Changeset View
Standalone View
swh/web/api/views/raw.py
- This file was added.
# Copyright (C) 2018-2019 The Software Heritage developers | |||||||||||||||||||||||||
vlorentz: should be `2022` instead | |||||||||||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | |||||||||||||||||||||||||
# License: GNU Affero General Public License version 3, or any later version | |||||||||||||||||||||||||
# See top-level LICENSE file for more information | |||||||||||||||||||||||||
from django.http import HttpResponse | |||||||||||||||||||||||||
from swh.model.git_objects import ( | |||||||||||||||||||||||||
content_git_object, | |||||||||||||||||||||||||
directory_git_object, | |||||||||||||||||||||||||
revision_git_object, | |||||||||||||||||||||||||
release_git_object, | |||||||||||||||||||||||||
snapshot_git_object, | |||||||||||||||||||||||||
) | |||||||||||||||||||||||||
from swh.model import model | |||||||||||||||||||||||||
from swh.model.swhids import CoreSWHID, ObjectType | |||||||||||||||||||||||||
from swh.web.api.apidoc import api_doc, format_docstring | |||||||||||||||||||||||||
from swh.web.api.apiurls import api_route | |||||||||||||||||||||||||
from swh.web.common import archive | |||||||||||||||||||||||||
from swh.web.common.exc import NotFoundExc | |||||||||||||||||||||||||
from swh.web.common.utils import SWHID_RE | |||||||||||||||||||||||||
@api_route( | |||||||||||||||||||||||||
f"/raw/(?P<swhid>{SWHID_RE})/", | |||||||||||||||||||||||||
"api-1-raw-object", | |||||||||||||||||||||||||
) | |||||||||||||||||||||||||
@api_doc("/raw/") | |||||||||||||||||||||||||
@format_docstring() | |||||||||||||||||||||||||
def api_raw_object(request, swhid): | |||||||||||||||||||||||||
""" | |||||||||||||||||||||||||
.. http:get:: /api/1/raw/<swhid>/ | |||||||||||||||||||||||||
Get the object corresponding to the SWHID in raw form. | |||||||||||||||||||||||||
This endpoint exposes the internal representation (see | |||||||||||||||||||||||||
:func:`swh.model.git_objects.*_git_object` in our data | |||||||||||||||||||||||||
model module for details), and so can be used to fetch a binary | |||||||||||||||||||||||||
blob which hashes to the same identifier. | |||||||||||||||||||||||||
:param string swhid: the object's SWHID | |||||||||||||||||||||||||
Done Inline ActionsThis ref won't work because it's not the name of an existing function. Use this instead: see ``*_git_object`` functions in :mod:`swh.model.git_objects` vlorentz: This ref won't work because it's not the name of an existing function. Use this instead:
```… | |||||||||||||||||||||||||
:resheader Content-Type: application/octet-stream | |||||||||||||||||||||||||
:statuscode 200: no error | |||||||||||||||||||||||||
:statuscode 400: an invalid SWHID has been provided | |||||||||||||||||||||||||
:statuscode 404: the requested object can not be found in the archive | |||||||||||||||||||||||||
**Example:** | |||||||||||||||||||||||||
.. parsed-literal:: | |||||||||||||||||||||||||
Done Inline Actionsdon't invalid SWHIDs raise a 404 too? vlorentz: don't invalid SWHIDs raise a 404 too? | |||||||||||||||||||||||||
Done Inline ActionsI have no idea :). This was inherited from the snapshot handler the patch as originally based on. I am just hoping the thrown exceptions do that! :) Ericson2314: I have no idea :). This was inherited from the snapshot handler the patch as originally based… | |||||||||||||||||||||||||
Done Inline ActionsI don't see anything in the code that would raise a 400; so it's either a 404 (if Django rejects based on the regexp) or a 500. vlorentz: I don't see anything in the code that would raise a 400; so it's either a 404 (if Django… | |||||||||||||||||||||||||
Done Inline ActionsWhat happens if CoreSWHID.from_string throws a ValidationError? Still, regardless you are right that even if that is a 400, regex non-matches will be a 404. One of the swh.web.view.vault one taking a SWHID mentions the 404, but the others don't so I will just drop it. Ericson2314: What happens if `CoreSWHID.from_string` throws a `ValidationError`?
Still, regardless you are… | |||||||||||||||||||||||||
Not Done Inline Actionsit raises a 500 because it won't be caught (afaik). ack vlorentz: it raises a 500 because it won't be caught (afaik).
ack | |||||||||||||||||||||||||
:swh_web_api:`raw/swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a` | |||||||||||||||||||||||||
""" | |||||||||||||||||||||||||
swhid = CoreSWHID.from_string(swhid) | |||||||||||||||||||||||||
object_id = swhid.object_id | |||||||||||||||||||||||||
object_type = swhid.object_type | |||||||||||||||||||||||||
def not_found(): | |||||||||||||||||||||||||
return NotFoundExc(f"Object with id {swhid} not found.") | |||||||||||||||||||||||||
if object_type == ObjectType.CONTENT: | |||||||||||||||||||||||||
results = archive.storage.content_find({"sha1_git": object_id}) | |||||||||||||||||||||||||
if len(results) == 0: | |||||||||||||||||||||||||
raise not_found() | |||||||||||||||||||||||||
result = content_git_object(results[0]) | |||||||||||||||||||||||||
elif object_type == ObjectType.DIRECTORY: | |||||||||||||||||||||||||
entries = [] | |||||||||||||||||||||||||
page_token = None | |||||||||||||||||||||||||
while True: | |||||||||||||||||||||||||
batch = archive.storage.directory_get_entries( | |||||||||||||||||||||||||
directory_id=object_id, | |||||||||||||||||||||||||
page_token=page_token, | |||||||||||||||||||||||||
) | |||||||||||||||||||||||||
if batch is None: | |||||||||||||||||||||||||
raise not_found() | |||||||||||||||||||||||||
entries += batch.results | |||||||||||||||||||||||||
if batch.next_page_token is None: | |||||||||||||||||||||||||
break | |||||||||||||||||||||||||
page_token = batch.next_page_token | |||||||||||||||||||||||||
Done Inline Actions
You can use this function: https://docs.softwareheritage.org/devel/apidoc/swh.core.api.classes.html#swh.core.api.classes.stream_results_optional It's shorter and more efficient (avoids a list copy on each loop) vlorentz: You can use this function: https://docs.softwareheritage.org/devel/apidoc/swh.core.api.classes. | |||||||||||||||||||||||||
result = directory_git_object( | |||||||||||||||||||||||||
model.Directory( | |||||||||||||||||||||||||
id=object_id, | |||||||||||||||||||||||||
entries=entries, | |||||||||||||||||||||||||
) | |||||||||||||||||||||||||
) | |||||||||||||||||||||||||
elif object_type == ObjectType.REVISION: | |||||||||||||||||||||||||
result = archive.storage.revision_get([object_id])[0] | |||||||||||||||||||||||||
Not Done Inline Actions
This usually does not matter, but some directories' git_object cannot be entirely rebuilt just from the list of entries for various reasons. (and please add a test for it; there is an example here: https://forge.softwareheritage.org/source/swh-vault/browse/master/swh/vault/tests/test_cookers.py$1121-1141 ) vlorentz: This usually does not matter, but some directories' git_object cannot be entirely rebuilt just… | |||||||||||||||||||||||||
Not Done Inline Actionsoh my bad, it should be [object_id] instead of [0] (the method returns a dict for some reason) vlorentz: oh my bad, it should be `[object_id]` instead of `[0]` (the method returns a dict for some… | |||||||||||||||||||||||||
Done Inline ActionsI this makes me think directory reassembly is subtle enough that it deserves its own function. I therefore opened D7720 to make one analogous to the snapshot one. We could still do an integration test for the web interface, but with this division of labor directory_get_all_entries and directory_git_object can also be tested in isolation. Ericson2314: I this makes me think directory reassembly is subtle enough that it deserves its own function. | |||||||||||||||||||||||||
if result is None: | |||||||||||||||||||||||||
raise not_found() | |||||||||||||||||||||||||
result = revision_git_object(result) | |||||||||||||||||||||||||
elif object_type == ObjectType.RELEASE: | |||||||||||||||||||||||||
result = archive.storage.release_get([object_id])[0] | |||||||||||||||||||||||||
if result is None: | |||||||||||||||||||||||||
raise not_found() | |||||||||||||||||||||||||
result = release_git_object(result) | |||||||||||||||||||||||||
elif object_type == ObjectType.SNAPSHOT: | |||||||||||||||||||||||||
result = archive.storage.snapshot_get(object_id) | |||||||||||||||||||||||||
if result is None: | |||||||||||||||||||||||||
raise not_found() | |||||||||||||||||||||||||
result = snapshot_git_object(result) | |||||||||||||||||||||||||
Done Inline Actionsit's a paginated endpoint too. Use this: https://docs.softwareheritage.org/devel/apidoc/swh.storage.algos.snapshot.html#swh.storage.algos.snapshot.snapshot_get_all_branches vlorentz: it's a paginated endpoint too. Use this: https://docs.softwareheritage.org/devel/apidoc/swh. | |||||||||||||||||||||||||
else: | |||||||||||||||||||||||||
raise ValueError(f"Unexpected object type variant: {object_type}") | |||||||||||||||||||||||||
response = HttpResponse(result, content_type="application/octet-stream") | |||||||||||||||||||||||||
filename = swhid.replace(":", "_") + "_raw" | |||||||||||||||||||||||||
response["Content-disposition"] = f"attachment; filename={filename}" | |||||||||||||||||||||||||
return response |
should be 2022 instead