diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -2,7 +2,7 @@ swh.core >= 0.0.95 swh.counters >= 0.5.1 swh.indexer >= 0.4.1 -swh.model >= 2.6.3 +swh.model >= 6.2.0 swh.scheduler >= 0.7.0 swh.search >= 0.2.0 swh.storage >= 0.31.0 diff --git a/swh/web/api/urls.py b/swh/web/api/urls.py --- a/swh/web/api/urls.py +++ b/swh/web/api/urls.py @@ -13,6 +13,7 @@ import swh.web.api.views.origin # noqa import swh.web.api.views.origin_save # noqa import swh.web.api.views.ping # noqa +import swh.web.api.views.raw # noqa import swh.web.api.views.release # noqa import swh.web.api.views.revision # noqa import swh.web.api.views.snapshot # noqa diff --git a/swh/web/api/views/metadata.py b/swh/web/api/views/metadata.py --- a/swh/web/api/views/metadata.py +++ b/swh/web/api/views/metadata.py @@ -16,9 +16,7 @@ from swh.web.api.apiurls import api_route from swh.web.common import archive, converters from swh.web.common.exc import BadInputExc, NotFoundExc -from swh.web.common.utils import reverse - -SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" +from swh.web.common.utils import SWHID_RE, reverse @api_route( diff --git a/swh/web/api/views/raw.py b/swh/web/api/views/raw.py new file mode 100644 --- /dev/null +++ b/swh/web/api/views/raw.py @@ -0,0 +1,119 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from django.http import HttpResponse +from typing import Optional, Iterable + +from swh.core.api.classes import stream_results_optional +from swh.model.git_objects import ( + content_git_object, + directory_git_object, + revision_git_object, + release_git_object, + snapshot_git_object, +) +from swh.model import model +from swh.model.hashutil import hash_to_hex +from swh.model.swhids import CoreSWHID, ObjectType +from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.web.api.apidoc import api_doc, format_docstring +from swh.web.api.apiurls import api_route +from swh.web.common import archive +from swh.web.common.exc import NotFoundExc +from swh.web.common.utils import SWHID_RE + + +@api_route( + f"/raw/(?P{SWHID_RE})/", + "api-1-raw-object", +) +@api_doc("/raw/") +@format_docstring() +def api_raw_object(request, swhid): + """ + .. http:get:: /api/1/raw// + + Get the object corresponding to the SWHID in raw form. + + This endpoint exposes the internal representation (see the + ``*_git_object`` functions in :mod:`swh.model.git_objects`), and + so can be used to fetch a binary blob which hashes to the same + identifier. + + :param string swhid: the object's SWHID + + :resheader Content-Type: application/octet-stream + + :statuscode 200: no error + :statuscode 404: the requested object can not be found in the archive + + **Example:** + + .. parsed-literal:: + + :swh_web_api:`raw/swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a` + """ + + swhid = CoreSWHID.from_string(swhid) + object_id = swhid.object_id + object_type = swhid.object_type + + def not_found(): + return NotFoundExc(f"Object with id {swhid} not found.") + + if object_type == ObjectType.CONTENT: + results = archive.storage.content_find({"sha1_git": object_id}) + if len(results) == 0: + raise not_found() + cnt = results[0] + # `cnt.with_data()` unfortunately doesn't seem to work. + if cnt.data is None: + d = cnt.to_dict() + d["data"] = archive.storage.content_get_data(cnt.sha1) + cnt = model.Content.from_dict(d) + assert cnt.data, f"Content {hash_to_hex(cnt.sha1)} ceased to exist" + result = content_git_object(cnt) + + elif object_type == ObjectType.DIRECTORY: + entries: Optional[Iterable[model.DirectoryEntry]] = stream_results_optional( + archive.storage.directory_get_entries, + directory_id=object_id, + ) + if entries is None: + raise not_found() + result = directory_git_object( + model.Directory( + id=object_id, + entries=tuple(entries), + raw_manifest=archive.storage.directory_get_raw_manifest([object_id])[0], + ) + ) + + elif object_type == ObjectType.REVISION: + result = archive.storage.revision_get([object_id])[0] + if result is None: + raise not_found() + result = revision_git_object(result) + + elif object_type == ObjectType.RELEASE: + result = archive.storage.release_get([object_id])[0] + if result is None: + raise not_found() + result = release_git_object(result) + + elif object_type == ObjectType.SNAPSHOT: + result = snapshot_get_all_branches(archive.storage, object_id) + if result is None: + raise not_found() + result = snapshot_git_object(result) + + else: + raise ValueError(f"Unexpected object type variant: {object_type}") + + response = HttpResponse(result, content_type="application/octet-stream") + filename = str(swhid).replace(":", "_") + "_raw" + response["Content-disposition"] = f"attachment; filename={filename}" + + return response diff --git a/swh/web/api/views/vault.py b/swh/web/api/views/vault.py --- a/swh/web/api/views/vault.py +++ b/swh/web/api/views/vault.py @@ -15,13 +15,12 @@ from swh.web.api.views.utils import api_lookup from swh.web.common import archive, query from swh.web.common.exc import BadInputExc -from swh.web.common.utils import reverse +from swh.web.common.utils import SWHID_RE, reverse + ###################################################### # Common -SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" - # XXX: a bit spaghetti. Would be better with class-based views. def _dispatch_cook_progress(request, bundle_type: str, swhid: CoreSWHID): diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -39,6 +39,8 @@ SWH_WEB_METRICS_REGISTRY = CollectorRegistry(auto_describe=True) +SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" + swh_object_icons = { "alias": "mdi mdi-star", "branch": "mdi mdi-source-branch", diff --git a/swh/web/tests/api/views/test_raw.py b/swh/web/tests/api/views/test_raw.py new file mode 100644 --- /dev/null +++ b/swh/web/tests/api/views/test_raw.py @@ -0,0 +1,58 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import hashlib + +from swh.model.hashutil import hash_to_bytes +from swh.web.common.utils import reverse +from swh.web.tests.utils import ( + check_api_get_responses, + check_http_get_response, +) + + +def test_api_raw_not_found(api_client, unknown_core_swhid): + url = reverse("api-1-raw-object", url_args={"swhid": str(unknown_core_swhid)}) + rv = check_api_get_responses(api_client, url, status_code=404) + assert rv.data == { + "exception": "NotFoundExc", + "reason": f"Object with id {unknown_core_swhid} not found.", + } + + +def _test_api_raw_hash(api_client, archive_data, object_id, object_ty): + url = reverse( + "api-1-raw-object", + url_args={"swhid": f"swh:1:{object_ty}:{object_id}"}, + ) + + rv = check_http_get_response(api_client, url, status_code=200) + assert rv["Content-Type"] == "application/octet-stream" + assert ( + rv["Content-disposition"] + == f"attachment; filename=swh_1_{object_ty}_{object_id}_raw" + ) + sha1_git = hashlib.new("sha1", rv.content).digest() + assert sha1_git == hash_to_bytes(object_id) + + +def test_api_raw_content(api_client, archive_data, content): + _test_api_raw_hash(api_client, archive_data, content["sha1_git"], "cnt") + + +def test_api_raw_directory(api_client, archive_data, directory): + _test_api_raw_hash(api_client, archive_data, directory, "dir") + + +def test_api_raw_revision(api_client, archive_data, revision): + _test_api_raw_hash(api_client, archive_data, revision, "rev") + + +def test_api_raw_release(api_client, archive_data, release): + _test_api_raw_hash(api_client, archive_data, release, "rel") + + +def test_api_raw_snapshot(api_client, archive_data, snapshot): + _test_api_raw_hash(api_client, archive_data, snapshot, "snp") diff --git a/swh/web/tests/conftest.py b/swh/web/tests/conftest.py --- a/swh/web/tests/conftest.py +++ b/swh/web/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -31,7 +31,7 @@ hash_to_hex, ) from swh.model.model import Content, Directory -from swh.model.swhids import ObjectType +from swh.model.swhids import CoreSWHID, ObjectType from swh.scheduler.tests.common import TASK_TYPES from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.revisions_walker import get_revisions_walker @@ -52,6 +52,7 @@ override_storages, random_content, random_sha1, + random_sha1_bytes, random_sha256, ) from swh.web.tests.utils import create_django_permission @@ -815,6 +816,18 @@ return random.choice(_object_type_swhid(ObjectType.SNAPSHOT)) +@pytest.fixture(scope="function", params=list(ObjectType)) +def unknown_core_swhid(request) -> CoreSWHID: + """Fixture returning an unknown core SWHID. + + Tests using this will be called once per object type. + """ + return CoreSWHID( + object_type=request.param, + object_id=random_sha1_bytes(), + ) + + # Fixture to manipulate data from a sample archive used in the tests @pytest.fixture(scope="function") def archive_data(tests_data): diff --git a/swh/web/tests/data.py b/swh/web/tests/data.py --- a/swh/web/tests/data.py +++ b/swh/web/tests/data.py @@ -59,15 +59,19 @@ } -def random_sha1(): - return hash_to_hex(bytes(random.randint(0, 255) for _ in range(20))) +def random_sha1_bytes() -> Sha1: + return bytes(random.randint(0, 255) for _ in range(20)) -def random_sha256(): +def random_sha1() -> str: + return hash_to_hex(random_sha1_bytes()) + + +def random_sha256() -> str: return hash_to_hex(bytes(random.randint(0, 255) for _ in range(32))) -def random_blake2s256(): +def random_blake2s256() -> str: return hash_to_hex(bytes(random.randint(0, 255) for _ in range(32)))