diff --git a/swh/web/api/utils.py b/swh/web/api/utils.py --- a/swh/web/api/utils.py +++ b/swh/web/api/utils.py @@ -7,6 +7,7 @@ from django.http import HttpRequest +from swh.model.model import Origin from swh.web.common.query import parse_hash from swh.web.common.typing import OriginInfo from swh.web.common.utils import resolve_branch_alias, reverse @@ -283,7 +284,7 @@ request: Absolute URIs will be generated if provided Returns: - An enriched origin dict filled with an additional url + An enriched origin dict filled with additional urls """ origin_dict = dict(origin) if "url" in origin_dict: @@ -292,6 +293,11 @@ url_args={"origin_url": origin_dict["url"]}, request=request, ) + origin_dict["metadata_authorities_url"] = reverse( + "api-1-raw-extrinsic-metadata-swhid-authorities", + url_args={"target": Origin(url=origin_dict["url"]).swhid()}, + request=request, + ) return origin_dict diff --git a/swh/web/api/views/metadata.py b/swh/web/api/views/metadata.py --- a/swh/web/api/views/metadata.py +++ b/swh/web/api/views/metadata.py @@ -44,6 +44,8 @@ {common_headers} :>jsonarr string target: SWHID of the object described by this metadata + (absent when ``target`` is not a core SWHID (ie. it does not have type + ``cnt``/``dir``/``rev``/``rel``/``snp``) :>jsonarr string discovery_date: ISO8601/RFC3339 timestamp of the moment this metadata was collected. :>jsonarr object authority: authority this metadata is coming from @@ -108,9 +110,17 @@ limit = min(limit, 10000) try: - parsed_target = swhids.CoreSWHID.from_string(target).to_extended() + parsed_target = swhids.ExtendedSWHID.from_string(target) except swhids.ValidationError as e: - raise BadInputExc(f"Invalid target SWHID: {e.args[0]}") from None + raise BadInputExc(f"Invalid target SWHID: {e}") from None + + try: + swhids.CoreSWHID.from_string(target) + except swhids.ValidationError: + # Can be parsed as an extended SWHID, but not as a core SWHID + extended_swhid = True + else: + extended_swhid = False if page_token_str is not None: page_token = base64.urlsafe_b64decode(page_token_str) @@ -125,17 +135,32 @@ limit=limit, ) + filename = None + if parsed_target.object_type == swhids.ExtendedObjectType.ORIGIN: + origin_sha1 = hashutil.hash_to_hex(parsed_target.object_id) + (origin_info,) = list(archive.lookup_origins_by_sha1s([origin_sha1])) + if origin_info is not None: + filename = re.sub("[:/_.]+", "_", origin_info["url"]) + "_metadata" + if filename is None: + filename = f"{target}_metadata" + results = [] for metadata in result_page.results: result = converters.from_raw_extrinsic_metadata(metadata) + if extended_swhid: + # Keep extended SWHIDs away from the public API as much as possible. + # (It is part of the URL, but not documented, and only accessed via + # the link in the response of api-1-origin) + del result["target"] + # We can't reliably send metadata directly, because it is a bytestring, # and we have to return JSON documents. result["metadata_url"] = reverse( "api-1-raw-extrinsic-metadata-get", url_args={"id": hashutil.hash_to_hex(metadata.id)}, - query_params={"filename": f"{target}_metadata"}, + query_params={"filename": filename}, request=request, ) @@ -208,6 +233,13 @@ They can then be used to get the raw `extrinsic metadata `__ collected on that object from each of the authorities. + This endpoint should only be used directly to retrieve metadata from + core SWHIDs (with type ``cnt``, ``dir``, ``rev``, ``rel``, and ``snp``). + For "extended" SWHIDs such as origins, the URL in the + ``origin_metadata_authorities_url`` field of + :http:get:`/api/1/origin/(origin_url)/get/` should be used instead of building + this URL directly. + :param string target: The core SWHID of the object whose metadata-providing authorities should be returned @@ -228,9 +260,9 @@ """ # noqa try: - parsed_target = swhids.CoreSWHID.from_string(target).to_extended() + parsed_target = swhids.ExtendedSWHID.from_string(target) except swhids.ValidationError as e: - raise BadInputExc(f"Invalid target SWHID: {e.args[0]}") from None + raise BadInputExc(f"Invalid target SWHID: {e}") from None authorities = archive.storage.raw_extrinsic_metadata_get_authorities( target=parsed_target diff --git a/swh/web/api/views/origin.py b/swh/web/api/views/origin.py --- a/swh/web/api/views/origin.py +++ b/swh/web/api/views/origin.py @@ -45,6 +45,10 @@ :>json string status: status of the visit (either **full**, **partial** or **ongoing**) :>json number visit: the unique identifier of the visit + :>json string metadata_authorities_url: link to + :http:get:`/api/1/raw-extrinsic-metadata/swhid/(target)/authorities/` + to get the list of metadata authorities providing extrinsic metadata + on this origin (and, indirectly, to the origin's extrinsic metadata itself) """ DOC_RETURN_ORIGIN_VISIT_ARRAY = DOC_RETURN_ORIGIN_VISIT.replace(":>json", ":>jsonarr") diff --git a/swh/web/tests/api/test_utils.py b/swh/web/tests/api/test_utils.py --- a/swh/web/tests/api/test_utils.py +++ b/swh/web/tests/api/test_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2021 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,6 +6,7 @@ import random from swh.model.hashutil import DEFAULT_ALGORITHMS +from swh.model.model import Origin from swh.web.api import utils from swh.web.common.origin_visits import get_origin_visits from swh.web.common.utils import resolve_branch_alias, reverse @@ -533,6 +534,11 @@ origin_data["origin_visits_url"] = reverse( "api-1-origin-visits", url_args={"origin_url": origin["url"]}, request=request ) + origin_data["metadata_authorities_url"] = reverse( + "api-1-raw-extrinsic-metadata-swhid-authorities", + url_args={"target": Origin(url=origin["url"]).swhid()}, + request=request, + ) assert actual_origin == origin_data @@ -544,6 +550,11 @@ origin_visits_url = reverse( "api-1-origin-visits", url_args={"origin_url": origin["url"]}, request=request ) + metadata_authorities_url = reverse( + "api-1-raw-extrinsic-metadata-swhid-authorities", + url_args={"target": Origin(url=origin["url"]).swhid()}, + request=request, + ) origin_search_result_data = ( [{"url": origin["url"]}], @@ -551,7 +562,13 @@ ) enriched_origin_search_result = ( - [{"url": origin["url"], "origin_visits_url": origin_visits_url}], + [ + { + "url": origin["url"], + "origin_visits_url": origin_visits_url, + "metadata_authorities_url": metadata_authorities_url, + } + ], None, ) diff --git a/swh/web/tests/api/views/test_metadata.py b/swh/web/tests/api/views/test_metadata.py --- a/swh/web/tests/api/views/test_metadata.py +++ b/swh/web/tests/api/views/test_metadata.py @@ -1,28 +1,21 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import attr -from hypothesis import given -from hypothesis.strategies import composite, sampled_from, sets +from hypothesis import given, settings +from hypothesis.strategies import sets import pytest -from swh.model.hypothesis_strategies import raw_extrinsic_metadata, sha1_git -from swh.model.swhids import CoreSWHID, ObjectType +from swh.model.hypothesis_strategies import raw_extrinsic_metadata +from swh.model.model import Origin from swh.web.common.utils import reverse from swh.web.tests.api.views.utils import scroll_results from swh.web.tests.utils import check_api_get_responses, check_http_get_response -@composite -def core_swhids(draw): - object_type = draw(sampled_from(ObjectType)) - object_id = draw(sha1_git()) - return CoreSWHID(object_type=object_type, object_id=object_id).to_extended() - - -@given(raw_extrinsic_metadata(target=core_swhids())) +@given(raw_extrinsic_metadata()) def test_api_raw_extrinsic_metadata(api_client, subtest, metadata): # ensure archive_data fixture will be reset between each hypothesis # example test run @@ -50,6 +43,9 @@ expected_result["discovery_date"] = expected_result[ "discovery_date" ].isoformat() + if expected_result["target"].startswith(("swh:1:ori:", "swh:1:emd:")): + # non-core SWHID are hidden from the API + del expected_result["target"] assert rv.data == [expected_result] rv = check_http_get_response(api_client, metadata_url, status_code=200) @@ -61,8 +57,43 @@ assert rv.content == metadata.metadata +@settings(max_examples=1) +@given(raw_extrinsic_metadata()) +def test_api_raw_extrinsic_metadata_origin_filename(api_client, subtest, metadata): + # ensure archive_data fixture will be reset between each hypothesis + # example test run + @subtest + def test_inner(archive_data): + nonlocal metadata + origin = Origin(url="http://example.com/repo.git") + metadata = attr.evolve(metadata, target=origin.swhid()) + metadata = attr.evolve(metadata, id=metadata.compute_hash()) + archive_data.origin_add([origin]) + archive_data.metadata_authority_add([metadata.authority]) + archive_data.metadata_fetcher_add([metadata.fetcher]) + archive_data.raw_extrinsic_metadata_add([metadata]) + + authority = metadata.authority + url = reverse( + "api-1-raw-extrinsic-metadata-swhid", + url_args={"target": str(metadata.target)}, + query_params={"authority": f"{authority.type.value} {authority.url}"}, + ) + rv = check_api_get_responses(api_client, url, status_code=200) + + assert len(rv.data) == 1 + metadata_url = rv.data[0]["metadata_url"] + rv = check_http_get_response(api_client, metadata_url, status_code=200) + assert rv["Content-Type"] == "application/octet-stream" + assert ( + rv["Content-Disposition"] + == 'attachment; filename="http_example_com_repo_git_metadata"' + ) + assert rv.content == metadata.metadata + + @pytest.mark.parametrize("limit", [1, 2, 10, 100]) -@given(sets(raw_extrinsic_metadata(target=core_swhids()), min_size=1)) +@given(sets(raw_extrinsic_metadata(), min_size=1)) def test_api_raw_extrinsic_metadata_scroll(api_client, subtest, limit, meta): # ensure archive_data fixture will be reset between each hypothesis # example test run @@ -101,12 +132,15 @@ expected_result["discovery_date"] = expected_result[ "discovery_date" ].isoformat() + if expected_result["target"].startswith(("swh:1:ori:", "swh:1:emd:")): + # non-core SWHID are hidden from the API + del expected_result["target"] assert len(results) == len(expected_results) for result in results: del result["metadata_url"] - assert result in expected_results + assert result in expected_results, str(expected_results) _swhid = "swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307" @@ -168,7 +202,7 @@ check_api_get_responses(api_client, url, status_code=status_code) -@given(raw_extrinsic_metadata(target=core_swhids())) +@given(raw_extrinsic_metadata()) def test_api_raw_extrinsic_metadata_list_authorities(api_client, subtest, metadata): # ensure archive_data fixture will be reset between each hypothesis # example test run