diff --git a/swh/web/api/views/metadata.py b/swh/web/api/views/metadata.py index 10dc14bc..70d5a45e 100644 --- a/swh/web/api/views/metadata.py +++ b/swh/web/api/views/metadata.py @@ -1,250 +1,254 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import re import iso8601 from django.http import HttpResponse from swh.model import hashutil, identifiers from swh.model.model import MetadataAuthority, MetadataAuthorityType from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.common import archive, converters from swh.web.common.exc import BadInputExc, NotFoundExc from swh.web.common.utils import reverse SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" @api_route( f"/raw-extrinsic-metadata/swhid/(?P{SWHID_RE})/", "api-1-raw-extrinsic-metadata-swhid", ) @api_doc("/raw-extrinsic-metadata/swhid/") @format_docstring() def api_raw_extrinsic_metadata_swhid(request, target): """ .. http:get:: /api/1/raw-extrinsic-metadata/swhid/(target) Returns raw `extrinsic metadata `__ collected on a given object. :param string target: The core SWHID of the object whose metadata should be returned :query string authority: A metadata authority identifier, formatted as `` ``. Required. :query string after: An ISO representation of the minimum timestamp of metadata to fetch. Defaults to allowing all metadata. :query int limit: Maximum number of metadata objects to return. {common_headers} :>jsonarr string target: SWHID of the object described by this metadata :>jsonarr string discovery_date: ISO8601 timestamp of the moment this metadata was collected. :>jsonarr object authority: authority this metadata is coming from :>jsonarr object fetcher: tool used to fetch the metadata :>jsonarr string format: short identifier of the format of the metadata :>jsonarr string metadata_url: link to download the metadata "blob" itself :>jsonarr string origin: URL of the origin in which context's the metadata is valid, if any :>jsonarr int visit: identifier of the visit in which context's the metadata is valid, if any :>jsonarr string snapshot: SWHID of the snapshot in which context's the metadata is valid, if any :>jsonarr string release: SWHID of the release in which context's the metadata is valid, if any :>jsonarr string revision: SWHID of the revision in which context's the metadata is valid, if any :>jsonarr string path: SWHID of the path in which context's is valid if any, relative to a release or revision as anchor :>jsonarr string directory: SWHID of the directory in which context's the metadata is valid, if any :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`raw-extrinsic-metadata/swhid/swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307/?authority=forge%20https://pypi.org/` """ # noqa authority_str: str = request.query_params.get("authority") after_str: str = request.query_params.get("after") limit_str: str = request.query_params.get("limit", "100") page_token_str: str = request.query_params.get("page_token") if not authority_str: raise BadInputExc("The 'authority' query parameter is required.") if " " not in authority_str.strip(): raise BadInputExc("The 'authority' query parameter should contain a space.") (authority_type_str, authority_url) = authority_str.split(" ", 1) try: authority_type = MetadataAuthorityType(authority_type_str) except ValueError: raise BadInputExc( f"Invalid 'authority' type, should be one of: " f"{', '.join(member.value for member in MetadataAuthorityType)}" ) authority = MetadataAuthority(authority_type, authority_url) if after_str: try: after = iso8601.parse_date(after_str) except iso8601.ParseError: raise BadInputExc("Invalid format for 'after' parameter.") from None else: after = None try: limit = int(limit_str) except ValueError: raise BadInputExc("'limit' parameter must be an integer.") from None limit = min(limit, 10000) try: target = identifiers.CoreSWHID.from_string(target).to_extended() except identifiers.ValidationError as e: raise BadInputExc(f"Invalid target SWHID: {e.args[0]}") from None if page_token_str: page_token = base64.urlsafe_b64decode(page_token_str) else: page_token = None result_page = archive.storage.raw_extrinsic_metadata_get( target=target, authority=authority, after=after, page_token=page_token, limit=limit, ) results = [] for metadata in result_page.results: result = converters.from_raw_extrinsic_metadata(metadata) # We can't reliably send metadata directly, because it is a bytestring, # and we have to return JSON documents. result["metadata_url"] = reverse( "api-1-raw-extrinsic-metadata-get", url_args={"id": hashutil.hash_to_hex(metadata.id)}, query_params={"filename": f"{target}_metadata"}, request=request, ) results.append(result) response = { "results": results, "headers": {}, } + if result_page.next_page_token is not None: response["headers"]["link-next"] = reverse( - "api-1-raw-extrinsic-metadata", + "api-1-raw-extrinsic-metadata-swhid", + url_args={"target": target}, query_params=dict( authority=authority_str, after=after_str, limit=limit_str, - page_token=base64.urlsafe_b64encode(result_page.next_page_token), + page_token=base64.urlsafe_b64encode( + result_page.next_page_token.encode() + ), ), request=request, ) return response @api_route( "/raw-extrinsic-metadata/get/(?P[0-9a-z]+)/", "api-1-raw-extrinsic-metadata-get", ) def api_raw_extrinsic_metadata_get(request, id): # This is an internal endpoint that should only be accessed via URLs given # by /raw-extrinsic-metadata/swhid/; so it is not documented. metadata = archive.storage.raw_extrinsic_metadata_get_by_ids( [hashutil.hash_to_bytes(id)] ) if not metadata: raise NotFoundExc( "Metadata not found. Use /raw-extrinsic-metadata/swhid/ to access metadata." ) response = HttpResponse( metadata[0].metadata, content_type="application/octet-stream" ) filename = request.query_params.get("filename") if filename and re.match("[a-zA-Z0-9:._-]+", filename): response["Content-disposition"] = f'attachment; filename="{filename}"' else: # It should always be not-None and match the regexp if the URL was created by # /raw-extrinsic-metadata/swhid/, but we're better safe than sorry. response["Content-disposition"] = "attachment" return response @api_route( f"/raw-extrinsic-metadata/swhid/(?P{SWHID_RE})/authorities/", "api-1-raw-extrinsic-metadata-swhid-authorities", ) @api_doc("/raw-extrinsic-metadata/swhid/authorities/") @format_docstring() def api_raw_extrinsic_metadata_swhid_authorities(request, target): """ .. http:get:: /api/1/raw-extrinsic-metadata/swhid/(target)/authorities/ Returns a list of metadata authorities that provided metadata on the given target. They can then be used to get the raw `extrinsic metadata `__ collected on that object from each of the authorities. :param string target: The core SWHID of the object whose metadata-providing authorities should be returned {common_headers} :>jsonarr string type: Type of authority (deposit_client, forge, registry) :>jsonarr string url: Unique IRI identifying the authority :>jsonarr object metadata_list_url: URL to get the list of metadata objects on the given object from this authority :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`raw-extrinsic-metadata/swhid/swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307/authorities/` """ # noqa target_str = target try: target = identifiers.CoreSWHID.from_string(target_str).to_extended() except identifiers.ValidationError as e: raise BadInputExc(f"Invalid target SWHID: {e.args[0]}") from None authorities = archive.storage.raw_extrinsic_metadata_get_authorities(target=target) results = [ { **authority.to_dict(), "metadata_list_url": reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": target_str}, query_params={"authority": f"{authority.type.value} {authority.url}"}, request=request, ), } for authority in authorities ] return { "results": results, "headers": {}, } diff --git a/swh/web/tests/api/views/test_metadata.py b/swh/web/tests/api/views/test_metadata.py index 53babe4e..7f813ade 100644 --- a/swh/web/tests/api/views/test_metadata.py +++ b/swh/web/tests/api/views/test_metadata.py @@ -1,188 +1,208 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import attr from hypothesis import given, strategies +from hypothesis.strategies._internal.core import sampled_from import pytest -from swh.model.hypothesis_strategies import raw_extrinsic_metadata +from swh.model.hypothesis_strategies import ( + raw_extrinsic_metadata as raw_extrinsic_metadata_orig, +) +from swh.model.hypothesis_strategies import sha1_git +from swh.model.identifiers import ExtendedObjectType, ExtendedSWHID, ObjectType from swh.web.common.utils import reverse from swh.web.tests.api.views.utils import scroll_results from swh.web.tests.utils import check_api_get_responses, check_http_get_response +# public Web API endpoint for raw extrinsic metadata does not support +# extended SWHIDs so we ensure only core ones will be used in test inputs. +@strategies.composite +def raw_extrinsic_metadata(draw): + remd = draw(raw_extrinsic_metadata_orig()) + remd = attr.evolve( + remd, + target=ExtendedSWHID( + object_type=ExtendedObjectType(draw(sampled_from(ObjectType)).value), + object_id=draw(sha1_git()), + ), + ) + return attr.evolve(remd, id=remd.compute_hash()) + + @given(raw_extrinsic_metadata()) def test_api_raw_extrinsic_metadata(api_client, subtest, metadata): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.metadata_authority_add([metadata.authority]) archive_data.metadata_fetcher_add([metadata.fetcher]) archive_data.raw_extrinsic_metadata_add([metadata]) authority = metadata.authority url = reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": str(metadata.target)}, query_params={"authority": f"{authority.type.value} {authority.url}"}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 expected_result = metadata.to_dict() del expected_result["id"] del expected_result["metadata"] metadata_url = rv.data[0]["metadata_url"] expected_result["metadata_url"] = metadata_url expected_result["discovery_date"] = expected_result[ "discovery_date" ].isoformat() assert rv.data == [expected_result] rv = check_http_get_response(api_client, metadata_url, status_code=200) assert rv["Content-Type"] == "application/octet-stream" assert ( rv["Content-Disposition"] == f'attachment; filename="{metadata.target}_metadata"' ) assert rv.content == metadata.metadata @pytest.mark.parametrize("limit", [1, 2, 10, 100]) @given(strategies.sets(raw_extrinsic_metadata(), min_size=1)) def test_api_raw_extrinsic_metadata_scroll(api_client, subtest, limit, meta): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): # Make all metadata objects use the same authority and target metadata0 = next(iter(meta)) metadata = { attr.evolve(m, authority=metadata0.authority, target=metadata0.target) for m in meta } # Metadata ids must also be updated as they depend on authority and target metadata = {attr.evolve(m, id=m.compute_hash()) for m in metadata} authority = metadata0.authority archive_data.metadata_authority_add([authority]) archive_data.metadata_fetcher_add(list({m.fetcher for m in metadata})) archive_data.raw_extrinsic_metadata_add(metadata) url = reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": str(metadata0.target)}, query_params={ "authority": f"{authority.type.value} {authority.url}", "limit": limit, }, ) results = scroll_results(api_client, url) expected_results = [m.to_dict() for m in metadata] for expected_result in expected_results: del expected_result["id"] del expected_result["metadata"] expected_result["discovery_date"] = expected_result[ "discovery_date" ].isoformat() assert len(results) == len(expected_results) for result in results: del result["metadata_url"] assert result in expected_results _swhid = "swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307" @pytest.mark.parametrize( "status_code,url_args,query_params", [ pytest.param( 200, {"target": _swhid}, {"authority": "forge http://example.org"}, id="minimal working", ), pytest.param( 200, {"target": _swhid}, { "authority": "forge http://example.org", "after": "2021-06-18T09:31:09", "limit": 100, }, id="maximal working", ), pytest.param( 400, {"target": _swhid}, {"authority": "foo http://example.org"}, id="invalid authority type", ), pytest.param( 400, {"target": _swhid}, {"authority": "forge http://example.org", "after": "yesterday",}, id="invalid 'after' format", ), pytest.param( 400, {"target": _swhid}, {"authority": "forge http://example.org", "limit": "abc",}, id="invalid 'limit'", ), ], ) def test_api_raw_extrinsic_metadata_check_params( api_client, archive_data, status_code, url_args, query_params ): url = reverse( "api-1-raw-extrinsic-metadata-swhid", url_args=url_args, query_params=query_params, ) check_api_get_responses(api_client, url, status_code=status_code) @given(raw_extrinsic_metadata()) def test_api_raw_extrinsic_metadata_list_authorities(api_client, subtest, metadata): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.metadata_authority_add([metadata.authority]) archive_data.metadata_fetcher_add([metadata.fetcher]) archive_data.raw_extrinsic_metadata_add([metadata]) authority = metadata.authority url = reverse( "api-1-raw-extrinsic-metadata-swhid-authorities", url_args={"target": str(metadata.target)}, ) rv = check_api_get_responses(api_client, url, status_code=200) expected_results = [ { "type": authority.type.value, "url": authority.url, "metadata_list_url": "http://testserver" + reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": str(metadata.target)}, query_params={ "authority": f"{authority.type.value} {authority.url}" }, ), } ] assert rv.data == expected_results