diff --git a/swh/web/api/urls.py b/swh/web/api/urls.py --- a/swh/web/api/urls.py +++ b/swh/web/api/urls.py @@ -8,6 +8,7 @@ import swh.web.api.views.directory # noqa import swh.web.api.views.graph # noqa import swh.web.api.views.identifiers # noqa +import swh.web.api.views.metadata # noqa import swh.web.api.views.origin # noqa import swh.web.api.views.origin_save # noqa import swh.web.api.views.ping # noqa diff --git a/swh/web/api/views/metadata.py b/swh/web/api/views/metadata.py new file mode 100644 --- /dev/null +++ b/swh/web/api/views/metadata.py @@ -0,0 +1,177 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import base64 + +import iso8601 + +from django.http import HttpResponse + +from swh.model import hashutil, identifiers +from swh.model.model import MetadataAuthority, MetadataAuthorityType +from swh.web.api.apidoc import api_doc, format_docstring +from swh.web.api.apiurls import api_route +from swh.web.common import archive, converters +from swh.web.common.exc import BadInputExc, NotFoundExc +from swh.web.common.utils import reverse + +SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" + + +@api_route( + f"/raw-extrinsic-metadata/swhid/(?P{SWHID_RE})/", + "api-1-raw-extrinsic-metadata-swhid", +) +@api_doc("/raw-extrinsic-metadata/swhid/") +@format_docstring() +def api_raw_extrinsic_metadata_swhid(request, target): + """ + .. http:get:: /api/1/raw-extrinsic-metadata/swhid/(target) + + Returns raw `extrinsic metadata`_ collected on a given object. + + .. _extrinsic metadata: https://docs.softwareheritage.org/devel/glossary.html#term-extrinsic-metadata + + :param string target: The SWHID of the object whose metadata should be returned + :query string authority: A metadata authority identifier, formatted as + ` `. Required. + :query string after: An ISO representation of the minimum timestamp of metadata + to fetch. Defaults to allowing all metadata. + :query int limit: Maximum number of metadata objects to return. + + {common_headers} + + :>jsonarr string target: SWHID of the object described by this metadata + :>jsonarr string discovery_date: ISO8601 timestamp of the moment this + metadata was collected. + :>jsonarr object authority: authority this metadata is coming from + :>jsonarr object fetcher: tool used to fetch the metadata + :>jsonarr string format: short identifier of the format of the metadata + :>jsonarr string metadata_url: link to download the metadata "blob" itself + :>jsonarr string origin: URL of the origin in which context's + the metadata is valid, if any + :>jsonarr int visit: identifier of the visit in which context's + the metadata is valid, if any + :>jsonarr string snapshot: SWHID of the snapshot in which context's + the metadata is valid, if any + :>jsonarr string release: SWHID of the release in which context's + the metadata is valid, if any + :>jsonarr string revision: SWHID of the revision in which context's + the metadata is valid, if any + :>jsonarr string path: SWHID of the path in which context's + is valid if any, relative to a release or revision as anchor + :>jsonarr string directory: SWHID of the directory in which context's + the metadata is valid, if any + + :statuscode 200: no error + + **Example:** + + .. parsed-literal:: + + :swh_web_api:`raw-extrinsic-metadata/swhid/swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307/?authority=forge%20https://pypi.org/` + """ # noqa + authority_str: str = request.query_params.get("authority") + after_str: str = request.query_params.get("after") + limit_str: str = request.query_params.get("limit", "100") + page_token_str: str = request.query_params.get("page_token") + + if not authority_str: + raise BadInputExc("The 'authority' query parameter is required.") + if " " not in authority_str.strip(): + raise BadInputExc("The 'authority' query parameter should contain a space.") + + (authority_type_str, authority_url) = authority_str.split(" ", 1) + try: + authority_type = MetadataAuthorityType(authority_type_str) + except ValueError: + raise BadInputExc( + f"Invalid 'authority' type, should be one of: " + f"{', '.join(member.value for member in MetadataAuthorityType)}" + ) + authority = MetadataAuthority(authority_type, authority_url) + + if after_str: + try: + after = iso8601.parse_date(after_str) + except iso8601.ParseError: + raise BadInputExc("Invalid format for 'after' parameter.") from None + else: + after = None + + try: + limit = int(limit_str) + except ValueError: + raise BadInputExc("'limit' parameter must be an integer.") from None + limit = min(limit, 10000) + + try: + target = identifiers.CoreSWHID.from_string(target).to_extended() + except identifiers.ValidationError as e: + raise BadInputExc(f"Invalid target SWHID: {e.args[0]}") from None + + if page_token_str: + page_token = base64.urlsafe_b64decode(page_token_str) + else: + page_token = None + + result_page = archive.storage.raw_extrinsic_metadata_get( + target=target, + authority=authority, + after=after, + page_token=page_token, + limit=limit, + ) + + results = [] + + for metadata in result_page.results: + result = converters.from_raw_extrinsic_metadata(metadata) + + # We can't reliably send metadata directly, because it is a bytestring, + # and we have to return JSON documents. + result["metadata_url"] = reverse( + "api-1-raw-extrinsic-metadata-get", + url_args={"id": hashutil.hash_to_hex(metadata.id)}, + request=request, + ) + + results.append(result) + + response = { + "results": results, + "headers": {}, + } + if result_page.next_page_token is not None: + response["headers"]["link-next"] = reverse( + "api-1-raw-extrinsic-metadata", + query_params=dict( + authority=authority_str, + after=after_str, + limit=limit_str, + page_token=base64.urlsafe_b64encode(result_page.next_page_token), + ), + request=request, + ) + + return response + + +@api_route( + "/raw-extrinsic-metadata/get/(?P[0-9a-z]+)/", + "api-1-raw-extrinsic-metadata-get", +) +def api_raw_extrinsic_metadata_get(request, id): + # This is an internal endpoint that should only be accessed via URLs given + # by /raw-extrinsic-metadata/swhid/; so it is not documented. + metadata = archive.storage.raw_extrinsic_metadata_get_by_ids( + [hashutil.hash_to_bytes(id)] + ) + if not metadata: + raise NotFoundExc( + "Metadata not found. Use /raw-extrinsic-metadata/swhid/ to access metadata." + ) + + return HttpResponse(metadata[0].metadata, content_type="application/octet-stream") diff --git a/swh/web/common/converters.py b/swh/web/common/converters.py --- a/swh/web/common/converters.py +++ b/swh/web/common/converters.py @@ -9,7 +9,7 @@ from swh.core.utils import decode_with_escape from swh.model import hashutil -from swh.model.model import Release, Revision +from swh.model.model import RawExtrinsicMetadata, Release, Revision from swh.storage.interface import PartialBranches from swh.web.common.typing import OriginInfo, OriginVisitInfo @@ -246,11 +246,11 @@ return json.JSONEncoder.default(self, obj) -def convert_revision_metadata(metadata): +def convert_metadata(metadata): """Convert json specific dict to a json serializable one. """ - if not metadata: + if metadata is None: return {} return json.loads(json.dumps(metadata, cls=SWHMetadataEncoder)) @@ -281,7 +281,7 @@ hashess={"id", "directory", "parents", "children"}, bytess={"name", "fullname", "email", "extra_headers", "message"}, convert={"metadata"}, - convert_fn=convert_revision_metadata, + convert_fn=convert_metadata, dates={"date", "committer_date"}, ) @@ -292,6 +292,18 @@ return revision_d +def from_raw_extrinsic_metadata( + metadata: Union[Dict[str, Any], RawExtrinsicMetadata] +) -> Dict[str, Any]: + """Convert RawExtrinsicMetadata model object to a json serializable dictionary. + """ + return from_swh( + metadata.to_dict() if isinstance(metadata, RawExtrinsicMetadata) else metadata, + blacklist={"id", "metadata"}, + dates={"discovery_date"}, + ) + + def from_content(content): """Convert swh content to serializable content dictionary. diff --git a/swh/web/tests/api/views/test_metadata.py b/swh/web/tests/api/views/test_metadata.py new file mode 100644 --- /dev/null +++ b/swh/web/tests/api/views/test_metadata.py @@ -0,0 +1,135 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import attr +from hypothesis import given, strategies +import pytest + +from swh.model.hypothesis_strategies import raw_extrinsic_metadata +from swh.web.common.utils import reverse +from swh.web.tests.api.views.utils import scroll_results +from swh.web.tests.utils import check_api_get_responses, check_http_get_response + + +@given(raw_extrinsic_metadata()) +def test_api_raw_extrinsic_metadata(api_client, archive_data, metadata): + archive_data.metadata_authority_add([metadata.authority]) + archive_data.metadata_fetcher_add([metadata.fetcher]) + archive_data.raw_extrinsic_metadata_add([metadata]) + + authority = metadata.authority + url = reverse( + "api-1-raw-extrinsic-metadata-swhid", + url_args={"target": str(metadata.target)}, + query_params={"authority": f"{authority.type.value} {authority.url}"}, + ) + rv = check_api_get_responses(api_client, url, status_code=200) + + assert len(rv.data) == 1 + + expected_result = metadata.to_dict() + del expected_result["id"] + del expected_result["metadata"] + metadata_url = rv.data[0]["metadata_url"] + expected_result["metadata_url"] = metadata_url + expected_result["discovery_date"] = expected_result["discovery_date"].isoformat() + assert rv.data == [expected_result] + + rv = check_http_get_response(api_client, metadata_url, status_code=200) + assert rv["Content-Type"] == "application/octet-stream" + assert rv.content == metadata.metadata + + +@pytest.mark.parametrize("limit", [1, 2, 10, 100]) +@given(strategies.sets(raw_extrinsic_metadata(), min_size=1)) +def test_api_raw_extrinsic_metadata_scroll(api_client, archive_data, limit, metadata): + # Make all metadata objects use the same authority and target + metadata0 = next(iter(metadata)) + metadata = { + attr.evolve(m, authority=metadata0.authority, target=metadata0.target) + for m in metadata + } + authority = metadata0.authority + + archive_data.metadata_authority_add([authority]) + archive_data.metadata_fetcher_add(list({m.fetcher for m in metadata})) + archive_data.raw_extrinsic_metadata_add(metadata) + + url = reverse( + "api-1-raw-extrinsic-metadata-swhid", + url_args={"target": str(metadata0.target)}, + query_params={ + "authority": f"{authority.type.value} {authority.url}", + "limit": limit, + }, + ) + + results = scroll_results(api_client, url) + + expected_results = [m.to_dict() for m in metadata] + for expected_result in expected_results: + del expected_result["id"] + del expected_result["metadata"] + expected_result["discovery_date"] = expected_result[ + "discovery_date" + ].isoformat() + + for result in results: + del result["metadata_url"] + + assert results == expected_results + + +_swhid = "swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307" + + +@pytest.mark.parametrize( + "status_code,url_args,query_params", + [ + pytest.param( + 200, + {"target": _swhid}, + {"authority": "forge http://example.org"}, + id="minimal working", + ), + pytest.param( + 200, + {"target": _swhid}, + { + "authority": "forge http://example.org", + "after": "2021-06-18T09:31:09", + "limit": 100, + }, + id="maximal working", + ), + pytest.param( + 400, + {"target": _swhid}, + {"authority": "foo http://example.org"}, + id="invalid authority type", + ), + pytest.param( + 400, + {"target": _swhid}, + {"authority": "forge http://example.org", "after": "yesterday",}, + id="invalid 'after' format", + ), + pytest.param( + 400, + {"target": _swhid}, + {"authority": "forge http://example.org", "limit": "abc",}, + id="invalid 'limit'", + ), + ], +) +def test_api_raw_extrinsic_metadata_check_params( + api_client, archive_data, status_code, url_args, query_params +): + url = reverse( + "api-1-raw-extrinsic-metadata-swhid", + url_args=url_args, + query_params=query_params, + ) + check_api_get_responses(api_client, url, status_code=status_code) diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,7 +8,6 @@ from hypothesis import given import pytest -from requests.utils import parse_header_links from swh.indexer.storage.model import OriginIntrinsicMetadataRow from swh.model.hashutil import hash_to_bytes @@ -19,6 +18,7 @@ from swh.web.common.exc import BadInputExc from swh.web.common.origin_visits import get_origin_visits from swh.web.common.utils import reverse +from swh.web.tests.api.views.utils import scroll_results from swh.web.tests.data import ( INDEXER_TOOL, ORIGIN_MASTER_REVISION, @@ -29,31 +29,6 @@ from swh.web.tests.utils import check_api_get_responses -def _scroll_results(api_client, url): - """Iterates through pages of results, and returns them all.""" - results = [] - - while True: - rv = check_api_get_responses(api_client, url, status_code=200) - - results.extend(rv.data) - - if "Link" in rv: - for link in parse_header_links(rv["Link"]): - if link["rel"] == "next": - # Found link to next page of results - url = link["url"] - break - else: - # No link with 'rel=next' - break - else: - # No Link header - break - - return results - - def test_api_lookup_origin_visits_raise_error(api_client, mocker): mock_get_origin_visits = mocker.patch("swh.web.api.views.origin.get_origin_visits") err_msg = "voluntary error to check the bad request middleware." @@ -389,7 +364,7 @@ url = reverse("api-1-origins", query_params={"origin_count": origin_count}) - results = _scroll_results(api_client, url) + results = scroll_results(api_client, url) assert len(results) == len(origins) assert {origin["url"] for origin in results} == origin_urls @@ -566,7 +541,7 @@ query_params={"limit": limit}, ) - results = _scroll_results(api_client, url) + results = scroll_results(api_client, url) assert {origin["url"] for origin in results} == expected_origins diff --git a/swh/web/tests/api/views/utils.py b/swh/web/tests/api/views/utils.py new file mode 100644 --- /dev/null +++ b/swh/web/tests/api/views/utils.py @@ -0,0 +1,33 @@ +# Copyright (C) 2015-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from requests.utils import parse_header_links + +from swh.web.tests.utils import check_api_get_responses + + +def scroll_results(api_client, url): + """Iterates through pages of results, and returns them all.""" + results = [] + + while True: + rv = check_api_get_responses(api_client, url, status_code=200) + + results.extend(rv.data) + + if "Link" in rv: + for link in parse_header_links(rv["Link"]): + if link["rel"] == "next": + # Found link to next page of results + url = link["url"] + break + else: + # No link with 'rel=next' + break + else: + # No Link header + break + + return results diff --git a/swh/web/tests/common/test_converters.py b/swh/web/tests/common/test_converters.py --- a/swh/web/tests/common/test_converters.py +++ b/swh/web/tests/common/test_converters.py @@ -97,7 +97,7 @@ empty_dict={"u"}, empty_list={"v"}, convert={"p", "q", "w"}, - convert_fn=converters.convert_revision_metadata, + convert_fn=converters.convert_metadata, ) assert expected_output == actual_output