Page MenuHomeSoftware Heritage

D5875.diff
No OneTemporary

D5875.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -2,8 +2,8 @@
swh.core >= 0.0.95
swh.counters >= 0.5.1
swh.indexer >= 0.4.1
-swh.model >= 0.5.0
+swh.model >= 2.6.3
swh.scheduler >= 0.7.0
swh.search >= 0.2.0
-swh.storage >= 0.11.10
+swh.storage >= 0.31.0
swh.vault >= 0.0.33
diff --git a/swh/web/api/urls.py b/swh/web/api/urls.py
--- a/swh/web/api/urls.py
+++ b/swh/web/api/urls.py
@@ -8,6 +8,7 @@
import swh.web.api.views.directory # noqa
import swh.web.api.views.graph # noqa
import swh.web.api.views.identifiers # noqa
+import swh.web.api.views.metadata # noqa
import swh.web.api.views.origin # noqa
import swh.web.api.views.origin_save # noqa
import swh.web.api.views.ping # noqa
diff --git a/swh/web/api/views/metadata.py b/swh/web/api/views/metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/web/api/views/metadata.py
@@ -0,0 +1,177 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import base64
+
+import iso8601
+
+from django.http import HttpResponse
+
+from swh.model import hashutil, identifiers
+from swh.model.model import MetadataAuthority, MetadataAuthorityType
+from swh.web.api.apidoc import api_doc, format_docstring
+from swh.web.api.apiurls import api_route
+from swh.web.common import archive, converters
+from swh.web.common.exc import BadInputExc, NotFoundExc
+from swh.web.common.utils import reverse
+
+SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}"
+
+
+@api_route(
+ f"/raw-extrinsic-metadata/swhid/(?P<target>{SWHID_RE})/",
+ "api-1-raw-extrinsic-metadata-swhid",
+)
+@api_doc("/raw-extrinsic-metadata/swhid/")
+@format_docstring()
+def api_raw_extrinsic_metadata_swhid(request, target):
+ """
+ .. http:get:: /api/1/raw-extrinsic-metadata/swhid/(target)
+
+ Returns raw `extrinsic metadata`_ collected on a given object.
+
+ .. _extrinsic metadata: https://docs.softwareheritage.org/devel/glossary.html#term-extrinsic-metadata
+
+ :param string target: The SWHID of the object whose metadata should be returned
+ :query string authority: A metadata authority identifier, formatted as
+ `<type> <IRI>`. Required.
+ :query string after: An ISO representation of the minimum timestamp of metadata
+ to fetch. Defaults to allowing all metadata.
+ :query int limit: Maximum number of metadata objects to return.
+
+ {common_headers}
+
+ :>jsonarr string target: SWHID of the object described by this metadata
+ :>jsonarr string discovery_date: ISO8601 timestamp of the moment this
+ metadata was collected.
+ :>jsonarr object authority: authority this metadata is coming from
+ :>jsonarr object fetcher: tool used to fetch the metadata
+ :>jsonarr string format: short identifier of the format of the metadata
+ :>jsonarr string metadata_url: link to download the metadata "blob" itself
+ :>jsonarr string origin: URL of the origin in which context's
+ the metadata is valid, if any
+ :>jsonarr int visit: identifier of the visit in which context's
+ the metadata is valid, if any
+ :>jsonarr string snapshot: SWHID of the snapshot in which context's
+ the metadata is valid, if any
+ :>jsonarr string release: SWHID of the release in which context's
+ the metadata is valid, if any
+ :>jsonarr string revision: SWHID of the revision in which context's
+ the metadata is valid, if any
+ :>jsonarr string path: SWHID of the path in which context's
+ is valid if any, relative to a release or revision as anchor
+ :>jsonarr string directory: SWHID of the directory in which context's
+ the metadata is valid, if any
+
+ :statuscode 200: no error
+
+ **Example:**
+
+ .. parsed-literal::
+
+ :swh_web_api:`raw-extrinsic-metadata/swhid/swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307/?authority=forge%20https://pypi.org/`
+ """ # noqa
+ authority_str: str = request.query_params.get("authority")
+ after_str: str = request.query_params.get("after")
+ limit_str: str = request.query_params.get("limit", "100")
+ page_token_str: str = request.query_params.get("page_token")
+
+ if not authority_str:
+ raise BadInputExc("The 'authority' query parameter is required.")
+ if " " not in authority_str.strip():
+ raise BadInputExc("The 'authority' query parameter should contain a space.")
+
+ (authority_type_str, authority_url) = authority_str.split(" ", 1)
+ try:
+ authority_type = MetadataAuthorityType(authority_type_str)
+ except ValueError:
+ raise BadInputExc(
+ f"Invalid 'authority' type, should be one of: "
+ f"{', '.join(member.value for member in MetadataAuthorityType)}"
+ )
+ authority = MetadataAuthority(authority_type, authority_url)
+
+ if after_str:
+ try:
+ after = iso8601.parse_date(after_str)
+ except iso8601.ParseError:
+ raise BadInputExc("Invalid format for 'after' parameter.") from None
+ else:
+ after = None
+
+ try:
+ limit = int(limit_str)
+ except ValueError:
+ raise BadInputExc("'limit' parameter must be an integer.") from None
+ limit = min(limit, 10000)
+
+ try:
+ target = identifiers.CoreSWHID.from_string(target).to_extended()
+ except identifiers.ValidationError as e:
+ raise BadInputExc(f"Invalid target SWHID: {e.args[0]}") from None
+
+ if page_token_str:
+ page_token = base64.urlsafe_b64decode(page_token_str)
+ else:
+ page_token = None
+
+ result_page = archive.storage.raw_extrinsic_metadata_get(
+ target=target,
+ authority=authority,
+ after=after,
+ page_token=page_token,
+ limit=limit,
+ )
+
+ results = []
+
+ for metadata in result_page.results:
+ result = converters.from_raw_extrinsic_metadata(metadata)
+
+ # We can't reliably send metadata directly, because it is a bytestring,
+ # and we have to return JSON documents.
+ result["metadata_url"] = reverse(
+ "api-1-raw-extrinsic-metadata-get",
+ url_args={"id": hashutil.hash_to_hex(metadata.id)},
+ request=request,
+ )
+
+ results.append(result)
+
+ response = {
+ "results": results,
+ "headers": {},
+ }
+ if result_page.next_page_token is not None:
+ response["headers"]["link-next"] = reverse(
+ "api-1-raw-extrinsic-metadata",
+ query_params=dict(
+ authority=authority_str,
+ after=after_str,
+ limit=limit_str,
+ page_token=base64.urlsafe_b64encode(result_page.next_page_token),
+ ),
+ request=request,
+ )
+
+ return response
+
+
+@api_route(
+ "/raw-extrinsic-metadata/get/(?P<id>[0-9a-z]+)/",
+ "api-1-raw-extrinsic-metadata-get",
+)
+def api_raw_extrinsic_metadata_get(request, id):
+ # This is an internal endpoint that should only be accessed via URLs given
+ # by /raw-extrinsic-metadata/swhid/; so it is not documented.
+ metadata = archive.storage.raw_extrinsic_metadata_get_by_ids(
+ [hashutil.hash_to_bytes(id)]
+ )
+ if not metadata:
+ raise NotFoundExc(
+ "Metadata not found. Use /raw-extrinsic-metadata/swhid/ to access metadata."
+ )
+
+ return HttpResponse(metadata[0].metadata, content_type="application/octet-stream")
diff --git a/swh/web/common/converters.py b/swh/web/common/converters.py
--- a/swh/web/common/converters.py
+++ b/swh/web/common/converters.py
@@ -9,7 +9,7 @@
from swh.core.utils import decode_with_escape
from swh.model import hashutil
-from swh.model.model import Release, Revision
+from swh.model.model import RawExtrinsicMetadata, Release, Revision
from swh.storage.interface import PartialBranches
from swh.web.common.typing import OriginInfo, OriginVisitInfo
@@ -246,11 +246,11 @@
return json.JSONEncoder.default(self, obj)
-def convert_revision_metadata(metadata):
+def convert_metadata(metadata):
"""Convert json specific dict to a json serializable one.
"""
- if not metadata:
+ if metadata is None:
return {}
return json.loads(json.dumps(metadata, cls=SWHMetadataEncoder))
@@ -281,7 +281,7 @@
hashess={"id", "directory", "parents", "children"},
bytess={"name", "fullname", "email", "extra_headers", "message"},
convert={"metadata"},
- convert_fn=convert_revision_metadata,
+ convert_fn=convert_metadata,
dates={"date", "committer_date"},
)
@@ -292,6 +292,18 @@
return revision_d
+def from_raw_extrinsic_metadata(
+ metadata: Union[Dict[str, Any], RawExtrinsicMetadata]
+) -> Dict[str, Any]:
+ """Convert RawExtrinsicMetadata model object to a json serializable dictionary.
+ """
+ return from_swh(
+ metadata.to_dict() if isinstance(metadata, RawExtrinsicMetadata) else metadata,
+ blacklist={"id", "metadata"},
+ dates={"discovery_date"},
+ )
+
+
def from_content(content):
"""Convert swh content to serializable content dictionary.
diff --git a/swh/web/tests/api/views/test_metadata.py b/swh/web/tests/api/views/test_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/web/tests/api/views/test_metadata.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import attr
+from hypothesis import given, strategies
+import pytest
+
+from swh.model.hypothesis_strategies import raw_extrinsic_metadata
+from swh.web.common.utils import reverse
+from swh.web.tests.api.views.utils import scroll_results
+from swh.web.tests.utils import check_api_get_responses, check_http_get_response
+
+
+@given(raw_extrinsic_metadata())
+def test_api_raw_extrinsic_metadata(api_client, archive_data, metadata):
+ archive_data.metadata_authority_add([metadata.authority])
+ archive_data.metadata_fetcher_add([metadata.fetcher])
+ archive_data.raw_extrinsic_metadata_add([metadata])
+
+ authority = metadata.authority
+ url = reverse(
+ "api-1-raw-extrinsic-metadata-swhid",
+ url_args={"target": str(metadata.target)},
+ query_params={"authority": f"{authority.type.value} {authority.url}"},
+ )
+ rv = check_api_get_responses(api_client, url, status_code=200)
+
+ assert len(rv.data) == 1
+
+ expected_result = metadata.to_dict()
+ del expected_result["id"]
+ del expected_result["metadata"]
+ metadata_url = rv.data[0]["metadata_url"]
+ expected_result["metadata_url"] = metadata_url
+ expected_result["discovery_date"] = expected_result["discovery_date"].isoformat()
+ assert rv.data == [expected_result]
+
+ rv = check_http_get_response(api_client, metadata_url, status_code=200)
+ assert rv["Content-Type"] == "application/octet-stream"
+ assert rv.content == metadata.metadata
+
+
+@pytest.mark.parametrize("limit", [1, 2, 10, 100])
+@given(strategies.sets(raw_extrinsic_metadata(), min_size=1))
+def test_api_raw_extrinsic_metadata_scroll(api_client, archive_data, limit, metadata):
+ # Make all metadata objects use the same authority and target
+ metadata0 = next(iter(metadata))
+ metadata = {
+ attr.evolve(m, authority=metadata0.authority, target=metadata0.target)
+ for m in metadata
+ }
+ authority = metadata0.authority
+
+ archive_data.metadata_authority_add([authority])
+ archive_data.metadata_fetcher_add(list({m.fetcher for m in metadata}))
+ archive_data.raw_extrinsic_metadata_add(metadata)
+
+ url = reverse(
+ "api-1-raw-extrinsic-metadata-swhid",
+ url_args={"target": str(metadata0.target)},
+ query_params={
+ "authority": f"{authority.type.value} {authority.url}",
+ "limit": limit,
+ },
+ )
+
+ results = scroll_results(api_client, url)
+
+ expected_results = [m.to_dict() for m in metadata]
+ for expected_result in expected_results:
+ del expected_result["id"]
+ del expected_result["metadata"]
+ expected_result["discovery_date"] = expected_result[
+ "discovery_date"
+ ].isoformat()
+
+ for result in results:
+ del result["metadata_url"]
+
+ assert results == expected_results
+
+
+_swhid = "swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307"
+
+
+@pytest.mark.parametrize(
+ "status_code,url_args,query_params",
+ [
+ pytest.param(
+ 200,
+ {"target": _swhid},
+ {"authority": "forge http://example.org"},
+ id="minimal working",
+ ),
+ pytest.param(
+ 200,
+ {"target": _swhid},
+ {
+ "authority": "forge http://example.org",
+ "after": "2021-06-18T09:31:09",
+ "limit": 100,
+ },
+ id="maximal working",
+ ),
+ pytest.param(
+ 400,
+ {"target": _swhid},
+ {"authority": "foo http://example.org"},
+ id="invalid authority type",
+ ),
+ pytest.param(
+ 400,
+ {"target": _swhid},
+ {"authority": "forge http://example.org", "after": "yesterday",},
+ id="invalid 'after' format",
+ ),
+ pytest.param(
+ 400,
+ {"target": _swhid},
+ {"authority": "forge http://example.org", "limit": "abc",},
+ id="invalid 'limit'",
+ ),
+ ],
+)
+def test_api_raw_extrinsic_metadata_check_params(
+ api_client, archive_data, status_code, url_args, query_params
+):
+ url = reverse(
+ "api-1-raw-extrinsic-metadata-swhid",
+ url_args=url_args,
+ query_params=query_params,
+ )
+ check_api_get_responses(api_client, url, status_code=status_code)
diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py
--- a/swh/web/tests/api/views/test_origin.py
+++ b/swh/web/tests/api/views/test_origin.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020 The Software Heritage developers
+# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -8,7 +8,6 @@
from hypothesis import given
import pytest
-from requests.utils import parse_header_links
from swh.indexer.storage.model import OriginIntrinsicMetadataRow
from swh.model.hashutil import hash_to_bytes
@@ -19,6 +18,7 @@
from swh.web.common.exc import BadInputExc
from swh.web.common.origin_visits import get_origin_visits
from swh.web.common.utils import reverse
+from swh.web.tests.api.views.utils import scroll_results
from swh.web.tests.data import (
INDEXER_TOOL,
ORIGIN_MASTER_REVISION,
@@ -29,31 +29,6 @@
from swh.web.tests.utils import check_api_get_responses
-def _scroll_results(api_client, url):
- """Iterates through pages of results, and returns them all."""
- results = []
-
- while True:
- rv = check_api_get_responses(api_client, url, status_code=200)
-
- results.extend(rv.data)
-
- if "Link" in rv:
- for link in parse_header_links(rv["Link"]):
- if link["rel"] == "next":
- # Found link to next page of results
- url = link["url"]
- break
- else:
- # No link with 'rel=next'
- break
- else:
- # No Link header
- break
-
- return results
-
-
def test_api_lookup_origin_visits_raise_error(api_client, mocker):
mock_get_origin_visits = mocker.patch("swh.web.api.views.origin.get_origin_visits")
err_msg = "voluntary error to check the bad request middleware."
@@ -389,7 +364,7 @@
url = reverse("api-1-origins", query_params={"origin_count": origin_count})
- results = _scroll_results(api_client, url)
+ results = scroll_results(api_client, url)
assert len(results) == len(origins)
assert {origin["url"] for origin in results} == origin_urls
@@ -566,7 +541,7 @@
query_params={"limit": limit},
)
- results = _scroll_results(api_client, url)
+ results = scroll_results(api_client, url)
assert {origin["url"] for origin in results} == expected_origins
diff --git a/swh/web/tests/api/views/utils.py b/swh/web/tests/api/views/utils.py
new file mode 100644
--- /dev/null
+++ b/swh/web/tests/api/views/utils.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2015-2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from requests.utils import parse_header_links
+
+from swh.web.tests.utils import check_api_get_responses
+
+
+def scroll_results(api_client, url):
+ """Iterates through pages of results, and returns them all."""
+ results = []
+
+ while True:
+ rv = check_api_get_responses(api_client, url, status_code=200)
+
+ results.extend(rv.data)
+
+ if "Link" in rv:
+ for link in parse_header_links(rv["Link"]):
+ if link["rel"] == "next":
+ # Found link to next page of results
+ url = link["url"]
+ break
+ else:
+ # No link with 'rel=next'
+ break
+ else:
+ # No Link header
+ break
+
+ return results
diff --git a/swh/web/tests/common/test_converters.py b/swh/web/tests/common/test_converters.py
--- a/swh/web/tests/common/test_converters.py
+++ b/swh/web/tests/common/test_converters.py
@@ -97,7 +97,7 @@
empty_dict={"u"},
empty_list={"v"},
convert={"p", "q", "w"},
- convert_fn=converters.convert_revision_metadata,
+ convert_fn=converters.convert_metadata,
)
assert expected_output == actual_output

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 1:26 AM (2 d, 5 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217149

Event Timeline