diff --git a/requirements-swh.txt b/requirements-swh.txt index 9224ac8a..cc602280 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,9 +1,9 @@ swh.auth[django] >= 0.5.3 swh.core >= 0.0.95 swh.counters >= 0.5.1 swh.indexer >= 0.4.1 -swh.model >= 0.5.0 +swh.model >= 2.6.3 swh.scheduler >= 0.7.0 swh.search >= 0.2.0 -swh.storage >= 0.11.10 +swh.storage >= 0.31.0 swh.vault >= 0.0.33 diff --git a/swh/web/api/urls.py b/swh/web/api/urls.py index 9033c291..025afebc 100644 --- a/swh/web/api/urls.py +++ b/swh/web/api/urls.py @@ -1,20 +1,21 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.web.api.apiurls import APIUrls import swh.web.api.views.content # noqa import swh.web.api.views.directory # noqa import swh.web.api.views.graph # noqa import swh.web.api.views.identifiers # noqa +import swh.web.api.views.metadata # noqa import swh.web.api.views.origin # noqa import swh.web.api.views.origin_save # noqa import swh.web.api.views.ping # noqa import swh.web.api.views.release # noqa import swh.web.api.views.revision # noqa import swh.web.api.views.snapshot # noqa import swh.web.api.views.stat # noqa import swh.web.api.views.vault # noqa urlpatterns = APIUrls.get_url_patterns() diff --git a/swh/web/api/views/metadata.py b/swh/web/api/views/metadata.py new file mode 100644 index 00000000..bb3e91b6 --- /dev/null +++ b/swh/web/api/views/metadata.py @@ -0,0 +1,177 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import base64 + +import iso8601 + +from django.http import HttpResponse + +from swh.model import hashutil, identifiers +from swh.model.model import MetadataAuthority, MetadataAuthorityType +from swh.web.api.apidoc import api_doc, format_docstring +from swh.web.api.apiurls import api_route +from swh.web.common import archive, converters +from swh.web.common.exc import BadInputExc, NotFoundExc +from swh.web.common.utils import reverse + +SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" + + +@api_route( + f"/raw-extrinsic-metadata/swhid/(?P{SWHID_RE})/", + "api-1-raw-extrinsic-metadata-swhid", +) +@api_doc("/raw-extrinsic-metadata/swhid/") +@format_docstring() +def api_raw_extrinsic_metadata_swhid(request, target): + """ + .. http:get:: /api/1/raw-extrinsic-metadata/swhid/(target) + + Returns raw `extrinsic metadata`_ collected on a given object. + + .. _extrinsic metadata: https://docs.softwareheritage.org/devel/glossary.html#term-extrinsic-metadata + + :param string target: The SWHID of the object whose metadata should be returned + :query string authority: A metadata authority identifier, formatted as + ` `. Required. + :query string after: An ISO representation of the minimum timestamp of metadata + to fetch. Defaults to allowing all metadata. + :query int limit: Maximum number of metadata objects to return. + + {common_headers} + + :>jsonarr string target: SWHID of the object described by this metadata + :>jsonarr string discovery_date: ISO8601 timestamp of the moment this + metadata was collected. + :>jsonarr object authority: authority this metadata is coming from + :>jsonarr object fetcher: tool used to fetch the metadata + :>jsonarr string format: short identifier of the format of the metadata + :>jsonarr string metadata_url: link to download the metadata "blob" itself + :>jsonarr string origin: URL of the origin in which context's + the metadata is valid, if any + :>jsonarr int visit: identifier of the visit in which context's + the metadata is valid, if any + :>jsonarr string snapshot: SWHID of the snapshot in which context's + the metadata is valid, if any + :>jsonarr string release: SWHID of the release in which context's + the metadata is valid, if any + :>jsonarr string revision: SWHID of the revision in which context's + the metadata is valid, if any + :>jsonarr string path: SWHID of the path in which context's + is valid if any, relative to a release or revision as anchor + :>jsonarr string directory: SWHID of the directory in which context's + the metadata is valid, if any + + :statuscode 200: no error + + **Example:** + + .. parsed-literal:: + + :swh_web_api:`raw-extrinsic-metadata/swhid/swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307/?authority=forge%20https://pypi.org/` + """ # noqa + authority_str: str = request.query_params.get("authority") + after_str: str = request.query_params.get("after") + limit_str: str = request.query_params.get("limit", "100") + page_token_str: str = request.query_params.get("page_token") + + if not authority_str: + raise BadInputExc("The 'authority' query parameter is required.") + if " " not in authority_str.strip(): + raise BadInputExc("The 'authority' query parameter should contain a space.") + + (authority_type_str, authority_url) = authority_str.split(" ", 1) + try: + authority_type = MetadataAuthorityType(authority_type_str) + except ValueError: + raise BadInputExc( + f"Invalid 'authority' type, should be one of: " + f"{', '.join(member.value for member in MetadataAuthorityType)}" + ) + authority = MetadataAuthority(authority_type, authority_url) + + if after_str: + try: + after = iso8601.parse_date(after_str) + except iso8601.ParseError: + raise BadInputExc("Invalid format for 'after' parameter.") from None + else: + after = None + + try: + limit = int(limit_str) + except ValueError: + raise BadInputExc("'limit' parameter must be an integer.") from None + limit = min(limit, 10000) + + try: + target = identifiers.CoreSWHID.from_string(target).to_extended() + except identifiers.ValidationError as e: + raise BadInputExc(f"Invalid target SWHID: {e.args[0]}") from None + + if page_token_str: + page_token = base64.urlsafe_b64decode(page_token_str) + else: + page_token = None + + result_page = archive.storage.raw_extrinsic_metadata_get( + target=target, + authority=authority, + after=after, + page_token=page_token, + limit=limit, + ) + + results = [] + + for metadata in result_page.results: + result = converters.from_raw_extrinsic_metadata(metadata) + + # We can't reliably send metadata directly, because it is a bytestring, + # and we have to return JSON documents. + result["metadata_url"] = reverse( + "api-1-raw-extrinsic-metadata-get", + url_args={"id": hashutil.hash_to_hex(metadata.id)}, + request=request, + ) + + results.append(result) + + response = { + "results": results, + "headers": {}, + } + if result_page.next_page_token is not None: + response["headers"]["link-next"] = reverse( + "api-1-raw-extrinsic-metadata", + query_params=dict( + authority=authority_str, + after=after_str, + limit=limit_str, + page_token=base64.urlsafe_b64encode(result_page.next_page_token), + ), + request=request, + ) + + return response + + +@api_route( + "/raw-extrinsic-metadata/get/(?P[0-9a-z]+)/", + "api-1-raw-extrinsic-metadata-get", +) +def api_raw_extrinsic_metadata_get(request, id): + # This is an internal endpoint that should only be accessed via URLs given + # by /raw-extrinsic-metadata/swhid/; so it is not documented. + metadata = archive.storage.raw_extrinsic_metadata_get_by_ids( + [hashutil.hash_to_bytes(id)] + ) + if not metadata: + raise NotFoundExc( + "Metadata not found. Use /raw-extrinsic-metadata/swhid/ to access metadata." + ) + + return HttpResponse(metadata[0].metadata, content_type="application/octet-stream") diff --git a/swh/web/common/converters.py b/swh/web/common/converters.py index 00f09bd0..3b1dcfb7 100644 --- a/swh/web/common/converters.py +++ b/swh/web/common/converters.py @@ -1,383 +1,395 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json from typing import Any, Dict, Union from swh.core.utils import decode_with_escape from swh.model import hashutil -from swh.model.model import Release, Revision +from swh.model.model import RawExtrinsicMetadata, Release, Revision from swh.storage.interface import PartialBranches from swh.web.common.typing import OriginInfo, OriginVisitInfo def _group_checksums(data): """Groups checksums values computed from hash functions used in swh and stored in data dict under a single entry 'checksums' """ if data: checksums = {} for hash in hashutil.ALGORITHMS: if hash in data and data[hash]: checksums[hash] = data[hash] del data[hash] if len(checksums) > 0: data["checksums"] = checksums def fmap(f, data): """Map f to data at each level. This must keep the origin data structure type: - map -> map - dict -> dict - list -> list - None -> None Args: f: function that expects one argument. data: data to traverse to apply the f function. list, map, dict or bare value. Returns: The same data-structure with modified values by the f function. """ if data is None: return data if isinstance(data, map): return map(lambda y: fmap(f, y), (x for x in data)) if isinstance(data, list): return [fmap(f, x) for x in data] if isinstance(data, tuple): return tuple(fmap(f, x) for x in data) if isinstance(data, dict): return {k: fmap(f, v) for (k, v) in data.items()} return f(data) def from_swh( dict_swh, hashess={}, bytess={}, dates={}, blacklist={}, removables_if_empty={}, empty_dict={}, empty_list={}, convert={}, convert_fn=lambda x: x, ): """Convert from a swh dictionary to something reasonably json serializable. Args: dict_swh: the origin dictionary needed to be transformed hashess: list/set of keys representing hashes values (sha1, sha256, sha1_git, etc...) as bytes. Those need to be transformed in hexadecimal string bytess: list/set of keys representing bytes values which needs to be decoded blacklist: set of keys to filter out from the conversion convert: set of keys whose associated values need to be converted using convert_fn convert_fn: the conversion function to apply on the value of key in 'convert' The remaining keys are copied as is in the output. Returns: dictionary equivalent as dict_swh only with its keys converted. """ def convert_hashes_bytes(v): """v is supposedly a hash as bytes, returns it converted in hex. """ if isinstance(v, bytes): return hashutil.hash_to_hex(v) return v def convert_bytes(v): """v is supposedly a bytes string, decode as utf-8. FIXME: Improve decoding policy. If not utf-8, break! """ if isinstance(v, bytes): return v.decode("utf-8") return v def convert_date(v): """ Args: v (dict or datatime): either: - a dict with three keys: - timestamp (dict or integer timestamp) - offset - negative_utc - or, a datetime We convert it to a human-readable string """ if not v: return v if isinstance(v, datetime.datetime): return v.isoformat() tz = datetime.timezone(datetime.timedelta(minutes=v["offset"])) swh_timestamp = v["timestamp"] if isinstance(swh_timestamp, dict): date = datetime.datetime.fromtimestamp(swh_timestamp["seconds"], tz=tz) else: date = datetime.datetime.fromtimestamp(swh_timestamp, tz=tz) datestr = date.isoformat() if v["offset"] == 0 and v["negative_utc"]: # remove the rightmost + and replace it with a - return "-".join(datestr.rsplit("+", 1)) return datestr if not dict_swh: return dict_swh new_dict = {} for key, value in dict_swh.items(): if key in blacklist or (key in removables_if_empty and not value): continue if key in dates: new_dict[key] = convert_date(value) elif key in convert: new_dict[key] = convert_fn(value) elif isinstance(value, dict): new_dict[key] = from_swh( value, hashess=hashess, bytess=bytess, dates=dates, blacklist=blacklist, removables_if_empty=removables_if_empty, empty_dict=empty_dict, empty_list=empty_list, convert=convert, convert_fn=convert_fn, ) elif key in hashess: new_dict[key] = fmap(convert_hashes_bytes, value) elif key in bytess: try: new_dict[key] = fmap(convert_bytes, value) except UnicodeDecodeError: if "decoding_failures" not in new_dict: new_dict["decoding_failures"] = [key] else: new_dict["decoding_failures"].append(key) new_dict[key] = fmap(decode_with_escape, value) elif key in empty_dict and not value: new_dict[key] = {} elif key in empty_list and not value: new_dict[key] = [] else: new_dict[key] = value _group_checksums(new_dict) return new_dict def from_origin(origin: Dict[str, Any]) -> OriginInfo: """Convert from a swh origin to an origin dictionary. """ return from_swh(origin) def from_release(release: Release) -> Dict[str, Any]: """Convert from a swh release to a json serializable release dictionary. Args: release: A release model object Returns: release dictionary with the following keys - id: hexadecimal sha1 (string) - revision: hexadecimal sha1 (string) - comment: release's comment message (string) - name: release's name (string) - author: release's author identifier (swh's id) - synthetic: the synthetic property (boolean) """ return from_swh( release.to_dict(), hashess={"id", "target"}, bytess={"message", "name", "fullname", "email"}, dates={"date"}, ) class SWHMetadataEncoder(json.JSONEncoder): """Special json encoder for metadata field which can contain bytes encoded value. """ def default(self, obj): if isinstance(obj, bytes): try: return obj.decode("utf-8") except UnicodeDecodeError: # fallback to binary representation to avoid display errors return repr(obj) # Let the base class default method raise the TypeError return json.JSONEncoder.default(self, obj) -def convert_revision_metadata(metadata): +def convert_metadata(metadata): """Convert json specific dict to a json serializable one. """ - if not metadata: + if metadata is None: return {} return json.loads(json.dumps(metadata, cls=SWHMetadataEncoder)) def from_revision(revision: Union[Dict[str, Any], Revision]) -> Dict[str, Any]: """Convert swh revision model object to a json serializable revision dictionary. Args: revision: revision model object Returns: dict: Revision dictionary with the same keys as inputs, except: - sha1s are in hexadecimal strings (id, directory) - bytes are decoded in string (author_name, committer_name, author_email, committer_email) Remaining keys are left as is """ if isinstance(revision, Revision): revision_d = revision.to_dict() else: revision_d = revision revision_d = from_swh( revision_d, hashess={"id", "directory", "parents", "children"}, bytess={"name", "fullname", "email", "extra_headers", "message"}, convert={"metadata"}, - convert_fn=convert_revision_metadata, + convert_fn=convert_metadata, dates={"date", "committer_date"}, ) if revision_d: if "parents" in revision_d: revision_d["merge"] = len(revision_d["parents"]) > 1 return revision_d +def from_raw_extrinsic_metadata( + metadata: Union[Dict[str, Any], RawExtrinsicMetadata] +) -> Dict[str, Any]: + """Convert RawExtrinsicMetadata model object to a json serializable dictionary. + """ + return from_swh( + metadata.to_dict() if isinstance(metadata, RawExtrinsicMetadata) else metadata, + blacklist={"id", "metadata"}, + dates={"discovery_date"}, + ) + + def from_content(content): """Convert swh content to serializable content dictionary. """ return from_swh( content, hashess={"sha1", "sha1_git", "sha256", "blake2s256"}, blacklist={"ctime"}, convert={"status"}, convert_fn=lambda v: "absent" if v == "hidden" else v, ) def from_person(person): """Convert swh person to serializable person dictionary. """ return from_swh(person, bytess={"name", "fullname", "email"}) def from_origin_visit(visit: Dict[str, Any]) -> OriginVisitInfo: """Convert swh origin_visit to serializable origin_visit dictionary. """ ov = from_swh( visit, hashess={"target", "snapshot"}, bytess={"branch"}, dates={"date"}, empty_dict={"metadata"}, ) return ov def from_snapshot(snapshot): """Convert swh snapshot to serializable (partial) snapshot dictionary. """ sv = from_swh(snapshot, hashess={"id", "target"}, bytess={"next_branch"}) if sv and "branches" in sv: sv["branches"] = {decode_with_escape(k): v for k, v in sv["branches"].items()} for k, v in snapshot["branches"].items(): # alias target existing branch names, not a sha1 if v and v["target_type"] == "alias": branch = decode_with_escape(k) target = decode_with_escape(v["target"]) sv["branches"][branch]["target"] = target return sv def from_partial_branches(branches: PartialBranches): """Convert PartialBranches to serializable partial snapshot dictionary """ return from_snapshot( { "id": branches["id"], "branches": { branch_name: branch.to_dict() if branch else None for (branch_name, branch) in branches["branches"].items() }, "next_branch": branches["next_branch"], } ) def from_directory_entry(dir_entry): """Convert swh directory to serializable directory dictionary. """ return from_swh( dir_entry, hashess={"dir_id", "sha1_git", "sha1", "sha256", "blake2s256", "target"}, bytess={"name"}, removables_if_empty={"sha1", "sha1_git", "sha256", "blake2s256", "status"}, convert={"status"}, convert_fn=lambda v: "absent" if v == "hidden" else v, ) def from_filetype(content_entry): """Convert swh content to serializable dictionary containing keys 'id', 'encoding', and 'mimetype'. """ return from_swh(content_entry, hashess={"id"}) diff --git a/swh/web/tests/api/views/test_metadata.py b/swh/web/tests/api/views/test_metadata.py new file mode 100644 index 00000000..fe4bec6b --- /dev/null +++ b/swh/web/tests/api/views/test_metadata.py @@ -0,0 +1,135 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import attr +from hypothesis import given, strategies +import pytest + +from swh.model.hypothesis_strategies import raw_extrinsic_metadata +from swh.web.common.utils import reverse +from swh.web.tests.api.views.utils import scroll_results +from swh.web.tests.utils import check_api_get_responses, check_http_get_response + + +@given(raw_extrinsic_metadata()) +def test_api_raw_extrinsic_metadata(api_client, archive_data, metadata): + archive_data.metadata_authority_add([metadata.authority]) + archive_data.metadata_fetcher_add([metadata.fetcher]) + archive_data.raw_extrinsic_metadata_add([metadata]) + + authority = metadata.authority + url = reverse( + "api-1-raw-extrinsic-metadata-swhid", + url_args={"target": str(metadata.target)}, + query_params={"authority": f"{authority.type.value} {authority.url}"}, + ) + rv = check_api_get_responses(api_client, url, status_code=200) + + assert len(rv.data) == 1 + + expected_result = metadata.to_dict() + del expected_result["id"] + del expected_result["metadata"] + metadata_url = rv.data[0]["metadata_url"] + expected_result["metadata_url"] = metadata_url + expected_result["discovery_date"] = expected_result["discovery_date"].isoformat() + assert rv.data == [expected_result] + + rv = check_http_get_response(api_client, metadata_url, status_code=200) + assert rv["Content-Type"] == "application/octet-stream" + assert rv.content == metadata.metadata + + +@pytest.mark.parametrize("limit", [1, 2, 10, 100]) +@given(strategies.sets(raw_extrinsic_metadata(), min_size=1)) +def test_api_raw_extrinsic_metadata_scroll(api_client, archive_data, limit, metadata): + # Make all metadata objects use the same authority and target + metadata0 = next(iter(metadata)) + metadata = { + attr.evolve(m, authority=metadata0.authority, target=metadata0.target) + for m in metadata + } + authority = metadata0.authority + + archive_data.metadata_authority_add([authority]) + archive_data.metadata_fetcher_add(list({m.fetcher for m in metadata})) + archive_data.raw_extrinsic_metadata_add(metadata) + + url = reverse( + "api-1-raw-extrinsic-metadata-swhid", + url_args={"target": str(metadata0.target)}, + query_params={ + "authority": f"{authority.type.value} {authority.url}", + "limit": limit, + }, + ) + + results = scroll_results(api_client, url) + + expected_results = [m.to_dict() for m in metadata] + for expected_result in expected_results: + del expected_result["id"] + del expected_result["metadata"] + expected_result["discovery_date"] = expected_result[ + "discovery_date" + ].isoformat() + + for result in results: + del result["metadata_url"] + + assert results == expected_results + + +_swhid = "swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307" + + +@pytest.mark.parametrize( + "status_code,url_args,query_params", + [ + pytest.param( + 200, + {"target": _swhid}, + {"authority": "forge http://example.org"}, + id="minimal working", + ), + pytest.param( + 200, + {"target": _swhid}, + { + "authority": "forge http://example.org", + "after": "2021-06-18T09:31:09", + "limit": 100, + }, + id="maximal working", + ), + pytest.param( + 400, + {"target": _swhid}, + {"authority": "foo http://example.org"}, + id="invalid authority type", + ), + pytest.param( + 400, + {"target": _swhid}, + {"authority": "forge http://example.org", "after": "yesterday",}, + id="invalid 'after' format", + ), + pytest.param( + 400, + {"target": _swhid}, + {"authority": "forge http://example.org", "limit": "abc",}, + id="invalid 'limit'", + ), + ], +) +def test_api_raw_extrinsic_metadata_check_params( + api_client, archive_data, status_code, url_args, query_params +): + url = reverse( + "api-1-raw-extrinsic-metadata-swhid", + url_args=url_args, + query_params=query_params, + ) + check_api_get_responses(api_client, url, status_code=status_code) diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py index 4dc42be2..8f3439d6 100644 --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -1,706 +1,681 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta import json from hypothesis import given import pytest -from requests.utils import parse_header_links from swh.indexer.storage.model import OriginIntrinsicMetadataRow from swh.model.hashutil import hash_to_bytes from swh.model.model import Origin, OriginVisit, OriginVisitStatus from swh.storage.exc import StorageAPIError, StorageDBError from swh.storage.utils import now from swh.web.api.utils import enrich_origin, enrich_origin_visit from swh.web.common.exc import BadInputExc from swh.web.common.origin_visits import get_origin_visits from swh.web.common.utils import reverse +from swh.web.tests.api.views.utils import scroll_results from swh.web.tests.data import ( INDEXER_TOOL, ORIGIN_MASTER_REVISION, ORIGIN_METADATA_KEY, ORIGIN_METADATA_VALUE, ) from swh.web.tests.strategies import new_origin, new_snapshots, origin, visit_dates from swh.web.tests.utils import check_api_get_responses -def _scroll_results(api_client, url): - """Iterates through pages of results, and returns them all.""" - results = [] - - while True: - rv = check_api_get_responses(api_client, url, status_code=200) - - results.extend(rv.data) - - if "Link" in rv: - for link in parse_header_links(rv["Link"]): - if link["rel"] == "next": - # Found link to next page of results - url = link["url"] - break - else: - # No link with 'rel=next' - break - else: - # No Link header - break - - return results - - def test_api_lookup_origin_visits_raise_error(api_client, mocker): mock_get_origin_visits = mocker.patch("swh.web.api.views.origin.get_origin_visits") err_msg = "voluntary error to check the bad request middleware." mock_get_origin_visits.side_effect = BadInputExc(err_msg) url = reverse("api-1-origin-visits", url_args={"origin_url": "http://foo"}) rv = check_api_get_responses(api_client, url, status_code=400) assert rv.data == {"exception": "BadInputExc", "reason": err_msg} def test_api_lookup_origin_visits_raise_swh_storage_error_db(api_client, mocker): mock_get_origin_visits = mocker.patch("swh.web.api.views.origin.get_origin_visits") err_msg = "Storage exploded! Will be back online shortly!" mock_get_origin_visits.side_effect = StorageDBError(err_msg) url = reverse("api-1-origin-visits", url_args={"origin_url": "http://foo"}) rv = check_api_get_responses(api_client, url, status_code=503) assert rv.data == { "exception": "StorageDBError", "reason": "An unexpected error occurred in the backend: %s" % err_msg, } def test_api_lookup_origin_visits_raise_swh_storage_error_api(api_client, mocker): mock_get_origin_visits = mocker.patch("swh.web.api.views.origin.get_origin_visits") err_msg = "Storage API dropped dead! Will resurrect asap!" mock_get_origin_visits.side_effect = StorageAPIError(err_msg) url = reverse("api-1-origin-visits", url_args={"origin_url": "http://foo"}) rv = check_api_get_responses(api_client, url, status_code=503) assert rv.data == { "exception": "StorageAPIError", "reason": "An unexpected error occurred in the api backend: %s" % err_msg, } @given(new_origin(), visit_dates(3), new_snapshots(3)) def test_api_lookup_origin_visits( api_client, archive_data, new_origin, visit_dates, new_snapshots ): archive_data.origin_add([new_origin]) for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] )[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=origin_visit.visit, date=now(), status="full", snapshot=new_snapshots[i].id, ) archive_data.origin_visit_status_add([visit_status]) all_visits = list(reversed(get_origin_visits(new_origin.to_dict()))) for last_visit, expected_visits in ( (None, all_visits[:2]), (all_visits[1]["visit"], all_visits[2:]), ): url = reverse( "api-1-origin-visits", url_args={"origin_url": new_origin.url}, query_params={"per_page": 2, "last_visit": last_visit}, ) rv = check_api_get_responses(api_client, url, status_code=200) for i in range(len(expected_visits)): expected_visits[i] = enrich_origin_visit( expected_visits[i], with_origin_link=False, with_origin_visit_link=True, request=rv.wsgi_request, ) assert rv.data == expected_visits @given(new_origin(), visit_dates(3), new_snapshots(3)) def test_api_lookup_origin_visits_by_id( api_client, archive_data, new_origin, visit_dates, new_snapshots ): archive_data.origin_add([new_origin]) for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] )[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=origin_visit.visit, date=now(), status="full", snapshot=new_snapshots[i].id, ) archive_data.origin_visit_status_add([visit_status]) all_visits = list(reversed(get_origin_visits(new_origin.to_dict()))) for last_visit, expected_visits in ( (None, all_visits[:2]), (all_visits[1]["visit"], all_visits[2:4]), ): url = reverse( "api-1-origin-visits", url_args={"origin_url": new_origin.url}, query_params={"per_page": 2, "last_visit": last_visit}, ) rv = check_api_get_responses(api_client, url, status_code=200) for i in range(len(expected_visits)): expected_visits[i] = enrich_origin_visit( expected_visits[i], with_origin_link=False, with_origin_visit_link=True, request=rv.wsgi_request, ) assert rv.data == expected_visits @given(new_origin(), visit_dates(3), new_snapshots(3)) def test_api_lookup_origin_visit( api_client, archive_data, new_origin, visit_dates, new_snapshots ): archive_data.origin_add([new_origin]) for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] )[0] visit_id = origin_visit.visit archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=origin_visit.visit, date=visit_date + timedelta(minutes=5), status="full", snapshot=new_snapshots[i].id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "api-1-origin-visit", url_args={"origin_url": new_origin.url, "visit_id": visit_id}, ) rv = check_api_get_responses(api_client, url, status_code=200) expected_visit = archive_data.origin_visit_get_by(new_origin.url, visit_id) expected_visit = enrich_origin_visit( expected_visit, with_origin_link=True, with_origin_visit_link=False, request=rv.wsgi_request, ) assert rv.data == expected_visit @given(new_origin()) def test_api_lookup_origin_visit_latest_no_visit(api_client, archive_data, new_origin): archive_data.origin_add([new_origin]) url = reverse("api-1-origin-visit-latest", url_args={"origin_url": new_origin.url}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "No visit for origin %s found" % new_origin.url, } @given(new_origin(), visit_dates(2), new_snapshots(1)) def test_api_lookup_origin_visit_latest( api_client, archive_data, new_origin, visit_dates, new_snapshots ): archive_data.origin_add([new_origin]) visit_dates.sort() visit_ids = [] for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] )[0] visit_ids.append(origin_visit.visit) archive_data.snapshot_add([new_snapshots[0]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit_ids[0], date=now(), status="full", snapshot=new_snapshots[0].id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse("api-1-origin-visit-latest", url_args={"origin_url": new_origin.url}) rv = check_api_get_responses(api_client, url, status_code=200) expected_visit = archive_data.origin_visit_get_by(new_origin.url, visit_ids[1]) expected_visit = enrich_origin_visit( expected_visit, with_origin_link=True, with_origin_visit_link=False, request=rv.wsgi_request, ) assert rv.data == expected_visit @given(new_origin(), visit_dates(2), new_snapshots(1)) def test_api_lookup_origin_visit_latest_with_snapshot( api_client, archive_data, new_origin, visit_dates, new_snapshots ): archive_data.origin_add([new_origin]) visit_dates.sort() visit_ids = [] for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] )[0] visit_ids.append(origin_visit.visit) archive_data.snapshot_add([new_snapshots[0]]) # Add snapshot to the latest visit visit_id = visit_ids[-1] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit_id, date=now(), status="full", snapshot=new_snapshots[0].id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "api-1-origin-visit-latest", url_args={"origin_url": new_origin.url}, query_params={"require_snapshot": True}, ) rv = check_api_get_responses(api_client, url, status_code=200) expected_visit = archive_data.origin_visit_status_get_latest( new_origin.url, type="git", require_snapshot=True ) expected_visit = enrich_origin_visit( expected_visit, with_origin_link=True, with_origin_visit_link=False, request=rv.wsgi_request, ) assert rv.data == expected_visit @given(origin()) def test_api_lookup_origin_visit_not_found(api_client, origin): all_visits = list(reversed(get_origin_visits(origin))) max_visit_id = max([v["visit"] for v in all_visits]) url = reverse( "api-1-origin-visit", url_args={"origin_url": origin["url"], "visit_id": max_visit_id + 1}, ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Origin %s or its visit with id %s not found!" % (origin["url"], max_visit_id + 1), } def test_api_origins_wrong_input(api_client, archive_data): """Should fail with 400 if the input is deprecated. """ # fail if wrong input url = reverse("api-1-origins", query_params={"origin_from": 1}) rv = check_api_get_responses(api_client, url, status_code=400) assert rv.data == { "exception": "BadInputExc", "reason": "Please use the Link header to browse through result", } def test_api_origins(api_client, archive_data): page_result = archive_data.origin_list(limit=10000) origins = page_result.results origin_urls = {origin.url for origin in origins} # Get only one url = reverse("api-1-origins", query_params={"origin_count": 1}) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 assert {origin["url"] for origin in rv.data} <= origin_urls # Get all url = reverse("api-1-origins", query_params={"origin_count": len(origins)}) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(origins) assert {origin["url"] for origin in rv.data} == origin_urls # Get "all + 10" url = reverse("api-1-origins", query_params={"origin_count": len(origins) + 10}) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(origins) assert {origin["url"] for origin in rv.data} == origin_urls @pytest.mark.parametrize("origin_count", [1, 2, 10, 100]) def test_api_origins_scroll(api_client, archive_data, origin_count): page_result = archive_data.origin_list(limit=10000) origins = page_result.results origin_urls = {origin.url for origin in origins} url = reverse("api-1-origins", query_params={"origin_count": origin_count}) - results = _scroll_results(api_client, url) + results = scroll_results(api_client, url) assert len(results) == len(origins) assert {origin["url"] for origin in results} == origin_urls @given(origin()) def test_api_origin_by_url(api_client, archive_data, origin): origin_url = origin["url"] url = reverse("api-1-origin", url_args={"origin_url": origin_url}) rv = check_api_get_responses(api_client, url, status_code=200) expected_origin = archive_data.origin_get([origin_url])[0] expected_origin = enrich_origin(expected_origin, rv.wsgi_request) assert rv.data == expected_origin @given(new_origin()) def test_api_origin_not_found(api_client, new_origin): url = reverse("api-1-origin", url_args={"origin_url": new_origin.url}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Origin with url %s not found!" % new_origin.url, } @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) def test_api_origin_search(api_client, mocker, backend): if backend != "swh-search": # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } # Search for 'github.com', get only one url = reverse( "api-1-origin-search", url_args={"url_pattern": "github.com"}, query_params={"limit": 1}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 assert {origin["url"] for origin in rv.data} <= expected_origins assert rv.data == [ enrich_origin({"url": origin["url"]}, request=rv.wsgi_request) for origin in rv.data ] # Search for 'github.com', get all url = reverse( "api-1-origin-search", url_args={"url_pattern": "github.com"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins assert rv.data == [ enrich_origin({"url": origin["url"]}, request=rv.wsgi_request) for origin in rv.data ] # Search for 'github.com', get more than available url = reverse( "api-1-origin-search", url_args={"url_pattern": "github.com"}, query_params={"limit": 10}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins assert rv.data == [ enrich_origin({"url": origin["url"]}, request=rv.wsgi_request) for origin in rv.data ] @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) def test_api_origin_search_words(api_client, mocker, backend): if backend != "swh-search": # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } url = reverse( "api-1-origin-search", url_args={"url_pattern": "github com"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins url = reverse( "api-1-origin-search", url_args={"url_pattern": "com github"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins url = reverse( "api-1-origin-search", url_args={"url_pattern": "memononen libtess2"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 assert {origin["url"] for origin in rv.data} == { "https://github.com/memononen/libtess2" } url = reverse( "api-1-origin-search", url_args={"url_pattern": "libtess2 memononen"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 assert {origin["url"] for origin in rv.data} == { "https://github.com/memononen/libtess2" } @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) def test_api_origin_search_visit_type(api_client, mocker, backend): if backend != "swh-search": # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } url = reverse( "api-1-origin-search", url_args={"url_pattern": "github com",}, query_params={"visit_type": "git"}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins url = reverse( "api-1-origin-search", url_args={"url_pattern": "github com",}, query_params={"visit_type": "foo"}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == [] @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) @pytest.mark.parametrize("limit", [1, 2, 3, 10]) def test_api_origin_search_scroll(api_client, archive_data, mocker, limit, backend): if backend != "swh-search": # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } url = reverse( "api-1-origin-search", url_args={"url_pattern": "github.com"}, query_params={"limit": limit}, ) - results = _scroll_results(api_client, url) + results = scroll_results(api_client, url) assert {origin["url"] for origin in results} == expected_origins @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) def test_api_origin_search_limit(api_client, archive_data, tests_data, mocker, backend): if backend == "swh-search": tests_data["search"].origin_update( [{"url": "http://foobar/{}".format(i)} for i in range(2000)] ) else: # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) archive_data.origin_add( [Origin(url="http://foobar/{}".format(i)) for i in range(2000)] ) url = reverse( "api-1-origin-search", url_args={"url_pattern": "foobar"}, query_params={"limit": 1050}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1000 @pytest.mark.parametrize("backend", ["swh-search", "swh-indexer-storage"]) def test_api_origin_metadata_search(api_client, mocker, backend): mock_config = mocker.patch("swh.web.common.archive.config") mock_config.get_config.return_value = {"metadata_search_backend": backend} url = reverse( "api-1-origin-metadata-search", query_params={"fulltext": ORIGIN_METADATA_VALUE} ) rv = check_api_get_responses(api_client, url, status_code=200) expected_data = [ { "url": origin_url, "metadata": { "from_revision": master_rev, "tool": { "name": INDEXER_TOOL["tool_name"], "version": INDEXER_TOOL["tool_version"], "configuration": INDEXER_TOOL["tool_configuration"], "id": INDEXER_TOOL["id"], }, "mappings": [], }, } for origin_url, master_rev in ORIGIN_MASTER_REVISION.items() ] for i in range(len(expected_data)): expected = expected_data[i] response = rv.data[i] metadata = response["metadata"].pop("metadata") assert any( [ORIGIN_METADATA_VALUE in json.dumps(val) for val in metadata.values()] ) assert response == expected def test_api_origin_metadata_search_limit(api_client, mocker): mock_idx_storage = mocker.patch("swh.web.common.archive.idx_storage") oimsft = mock_idx_storage.origin_intrinsic_metadata_search_fulltext oimsft.side_effect = lambda conjunction, limit: [ OriginIntrinsicMetadataRow( id=origin_url, from_revision=hash_to_bytes(master_rev), indexer_configuration_id=INDEXER_TOOL["id"], metadata={ORIGIN_METADATA_KEY: ORIGIN_METADATA_VALUE}, mappings=[], ) for origin_url, master_rev in ORIGIN_MASTER_REVISION.items() ] url = reverse( "api-1-origin-metadata-search", query_params={"fulltext": ORIGIN_METADATA_VALUE} ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(ORIGIN_MASTER_REVISION) oimsft.assert_called_with(conjunction=[ORIGIN_METADATA_VALUE], limit=70) url = reverse( "api-1-origin-metadata-search", query_params={"fulltext": ORIGIN_METADATA_VALUE, "limit": 10}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(ORIGIN_MASTER_REVISION) oimsft.assert_called_with(conjunction=[ORIGIN_METADATA_VALUE], limit=10) url = reverse( "api-1-origin-metadata-search", query_params={"fulltext": ORIGIN_METADATA_VALUE, "limit": 987}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(ORIGIN_MASTER_REVISION) oimsft.assert_called_with(conjunction=[ORIGIN_METADATA_VALUE], limit=100) @given(origin()) def test_api_origin_intrinsic_metadata(api_client, origin): url = reverse( "api-origin-intrinsic-metadata", url_args={"origin_url": origin["url"]} ) rv = check_api_get_responses(api_client, url, status_code=200) expected_data = {ORIGIN_METADATA_KEY: ORIGIN_METADATA_VALUE} assert rv.data == expected_data def test_api_origin_metadata_search_invalid(api_client, mocker): mock_idx_storage = mocker.patch("swh.web.common.archive.idx_storage") url = reverse("api-1-origin-metadata-search") check_api_get_responses(api_client, url, status_code=400) mock_idx_storage.assert_not_called() @pytest.mark.parametrize("backend", ["swh-counters", "swh-storage"]) def test_api_stat_counters(api_client, mocker, backend): mock_config = mocker.patch("swh.web.common.archive.config") mock_config.get_config.return_value = {"counters_backend": backend} url = reverse("api-1-stat-counters") rv = check_api_get_responses(api_client, url, status_code=200) counts = json.loads(rv.content) for obj in ["content", "origin", "release", "directory", "revision"]: assert counts.get(obj, 0) > 0 diff --git a/swh/web/tests/api/views/utils.py b/swh/web/tests/api/views/utils.py new file mode 100644 index 00000000..9887dba3 --- /dev/null +++ b/swh/web/tests/api/views/utils.py @@ -0,0 +1,33 @@ +# Copyright (C) 2015-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from requests.utils import parse_header_links + +from swh.web.tests.utils import check_api_get_responses + + +def scroll_results(api_client, url): + """Iterates through pages of results, and returns them all.""" + results = [] + + while True: + rv = check_api_get_responses(api_client, url, status_code=200) + + results.extend(rv.data) + + if "Link" in rv: + for link in parse_header_links(rv["Link"]): + if link["rel"] == "next": + # Found link to next page of results + url = link["url"] + break + else: + # No link with 'rel=next' + break + else: + # No Link header + break + + return results diff --git a/swh/web/tests/common/test_converters.py b/swh/web/tests/common/test_converters.py index a4cc597c..53ed0f86 100644 --- a/swh/web/tests/common/test_converters.py +++ b/swh/web/tests/common/test_converters.py @@ -1,757 +1,757 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from swh.model import hashutil from swh.model.model import ( ObjectType, Person, Release, Revision, RevisionType, Timestamp, TimestampWithTimezone, ) from swh.web.common import converters def test_fmap(): assert [2, 3, None, 4] == converters.fmap(lambda x: x + 1, [1, 2, None, 3]) assert [11, 12, 13] == list( converters.fmap(lambda x: x + 10, map(lambda x: x, [1, 2, 3])) ) assert {"a": 2, "b": 4} == converters.fmap(lambda x: x * 2, {"a": 1, "b": 2}) assert 100 == converters.fmap(lambda x: x * 10, 10) assert {"a": [2, 6], "b": 4} == converters.fmap( lambda x: x * 2, {"a": [1, 3], "b": 2} ) assert converters.fmap(lambda x: x, None) is None def test_from_swh(): some_input = { "a": "something", "b": "someone", "c": b"sharp-0.3.4.tgz", "d": hashutil.hash_to_bytes("b04caf10e9535160d90e874b45aa426de762f19f"), "e": b"sharp.html/doc_002dS_005fISREG.html", "g": [b"utf-8-to-decode", b"another-one"], "h": "something filtered", "i": {"e": b"something"}, "j": { "k": { "l": [b"bytes thing", b"another thingy", b""], "n": "don't care either", }, "m": "don't care", }, "o": "something", "p": b"foo", "q": {"extra-headers": [["a", b"intact"]]}, "w": None, "r": {"p": "also intact", "q": "bar"}, "s": {"timestamp": 42, "offset": -420, "negative_utc": None,}, "s1": { "timestamp": {"seconds": 42, "microseconds": 0}, "offset": -420, "negative_utc": None, }, "s2": datetime.datetime(2013, 7, 1, 20, 0, 0, tzinfo=datetime.timezone.utc), "t": None, "u": None, "v": None, "x": None, } expected_output = { "a": "something", "b": "someone", "c": "sharp-0.3.4.tgz", "d": "b04caf10e9535160d90e874b45aa426de762f19f", "e": "sharp.html/doc_002dS_005fISREG.html", "g": ["utf-8-to-decode", "another-one"], "i": {"e": "something"}, "j": {"k": {"l": ["bytes thing", "another thingy", ""]}}, "p": "foo", "q": {"extra-headers": [["a", "intact"]]}, "w": {}, "r": {"p": "also intact", "q": "bar"}, "s": "1969-12-31T17:00:42-07:00", "s1": "1969-12-31T17:00:42-07:00", "s2": "2013-07-01T20:00:00+00:00", "u": {}, "v": [], "x": None, } actual_output = converters.from_swh( some_input, hashess={"d", "o", "x"}, bytess={"c", "e", "g", "l"}, dates={"s", "s1", "s2"}, blacklist={"h", "m", "n", "o"}, removables_if_empty={"t"}, empty_dict={"u"}, empty_list={"v"}, convert={"p", "q", "w"}, - convert_fn=converters.convert_revision_metadata, + convert_fn=converters.convert_metadata, ) assert expected_output == actual_output def test_from_swh_edge_cases_do_no_conversion_if_none_or_not_bytes(): some_input = {"a": "something", "b": None, "c": "someone", "d": None, "e": None} expected_output = { "a": "something", "b": None, "c": "someone", "d": None, "e": None, } actual_output = converters.from_swh( some_input, hashess={"a", "b"}, bytess={"c", "d"}, dates={"e"} ) assert expected_output == actual_output def test_from_swh_edge_cases_convert_invalid_utf8_bytes(): some_input = { "a": "something", "b": "someone", "c": b"a name \xff", "d": b"an email \xff", } expected_output = { "a": "something", "b": "someone", "c": "a name \\xff", "d": "an email \\xff", "decoding_failures": ["c", "d"], } actual_output = converters.from_swh( some_input, hashess={"a", "b"}, bytess={"c", "d"} ) for v in ["a", "b", "c", "d"]: assert expected_output[v] == actual_output[v] assert len(expected_output["decoding_failures"]) == len( actual_output["decoding_failures"] ) for v in expected_output["decoding_failures"]: assert v in actual_output["decoding_failures"] def test_from_swh_empty(): assert {} == converters.from_swh({}) def test_from_swh_none(): assert converters.from_swh(None) is None def test_from_origin(): origin_input = { "id": 9, "type": "ftp", "url": "rsync://ftp.gnu.org/gnu/octave", } expected_origin = { "id": 9, "type": "ftp", "url": "rsync://ftp.gnu.org/gnu/octave", } actual_origin = converters.from_origin(origin_input) assert actual_origin == expected_origin def test_from_origin_visit(): snap_hash = "b5f0b7f716735ebffe38505c60145c4fd9da6ca3" for snap in [snap_hash, None]: visit = { "date": { "timestamp": datetime.datetime( 2015, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "origin": 10, "visit": 100, "metadata": None, "status": "full", "snapshot": hashutil.hash_to_bytes(snap) if snap else snap, } expected_visit = { "date": "2015-01-01T22:00:00+00:00", "origin": 10, "visit": 100, "metadata": {}, "status": "full", "snapshot": snap_hash if snap else snap, } actual_visit = converters.from_origin_visit(visit) assert actual_visit == expected_visit def test_from_release(): """Convert release model object to a dict should be ok""" ts = int( datetime.datetime( 2015, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ).timestamp() ) release_input = Release( id=hashutil.hash_to_bytes("aad23fa492a0c5fed0708a6703be875448c86884"), target=hashutil.hash_to_bytes("5e46d564378afc44b31bb89f99d5675195fbdf67"), target_type=ObjectType.REVISION, date=TimestampWithTimezone( timestamp=Timestamp(seconds=ts, microseconds=0), offset=0, negative_utc=False, ), author=Person( name=b"author name", fullname=b"Author Name author@email", email=b"author@email", ), name=b"v0.0.1", message=b"some comment on release", synthetic=True, ) expected_release = { "id": "aad23fa492a0c5fed0708a6703be875448c86884", "target": "5e46d564378afc44b31bb89f99d5675195fbdf67", "target_type": "revision", "date": "2015-01-01T22:00:00+00:00", "author": { "name": "author name", "fullname": "Author Name author@email", "email": "author@email", }, "name": "v0.0.1", "message": "some comment on release", "target_type": "revision", "synthetic": True, } actual_release = converters.from_release(release_input) assert actual_release == expected_release def test_from_revision_model_object(): ts = int( datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp() ) revision_input = Revision( directory=hashutil.hash_to_bytes("7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), author=Person( name=b"Software Heritage", fullname=b"robot robot@softwareheritage.org", email=b"robot@softwareheritage.org", ), committer=Person( name=b"Software Heritage", fullname=b"robot robot@softwareheritage.org", email=b"robot@softwareheritage.org", ), message=b"synthetic revision message", date=TimestampWithTimezone( timestamp=Timestamp(seconds=ts, microseconds=0), offset=0, negative_utc=False, ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=ts, microseconds=0), offset=0, negative_utc=False, ), synthetic=True, type=RevisionType.TAR, parents=tuple( [ hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5"), hashutil.hash_to_bytes("30d8be353ed3480476f032475e7c244eff7371d5"), ] ), extra_headers=((b"gpgsig", b"some-signature"),), metadata={ "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ], }, ) expected_revision = { "id": "a001358278a0d811fe7072463f805da601121c2a", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "committer": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "message": "synthetic revision message", "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "parents": tuple( [ "29d8be353ed3480476f032475e7c244eff7371d5", "30d8be353ed3480476f032475e7c244eff7371d5", ] ), "type": "tar", "synthetic": True, "extra_headers": (("gpgsig", "some-signature"),), "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ], }, "merge": True, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_revision(): ts = datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp() revision_input = { "id": hashutil.hash_to_bytes("18d8be353ed3480476f032475e7c233eff7371d5"), "directory": hashutil.hash_to_bytes("7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), "author": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "committer": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "message": b"synthetic revision message", "date": {"timestamp": ts, "offset": 0, "negative_utc": False,}, "committer_date": {"timestamp": ts, "offset": 0, "negative_utc": False,}, "synthetic": True, "type": "tar", "parents": [ hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5"), hashutil.hash_to_bytes("30d8be353ed3480476f032475e7c244eff7371d5"), ], "children": [ hashutil.hash_to_bytes("123546353ed3480476f032475e7c244eff7371d5"), ], "metadata": { "extra_headers": [["gpgsig", b"some-signature"]], "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ], }, } expected_revision = { "id": "18d8be353ed3480476f032475e7c233eff7371d5", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "committer": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "message": "synthetic revision message", "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "children": ["123546353ed3480476f032475e7c244eff7371d5"], "parents": [ "29d8be353ed3480476f032475e7c244eff7371d5", "30d8be353ed3480476f032475e7c244eff7371d5", ], "type": "tar", "synthetic": True, "metadata": { "extra_headers": [["gpgsig", "some-signature"]], "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ], }, "merge": True, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_revision_nomerge(): revision_input = { "id": hashutil.hash_to_bytes("18d8be353ed3480476f032475e7c233eff7371d5"), "parents": [hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5")], } expected_revision = { "id": "18d8be353ed3480476f032475e7c233eff7371d5", "parents": ["29d8be353ed3480476f032475e7c244eff7371d5"], "merge": False, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_revision_noparents(): revision_input = { "id": hashutil.hash_to_bytes("18d8be353ed3480476f032475e7c233eff7371d5"), "directory": hashutil.hash_to_bytes("7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), "author": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "committer": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "message": b"synthetic revision message", "date": { "timestamp": datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "committer_date": { "timestamp": datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "synthetic": True, "type": "tar", "children": [ hashutil.hash_to_bytes("123546353ed3480476f032475e7c244eff7371d5"), ], "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ] }, } expected_revision = { "id": "18d8be353ed3480476f032475e7c233eff7371d5", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "committer": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "message": "synthetic revision message", "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "children": ["123546353ed3480476f032475e7c244eff7371d5"], "type": "tar", "synthetic": True, "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ] }, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_revision_invalid(): revision_input = { "id": hashutil.hash_to_bytes("18d8be353ed3480476f032475e7c233eff7371d5"), "directory": hashutil.hash_to_bytes("7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), "author": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "committer": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "message": b"invalid message \xff", "date": { "timestamp": datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "committer_date": { "timestamp": datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "synthetic": True, "type": "tar", "parents": [ hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5"), hashutil.hash_to_bytes("30d8be353ed3480476f032475e7c244eff7371d5"), ], "children": [ hashutil.hash_to_bytes("123546353ed3480476f032475e7c244eff7371d5"), ], "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ] }, } expected_revision = { "id": "18d8be353ed3480476f032475e7c233eff7371d5", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "committer": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "message": "invalid message \\xff", "decoding_failures": ["message"], "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "children": ["123546353ed3480476f032475e7c244eff7371d5"], "parents": [ "29d8be353ed3480476f032475e7c244eff7371d5", "30d8be353ed3480476f032475e7c244eff7371d5", ], "type": "tar", "synthetic": True, "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ] }, "merge": True, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_content_none(): assert converters.from_content(None) is None def test_from_content(): content_input = { "sha1": hashutil.hash_to_bytes("5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5"), "sha256": hashutil.hash_to_bytes( "39007420ca5de7cb3cfc15196335507e" "e76c98930e7e0afa4d2747d3bf96c926" ), "blake2s256": hashutil.hash_to_bytes( "49007420ca5de7cb3cfc15196335507e" "e76c98930e7e0afa4d2747d3bf96c926" ), "sha1_git": hashutil.hash_to_bytes("40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03"), "ctime": "something-which-is-filtered-out", "data": b"data in bytes", "length": 10, "status": "hidden", } # 'status' is filtered expected_content = { "checksums": { "sha1": "5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5", "sha256": "39007420ca5de7cb3cfc15196335507ee76c98" "930e7e0afa4d2747d3bf96c926", "blake2s256": "49007420ca5de7cb3cfc15196335507ee7" "6c98930e7e0afa4d2747d3bf96c926", "sha1_git": "40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03", }, "data": b"data in bytes", "length": 10, "status": "absent", } actual_content = converters.from_content(content_input) assert actual_content == expected_content def test_from_person(): person_input = { "id": 10, "anything": "else", "name": b"bob", "fullname": b"bob bob@alice.net", "email": b"bob@foo.alice", } expected_person = { "id": 10, "anything": "else", "name": "bob", "fullname": "bob bob@alice.net", "email": "bob@foo.alice", } actual_person = converters.from_person(person_input) assert actual_person == expected_person def test_from_directory_entries(): dir_entries_input = { "sha1": hashutil.hash_to_bytes("5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5"), "sha256": hashutil.hash_to_bytes( "39007420ca5de7cb3cfc15196335507e" "e76c98930e7e0afa4d2747d3bf96c926" ), "sha1_git": hashutil.hash_to_bytes("40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03"), "blake2s256": hashutil.hash_to_bytes( "685395c5dc57cada459364f0946d3dd45bad5fcbab" "c1048edb44380f1d31d0aa" ), "target": hashutil.hash_to_bytes("40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03"), "dir_id": hashutil.hash_to_bytes("40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03"), "name": b"bob", "type": 10, "status": "hidden", } expected_dir_entries = { "checksums": { "sha1": "5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5", "sha256": "39007420ca5de7cb3cfc15196335507ee76c98" "930e7e0afa4d2747d3bf96c926", "sha1_git": "40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03", "blake2s256": "685395c5dc57cada459364f0946d3dd45bad5f" "cbabc1048edb44380f1d31d0aa", }, "target": "40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03", "dir_id": "40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03", "name": "bob", "type": 10, "status": "absent", } actual_dir_entries = converters.from_directory_entry(dir_entries_input) assert actual_dir_entries == expected_dir_entries def test_from_filetype(): content_filetype = { "id": hashutil.hash_to_bytes("5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5"), "encoding": "utf-8", "mimetype": "text/plain", } expected_content_filetype = { "id": "5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5", "encoding": "utf-8", "mimetype": "text/plain", } actual_content_filetype = converters.from_filetype(content_filetype) assert actual_content_filetype == expected_content_filetype