diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py index ac8e4a04..6a97cc93 100644 --- a/swh/web/api/views/identifiers.py +++ b/swh/web/api/views/identifiers.py @@ -1,120 +1,121 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.common import archive from swh.web.common.exc import LargePayloadExc from swh.web.common.identifiers import get_swhid, group_swhids, resolve_swhid @api_route(r"/resolve/(?P.*)/", "api-1-resolve-swhid") @api_doc("/resolve/") @format_docstring() def api_resolve_swhid(request, swhid): """ .. http:get:: /api/1/resolve/(swhid)/ Resolve a SoftWare Heritage persistent IDentifier (SWHID) Try to resolve a provided `SoftWare Heritage persistent IDentifier `_ into an url for browsing the pointed archive object. If the provided identifier is valid, the existence of the object in the archive will also be checked. :param string swhid: a SoftWare Heritage persistent IDentifier :>json string browse_url: the url for browsing the pointed object :>json object metadata: object holding optional parts of the SWHID :>json string namespace: the SWHID namespace :>json string object_id: the hash identifier of the pointed object :>json string object_type: the type of the pointed object :>json number scheme_version: the scheme version of the SWHID {common_headers} :statuscode 200: no error :statuscode 400: an invalid SWHID has been provided :statuscode 404: the pointed object does not exist in the archive **Example:** .. parsed-literal:: :swh_web_api:`resolve/swh:1:rev:96db9023b881d7cd9f379b0c154650d6c108e9a3;origin=https://github.com/openssl/openssl/` """ # try to resolve the provided swhid swhid_resolved = resolve_swhid(swhid) # id is well-formed, now check that the pointed # object is present in the archive, NotFoundExc # will be raised otherwise swhid_parsed = swhid_resolved["swhid_parsed"] object_type = swhid_parsed.object_type object_id = hash_to_hex(swhid_parsed.object_id) archive.lookup_object(swhid_parsed.object_type, object_id) # id is well-formed and the pointed object exists return { "namespace": swhid_parsed.namespace, "scheme_version": swhid_parsed.scheme_version, "object_type": object_type.name.lower(), "object_id": object_id, "metadata": swhid_parsed.qualifiers(), "browse_url": request.build_absolute_uri(swhid_resolved["browse_url"]), } @api_route(r"/known/", "api-1-known", methods=["POST"]) @api_doc("/known/") @format_docstring() def api_swhid_known(request): """ .. http:post:: /api/1/known/ Check if a list of objects are present in the Software Heritage archive. The objects to check existence must be provided using `SoftWare Heritage persistent IDentifiers `_. :json object : an object whose keys are input SWHIDs and values objects with the following keys: * **known (bool)**: whether the object was found {common_headers} :statuscode 200: no error :statuscode 400: an invalid SWHID was provided :statuscode 413: the input array of SWHIDs is too large """ limit = 1000 if len(request.data) > limit: raise LargePayloadExc( "The maximum number of SWHIDs this endpoint can receive is %s" % limit ) swhids = [get_swhid(swhid) for swhid in request.data] response = {str(swhid): {"known": False} for swhid in swhids} # group swhids by their type swhids_by_type = group_swhids(swhids) # search for hashes not present in the storage - missing_hashes = set( - map(hash_to_bytes, archive.lookup_missing_hashes(swhids_by_type)) - ) + missing_hashes = { + k: set(map(hash_to_bytes, archive.lookup_missing_hashes({k: v}))) + for k, v in swhids_by_type.items() + } for swhid in swhids: - if swhid.object_id not in missing_hashes: + if swhid.object_id not in missing_hashes[swhid.object_type]: response[str(swhid)]["known"] = True return response diff --git a/swh/web/tests/api/views/test_identifiers.py b/swh/web/tests/api/views/test_identifiers.py index 30312470..456c9faa 100644 --- a/swh/web/tests/api/views/test_identifiers.py +++ b/swh/web/tests/api/views/test_identifiers.py @@ -1,164 +1,184 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model.swhids import ObjectType from swh.web.common.identifiers import gen_swhid from swh.web.common.utils import reverse from swh.web.tests.data import random_sha1 from swh.web.tests.utils import check_api_get_responses, check_api_post_responses def test_swhid_resolve_success( api_client, content, directory, origin, release, revision, snapshot ): for obj_type, obj_id in ( (ObjectType.CONTENT, content["sha1_git"]), (ObjectType.DIRECTORY, directory), (ObjectType.RELEASE, release), (ObjectType.REVISION, revision), (ObjectType.SNAPSHOT, snapshot), ): swhid = gen_swhid(obj_type, obj_id, metadata={"origin": origin["url"]}) url = reverse("api-1-resolve-swhid", url_args={"swhid": swhid}) resp = check_api_get_responses(api_client, url, status_code=200) if obj_type == ObjectType.CONTENT: url_args = {"query_string": "sha1_git:%s" % obj_id} elif obj_type == ObjectType.SNAPSHOT: url_args = {"snapshot_id": obj_id} else: url_args = {"sha1_git": obj_id} obj_type_str = obj_type.name.lower() browse_rev_url = reverse( f"browse-{obj_type_str}", url_args=url_args, query_params={"origin_url": origin["url"]}, request=resp.wsgi_request, ) expected_result = { "browse_url": browse_rev_url, "metadata": {"origin": origin["url"]}, "namespace": "swh", "object_id": obj_id, "object_type": obj_type_str, "scheme_version": 1, } assert resp.data == expected_result def test_swhid_resolve_invalid(api_client): rev_id_invalid = "96db9023b8_foo_50d6c108e9a3" swhid = "swh:1:rev:%s" % rev_id_invalid url = reverse("api-1-resolve-swhid", url_args={"swhid": swhid}) check_api_get_responses(api_client, url, status_code=400) def test_swhid_resolve_not_found( api_client, unknown_content, unknown_directory, unknown_release, unknown_revision, unknown_snapshot, ): for obj_type, obj_id in ( (ObjectType.CONTENT, unknown_content["sha1_git"]), (ObjectType.DIRECTORY, unknown_directory), (ObjectType.RELEASE, unknown_release), (ObjectType.REVISION, unknown_revision), (ObjectType.SNAPSHOT, unknown_snapshot), ): swhid = gen_swhid(obj_type, obj_id) url = reverse("api-1-resolve-swhid", url_args={"swhid": swhid}) check_api_get_responses(api_client, url, status_code=404) def test_swh_origin_id_not_resolvable(api_client): ori_swhid = "swh:1:ori:8068d0075010b590762c6cb5682ed53cb3c13deb" url = reverse("api-1-resolve-swhid", url_args={"swhid": ori_swhid}) check_api_get_responses(api_client, url, status_code=400) def test_api_known_swhid_all_present( api_client, content, directory, release, revision, snapshot ): input_swhids = [ gen_swhid(ObjectType.CONTENT, content["sha1_git"]), gen_swhid(ObjectType.DIRECTORY, directory), gen_swhid(ObjectType.REVISION, revision), gen_swhid(ObjectType.RELEASE, release), gen_swhid(ObjectType.SNAPSHOT, snapshot), ] url = reverse("api-1-known") resp = check_api_post_responses(api_client, url, data=input_swhids, status_code=200) assert resp.data == {swhid: {"known": True} for swhid in input_swhids} def test_api_known_swhid_some_present(api_client, content, directory): content_ = gen_swhid(ObjectType.CONTENT, content["sha1_git"]) directory_ = gen_swhid(ObjectType.DIRECTORY, directory) unknown_revision_ = gen_swhid(ObjectType.REVISION, random_sha1()) unknown_release_ = gen_swhid(ObjectType.RELEASE, random_sha1()) unknown_snapshot_ = gen_swhid(ObjectType.SNAPSHOT, random_sha1()) input_swhids = [ content_, directory_, unknown_revision_, unknown_release_, unknown_snapshot_, ] url = reverse("api-1-known") resp = check_api_post_responses(api_client, url, data=input_swhids, status_code=200) assert resp.data == { content_: {"known": True}, directory_: {"known": True}, unknown_revision_: {"known": False}, unknown_release_: {"known": False}, unknown_snapshot_: {"known": False}, } +def test_api_known_swhid_same_hash(api_client, content): + content_ = gen_swhid(ObjectType.CONTENT, content["sha1_git"]) + # Reuse hash to make invalid directory SHWID + directory_ = gen_swhid(ObjectType.DIRECTORY, content["sha1_git"]) + + input_swhids = [ + content_, + directory_, + ] + + url = reverse("api-1-known") + + resp = check_api_post_responses(api_client, url, data=input_swhids, status_code=200) + + assert resp.data == { + content_: {"known": True}, + directory_: {"known": False}, + } + + def test_api_known_invalid_swhid(api_client): invalid_swhid_sha1 = ["swh:1:cnt:8068d0075010b590762c6cb5682ed53cb3c13de;"] invalid_swhid_type = ["swh:1:cnn:8068d0075010b590762c6cb5682ed53cb3c13deb"] url = reverse("api-1-known") check_api_post_responses(api_client, url, data=invalid_swhid_sha1, status_code=400) check_api_post_responses(api_client, url, data=invalid_swhid_type, status_code=400) def test_api_known_raises_large_payload_error(api_client): random_swhid = "swh:1:cnt:8068d0075010b590762c6cb5682ed53cb3c13deb" limit = 10000 err_msg = "The maximum number of SWHIDs this endpoint can receive is 1000" swhids = [random_swhid for i in range(limit)] url = reverse("api-1-known") resp = check_api_post_responses(api_client, url, data=swhids, status_code=413) assert resp.data == {"exception": "LargePayloadExc", "reason": err_msg}