diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py --- a/swh/web/api/views/identifiers.py +++ b/swh/web/api/views/identifiers.py @@ -3,7 +3,8 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.hashutil import hash_to_hex +from swh.model.swhids import CoreSWHID from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.common import archive @@ -104,18 +105,16 @@ swhids = [get_swhid(swhid) for swhid in request.data] - response = {str(swhid): {"known": False} for swhid in swhids} + response = {str(swhid): {"known": True} for swhid in swhids} # group swhids by their type swhids_by_type = group_swhids(swhids) # search for hashes not present in the storage - missing_hashes = { - k: set(map(hash_to_bytes, archive.lookup_missing_hashes({k: v}))) - for k, v in swhids_by_type.items() - } + missing_hashes = archive.lookup_missing_hashes(swhids_by_type) - for swhid in swhids: - if swhid.object_id not in missing_hashes[swhid.object_type]: - response[str(swhid)]["known"] = True + for ty, missing_hashes in missing_hashes.items(): + for hash in missing_hashes.iter(): + swhid = CoreSWHID(object_type=ty, object_id=hash) + response[str(swhid)]["known"] = False return response diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py --- a/swh/web/common/archive.py +++ b/swh/web/common/archive.py @@ -1406,7 +1406,21 @@ raise ValueError(f"Unexpected object type variant: {object_type}") -def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[str]: +# TODO factored out into swh-storage in D7751. Use that version once it +# lands. +def _identifiers_missing(obj_type: ObjectType, obj_ids: List[bytes]) -> Iterable[bytes]: + return { + ObjectType.CONTENT: storage.content_missing_per_sha1_git, + ObjectType.DIRECTORY: storage.directory_missing, + ObjectType.REVISION: storage.revision_missing, + ObjectType.RELEASE: storage.release_missing, + ObjectType.SNAPSHOT: storage.snapshot_missing, + }[obj_type](obj_ids) + + +def lookup_missing_hashes( + grouped_swhids: Dict[ObjectType, List[bytes]] +) -> Dict[ObjectType, Set[bytes]]: """Lookup missing Software Heritage persistent identifier hash, using batch processing. @@ -1415,27 +1429,12 @@ keys: object types values: object hashes Returns: - A set(hexadecimal) of the hashes not found in the storage + A dictionary per type with set(bytes) of the hashes not found in the storage """ - missing_hashes = [] - - for obj_type, obj_ids in grouped_swhids.items(): - if obj_type == ObjectType.CONTENT: - missing_hashes.append(storage.content_missing_per_sha1_git(obj_ids)) - elif obj_type == ObjectType.DIRECTORY: - missing_hashes.append(storage.directory_missing(obj_ids)) - elif obj_type == ObjectType.REVISION: - missing_hashes.append(storage.revision_missing(obj_ids)) - elif obj_type == ObjectType.RELEASE: - missing_hashes.append(storage.release_missing(obj_ids)) - elif obj_type == ObjectType.SNAPSHOT: - missing_hashes.append(storage.snapshot_missing(obj_ids)) - - missing = set( - map(lambda x: hashutil.hash_to_hex(x), itertools.chain(*missing_hashes)) - ) - - return missing + return { + obj_type: set(_identifiers_missing(obj_type, obj_ids)) + for obj_type, obj_ids in grouped_swhids.items() + } def lookup_origins_by_sha1s(sha1s: List[str]) -> Iterator[Optional[OriginInfo]]: diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py --- a/swh/web/tests/common/test_archive.py +++ b/swh/web/tests/common/test_archive.py @@ -31,7 +31,7 @@ from swh.web.common.exc import BadInputExc, NotFoundExc from swh.web.common.typing import OriginInfo, PagedResult from swh.web.tests.conftest import ctags_json_missing, fossology_missing -from swh.web.tests.data import random_content, random_sha1 +from swh.web.tests.data import random_content, random_sha1, random_sha1_bytes from swh.web.tests.strategies import new_origin, new_revision, visit_dates @@ -941,47 +941,51 @@ def test_lookup_missing_hashes_non_present(): - missing_cnt = random_sha1() - missing_dir = random_sha1() - missing_rev = random_sha1() - missing_rel = random_sha1() - missing_snp = random_sha1() + missing_cnt = random_sha1_bytes() + missing_dir = random_sha1_bytes() + missing_rev = random_sha1_bytes() + missing_rel = random_sha1_bytes() + missing_snp = random_sha1_bytes() grouped_swhids = { - ObjectType.CONTENT: [hash_to_bytes(missing_cnt)], - ObjectType.DIRECTORY: [hash_to_bytes(missing_dir)], - ObjectType.REVISION: [hash_to_bytes(missing_rev)], - ObjectType.RELEASE: [hash_to_bytes(missing_rel)], - ObjectType.SNAPSHOT: [hash_to_bytes(missing_snp)], + ObjectType.CONTENT: [missing_cnt], + ObjectType.DIRECTORY: [missing_dir], + ObjectType.REVISION: [missing_rev], + ObjectType.RELEASE: [missing_rel], + ObjectType.SNAPSHOT: [missing_snp], } actual_result = archive.lookup_missing_hashes(grouped_swhids) assert actual_result == { - missing_cnt, - missing_dir, - missing_rev, - missing_rel, - missing_snp, + ObjectType.CONTENT: {missing_cnt}, + ObjectType.DIRECTORY: {missing_dir}, + ObjectType.REVISION: {missing_rev}, + ObjectType.RELEASE: {missing_rel}, + ObjectType.SNAPSHOT: {missing_snp}, } def test_lookup_missing_hashes_some_present(content, directory): - missing_rev = random_sha1() - missing_rel = random_sha1() - missing_snp = random_sha1() + missing_rev = random_sha1_bytes() + missing_rel = random_sha1_bytes() + missing_snp = random_sha1_bytes() grouped_swhids = { ObjectType.CONTENT: [hash_to_bytes(content["sha1_git"])], ObjectType.DIRECTORY: [hash_to_bytes(directory)], - ObjectType.REVISION: [hash_to_bytes(missing_rev)], - ObjectType.RELEASE: [hash_to_bytes(missing_rel)], - ObjectType.SNAPSHOT: [hash_to_bytes(missing_snp)], + ObjectType.REVISION: [missing_rev], + ObjectType.RELEASE: [missing_rel], + ObjectType.SNAPSHOT: [missing_snp], } actual_result = archive.lookup_missing_hashes(grouped_swhids) - assert actual_result == {missing_rev, missing_rel, missing_snp} + assert actual_result == { + ObjectType.REVISION: {missing_rev}, + ObjectType.RELEASE: {missing_rel}, + ObjectType.SNAPSHOT: {missing_snp}, + } def test_lookup_origin_extra_trailing_slash(origin):