diff --git a/swh/storage/algos/identifier.py b/swh/storage/algos/identifier.py new file mode 100644 --- /dev/null +++ b/swh/storage/algos/identifier.py @@ -0,0 +1,44 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Iterable, List + +from swh.model.model import ObjectType, Sha1Git +from swh.storage.interface import StorageInterface + + +def identifiers_missing( + storage: StorageInterface, obj_type: ObjectType, obj_ids: List[Sha1Git] +) -> Iterable[Sha1Git]: + """Lookup missing Software Heritage hashes, choosing the correct + underlying method based on the type. + + The idea is that conceptually we are looking up a list of Core + SWHIDs with the same objec type part, and to enforce that we pash + the type and the hashes separately. + + Args: + storage (swh.storage.interface.StorageInterface): the storage + instance + + obj_type: What type of object are the hashes supposed to point + to. + + obj_ids: list of hashes to look up + + Yields: + missing hashes for the given object type + """ + + if obj_type == ObjectType.CONTENT: + return storage.content_missing_per_sha1_git(obj_ids) + elif obj_type == ObjectType.DIRECTORY: + return storage.directory_missing(obj_ids) + elif obj_type == ObjectType.REVISION: + return storage.revision_missing(obj_ids) + elif obj_type == ObjectType.RELEASE: + return storage.release_missing(obj_ids) + elif obj_type == ObjectType.SNAPSHOT: + return storage.snapshot_missing(obj_ids) diff --git a/swh/storage/tests/algos/test_identifer.py b/swh/storage/tests/algos/test_identifer.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/algos/test_identifer.py @@ -0,0 +1,56 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +import random + +from swh.model.model import ObjectType, Sha1Git +from swh.storage.algos.identifier import identifiers_missing +from ..storage_data import StorageData + + +@pytest.fixture(scope="function") +def missing_id() -> Sha1Git: + return bytes(random.randint(0, 255) for _ in range(20)) + + +@pytest.mark.parametrize("content_id", [c.sha1_git for c in StorageData.contents]) +def test_missing_contents(swh_storage, content_id, missing_id): + hashes = [content_id, missing_id] + + result = identifiers_missing(swh_storage, ObjectType.CONTENT, hashes) + assert result == {missing_id} + + +@pytest.mark.parametrize("directory_id", [o.id for o in StorageData.directories]) +def test_missing_directories(swh_storage, directory_id, missing_id): + hashes = [directory_id, missing_id] + + result = identifiers_missing(swh_storage, ObjectType.Directories, hashes) + assert result == {missing_id} + + +@pytest.mark.parametrize("revision_id", [o.id for o in StorageData.revisions]) +def test_missing_revisions(swh_storage, revision_id, missing_id): + hashes = [revision_id, missing_id] + + result = identifiers_missing(swh_storage, ObjectType.REVISION, hashes) + assert result == {missing_id} + + +@pytest.mark.parametrize("release_id", [o.id for o in StorageData.releases]) +def test_missing_releases(swh_storage, release_id, missing_id): + hashes = [release_id, missing_id] + + result = identifiers_missing(swh_storage, ObjectType.RELEASE, hashes) + assert result == {missing_id} + + +@pytest.mark.parametrize("snapshot_id", [o.id for o in StorageData.snapshots]) +def test_missing_snapshots(swh_storage, snapshot_id, missing_id): + hashes = [snapshot_id, missing_id] + + result = identifiers_missing(swh_storage, ObjectType.CONTENT, hashes) + assert result == {missing_id}