Page MenuHomeSoftware Heritage

D7749.diff
No OneTemporary

D7749.diff

diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py
--- a/swh/web/api/views/identifiers.py
+++ b/swh/web/api/views/identifiers.py
@@ -3,7 +3,8 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.model.hashutil import hash_to_bytes, hash_to_hex
+from swh.model.hashutil import hash_to_hex
+from swh.model.swhids import CoreSWHID
from swh.web.api.apidoc import api_doc, format_docstring
from swh.web.api.apiurls import api_route
from swh.web.common import archive
@@ -104,18 +105,16 @@
swhids = [get_swhid(swhid) for swhid in request.data]
- response = {str(swhid): {"known": False} for swhid in swhids}
+ response = {str(swhid): {"known": True} for swhid in swhids}
# group swhids by their type
swhids_by_type = group_swhids(swhids)
# search for hashes not present in the storage
- missing_hashes = {
- k: set(map(hash_to_bytes, archive.lookup_missing_hashes({k: v})))
- for k, v in swhids_by_type.items()
- }
+ missing_hashes = archive.lookup_missing_hashes(swhids_by_type)
- for swhid in swhids:
- if swhid.object_id not in missing_hashes[swhid.object_type]:
- response[str(swhid)]["known"] = True
+ for ty, missing_hashes in missing_hashes.items():
+ for hash in missing_hashes.iter():
+ swhid = CoreSWHID(object_type=ty, object_id=hash)
+ response[str(swhid)]["known"] = False
return response
diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py
--- a/swh/web/common/archive.py
+++ b/swh/web/common/archive.py
@@ -1406,7 +1406,21 @@
raise ValueError(f"Unexpected object type variant: {object_type}")
-def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[str]:
+# TODO factored out into swh-storage in D7751. Use that version once it
+# lands.
+def _identifiers_missing(obj_type: ObjectType, obj_ids: List[bytes]) -> Iterable[bytes]:
+ return {
+ ObjectType.CONTENT: storage.content_missing_per_sha1_git,
+ ObjectType.DIRECTORY: storage.directory_missing,
+ ObjectType.REVISION: storage.revision_missing,
+ ObjectType.RELEASE: storage.release_missing,
+ ObjectType.SNAPSHOT: storage.snapshot_missing,
+ }[obj_type](obj_ids)
+
+
+def lookup_missing_hashes(
+ grouped_swhids: Dict[ObjectType, List[bytes]]
+) -> Dict[ObjectType, Set[bytes]]:
"""Lookup missing Software Heritage persistent identifier hash, using
batch processing.
@@ -1415,27 +1429,12 @@
keys: object types
values: object hashes
Returns:
- A set(hexadecimal) of the hashes not found in the storage
+ A dictionary per type with set(bytes) of the hashes not found in the storage
"""
- missing_hashes = []
-
- for obj_type, obj_ids in grouped_swhids.items():
- if obj_type == ObjectType.CONTENT:
- missing_hashes.append(storage.content_missing_per_sha1_git(obj_ids))
- elif obj_type == ObjectType.DIRECTORY:
- missing_hashes.append(storage.directory_missing(obj_ids))
- elif obj_type == ObjectType.REVISION:
- missing_hashes.append(storage.revision_missing(obj_ids))
- elif obj_type == ObjectType.RELEASE:
- missing_hashes.append(storage.release_missing(obj_ids))
- elif obj_type == ObjectType.SNAPSHOT:
- missing_hashes.append(storage.snapshot_missing(obj_ids))
-
- missing = set(
- map(lambda x: hashutil.hash_to_hex(x), itertools.chain(*missing_hashes))
- )
-
- return missing
+ return {
+ obj_type: set(_identifiers_missing(obj_type, obj_ids))
+ for obj_type, obj_ids in grouped_swhids.items()
+ }
def lookup_origins_by_sha1s(sha1s: List[str]) -> Iterator[Optional[OriginInfo]]:
diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py
--- a/swh/web/tests/common/test_archive.py
+++ b/swh/web/tests/common/test_archive.py
@@ -31,7 +31,7 @@
from swh.web.common.exc import BadInputExc, NotFoundExc
from swh.web.common.typing import OriginInfo, PagedResult
from swh.web.tests.conftest import ctags_json_missing, fossology_missing
-from swh.web.tests.data import random_content, random_sha1
+from swh.web.tests.data import random_content, random_sha1, random_sha1_bytes
from swh.web.tests.strategies import new_origin, new_revision, visit_dates
@@ -941,47 +941,51 @@
def test_lookup_missing_hashes_non_present():
- missing_cnt = random_sha1()
- missing_dir = random_sha1()
- missing_rev = random_sha1()
- missing_rel = random_sha1()
- missing_snp = random_sha1()
+ missing_cnt = random_sha1_bytes()
+ missing_dir = random_sha1_bytes()
+ missing_rev = random_sha1_bytes()
+ missing_rel = random_sha1_bytes()
+ missing_snp = random_sha1_bytes()
grouped_swhids = {
- ObjectType.CONTENT: [hash_to_bytes(missing_cnt)],
- ObjectType.DIRECTORY: [hash_to_bytes(missing_dir)],
- ObjectType.REVISION: [hash_to_bytes(missing_rev)],
- ObjectType.RELEASE: [hash_to_bytes(missing_rel)],
- ObjectType.SNAPSHOT: [hash_to_bytes(missing_snp)],
+ ObjectType.CONTENT: [missing_cnt],
+ ObjectType.DIRECTORY: [missing_dir],
+ ObjectType.REVISION: [missing_rev],
+ ObjectType.RELEASE: [missing_rel],
+ ObjectType.SNAPSHOT: [missing_snp],
}
actual_result = archive.lookup_missing_hashes(grouped_swhids)
assert actual_result == {
- missing_cnt,
- missing_dir,
- missing_rev,
- missing_rel,
- missing_snp,
+ ObjectType.CONTENT: {missing_cnt},
+ ObjectType.DIRECTORY: {missing_dir},
+ ObjectType.REVISION: {missing_rev},
+ ObjectType.RELEASE: {missing_rel},
+ ObjectType.SNAPSHOT: {missing_snp},
}
def test_lookup_missing_hashes_some_present(content, directory):
- missing_rev = random_sha1()
- missing_rel = random_sha1()
- missing_snp = random_sha1()
+ missing_rev = random_sha1_bytes()
+ missing_rel = random_sha1_bytes()
+ missing_snp = random_sha1_bytes()
grouped_swhids = {
ObjectType.CONTENT: [hash_to_bytes(content["sha1_git"])],
ObjectType.DIRECTORY: [hash_to_bytes(directory)],
- ObjectType.REVISION: [hash_to_bytes(missing_rev)],
- ObjectType.RELEASE: [hash_to_bytes(missing_rel)],
- ObjectType.SNAPSHOT: [hash_to_bytes(missing_snp)],
+ ObjectType.REVISION: [missing_rev],
+ ObjectType.RELEASE: [missing_rel],
+ ObjectType.SNAPSHOT: [missing_snp],
}
actual_result = archive.lookup_missing_hashes(grouped_swhids)
- assert actual_result == {missing_rev, missing_rel, missing_snp}
+ assert actual_result == {
+ ObjectType.REVISION: {missing_rev},
+ ObjectType.RELEASE: {missing_rel},
+ ObjectType.SNAPSHOT: {missing_snp},
+ }
def test_lookup_origin_extra_trailing_slash(origin):

File Metadata

Mime Type
text/plain
Expires
Mon, Aug 18, 12:46 AM (3 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228016

Event Timeline