Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9697745
D7749.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D7749.diff
View Options
diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py
--- a/swh/web/api/views/identifiers.py
+++ b/swh/web/api/views/identifiers.py
@@ -3,7 +3,8 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.model.hashutil import hash_to_bytes, hash_to_hex
+from swh.model.hashutil import hash_to_hex
+from swh.model.swhids import CoreSWHID
from swh.web.api.apidoc import api_doc, format_docstring
from swh.web.api.apiurls import api_route
from swh.web.common import archive
@@ -104,18 +105,16 @@
swhids = [get_swhid(swhid) for swhid in request.data]
- response = {str(swhid): {"known": False} for swhid in swhids}
+ response = {str(swhid): {"known": True} for swhid in swhids}
# group swhids by their type
swhids_by_type = group_swhids(swhids)
# search for hashes not present in the storage
- missing_hashes = {
- k: set(map(hash_to_bytes, archive.lookup_missing_hashes({k: v})))
- for k, v in swhids_by_type.items()
- }
+ missing_hashes = archive.lookup_missing_hashes(swhids_by_type)
- for swhid in swhids:
- if swhid.object_id not in missing_hashes[swhid.object_type]:
- response[str(swhid)]["known"] = True
+ for ty, missing_hashes in missing_hashes.items():
+ for hash in missing_hashes.iter():
+ swhid = CoreSWHID(object_type=ty, object_id=hash)
+ response[str(swhid)]["known"] = False
return response
diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py
--- a/swh/web/common/archive.py
+++ b/swh/web/common/archive.py
@@ -1406,7 +1406,21 @@
raise ValueError(f"Unexpected object type variant: {object_type}")
-def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[str]:
+# TODO factored out into swh-storage in D7751. Use that version once it
+# lands.
+def _identifiers_missing(obj_type: ObjectType, obj_ids: List[bytes]) -> Iterable[bytes]:
+ return {
+ ObjectType.CONTENT: storage.content_missing_per_sha1_git,
+ ObjectType.DIRECTORY: storage.directory_missing,
+ ObjectType.REVISION: storage.revision_missing,
+ ObjectType.RELEASE: storage.release_missing,
+ ObjectType.SNAPSHOT: storage.snapshot_missing,
+ }[obj_type](obj_ids)
+
+
+def lookup_missing_hashes(
+ grouped_swhids: Dict[ObjectType, List[bytes]]
+) -> Dict[ObjectType, Set[bytes]]:
"""Lookup missing Software Heritage persistent identifier hash, using
batch processing.
@@ -1415,27 +1429,12 @@
keys: object types
values: object hashes
Returns:
- A set(hexadecimal) of the hashes not found in the storage
+ A dictionary per type with set(bytes) of the hashes not found in the storage
"""
- missing_hashes = []
-
- for obj_type, obj_ids in grouped_swhids.items():
- if obj_type == ObjectType.CONTENT:
- missing_hashes.append(storage.content_missing_per_sha1_git(obj_ids))
- elif obj_type == ObjectType.DIRECTORY:
- missing_hashes.append(storage.directory_missing(obj_ids))
- elif obj_type == ObjectType.REVISION:
- missing_hashes.append(storage.revision_missing(obj_ids))
- elif obj_type == ObjectType.RELEASE:
- missing_hashes.append(storage.release_missing(obj_ids))
- elif obj_type == ObjectType.SNAPSHOT:
- missing_hashes.append(storage.snapshot_missing(obj_ids))
-
- missing = set(
- map(lambda x: hashutil.hash_to_hex(x), itertools.chain(*missing_hashes))
- )
-
- return missing
+ return {
+ obj_type: set(_identifiers_missing(obj_type, obj_ids))
+ for obj_type, obj_ids in grouped_swhids.items()
+ }
def lookup_origins_by_sha1s(sha1s: List[str]) -> Iterator[Optional[OriginInfo]]:
diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py
--- a/swh/web/tests/common/test_archive.py
+++ b/swh/web/tests/common/test_archive.py
@@ -31,7 +31,7 @@
from swh.web.common.exc import BadInputExc, NotFoundExc
from swh.web.common.typing import OriginInfo, PagedResult
from swh.web.tests.conftest import ctags_json_missing, fossology_missing
-from swh.web.tests.data import random_content, random_sha1
+from swh.web.tests.data import random_content, random_sha1, random_sha1_bytes
from swh.web.tests.strategies import new_origin, new_revision, visit_dates
@@ -941,47 +941,51 @@
def test_lookup_missing_hashes_non_present():
- missing_cnt = random_sha1()
- missing_dir = random_sha1()
- missing_rev = random_sha1()
- missing_rel = random_sha1()
- missing_snp = random_sha1()
+ missing_cnt = random_sha1_bytes()
+ missing_dir = random_sha1_bytes()
+ missing_rev = random_sha1_bytes()
+ missing_rel = random_sha1_bytes()
+ missing_snp = random_sha1_bytes()
grouped_swhids = {
- ObjectType.CONTENT: [hash_to_bytes(missing_cnt)],
- ObjectType.DIRECTORY: [hash_to_bytes(missing_dir)],
- ObjectType.REVISION: [hash_to_bytes(missing_rev)],
- ObjectType.RELEASE: [hash_to_bytes(missing_rel)],
- ObjectType.SNAPSHOT: [hash_to_bytes(missing_snp)],
+ ObjectType.CONTENT: [missing_cnt],
+ ObjectType.DIRECTORY: [missing_dir],
+ ObjectType.REVISION: [missing_rev],
+ ObjectType.RELEASE: [missing_rel],
+ ObjectType.SNAPSHOT: [missing_snp],
}
actual_result = archive.lookup_missing_hashes(grouped_swhids)
assert actual_result == {
- missing_cnt,
- missing_dir,
- missing_rev,
- missing_rel,
- missing_snp,
+ ObjectType.CONTENT: {missing_cnt},
+ ObjectType.DIRECTORY: {missing_dir},
+ ObjectType.REVISION: {missing_rev},
+ ObjectType.RELEASE: {missing_rel},
+ ObjectType.SNAPSHOT: {missing_snp},
}
def test_lookup_missing_hashes_some_present(content, directory):
- missing_rev = random_sha1()
- missing_rel = random_sha1()
- missing_snp = random_sha1()
+ missing_rev = random_sha1_bytes()
+ missing_rel = random_sha1_bytes()
+ missing_snp = random_sha1_bytes()
grouped_swhids = {
ObjectType.CONTENT: [hash_to_bytes(content["sha1_git"])],
ObjectType.DIRECTORY: [hash_to_bytes(directory)],
- ObjectType.REVISION: [hash_to_bytes(missing_rev)],
- ObjectType.RELEASE: [hash_to_bytes(missing_rel)],
- ObjectType.SNAPSHOT: [hash_to_bytes(missing_snp)],
+ ObjectType.REVISION: [missing_rev],
+ ObjectType.RELEASE: [missing_rel],
+ ObjectType.SNAPSHOT: [missing_snp],
}
actual_result = archive.lookup_missing_hashes(grouped_swhids)
- assert actual_result == {missing_rev, missing_rel, missing_snp}
+ assert actual_result == {
+ ObjectType.REVISION: {missing_rev},
+ ObjectType.RELEASE: {missing_rel},
+ ObjectType.SNAPSHOT: {missing_snp},
+ }
def test_lookup_origin_extra_trailing_slash(origin):
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Aug 18, 12:46 AM (3 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228016
Attached To
D7749: Cleanup `archive.lookup_missing_hashes` and `api_swhid_known`
Event Timeline
Log In to Comment