Page MenuHomeSoftware Heritage

D6888.id.diff
No OneTemporary

D6888.id.diff

diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -394,7 +394,7 @@
else:
return None
- def content_missing_from_hashes(
+ def content_missing_from_all_hashes(
self, contents_hashes: List[Dict[str, bytes]]
) -> Iterator[Dict[str, bytes]]:
for group in grouper(contents_hashes, PARTITION_KEY_RESTRICTION_MAX_SIZE):
@@ -410,7 +410,7 @@
for content in group:
for algo in HASH_ALGORITHMS:
assert content.get(algo) is not None, (
- "content_missing_from_hashes must not be called with "
+ "content_missing_from_all_hashes must not be called with "
"partial hashes."
)
if tuple(content[algo] for algo in HASH_ALGORITHMS) not in present:
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -351,6 +351,7 @@
def _content_find_many(self, contents: List[Dict[str, Any]]) -> List[Content]:
# Find an algorithm that is common to all the requested contents.
# It will be used to do an initial filtering efficiently.
+ # TODO: prioritize sha256, we can do more efficient lookups from this hash.
filter_algos = set(HASH_ALGORITHMS)
for content in contents:
filter_algos &= set(content)
@@ -402,16 +403,28 @@
contents_with_missing_hashes.append(content)
# These contents can be queried efficiently directly in the main table
- for content in self._cql_runner.content_missing_from_hashes(
+ for content in self._cql_runner.content_missing_from_all_hashes(
contents_with_all_hashes
):
yield content[key_hash]
- # For these, we need the expensive index lookups + main table.
- for content in contents_with_missing_hashes:
- res = self.content_find(content)
- if not res:
- yield content[key_hash]
+ if contents_with_missing_hashes:
+ # For these, we need the expensive index lookups + main table.
+
+ # Get all contents in the database that match (at least) one of the
+ # requested contents, concurrently.
+ found_contents = self._content_find_many(contents_with_missing_hashes)
+
+ for missing_content in contents_with_missing_hashes:
+ for found_content in found_contents:
+ # check if the found_content.hashes() dictionary contains a superset
+ # of the (key, value) pairs in missing_content
+ if missing_content.items() <= found_content.hashes().items():
+ # Found!
+ break
+ else:
+ # Not found
+ yield missing_content[key_hash]
@timed
def content_missing_per_sha1(self, contents: List[bytes]) -> Iterable[bytes]:
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -216,7 +216,7 @@
matches.sort()
return matches[0:limit]
- def content_missing_from_hashes(
+ def content_missing_from_all_hashes(
self, contents_hashes: List[Dict[str, bytes]]
) -> Iterator[Dict[str, bytes]]:
for content_hashes in contents_hashes:

File Metadata

Mime Type
text/plain
Expires
Fri, Jun 20, 5:26 PM (1 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217288

Event Timeline