Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9123408
D6888.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
3 KB
Subscribers
None
D6888.id.diff
View Options
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -394,7 +394,7 @@
else:
return None
- def content_missing_from_hashes(
+ def content_missing_from_all_hashes(
self, contents_hashes: List[Dict[str, bytes]]
) -> Iterator[Dict[str, bytes]]:
for group in grouper(contents_hashes, PARTITION_KEY_RESTRICTION_MAX_SIZE):
@@ -410,7 +410,7 @@
for content in group:
for algo in HASH_ALGORITHMS:
assert content.get(algo) is not None, (
- "content_missing_from_hashes must not be called with "
+ "content_missing_from_all_hashes must not be called with "
"partial hashes."
)
if tuple(content[algo] for algo in HASH_ALGORITHMS) not in present:
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -351,6 +351,7 @@
def _content_find_many(self, contents: List[Dict[str, Any]]) -> List[Content]:
# Find an algorithm that is common to all the requested contents.
# It will be used to do an initial filtering efficiently.
+ # TODO: prioritize sha256, we can do more efficient lookups from this hash.
filter_algos = set(HASH_ALGORITHMS)
for content in contents:
filter_algos &= set(content)
@@ -402,16 +403,28 @@
contents_with_missing_hashes.append(content)
# These contents can be queried efficiently directly in the main table
- for content in self._cql_runner.content_missing_from_hashes(
+ for content in self._cql_runner.content_missing_from_all_hashes(
contents_with_all_hashes
):
yield content[key_hash]
- # For these, we need the expensive index lookups + main table.
- for content in contents_with_missing_hashes:
- res = self.content_find(content)
- if not res:
- yield content[key_hash]
+ if contents_with_missing_hashes:
+ # For these, we need the expensive index lookups + main table.
+
+ # Get all contents in the database that match (at least) one of the
+ # requested contents, concurrently.
+ found_contents = self._content_find_many(contents_with_missing_hashes)
+
+ for missing_content in contents_with_missing_hashes:
+ for found_content in found_contents:
+ # check if the found_content.hashes() dictionary contains a superset
+ # of the (key, value) pairs in missing_content
+ if missing_content.items() <= found_content.hashes().items():
+ # Found!
+ break
+ else:
+ # Not found
+ yield missing_content[key_hash]
@timed
def content_missing_per_sha1(self, contents: List[bytes]) -> Iterable[bytes]:
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -216,7 +216,7 @@
matches.sort()
return matches[0:limit]
- def content_missing_from_hashes(
+ def content_missing_from_all_hashes(
self, contents_hashes: List[Dict[str, bytes]]
) -> Iterator[Dict[str, bytes]]:
for content_hashes in contents_hashes:
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Jun 20, 5:26 PM (1 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217288
Attached To
D6888: cassandra: Rewrite content_missing to run queries concurrently.
Event Timeline
Log In to Comment