diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -439,7 +439,7 @@ else: return None - def content_missing_from_hashes( + def content_missing_from_all_hashes( self, contents_hashes: List[Dict[str, bytes]] ) -> Iterator[Dict[str, bytes]]: for group in grouper(contents_hashes, PARTITION_KEY_RESTRICTION_MAX_SIZE): @@ -455,7 +455,7 @@ for content in group: for algo in HASH_ALGORITHMS: assert content.get(algo) is not None, ( - "content_missing_from_hashes must not be called with " + "content_missing_from_all_hashes must not be called with " "partial hashes." ) if tuple(content[algo] for algo in HASH_ALGORITHMS) not in present: diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -371,6 +371,7 @@ def _content_find_many(self, contents: List[Dict[str, Any]]) -> List[Content]: # Find an algorithm that is common to all the requested contents. # It will be used to do an initial filtering efficiently. + # TODO: prioritize sha256, we can do more efficient lookups from this hash. filter_algos = set(HASH_ALGORITHMS) for content in contents: filter_algos &= set(content) @@ -422,16 +423,24 @@ contents_with_missing_hashes.append(content) # These contents can be queried efficiently directly in the main table - for content in self._cql_runner.content_missing_from_hashes( + for content in self._cql_runner.content_missing_from_all_hashes( contents_with_all_hashes ): yield content[key_hash] - # For these, we need the expensive index lookups + main table. - for content in contents_with_missing_hashes: - res = self.content_find(content) - if not res: - yield content[key_hash] + if contents_with_missing_hashes: + # For these, we need the expensive index lookups + main table. + + found_contents = self._content_find_many(contents_with_missing_hashes) + + for missing_content in contents_with_missing_hashes: + for found_content in found_contents: + if missing_content.items() <= found_content.hashes().items(): + # Found! + break + else: + # Not found + yield missing_content[key_hash] @timed def content_missing_per_sha1(self, contents: List[bytes]) -> Iterable[bytes]: