D6888.id.diff
View Options

	diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
	--- a/swh/storage/cassandra/cql.py
	+++ b/swh/storage/cassandra/cql.py
	@@ -394,7 +394,7 @@
	else:
	return None

	- def content_missing_from_hashes(
	+ def content_missing_from_all_hashes(
	self, contents_hashes: List[Dict[str, bytes]]
	) -> Iterator[Dict[str, bytes]]:
	for group in grouper(contents_hashes, PARTITION_KEY_RESTRICTION_MAX_SIZE):
	@@ -410,7 +410,7 @@
	for content in group:
	for algo in HASH_ALGORITHMS:
	assert content.get(algo) is not None, (
	- "content_missing_from_hashes must not be called with "
	+ "content_missing_from_all_hashes must not be called with "
	"partial hashes."
	)
	if tuple(content[algo] for algo in HASH_ALGORITHMS) not in present:
	diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
	--- a/swh/storage/cassandra/storage.py
	+++ b/swh/storage/cassandra/storage.py
	@@ -351,6 +351,7 @@
	def _content_find_many(self, contents: List[Dict[str, Any]]) -> List[Content]:
	# Find an algorithm that is common to all the requested contents.
	# It will be used to do an initial filtering efficiently.
	+ # TODO: prioritize sha256, we can do more efficient lookups from this hash.
	filter_algos = set(HASH_ALGORITHMS)
	for content in contents:
	filter_algos &= set(content)
	@@ -402,16 +403,28 @@
	contents_with_missing_hashes.append(content)

	# These contents can be queried efficiently directly in the main table
	- for content in self._cql_runner.content_missing_from_hashes(
	+ for content in self._cql_runner.content_missing_from_all_hashes(
	contents_with_all_hashes
	):
	yield content[key_hash]

	- # For these, we need the expensive index lookups + main table.
	- for content in contents_with_missing_hashes:
	- res = self.content_find(content)
	- if not res:
	- yield content[key_hash]
	+ if contents_with_missing_hashes:
	+ # For these, we need the expensive index lookups + main table.
	+
	+ # Get all contents in the database that match (at least) one of the
	+ # requested contents, concurrently.
	+ found_contents = self._content_find_many(contents_with_missing_hashes)
	+
	+ for missing_content in contents_with_missing_hashes:
	+ for found_content in found_contents:
	+ # check if the found_content.hashes() dictionary contains a superset
	+ # of the (key, value) pairs in missing_content
	+ if missing_content.items() <= found_content.hashes().items():
	+ # Found!
	+ break
	+ else:
	+ # Not found
	+ yield missing_content[key_hash]

	@timed
	def content_missing_per_sha1(self, contents: List[bytes]) -> Iterable[bytes]:
	diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
	--- a/swh/storage/in_memory.py
	+++ b/swh/storage/in_memory.py
	@@ -216,7 +216,7 @@
	matches.sort()
	return matches[0:limit]

	- def content_missing_from_hashes(
	+ def content_missing_from_all_hashes(
	self, contents_hashes: List[Dict[str, bytes]]
	) -> Iterator[Dict[str, bytes]]:
	for content_hashes in contents_hashes:

File Metadata

Mime Type: text/plain
Expires: Fri, Jun 20, 5:26 PM (1 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3217288

D6888.id.diff
No OneTemporary
Actions

D6888.id.diff
View Options

File Metadata

Event Timeline

D6888.id.diffNo OneTemporaryActions

D6888.id.diffView Options

File Metadata

Event Timeline

D6888.id.diff
No OneTemporary
Actions

D6888.id.diff
View Options