Differential D6888 Diff 24984 swh/storage/cassandra/storage.py

Changeset View

Standalone View

swh/storage/cassandra/storage.py

Show First 20 Lines • Show All 345 Lines • ▼ Show 20 Lines	class CassandraStorage:

@timed		@timed
def content_find(self, content: Dict[str, Any]) -> List[Content]:		def content_find(self, content: Dict[str, Any]) -> List[Content]:
return self._content_find_many([content])		return self._content_find_many([content])

def _content_find_many(self, contents: List[Dict[str, Any]]) -> List[Content]:		def _content_find_many(self, contents: List[Dict[str, Any]]) -> List[Content]:
# Find an algorithm that is common to all the requested contents.		# Find an algorithm that is common to all the requested contents.
# It will be used to do an initial filtering efficiently.		# It will be used to do an initial filtering efficiently.
		# TODO: prioritize sha256, we can do more efficient lookups from this hash.
filter_algos = set(HASH_ALGORITHMS)		filter_algos = set(HASH_ALGORITHMS)
for content in contents:		for content in contents:
filter_algos &= set(content)		filter_algos &= set(content)
if not filter_algos:		if not filter_algos:
raise StorageArgumentException(		raise StorageArgumentException(
"content keys must contain at least one "		"content keys must contain at least one "
f"of: {', '.join(sorted(HASH_ALGORITHMS))}"		f"of: {', '.join(sorted(HASH_ALGORITHMS))}"
)		)
Show All 35 Lines	) -> Iterable[bytes]:
contents_with_missing_hashes = []		contents_with_missing_hashes = []
for content in contents:		for content in contents:
if DEFAULT_ALGORITHMS <= set(content):		if DEFAULT_ALGORITHMS <= set(content):
contents_with_all_hashes.append(content)		contents_with_all_hashes.append(content)
else:		else:
contents_with_missing_hashes.append(content)		contents_with_missing_hashes.append(content)

# These contents can be queried efficiently directly in the main table		# These contents can be queried efficiently directly in the main table
for content in self._cql_runner.content_missing_from_hashes(		for content in self._cql_runner.content_missing_from_all_hashes(
contents_with_all_hashes		contents_with_all_hashes
):		):
yield content[key_hash]		yield content[key_hash]

		if contents_with_missing_hashes:
# For these, we need the expensive index lookups + main table.		# For these, we need the expensive index lookups + main table.
for content in contents_with_missing_hashes:
res = self.content_find(content)		found_contents = self._content_find_many(contents_with_missing_hashes)
		douarddaUnsubmitted Done Inline Actions would be nice to have a comment explaining why this more convoluted code is better (aka remind the reader the concurrency gained with the usage of content_find_many) douardda: would be nice to have a comment explaining why this more convoluted code is better (aka remind…
if not res:
yield content[key_hash]		for missing_content in contents_with_missing_hashes:
		for found_content in found_contents:
		if missing_content.items() <= found_content.hashes().items():
		douarddaUnsubmitted Done Inline Actions not a big fan of the double for loop, but meh (alternative implem would probably be much worse) douardda: not a big fan of the double for loop, but meh (alternative implem would probably be much worse)
		vlorentzAuthorUnsubmitted Done Inline Actions the alternative implem is in D6889 vlorentz: the alternative implem is in D6889
		# Found!
		break
		else:
		# Not found
		yield missing_content[key_hash]

@timed		@timed
def content_missing_per_sha1(self, contents: List[bytes]) -> Iterable[bytes]:		def content_missing_per_sha1(self, contents: List[bytes]) -> Iterable[bytes]:
return self.content_missing([{"sha1": c} for c in contents])		return self.content_missing([{"sha1": c} for c in contents])

@timed		@timed
def content_missing_per_sha1_git(		def content_missing_per_sha1_git(
self, contents: List[Sha1Git]		self, contents: List[Sha1Git]
▲ Show 20 Lines • Show All 1,291 Lines • Show Last 20 Lines