Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/cassandra/storage.py
Show First 20 Lines • Show All 345 Lines • ▼ Show 20 Lines | class CassandraStorage: | ||||
@timed | @timed | ||||
def content_find(self, content: Dict[str, Any]) -> List[Content]: | def content_find(self, content: Dict[str, Any]) -> List[Content]: | ||||
return self._content_find_many([content]) | return self._content_find_many([content]) | ||||
def _content_find_many(self, contents: List[Dict[str, Any]]) -> List[Content]: | def _content_find_many(self, contents: List[Dict[str, Any]]) -> List[Content]: | ||||
# Find an algorithm that is common to all the requested contents. | # Find an algorithm that is common to all the requested contents. | ||||
# It will be used to do an initial filtering efficiently. | # It will be used to do an initial filtering efficiently. | ||||
# TODO: prioritize sha256, we can do more efficient lookups from this hash. | |||||
filter_algos = set(HASH_ALGORITHMS) | filter_algos = set(HASH_ALGORITHMS) | ||||
for content in contents: | for content in contents: | ||||
filter_algos &= set(content) | filter_algos &= set(content) | ||||
if not filter_algos: | if not filter_algos: | ||||
raise StorageArgumentException( | raise StorageArgumentException( | ||||
"content keys must contain at least one " | "content keys must contain at least one " | ||||
f"of: {', '.join(sorted(HASH_ALGORITHMS))}" | f"of: {', '.join(sorted(HASH_ALGORITHMS))}" | ||||
) | ) | ||||
Show All 35 Lines | ) -> Iterable[bytes]: | ||||
contents_with_missing_hashes = [] | contents_with_missing_hashes = [] | ||||
for content in contents: | for content in contents: | ||||
if DEFAULT_ALGORITHMS <= set(content): | if DEFAULT_ALGORITHMS <= set(content): | ||||
contents_with_all_hashes.append(content) | contents_with_all_hashes.append(content) | ||||
else: | else: | ||||
contents_with_missing_hashes.append(content) | contents_with_missing_hashes.append(content) | ||||
# These contents can be queried efficiently directly in the main table | # These contents can be queried efficiently directly in the main table | ||||
for content in self._cql_runner.content_missing_from_hashes( | for content in self._cql_runner.content_missing_from_all_hashes( | ||||
contents_with_all_hashes | contents_with_all_hashes | ||||
): | ): | ||||
yield content[key_hash] | yield content[key_hash] | ||||
if contents_with_missing_hashes: | |||||
# For these, we need the expensive index lookups + main table. | # For these, we need the expensive index lookups + main table. | ||||
for content in contents_with_missing_hashes: | |||||
res = self.content_find(content) | found_contents = self._content_find_many(contents_with_missing_hashes) | ||||
douardda: would be nice to have a comment explaining why this more convoluted code is better (aka remind… | |||||
if not res: | |||||
yield content[key_hash] | for missing_content in contents_with_missing_hashes: | ||||
for found_content in found_contents: | |||||
if missing_content.items() <= found_content.hashes().items(): | |||||
Done Inline Actionsnot a big fan of the double for loop, but meh (alternative implem would probably be much worse) douardda: not a big fan of the double for loop, but meh (alternative implem would probably be much worse) | |||||
Done Inline Actionsthe alternative implem is in D6889 vlorentz: the alternative implem is in D6889 | |||||
# Found! | |||||
break | |||||
else: | |||||
# Not found | |||||
yield missing_content[key_hash] | |||||
@timed | @timed | ||||
def content_missing_per_sha1(self, contents: List[bytes]) -> Iterable[bytes]: | def content_missing_per_sha1(self, contents: List[bytes]) -> Iterable[bytes]: | ||||
return self.content_missing([{"sha1": c} for c in contents]) | return self.content_missing([{"sha1": c} for c in contents]) | ||||
@timed | @timed | ||||
def content_missing_per_sha1_git( | def content_missing_per_sha1_git( | ||||
self, contents: List[Sha1Git] | self, contents: List[Sha1Git] | ||||
▲ Show 20 Lines • Show All 1,291 Lines • Show Last 20 Lines |
would be nice to have a comment explaining why this more convoluted code is better (aka remind the reader the concurrency gained with the usage of content_find_many)