diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -247,11 +247,18 @@ results.append(Content(**row_d)) return results - def content_missing(self, content, key_hash="sha1"): - for cont in content: - res = self.content_find(cont) + def content_missing( + self, contents: List[Dict[str, Any]], key_hash: str = "sha1" + ) -> Iterable[bytes]: + if key_hash not in DEFAULT_ALGORITHMS: + raise StorageArgumentException( + "key_hash should be one of {','.join(DEFAULT_ALGORITHMS)}" + ) + + for content in contents: + res = self.content_find(content) if not res: - yield cont[key_hash] + yield content[key_hash] def content_missing_per_sha1(self, contents): return self.content_missing([{"sha1": c for c in contents}]) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -345,13 +345,20 @@ keys = list(set.intersection(*found)) return [self._contents[key] for key in keys] - def content_missing(self, content, key_hash="sha1"): - for cont in content: - for (algo, hash_) in cont.items(): + def content_missing( + self, contents: List[Dict[str, Any]], key_hash: str = "sha1" + ) -> Iterable[bytes]: + if key_hash not in DEFAULT_ALGORITHMS: + raise StorageArgumentException( + "key_hash should be one of {','.join(DEFAULT_ALGORITHMS)}" + ) + + for content in contents: + for (algo, hash_) in content.items(): if algo not in DEFAULT_ALGORITHMS: continue if hash_ not in self._content_indexes.get(algo, []): - yield cont[key_hash] + yield content[key_hash] break def content_missing_per_sha1(self, contents): diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -247,26 +247,24 @@ ... @remote_api_endpoint("content/missing") - def content_missing(self, content, key_hash="sha1"): + def content_missing( + self, contents: List[Dict[str, Any]], key_hash: str = "sha1" + ) -> Iterable[bytes]: """List content missing from storage Args: - content ([dict]): iterable of dictionaries whose keys are - either 'length' or an item of - :data:`swh.model.hashutil.ALGORITHMS`; - mapped to the corresponding checksum - (or length). - - key_hash (str): name of the column to use as hash id - result (default: 'sha1') - - Returns: - iterable ([bytes]): missing content ids (as per the - key_hash column) + content: iterable of dictionaries whose keys are either 'length' or an item + of :data:`swh.model.hashutil.ALGORITHMS`; mapped to the + corresponding checksum (or length). + key_hash: name of the column to use as hash id result (default: 'sha1') Raises: + StorageArgumentException when key_hash is unknown. TODO: an exception when we get a hash collision. + Returns: + iterable of missing content ids (as per the `key_hash` column) + """ ... diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -338,18 +338,18 @@ @timed @db_transaction_generator() - def content_missing(self, content, key_hash="sha1", db=None, cur=None): - keys = db.content_hash_keys - - if key_hash not in keys: - raise StorageArgumentException("key_hash should be one of %s" % keys) + def content_missing( + self, contents: List[Dict[str, Any]], key_hash: str = "sha1", db=None, cur=None + ) -> Iterable[bytes]: + if key_hash not in DEFAULT_ALGORITHMS: + raise StorageArgumentException( + "key_hash should be one of {','.join(DEFAULT_ALGORITHMS)}" + ) + keys = db.content_hash_keys key_hash_idx = keys.index(key_hash) - if not content: - return - - for obj in db.content_missing_from_list(content, cur): + for obj in db.content_missing_from_list(contents, cur): yield obj[key_hash_idx] @timed