diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -209,16 +209,17 @@ assert len(contents) <= limit return PagedResult(results=contents, next_page_token=next_page_token) - def content_get_metadata(self, contents: List[bytes]) -> Dict[bytes, List[Dict]]: - result: Dict[bytes, List[Dict]] = {sha1: [] for sha1 in contents} + def content_get(self, contents: List[Sha1]) -> List[Optional[Content]]: + contents_by_sha1: Dict[Sha1, Optional[Content]] = {} for sha1 in contents: # Get all (sha1, sha1_git, sha256, blake2s256) whose sha1 # matches the argument, from the index table ('content_by_sha1') for row in self._content_get_from_hash("sha1", sha1): - content_metadata = row._asdict() - content_metadata.pop("ctime") - result[content_metadata["sha1"]].append(content_metadata) - return result + row_d = row._asdict() + row_d.pop("ctime") + content = Content(**row_d) + contents_by_sha1[content.sha1] = content + return [contents_by_sha1.get(sha1) for sha1 in contents] def content_find(self, content: Dict[str, Any]) -> List[Content]: # Find an algorithm that is common to all the requested contents. diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -294,19 +294,18 @@ assert len(contents) <= limit return PagedResult(results=contents, next_page_token=next_page_token) - def content_get_metadata(self, contents: List[bytes]) -> Dict[bytes, List[Dict]]: - result: Dict = {sha1: [] for sha1 in contents} + def content_get(self, contents: List[Sha1]) -> List[Optional[Content]]: + contents_by_sha1: Dict[Sha1, Optional[Content]] = {} for sha1 in contents: if sha1 in self._content_indexes["sha1"]: objs = self._content_indexes["sha1"][sha1] # only 1 element as content_add_metadata would have raised a # hash collision otherwise + assert len(objs) == 1 for key in objs: - d = attr.evolve( - self._contents[key], data=None, ctime=None - ).to_dict() - result[sha1].append(d) - return result + content = attr.evolve(self._contents[key], data=None, ctime=None) + contents_by_sha1[sha1] = content + return [contents_by_sha1.get(sha1) for sha1 in contents] def content_find(self, content: Dict[str, Any]) -> List[Content]: if not set(content).intersection(DEFAULT_ALGORITHMS): diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -187,16 +187,14 @@ ... @remote_api_endpoint("content/metadata") - def content_get_metadata(self, contents: List[bytes]) -> Dict[bytes, List[Dict]]: + def content_get(self, contents: List[Sha1]) -> List[Optional[Content]]: """Retrieve content metadata in bulk Args: - content: iterable of content identifiers (sha1) + content: List of content identifiers Returns: - a dict with keys the content's sha1 and the associated value - either the existing content's metadata or None if the content does - not exist. + List of contents model objects when they exist, None otherwise. """ ... diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -308,14 +308,16 @@ @timed @db_transaction(statement_timeout=500) - def content_get_metadata( - self, contents: List[bytes], db=None, cur=None - ) -> Dict[bytes, List[Dict]]: - result: Dict[bytes, List[Dict]] = {sha1: [] for sha1 in contents} + def content_get( + self, contents: List[Sha1], db=None, cur=None + ) -> List[Optional[Content]]: + contents_by_sha1: Dict[Sha1, Optional[Content]] = {} for row in db.content_get_metadata_from_sha1s(contents, cur): - content_meta = dict(zip(db.content_get_metadata_keys, row)) - result[content_meta["sha1"]].append(content_meta) - return result + row_d = dict(zip(db.content_get_metadata_keys, row)) + content = Content(**row_d) + contents_by_sha1[content.sha1] = content + + return [contents_by_sha1.get(sha1) for sha1 in contents] @timed @db_transaction_generator() diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py --- a/swh/storage/tests/test_cassandra.py +++ b/swh/storage/tests/test_cassandra.py @@ -267,15 +267,14 @@ swh_storage._cql_runner, "content_get_from_token", mock_cgft ) - actual_result = swh_storage.content_get_metadata([cont.sha1]) - + actual_result = swh_storage.content_get([cont.sha1]) assert called == 2 # dropping extra column not returned - expected_cont = attr.evolve(cont, data=None, ctime=None).to_dict() + expected_cont = attr.evolve(cont, data=None) # but cont2 should be filtered out - assert actual_result == {cont.sha1: [expected_cont]} + assert actual_result == [expected_cont] def test_content_find_murmur3_collision(self, swh_storage, mocker, sample_data): """The Murmur3 token is used as link from index tables to the main diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py --- a/swh/storage/tests/test_retry.py +++ b/swh/storage/tests/test_retry.py @@ -117,17 +117,17 @@ content = attr.evolve(sample_content, data=None) pk = content.sha1 - content_metadata = swh_storage.content_get_metadata([pk]) - assert not content_metadata[pk] + content_metadata = swh_storage.content_get([pk]) + assert content_metadata == [None] s = swh_storage.content_add_metadata([content]) assert s == { "content:add": 1, } - content_metadata = swh_storage.content_get_metadata([pk]) - assert len(content_metadata[pk]) == 1 - assert content_metadata[pk][0]["sha1"] == pk + content_metadata = swh_storage.content_get([pk]) + assert len(content_metadata) == 1 + assert content_metadata[0].sha1 == pk def test_retrying_proxy_storage_content_add_metadata_with_retry( @@ -175,10 +175,6 @@ sample_content = sample_data.content content = attr.evolve(sample_content, data=None) - pk = content.sha1 - content_metadata = swh_storage.content_get_metadata([pk]) - assert not content_metadata[pk] - with pytest.raises(StorageArgumentException, match="Refuse to add"): swh_storage.content_add_metadata([content]) diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -330,10 +330,9 @@ swh_storage.content_update([cont1b.to_dict()], keys=["sha1_git"]) - results = swh_storage.content_get_metadata([cont1.sha1]) - - expected_content = attr.evolve(cont1b, data=None, ctime=None).to_dict() - assert tuple(results[cont1.sha1]) == (expected_content,) + actual_contents = swh_storage.content_get([cont1.sha1]) + expected_content = attr.evolve(cont1b, data=None) + assert actual_contents == [expected_content] def test_content_add_metadata(self, swh_storage, sample_data): cont = attr.evolve(sample_data.content, data=None, ctime=now()) @@ -343,12 +342,8 @@ "content:add": 1, } - expected_cont = cont.to_dict() - expected_cont.pop("ctime", None) - - assert tuple(swh_storage.content_get_metadata([cont.sha1])[cont.sha1]) == ( - expected_cont, - ) + expected_cont = cont + assert swh_storage.content_get([cont.sha1]) == [expected_cont] contents = [ obj for (obj_type, obj) in swh_storage.journal_writer.journal.objects @@ -617,32 +612,33 @@ for content in actual_contents: assert content in expected_contents - def test_content_get_metadata(self, swh_storage, sample_data): + def test_content_get(self, swh_storage, sample_data): cont1, cont2 = sample_data.contents[:2] swh_storage.content_add([cont1, cont2]) - actual_md = swh_storage.content_get_metadata([cont1.sha1, cont2.sha1]) + actual_contents = swh_storage.content_get([cont1.sha1, cont2.sha1]) # we only retrieve the metadata so no data nor ctime within - expected_cont1, expected_cont2 = [ - attr.evolve(c, data=None, ctime=None).to_dict() for c in [cont1, cont2] - ] + expected_contents = [attr.evolve(c, data=None) for c in [cont1, cont2]] - assert tuple(actual_md[cont1.sha1]) == (expected_cont1,) - assert tuple(actual_md[cont2.sha1]) == (expected_cont2,) - assert len(actual_md.keys()) == 2 + assert actual_contents == expected_contents - def test_content_get_metadata_missing_sha1(self, swh_storage, sample_data): + def test_content_get_missing_sha1(self, swh_storage, sample_data): cont1, cont2 = sample_data.contents[:2] + assert cont1.sha1 != cont2.sha1 missing_cont = sample_data.skipped_content swh_storage.content_add([cont1, cont2]) - actual_contents = swh_storage.content_get_metadata([missing_cont.sha1]) + actual_contents = swh_storage.content_get( + [cont1.sha1, cont2.sha1, missing_cont.sha1] + ) - assert len(actual_contents) == 1 - assert tuple(actual_contents[missing_cont.sha1]) == () + expected_contents = [ + attr.evolve(c, data=None) if c else None for c in [cont1, cont2, None] + ] + assert actual_contents == expected_contents def test_content_get_random(self, swh_storage, sample_data): cont, cont2, cont3 = sample_data.contents[:3] @@ -3829,25 +3825,15 @@ assert actual_content_data is not None assert actual_content_data == content.data - def test_generate_content_get_metadata(self, swh_storage, swh_contents): - # input the list of sha1s we want from storage - expected_contents = [c.to_dict() for c in swh_contents if c.status != "absent"] - get_sha1s = [c["sha1"] for c in expected_contents] - - # retrieve contents - meta_contents = swh_storage.content_get_metadata(get_sha1s) - - assert len(list(meta_contents)) == len(get_sha1s) - - actual_contents = [] - for contents in meta_contents.values(): - actual_contents.extend(contents) + def test_generate_content_get(self, swh_storage, swh_contents): + expected_contents = [ + attr.evolve(c, data=None) for c in swh_contents if c.status != "absent" + ] - keys_to_check = {"length", "status", "sha1", "sha1_git", "sha256", "blake2s256"} + actual_contents = swh_storage.content_get([c.sha1 for c in expected_contents]) - assert_contents_ok( - expected_contents, actual_contents, keys_to_check=keys_to_check - ) + assert len(actual_contents) == len(expected_contents) + assert actual_contents == expected_contents @pytest.mark.parametrize("limit", [1, 7, 10, 100, 1000]) def test_origin_list(self, swh_storage, swh_origins, limit):