diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -34,7 +34,7 @@ RawExtrinsicMetadata, Sha1Git, ) -from swh.storage.interface import ListOrder, PagedResult, VISIT_STATUSES +from swh.storage.interface import ListOrder, PagedResult, Sha1, VISIT_STATUSES from swh.storage.objstorage import ObjStorage from swh.storage.writer import JournalWriter from swh.storage.utils import map_optional, now @@ -163,15 +163,9 @@ def content_add_metadata(self, content: List[Content]) -> Dict: return self._content_add(content, with_data=False) - def content_get( - self, contents: List[bytes] - ) -> Iterable[Optional[Dict[str, bytes]]]: - # FIXME: Make this method support slicing the `data`. - if len(contents) > BULK_BLOCK_CONTENT_LEN_MAX: - raise StorageArgumentException( - f"Send at maximum {BULK_BLOCK_CONTENT_LEN_MAX} contents." - ) - yield from self.objstorage.content_get(contents) + def content_get_data(self, content: Sha1) -> Optional[bytes]: + # FIXME: Make this method support slicing the `data` + return self.objstorage.content_get(content) def content_get_partition( self, diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -51,6 +51,7 @@ MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, + Sha1, Sha1Git, ) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex @@ -254,15 +255,9 @@ def content_add_metadata(self, content: List[Content]) -> Dict: return self._content_add(content, with_data=False) - def content_get( - self, contents: List[bytes] - ) -> Iterable[Optional[Dict[str, bytes]]]: - # FIXME: Make this method support slicing the `data`. - if len(contents) > BULK_BLOCK_CONTENT_LEN_MAX: - raise StorageArgumentException( - f"Send at maximum {BULK_BLOCK_CONTENT_LEN_MAX} contents." - ) - yield from self.objstorage.content_get(contents) + def content_get_data(self, content: Sha1) -> Optional[bytes]: + # FIXME: Make this method support slicing the `data` + return self.objstorage.content_get(content) def content_get_partition( self, diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -27,6 +27,7 @@ MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, + Sha1, Sha1Git, ) @@ -146,31 +147,14 @@ ... @remote_api_endpoint("content/data") - def content_get( - self, contents: List[bytes] - ) -> Iterable[Optional[Dict[str, bytes]]]: - """Retrieve in bulk contents and their data. - - This generator yields exactly as many items than provided sha1 - identifiers, but callers should not assume this will always be true. - - It may also yield `None` values in case an object was not found. - - TODO: - Rename to content_get_data + def content_get_data(self, content: Sha1) -> Optional[bytes]: + """Given a content identifier, returns its associated data if any. Args: - contents: iterables of sha1 - - Raises: - StorageArgumentException in case of too much contents are required. - (cf. BULK_BLOCK_CONTENT_LEN_MAX) - - Yields: - Streams of contents as dict with their raw data: + content: sha1 identifier - - sha1 (bytes): content id - - data (bytes): content's raw data + Returns: + raw content data (bytes) """ ... diff --git a/swh/storage/objstorage.py b/swh/storage/objstorage.py --- a/swh/storage/objstorage.py +++ b/swh/storage/objstorage.py @@ -3,8 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Dict, Generator, Iterable +from typing import Dict, Iterable, Optional +from swh.storage.interface import Sha1 from swh.model.model import Content, MissingData from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError @@ -26,21 +27,22 @@ raise AttributeError(key) return getattr(self.objstorage, key) - def content_get(self, contents: Iterable[bytes]) -> Generator: - """Retrieve content data from the objstorage + def content_get(self, obj_id: Sha1) -> Optional[bytes]: + """Retrieve data associated to the content from the objstorage Args: - contents: List of contents to retrieve data from + content: content identitier + + Returns: + associated content's data if any, None otherwise. """ - for obj_id in contents: - try: - data = self.objstorage.get(obj_id) - except ObjNotFoundError: - yield None - continue - - yield {"sha1": obj_id, "data": data} + try: + data = self.objstorage.get(obj_id) + except ObjNotFoundError: + data = None + + return data def content_add(self, contents: Iterable[Content]) -> Dict: """Add contents to the objstorage. diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -37,6 +37,7 @@ Revision, Release, SkippedContent, + Sha1, Sha1Git, Snapshot, SHA1_SIZE, @@ -266,15 +267,9 @@ } @timed - def content_get( - self, contents: List[bytes] - ) -> Iterable[Optional[Dict[str, bytes]]]: - # FIXME: Make this method support slicing the `data`. - if len(contents) > BULK_BLOCK_CONTENT_LEN_MAX: - raise StorageArgumentException( - f"Send at maximum {BULK_BLOCK_CONTENT_LEN_MAX} contents." - ) - yield from self.objstorage.content_get(contents) + def content_get_data(self, content: Sha1) -> Optional[bytes]: + # FIXME: Make this method support slicing the `data` + return self.objstorage.content_get(content) @timed @db_transaction() diff --git a/swh/storage/tests/test_filter.py b/swh/storage/tests/test_filter.py --- a/swh/storage/tests/test_filter.py +++ b/swh/storage/tests/test_filter.py @@ -23,8 +23,8 @@ def test_filtering_proxy_storage_content(swh_storage, sample_data): sample_content = sample_data.content - content = next(swh_storage.content_get([sample_content.sha1])) - assert not content + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None s = swh_storage.content_add([sample_content]) assert s == { @@ -32,7 +32,7 @@ "content:add:bytes": sample_content.length, } - content = next(swh_storage.content_get([sample_content.sha1])) + content = swh_storage.content_get_data(sample_content.sha1) assert content is not None s = swh_storage.content_add([sample_content]) diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py --- a/swh/storage/tests/test_retry.py +++ b/swh/storage/tests/test_retry.py @@ -47,8 +47,8 @@ """ sample_content = sample_data.content - content = next(swh_storage.content_get([sample_content.sha1])) - assert not content + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None s = swh_storage.content_add([sample_content]) assert s == { @@ -56,8 +56,8 @@ "content:add:bytes": sample_content.length, } - content = next(swh_storage.content_get([sample_content.sha1])) - assert content["sha1"] == sample_content.sha1 + content = swh_storage.content_get_data(sample_content.sha1) + assert content == sample_content.data def test_retrying_proxy_storage_content_add_with_retry( @@ -78,8 +78,8 @@ sample_content = sample_data.content - content = next(swh_storage.content_get([sample_content.sha1])) - assert not content + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None s = swh_storage.content_add([sample_content]) assert s == {"content:add": 1} @@ -100,8 +100,8 @@ sample_content = sample_data.content - content = next(swh_storage.content_get([sample_content.sha1])) - assert not content + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None with pytest.raises(StorageArgumentException, match="Refuse to add"): swh_storage.content_add([sample_content]) diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -186,9 +186,7 @@ "content:add:bytes": cont.length, } - assert list(swh_storage.content_get([cont.sha1])) == [ - {"sha1": cont.sha1, "data": cont.data} - ] + assert swh_storage.content_get_data(cont.sha1) == cont.data expected_cont = attr.evolve(cont, data=None) @@ -223,9 +221,7 @@ # the fact that we retrieve the content object from the storage with # the correct 'data' field ensures it has been 'called' - assert list(swh_storage.content_get([cont.sha1])) == [ - {"sha1": cont.sha1, "data": cont.data} - ] + assert swh_storage.content_get_data(cont.sha1) == cont.data expected_cont = attr.evolve(lazy_content, data=None, ctime=None) contents = [ @@ -242,23 +238,20 @@ swh_storage.refresh_stat_counters() assert swh_storage.stat_counters()["content"] == 1 - def test_content_get_missing(self, swh_storage, sample_data): + def test_content_get_data_missing(self, swh_storage, sample_data): cont, cont2 = sample_data.contents[:2] swh_storage.content_add([cont]) # Query a single missing content - results = list(swh_storage.content_get([cont2.sha1])) - assert results == [None] + actual_content_data = swh_storage.content_get_data(cont2.sha1) + assert actual_content_data is None # Check content_get does not abort after finding a missing content - results = list(swh_storage.content_get([cont.sha1, cont2.sha1])) - assert results == [{"sha1": cont.sha1, "data": cont.data}, None] - - # Check content_get does not discard found countent when it finds - # a missing content. - results = list(swh_storage.content_get([cont2.sha1, cont.sha1])) - assert results == [None, {"sha1": cont.sha1, "data": cont.data}] + actual_content_data = swh_storage.content_get_data(cont.sha1) + assert actual_content_data == cont.data + actual_content_data = swh_storage.content_get_data(cont2.sha1) + assert actual_content_data is None def test_content_add_different_input(self, swh_storage, sample_data): cont, cont2 = sample_data.contents[:2] @@ -320,9 +313,7 @@ cont = sample_data.content swh_storage.content_add([cont, cont]) - assert list(swh_storage.content_get([cont.sha1])) == [ - {"sha1": cont.sha1, "data": cont.data} - ] + assert swh_storage.content_get_data(cont.sha1) == cont.data def test_content_update(self, swh_storage, sample_data): cont1 = sample_data.content @@ -3829,15 +3820,14 @@ class TestStorageGeneratedData: - def test_generate_content_get(self, swh_storage, swh_contents): - contents_with_data = [c.to_dict() for c in swh_contents if c.status != "absent"] - # input the list of sha1s we want from storage - get_sha1s = [c["sha1"] for c in contents_with_data] + def test_generate_content_get_data(self, swh_storage, swh_contents): + contents_with_data = [c for c in swh_contents if c.status != "absent"] # retrieve contents - actual_contents = list(swh_storage.content_get(get_sha1s)) - assert None not in actual_contents - assert_contents_ok(contents_with_data, actual_contents) + for content in contents_with_data: + actual_content_data = swh_storage.content_get_data(content.sha1) + assert actual_content_data is not None + assert actual_content_data == content.data def test_generate_content_get_metadata(self, swh_storage, swh_contents): # input the list of sha1s we want from storage