Page MenuHomeSoftware Heritage

D695.id.diff
No OneTemporary

D695.id.diff

diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -17,6 +17,9 @@
from swh.model.hashutil import DEFAULT_ALGORITHMS
from swh.model.identifiers import normalize_timestamp
+# Max block size of contents to return
+BULK_BLOCK_CONTENT_LEN_MAX = 10000
+
def now():
return datetime.datetime.now(tz=datetime.timezone.utc)
@@ -42,6 +45,9 @@
self._metadata_providers = {}
self._objects = defaultdict(list)
+ # ideally we would want a skip list for both fast inserts and searches
+ self._sorted_sha1s = []
+
def check_config(self, *, check_write):
"""Check that the storage is configured and ready to go."""
return True
@@ -79,9 +85,88 @@
('content', content['sha1']))
self._contents[key] = copy.deepcopy(content)
self._contents[key]['ctime'] = now()
+ bisect.insort(self._sorted_sha1s, content['sha1'])
if self._contents[key]['status'] == 'visible':
self._contents_data[key] = self._contents[key].pop('data')
+ def content_get(self, ids):
+ """Retrieve in bulk contents and their data.
+
+ This function may yield more blobs than provided sha1 identifiers,
+ in case they collide.
+
+ Args:
+ content: iterables of sha1
+
+ Yields:
+ Dict[str, bytes]: Generates streams of contents as dict with their
+ raw data:
+
+ - sha1 (bytes): content id
+ - data (bytes): content's raw data
+
+ Raises:
+ ValueError in case of too much contents are required.
+ cf. BULK_BLOCK_CONTENT_LEN_MAX
+
+ """
+ # FIXME: Make this method support slicing the `data`.
+ if len(ids) > BULK_BLOCK_CONTENT_LEN_MAX:
+ raise ValueError(
+ "Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX)
+ for id_ in ids:
+ for key in self._content_indexes['sha1'][id_]:
+ yield {
+ 'sha1': id_,
+ 'data': self._contents_data[key],
+ }
+
+ def content_get_range(self, start, end, limit=1000, db=None, cur=None):
+ """Retrieve contents within range [start, end] bound by limit.
+
+ Note that this function may return more than one blob per hash. The
+ limit is enforced with multiplicity (ie. two blobs with the same hash
+ will count twice toward the limit).
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **limit** (int): Limit result (default to 1000)
+
+ Returns:
+ a dict with keys:
+ - contents [dict]: iterable of contents in between the range.
+ - next (bytes): There remains content in the range
+ starting from this next sha1
+
+ """
+ if limit is None:
+ raise ValueError('Development error: limit should not be None')
+ from_index = bisect.bisect_left(self._sorted_sha1s, start)
+ sha1s = itertools.islice(self._sorted_sha1s, from_index, None)
+ sha1s = ((sha1, content_key)
+ for sha1 in sha1s
+ for content_key in self._content_indexes['sha1'][sha1])
+ matched = []
+ for sha1, key in sha1s:
+ if sha1 > end:
+ break
+ if len(matched) >= limit:
+ return {
+ 'contents': matched,
+ 'next': sha1,
+ }
+ matched.append({
+ 'data': self._contents_data[key],
+ **self._contents[key],
+ })
+ return {
+ 'contents': matched,
+ 'next': None,
+ }
+
def content_get_metadata(self, sha1s):
"""Retrieve content metadata in bulk
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -228,17 +228,18 @@
content: iterables of sha1
Yields:
- dict: Generates streams of contents as dict with their raw data:
+ Dict[str, bytes]: Generates streams of contents as dict with their
+ raw data:
- - sha1: sha1's content
- - data: bytes data of the content
+ - sha1 (bytes): content id
+ - data (bytes): content's raw data
Raises:
ValueError in case of too much contents are required.
cf. BULK_BLOCK_CONTENT_LEN_MAX
"""
- # FIXME: Improve on server module to slice the result
+ # FIXME: Make this method support slicing the `data`.
if len(content) > BULK_BLOCK_CONTENT_LEN_MAX:
raise ValueError(
"Send at maximum %s contents." % BULK_BLOCK_CONTENT_LEN_MAX)
diff --git a/swh/storage/tests/test_in_memory.py b/swh/storage/tests/test_in_memory.py
--- a/swh/storage/tests/test_in_memory.py
+++ b/swh/storage/tests/test_in_memory.py
@@ -45,18 +45,5 @@
super().setUp()
self.storage = Storage()
- @pytest.mark.xfail(reason='not implemented yet')
- def test_generate_content_get(self):
- super().test_generate_content_get()
-
- @pytest.mark.xfail(reason='not implemented yet')
- def test_generate_content_get_range_limit_none(self):
- super().test_generate_content_get_range_limit_none()
-
- @pytest.mark.xfail(reason='not implemented yet')
- def test_generate_content_get_range_no_limit(self):
- super().test_generate_content_get_range_no_limit()
-
- @pytest.mark.xfail(reason='not implemented yet')
- def test_generate_content_get_range_limit(self):
- super().test_generate_content_get_range_limit()
+ def reset_storage_tables(self):
+ self.storage = Storage()

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 7:29 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219135

Event Timeline