Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7343115
D695.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
D695.id.diff
View Options
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -17,6 +17,9 @@
from swh.model.hashutil import DEFAULT_ALGORITHMS
from swh.model.identifiers import normalize_timestamp
+# Max block size of contents to return
+BULK_BLOCK_CONTENT_LEN_MAX = 10000
+
def now():
return datetime.datetime.now(tz=datetime.timezone.utc)
@@ -42,6 +45,9 @@
self._metadata_providers = {}
self._objects = defaultdict(list)
+ # ideally we would want a skip list for both fast inserts and searches
+ self._sorted_sha1s = []
+
def check_config(self, *, check_write):
"""Check that the storage is configured and ready to go."""
return True
@@ -79,9 +85,88 @@
('content', content['sha1']))
self._contents[key] = copy.deepcopy(content)
self._contents[key]['ctime'] = now()
+ bisect.insort(self._sorted_sha1s, content['sha1'])
if self._contents[key]['status'] == 'visible':
self._contents_data[key] = self._contents[key].pop('data')
+ def content_get(self, ids):
+ """Retrieve in bulk contents and their data.
+
+ This function may yield more blobs than provided sha1 identifiers,
+ in case they collide.
+
+ Args:
+ content: iterables of sha1
+
+ Yields:
+ Dict[str, bytes]: Generates streams of contents as dict with their
+ raw data:
+
+ - sha1 (bytes): content id
+ - data (bytes): content's raw data
+
+ Raises:
+ ValueError in case of too much contents are required.
+ cf. BULK_BLOCK_CONTENT_LEN_MAX
+
+ """
+ # FIXME: Make this method support slicing the `data`.
+ if len(ids) > BULK_BLOCK_CONTENT_LEN_MAX:
+ raise ValueError(
+ "Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX)
+ for id_ in ids:
+ for key in self._content_indexes['sha1'][id_]:
+ yield {
+ 'sha1': id_,
+ 'data': self._contents_data[key],
+ }
+
+ def content_get_range(self, start, end, limit=1000, db=None, cur=None):
+ """Retrieve contents within range [start, end] bound by limit.
+
+ Note that this function may return more than one blob per hash. The
+ limit is enforced with multiplicity (ie. two blobs with the same hash
+ will count twice toward the limit).
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **limit** (int): Limit result (default to 1000)
+
+ Returns:
+ a dict with keys:
+ - contents [dict]: iterable of contents in between the range.
+ - next (bytes): There remains content in the range
+ starting from this next sha1
+
+ """
+ if limit is None:
+ raise ValueError('Development error: limit should not be None')
+ from_index = bisect.bisect_left(self._sorted_sha1s, start)
+ sha1s = itertools.islice(self._sorted_sha1s, from_index, None)
+ sha1s = ((sha1, content_key)
+ for sha1 in sha1s
+ for content_key in self._content_indexes['sha1'][sha1])
+ matched = []
+ for sha1, key in sha1s:
+ if sha1 > end:
+ break
+ if len(matched) >= limit:
+ return {
+ 'contents': matched,
+ 'next': sha1,
+ }
+ matched.append({
+ 'data': self._contents_data[key],
+ **self._contents[key],
+ })
+ return {
+ 'contents': matched,
+ 'next': None,
+ }
+
def content_get_metadata(self, sha1s):
"""Retrieve content metadata in bulk
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -228,17 +228,18 @@
content: iterables of sha1
Yields:
- dict: Generates streams of contents as dict with their raw data:
+ Dict[str, bytes]: Generates streams of contents as dict with their
+ raw data:
- - sha1: sha1's content
- - data: bytes data of the content
+ - sha1 (bytes): content id
+ - data (bytes): content's raw data
Raises:
ValueError in case of too much contents are required.
cf. BULK_BLOCK_CONTENT_LEN_MAX
"""
- # FIXME: Improve on server module to slice the result
+ # FIXME: Make this method support slicing the `data`.
if len(content) > BULK_BLOCK_CONTENT_LEN_MAX:
raise ValueError(
"Send at maximum %s contents." % BULK_BLOCK_CONTENT_LEN_MAX)
diff --git a/swh/storage/tests/test_in_memory.py b/swh/storage/tests/test_in_memory.py
--- a/swh/storage/tests/test_in_memory.py
+++ b/swh/storage/tests/test_in_memory.py
@@ -45,18 +45,5 @@
super().setUp()
self.storage = Storage()
- @pytest.mark.xfail(reason='not implemented yet')
- def test_generate_content_get(self):
- super().test_generate_content_get()
-
- @pytest.mark.xfail(reason='not implemented yet')
- def test_generate_content_get_range_limit_none(self):
- super().test_generate_content_get_range_limit_none()
-
- @pytest.mark.xfail(reason='not implemented yet')
- def test_generate_content_get_range_no_limit(self):
- super().test_generate_content_get_range_no_limit()
-
- @pytest.mark.xfail(reason='not implemented yet')
- def test_generate_content_get_range_limit(self):
- super().test_generate_content_get_range_limit()
+ def reset_storage_tables(self):
+ self.storage = Storage()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mar 17 2025, 7:29 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219135
Attached To
D695: Implement content_get and content_get_range for the in-mem storage.
Event Timeline
Log In to Comment