Changeset View
Standalone View
swh/storage/in_memory.py
Show All 11 Lines | |||||
import datetime | import datetime | ||||
import itertools | import itertools | ||||
import random | import random | ||||
import warnings | import warnings | ||||
from swh.model.hashutil import DEFAULT_ALGORITHMS | from swh.model.hashutil import DEFAULT_ALGORITHMS | ||||
from swh.model.identifiers import normalize_timestamp | from swh.model.identifiers import normalize_timestamp | ||||
# Max block size of contents to return | |||||
BULK_BLOCK_CONTENT_LEN_MAX = 10000 | |||||
def now(): | def now(): | ||||
return datetime.datetime.now(tz=datetime.timezone.utc) | return datetime.datetime.now(tz=datetime.timezone.utc) | ||||
OriginVisitKey = collections.namedtuple('OriginVisitKey', 'origin date') | OriginVisitKey = collections.namedtuple('OriginVisitKey', 'origin date') | ||||
Show All 9 Lines | def __init__(self): | ||||
self._snapshots = {} | self._snapshots = {} | ||||
self._origins = {} | self._origins = {} | ||||
self._origin_visits = {} | self._origin_visits = {} | ||||
self._origin_metadata = defaultdict(list) | self._origin_metadata = defaultdict(list) | ||||
self._tools = {} | self._tools = {} | ||||
self._metadata_providers = {} | self._metadata_providers = {} | ||||
self._objects = defaultdict(list) | self._objects = defaultdict(list) | ||||
# ideally we would want a skip list for both fast inserts and searches | |||||
self._sorted_sha1s = [] | |||||
def check_config(self, *, check_write): | def check_config(self, *, check_write): | ||||
"""Check that the storage is configured and ready to go.""" | """Check that the storage is configured and ready to go.""" | ||||
return True | return True | ||||
def content_add(self, contents): | def content_add(self, contents): | ||||
"""Add content blobs to the storage | """Add content blobs to the storage | ||||
Args: | Args: | ||||
Show All 20 Lines | def content_add(self, contents): | ||||
if content[algorithm] in self._content_indexes[algorithm]: | if content[algorithm] in self._content_indexes[algorithm]: | ||||
from . import HashCollision | from . import HashCollision | ||||
raise HashCollision(algorithm, content[algorithm], key) | raise HashCollision(algorithm, content[algorithm], key) | ||||
for algorithm in DEFAULT_ALGORITHMS: | for algorithm in DEFAULT_ALGORITHMS: | ||||
self._content_indexes[algorithm][content[algorithm]].add(key) | self._content_indexes[algorithm][content[algorithm]].add(key) | ||||
self._objects[content['sha1_git']].append( | self._objects[content['sha1_git']].append( | ||||
('content', content['sha1'])) | ('content', content['sha1'])) | ||||
self._contents[key] = copy.deepcopy(content) | self._contents[key] = copy.deepcopy(content) | ||||
self._contents[key]['ctime'] = now() | self._contents[key]['ctime'] = now() | ||||
ardumont: Why?
A comment would help. | |||||
Done Inline ActionsWhy what? It just inserts the new sha1 in the list vlorentz: Why what? It just inserts the new sha1 in the list | |||||
Not Done Inline ActionsYou change something in the function content_add. Now it's clearer ;) ardumont: You change something in the function content_add.
Your diff is about content_get and… | |||||
Done Inline ActionsYeah, I added self._sorted_sha1 to make searches run in logtime. vlorentz: Yeah, I added `self._sorted_sha1` to make searches run in logtime. | |||||
bisect.insort(self._sorted_sha1s, content['sha1']) | |||||
if self._contents[key]['status'] == 'visible': | if self._contents[key]['status'] == 'visible': | ||||
self._contents_data[key] = self._contents[key].pop('data') | self._contents_data[key] = self._contents[key].pop('data') | ||||
def content_get(self, ids): | |||||
"""Retrieve in bulk contents and their data. | |||||
This function may yield more blobs than provided sha1 identifiers, | |||||
in case they collide. | |||||
Args: | |||||
content: iterables of sha1 | |||||
Yields: | |||||
Done Inline ActionsDict[bytes, bytes]? ardumont: Dict[bytes, bytes]? | |||||
Not Done Inline ActionsWhy is the key str? What did i miss? ardumont: Why is the key str? What did i miss? | |||||
Done Inline ActionsKeys are 'sha1' and 'data'. vlorentz: Keys are `'sha1'` and `'data'`. | |||||
Not Done Inline Actionsright! ardumont: right! | |||||
Dict[str, bytes]: Generates streams of contents as dict with their | |||||
raw data: | |||||
Done Inline Actionssha1 (bytes): content id ardumont: sha1 (bytes): content id | |||||
Done Inline Actionsdata (bytes): content's raw data ardumont: data (bytes): content's raw data | |||||
- sha1 (bytes): content id | |||||
- data (bytes): content's raw data | |||||
Raises: | |||||
ValueError in case of too much contents are required. | |||||
cf. BULK_BLOCK_CONTENT_LEN_MAX | |||||
Not Done Inline Actions
You mean irrelevant for the in-memory usage (test) i guess. Explaining why i ask. That would be out of scope for that diff ;) ardumont: > remove irrelevant fixme.
You mean irrelevant for the in-memory usage (test) i guess. | |||||
Done Inline Actionsnvm, I misunderstood that comment vlorentz: nvm, I misunderstood that comment | |||||
Not Done Inline ActionsSince you touch the comment, you can now fix it to make it clearer then ;) ardumont: Since you touch the comment, you can now fix it to make it clearer then ;) | |||||
""" | |||||
# FIXME: Make this method support slicing the `data`. | |||||
if len(ids) > BULK_BLOCK_CONTENT_LEN_MAX: | |||||
raise ValueError( | |||||
"Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX) | |||||
for id_ in ids: | |||||
for key in self._content_indexes['sha1'][id_]: | |||||
yield { | |||||
'sha1': id_, | |||||
'data': self._contents_data[key], | |||||
} | |||||
def content_get_range(self, start, end, limit=1000, db=None, cur=None): | |||||
"""Retrieve contents within range [start, end] bound by limit. | |||||
Done Inline ActionsCan you please clarify what you mean by 'enforced with multiplicity'? ardumont: Can you please clarify what you mean by 'enforced with multiplicity'? | |||||
Note that this function may return more than one blob per hash. The | |||||
Not Done Inline ActionsCan't we skip those? Pushing filtering on the endpoint consumer feels bad to me. (and if in the main storage there is such a case, that's an out of scope issue, an issue nonetheless). ardumont: Can't we skip those?
Pushing filtering on the endpoint consumer feels bad to me.
(and if in… | |||||
Done Inline ActionsThat sentence was wrong, content_get_range returns no None item. vlorentz: That sentence was wrong, `content_get_range` returns no `None` item. | |||||
limit is enforced with multiplicity (ie. two blobs with the same hash | |||||
will count twice toward the limit). | |||||
Args: | |||||
**start** (bytes): Starting identifier range (expected smaller | |||||
than end) | |||||
**end** (bytes): Ending identifier range (expected larger | |||||
than start) | |||||
**limit** (int): Limit result (default to 1000) | |||||
Returns: | |||||
a dict with keys: | |||||
- contents [dict]: iterable of contents in between the range. | |||||
- next (bytes): There remains content in the range | |||||
starting from this next sha1 | |||||
""" | |||||
if limit is None: | |||||
raise ValueError('Development error: limit should not be None') | |||||
from_index = bisect.bisect_left(self._sorted_sha1s, start) | |||||
sha1s = itertools.islice(self._sorted_sha1s, from_index, None) | |||||
sha1s = ((sha1, content_key) | |||||
for sha1 in sha1s | |||||
for content_key in self._content_indexes['sha1'][sha1]) | |||||
matched = [] | |||||
for sha1, key in sha1s: | |||||
if sha1 > end: | |||||
break | |||||
Done Inline ActionsI never said it before. So this should be avoided. Naturally, that's up for discussion but in the mean time, that'd be great to align to current conventions. return { 'contents': matched, 'next': sha1, } As per tool, that'd be great not to bother with this and have some tool doing that for us. ardumont: I never said it before.
We are growing inconsistent with the formatting.
That seems like… | |||||
if len(matched) >= limit: | |||||
return { | |||||
'contents': matched, | |||||
'next': sha1, | |||||
} | |||||
matched.append({ | |||||
'data': self._contents_data[key], | |||||
**self._contents[key], | |||||
Not Done Inline ActionsSame as a previous comment, prefer a single return statement when possible/not too obfuscating. Here, something like: from itertools import chain, islice next = None, contents = [] sha1s = islice(self._sorted_sha1s, 0, from_index) it_contents = (((sha1, v) for v in self._content_indexes['sha1'][sha1]) for sha1 in sha1s) for sha1, key in islice(chain.from_iterable(it_contents), limit): if sha1 > end: break contents.append({ 'data': self._contents_data[key], **self._contents[key], }) else: next = sha1 return dict(next=next, contents=contents) douardda: Same as a previous comment, prefer a single return statement when possible/not too obfuscating. | |||||
Done Inline ActionsYou can't do that, because it assigns the last returned value to next, instead of the first non-returned one. As a consequence, UnboundLocalError when there is no result because the for loop does not run. So the for loop needs to be rewritten like this: next = None for sha1, key in chain.from_iterable(it_contents): if sha1 > end: break if len(contents) >= limit: next = sha1 break contents.append({ 'data': self._contents_data[key], **self._contents[key], }) return dict(next=next, contents=contents) Which I find slightly less readable than this: for sha1, key in chain.from_iterable(it_contents): if sha1 > end: break if len(contents) >= limit: return dict(next=sha1, contents=contents) contents.append({ 'data': self._contents_data[key], **self._contents[key], }) return dict(next=None, contents=contents) because the former uses an extra next variable to do the same thing (with a "break then immediately return" instead of just a return). vlorentz: You can't do that, because it assigns the last returned value to `next`, instead of the first… | |||||
Not Done Inline ActionsYes, i was wondering whether what david proposed would work. Pick either one of the last one you proposed. ardumont: Yes, i was wondering whether what david proposed would work.
Pick either one of the last one… | |||||
Not Done Inline ActionsIndeed I changed the behavior in my snippet, but a small fix could be: for sha1, key in islice(chain.from_iterable(it_contents), limit+1): #[...] return dict(next=next, contents=contents[:limit]) would do the trick, but again, it's readability is debatable. Whatever, I still don't like your return "hidden" in the middle of the loop, but I won't fight. douardda: Indeed I changed the behavior in my snippet, but a small fix could be:
```
for sha1, key in… | |||||
}) | |||||
return { | |||||
'contents': matched, | |||||
'next': None, | |||||
} | |||||
def content_get_metadata(self, sha1s): | def content_get_metadata(self, sha1s): | ||||
"""Retrieve content metadata in bulk | """Retrieve content metadata in bulk | ||||
Args: | Args: | ||||
content: iterable of content identifiers (sha1) | content: iterable of content identifiers (sha1) | ||||
Returns: | Returns: | ||||
an iterable with content metadata corresponding to the given ids | an iterable with content metadata corresponding to the given ids | ||||
▲ Show 20 Lines • Show All 1,017 Lines • Show Last 20 Lines |
Why?
A comment would help.