Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/in_memory.py
Show All 35 Lines | |||||
def now(): | def now(): | ||||
return datetime.datetime.now(tz=datetime.timezone.utc) | return datetime.datetime.now(tz=datetime.timezone.utc) | ||||
class InMemoryStorage: | class InMemoryStorage: | ||||
def __init__(self, journal_writer=None): | def __init__(self, journal_writer=None): | ||||
self._contents = {} | |||||
self._content_indexes = defaultdict(lambda: defaultdict(set)) | |||||
self._skipped_contents = {} | |||||
self._skipped_content_indexes = defaultdict(lambda: defaultdict(set)) | |||||
self.reset() | self.reset() | ||||
self.journal_writer = JournalWriter(journal_writer) | self.journal_writer = JournalWriter(journal_writer) | ||||
def reset(self): | def reset(self): | ||||
self._contents = {} | |||||
self._content_indexes = defaultdict(lambda: defaultdict(set)) | |||||
self._skipped_contents = {} | |||||
self._skipped_content_indexes = defaultdict(lambda: defaultdict(set)) | |||||
self._directories = {} | self._directories = {} | ||||
self._revisions = {} | self._revisions = {} | ||||
self._releases = {} | self._releases = {} | ||||
self._snapshots = {} | self._snapshots = {} | ||||
self._origins = {} | self._origins = {} | ||||
self._origins_by_id = [] | self._origins_by_id = [] | ||||
self._origins_by_sha1 = {} | self._origins_by_sha1 = {} | ||||
self._origin_visits = {} | self._origin_visits = {} | ||||
▲ Show 20 Lines • Show All 185 Lines • ▼ Show 20 Lines | class InMemoryStorage: | ||||
def content_missing_per_sha1_git(self, contents): | def content_missing_per_sha1_git(self, contents): | ||||
for content in contents: | for content in contents: | ||||
if content not in self._content_indexes['sha1_git']: | if content not in self._content_indexes['sha1_git']: | ||||
yield content | yield content | ||||
def content_get_random(self): | def content_get_random(self): | ||||
return random.choice(list(self._content_indexes['sha1_git'])) | return random.choice(list(self._content_indexes['sha1_git'])) | ||||
def _skipped_content_add(self, contents: Iterable[SkippedContent]) -> Dict: | def _skipped_content_add(self, contents: List[SkippedContent]) -> Dict: | ||||
self.journal_writer.skipped_content_add(contents) | self.journal_writer.skipped_content_add(contents) | ||||
summary = { | summary = { | ||||
'skipped_content:add': 0 | 'skipped_content:add': 0 | ||||
} | } | ||||
skipped_content_missing = self.skipped_content_missing( | missing_contents = self.skipped_content_missing( | ||||
[c.to_dict() for c in contents]) | [c.to_dict() for c in contents]) | ||||
for content in skipped_content_missing: | missing = {self._content_key(c) for c in missing_contents} | ||||
key = self._content_key(content, allow_missing=True) | contents = [c for c in contents | ||||
if self._content_key(c) in missing] | |||||
for content in contents: | |||||
key = self._content_key(content) | |||||
olasd: Took me a while to understand this logic. Why not use `_content_key_algorithm()` here?
You… | |||||
for algo in DEFAULT_ALGORITHMS: | for algo in DEFAULT_ALGORITHMS: | ||||
if content.get(algo): | if content.get_hash(algo): | ||||
self._skipped_content_indexes[algo][ | self._skipped_content_indexes[algo][ | ||||
content.get(algo)].add(key) | content.get_hash(algo)].add(key) | ||||
self._skipped_contents[key] = content | self._skipped_contents[key] = content | ||||
summary['skipped_content:add'] += 1 | summary['skipped_content:add'] += 1 | ||||
return summary | return summary | ||||
def skipped_content_add(self, content: Iterable[SkippedContent]) -> Dict: | |||||
now = datetime.datetime.now(tz=datetime.timezone.utc) | |||||
content = [attr.evolve(c, ctime=now) for c in content] | |||||
Not Done Inline ActionsAs you only iterate once, no need for the list(content) olasd: As you only iterate once, no need for the `list(content)` | |||||
return self._skipped_content_add(content) | |||||
def skipped_content_missing(self, contents): | def skipped_content_missing(self, contents): | ||||
for content in contents: | for content in contents: | ||||
for (key, algorithm) in self._content_key_algorithm(content): | matches = list(self._skipped_contents.values()) | ||||
for (algorithm, key) in self._content_key(content): | |||||
if algorithm == 'blake2s256': | if algorithm == 'blake2s256': | ||||
continue | continue | ||||
if key not in self._skipped_content_indexes[algorithm]: | # Filter out skipped contents with the same hash | ||||
# index must contain hashes of algos except blake2s256 | matches = [ | ||||
# else the content is considered skipped | match for match in matches | ||||
if match.get_hash(algorithm) == key] | |||||
# if none of the contents match | |||||
if not matches: | |||||
yield {algo: content[algo] | yield {algo: content[algo] | ||||
for algo in DEFAULT_ALGORITHMS | for algo in DEFAULT_ALGORITHMS} | ||||
if content[algo] is not None} | |||||
break | |||||
def skipped_content_add(self, content: Iterable[SkippedContent]) -> Dict: | |||||
content = list(content) | |||||
now = datetime.datetime.now(tz=datetime.timezone.utc) | |||||
content = [attr.evolve(c, ctime=now) for c in content] | |||||
return self._skipped_content_add(content) | |||||
def directory_add(self, directories: Iterable[Directory]) -> Dict: | def directory_add(self, directories: Iterable[Directory]) -> Dict: | ||||
directories = [dir_ for dir_ in directories | directories = [dir_ for dir_ in directories | ||||
if dir_.id not in self._directories] | if dir_.id not in self._directories] | ||||
self.journal_writer.directory_add(directories) | self.journal_writer.directory_add(directories) | ||||
count = 0 | count = 0 | ||||
for directory in directories: | for directory in directories: | ||||
▲ Show 20 Lines • Show All 666 Lines • ▼ Show 20 Lines | def _person_add(self, person): | ||||
self._persons.append(person) | self._persons.append(person) | ||||
self._objects[key].append(('person', person_id)) | self._objects[key].append(('person', person_id)) | ||||
else: | else: | ||||
person_id = self._objects[key][0][1] | person_id = self._objects[key][0][1] | ||||
person = self._persons[person_id-1] | person = self._persons[person_id-1] | ||||
return person | return person | ||||
@staticmethod | @staticmethod | ||||
def _content_key(content, allow_missing=False): | def _content_key(content): | ||||
"""A stable key for a content""" | |||||
return tuple(getattr(content, key, None) | |||||
for key in sorted(DEFAULT_ALGORITHMS)) | |||||
@staticmethod | |||||
def _content_key_algorithm(content): | |||||
""" A stable key and the algorithm for a content""" | """ A stable key and the algorithm for a content""" | ||||
if isinstance(content, BaseContent): | if isinstance(content, BaseContent): | ||||
content = content.to_dict() | content = content.to_dict() | ||||
return tuple((content.get(key), key) | return tuple((key, content.get(key)) | ||||
for key in sorted(DEFAULT_ALGORITHMS)) | for key in sorted(DEFAULT_ALGORITHMS)) | ||||
@staticmethod | @staticmethod | ||||
def _tool_key(tool): | def _tool_key(tool): | ||||
return '%r %r %r' % (tool['name'], tool['version'], | return '%r %r %r' % (tool['name'], tool['version'], | ||||
tuple(sorted(tool['configuration'].items()))) | tuple(sorted(tool['configuration'].items()))) | ||||
@staticmethod | @staticmethod | ||||
Show All 11 Lines |
Took me a while to understand this logic. Why not use _content_key_algorithm() here?
You could also make a dict with these keys, which would alleviate the n² nature of the lookup (even though it's probably not a big deal considering how tiny n is)