diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -38,6 +38,8 @@ def __init__(self, journal_writer=None): self._contents = {} self._content_indexes = defaultdict(lambda: defaultdict(set)) + self._skipped_contents = {} + self._skipped_content_indexes = defaultdict(lambda: defaultdict(set)) self.reset() @@ -77,10 +79,43 @@ del content['data'] self.journal_writer.write_addition('content', content) - count_contents = 0 count_content_added = 0 + count_skipped_content_added = 0 count_content_bytes_added = 0 + content_by_status = defaultdict(list) + for content in contents: + if 'status' not in content: + content['status'] = 'visible' + if 'length' not in content: + content['length'] = -1 + content_by_status[content['status']].append(content) + + # store in content + content_with_data = content_by_status['visible'] + # store in skipped content + content_without_data = content_by_status['absent'] + + count_content_added, count_content_bytes_added = \ + self._content_with_data_add(content_with_data, with_data) + + count_skipped_content_added = self._content_without_data_add( + content_without_data + ) + + summary = { + 'content:add': count_content_added, + 'skipped_content:add': count_skipped_content_added, + } + + if with_data: + summary['content:add:bytes'] = count_content_bytes_added + + return summary + + def _content_with_data_add(self, contents, with_data): + count_content_added = 0 + count_content_bytes_added = 0 for content in contents: key = self._content_key(content) if key in self._contents: @@ -96,23 +131,25 @@ ('content', content['sha1'])) self._contents[key] = copy.deepcopy(content) bisect.insort(self._sorted_sha1s, content['sha1']) - count_contents += 1 - if self._contents[key]['status'] == 'visible': - count_content_added += 1 - if with_data: - content_data = self._contents[key].pop('data') - count_content_bytes_added += len(content_data) - self.objstorage.add(content_data, content['sha1']) + count_content_added += 1 + if with_data: + content_data = self._contents[key].pop('data') + count_content_bytes_added += len(content_data) + self.objstorage.add(content_data, content['sha1']) - summary = { - 'content:add': count_content_added, - 'skipped_content:add': count_contents - count_content_added, - } + return (count_content_added, count_content_bytes_added) - if with_data: - summary['content:add:bytes'] = count_content_bytes_added + def _content_without_data_add(self, contents): + count = 0 + skipped_content_missing = self.skipped_content_missing(contents) + for count, content in enumerate(skipped_content_missing): + for (key, algorithm) in self._content_key_algorithm(content): + if algorithm == 'blake2s256': + continue + self._skipped_content_indexes[algorithm].add(key) + self._skipped_contents[key] = copy.deepcopy(content) - return summary + return count def content_add(self, content): """Add content blobs to the storage @@ -352,6 +389,24 @@ if content not in self._content_indexes['sha1']: yield content + def skipped_content_missing(self, contents): + """List all skipped_content missing from storage + + Args: + contents: Iterable of sha1 to check for skipped content entry + + Returns: + iterable: dict of skipped content entry + """ + + for content in contents: + for (key, algorithm) in self._content_key_algorithm(content): + if algorithm == 'blake2s256': + continue + if key not in self._skipped_content_indexes[algorithm]: + yield content + break + def directory_add(self, directories): """Add directories to the storage @@ -1658,6 +1713,12 @@ return tuple(content.get(key) for key in sorted(DEFAULT_ALGORITHMS)) @staticmethod + def _content_key_algorithm(content): + """ A stable key and the algorithm for a content""" + return tuple((content.get(key), key) + for key in sorted(DEFAULT_ALGORITHMS)) + + @staticmethod def _tool_key(tool): return '%r %r %r' % (tool['name'], tool['version'], tuple(sorted(tool['configuration'].items())))