diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -38,6 +38,8 @@ def __init__(self, journal_writer=None): self._contents = {} self._content_indexes = defaultdict(lambda: defaultdict(set)) + self._skipped_contents = {} + self._skipped_content_indexes = defaultdict(lambda: defaultdict(set)) self.reset() @@ -77,10 +79,38 @@ del content['data'] self.journal_writer.write_addition('content', content) - count_contents = 0 + content_with_data = [] + content_without_data = [] + for content in contents: + if 'status' not in content: + content['status'] = 'visible' + if 'length' not in content: + content['length'] = -1 + if content['status'] == 'visible': + content_with_data.append(content) + elif content['status'] == 'absent': + content_without_data.append(content) + + count_content_added, count_content_bytes_added = \ + self._content_add_present(content_with_data, with_data) + + count_skipped_content_added = self._content_add_absent( + content_without_data + ) + + summary = { + 'content:add': count_content_added, + 'skipped_content:add': count_skipped_content_added, + } + + if with_data: + summary['content:add:bytes'] = count_content_bytes_added + + return summary + + def _content_add_present(self, contents, with_data): count_content_added = 0 count_content_bytes_added = 0 - for content in contents: key = self._content_key(content) if key in self._contents: @@ -96,23 +126,25 @@ ('content', content['sha1'])) self._contents[key] = copy.deepcopy(content) bisect.insort(self._sorted_sha1s, content['sha1']) - count_contents += 1 - if self._contents[key]['status'] == 'visible': - count_content_added += 1 - if with_data: - content_data = self._contents[key].pop('data') - count_content_bytes_added += len(content_data) - self.objstorage.add(content_data, content['sha1']) + count_content_added += 1 + if with_data: + content_data = self._contents[key].pop('data') + count_content_bytes_added += len(content_data) + self.objstorage.add(content_data, content['sha1']) - summary = { - 'content:add': count_content_added, - 'skipped_content:add': count_contents - count_content_added, - } + return (count_content_added, count_content_bytes_added) - if with_data: - summary['content:add:bytes'] = count_content_bytes_added + def _content_add_absent(self, contents): + count = 0 + skipped_content_missing = self.skipped_content_missing(contents) + for content in skipped_content_missing: + key = self._content_key(content) + for algo in DEFAULT_ALGORITHMS: + self._skipped_content_indexes[algo][content[algo]].add(key) + self._skipped_contents[key] = copy.deepcopy(content) + count += 1 - return summary + return count def content_add(self, content): """Add content blobs to the storage @@ -352,6 +384,26 @@ if content not in self._content_indexes['sha1']: yield content + def skipped_content_missing(self, contents): + """List all skipped_content missing from storage + + Args: + contents: Iterable of sha1 to check for skipped content entry + + Returns: + iterable: dict of skipped content entry + """ + + for content in contents: + for (key, algorithm) in self._content_key_algorithm(content): + if algorithm == 'blake2s256': + continue + if key not in self._skipped_content_indexes[algorithm]: + # index must contain hashes of algos except blake2s256 + # else the content is considered skipped + yield content + break + def directory_add(self, directories): """Add directories to the storage @@ -1660,6 +1712,12 @@ return tuple(content.get(key) for key in sorted(DEFAULT_ALGORITHMS)) @staticmethod + def _content_key_algorithm(content): + """ A stable key and the algorithm for a content""" + return tuple((content.get(key), key) + for key in sorted(DEFAULT_ALGORITHMS)) + + @staticmethod def _tool_key(tool): return '%r %r %r' % (tool['name'], tool['version'], tuple(sorted(tool['configuration'].items()))) diff --git a/swh/storage/tests/test_in_memory.py b/swh/storage/tests/test_in_memory.py --- a/swh/storage/tests/test_in_memory.py +++ b/swh/storage/tests/test_in_memory.py @@ -37,11 +37,6 @@ def test_content_add_metadata_db(self): pass - @pytest.mark.skip( - 'not implemented, see https://forge.softwareheritage.org/T1633') - def test_skipped_content_add(self): - pass - if not _test_origin_ids: @pytest.mark.skip('requires origin ids') def test_origin_metadata_add(self):