diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -19,8 +19,8 @@ import attr from swh.model.model import ( - Content, Directory, Revision, Release, Snapshot, OriginVisit, Origin, - SHA1_SIZE) + BaseContent, Content, SkippedContent, Directory, Revision, Release, + Snapshot, OriginVisit, Origin, SHA1_SIZE) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError @@ -75,47 +75,26 @@ return True def _content_add(self, contents, with_data): - content_with_data = [] - content_without_data = [] for content in contents: if content.status is None: content.status = 'visible' + if content.status == 'absent': + raise ValueError('content with status=absent') if content.length is None: - content.length = -1 - if content.status != 'absent': - if self._content_key(content) not in self._contents: - content_with_data.append(content) - else: - if self._content_key(content) not in self._skipped_contents: - content_without_data.append(content) + raise ValueError('content with length=None') if self.journal_writer: - for content in content_with_data: + for content in contents: content = attr.evolve(content, data=None) self.journal_writer.write_addition('content', content) - for content in content_without_data: - self.journal_writer.write_addition('content', content) - - count_content_added, count_content_bytes_added = \ - self._content_add_present(content_with_data, with_data) - - count_skipped_content_added = self._content_add_absent( - content_without_data - ) summary = { - 'content:add': count_content_added, - 'skipped_content:add': count_skipped_content_added, + 'content:add': 0, } if with_data: - summary['content:add:bytes'] = count_content_bytes_added + summary['content:add:bytes'] = 0 - return summary - - def _content_add_present(self, contents, with_data): - count_content_added = 0 - count_content_bytes_added = 0 for content in contents: key = self._content_key(content) if key in self._contents: @@ -133,40 +112,21 @@ ('content', content.sha1)) self._contents[key] = content bisect.insort(self._sorted_sha1s, content.sha1) - count_content_added += 1 + summary['content:add'] += 1 if with_data: content_data = self._contents[key].data self._contents[key] = attr.evolve( self._contents[key], data=None) - count_content_bytes_added += len(content_data) + summary['content:add:bytes'] += len(content_data) self.objstorage.add(content_data, content.sha1) - return (count_content_added, count_content_bytes_added) - - def _content_add_absent(self, contents): - count = 0 - skipped_content_missing = self.skipped_content_missing(contents) - for content in skipped_content_missing: - key = self._content_key(content) - for algo in DEFAULT_ALGORITHMS: - self._skipped_content_indexes[algo][content.get_hash(algo)] \ - .add(key) - self._skipped_contents[key] = content - count += 1 - - return count - - def _content_to_model(self, contents): - for content in contents: - content = content.copy() - content.pop('origin', None) - yield Content.from_dict(content) + return summary def content_add(self, content): now = datetime.datetime.now(tz=datetime.timezone.utc) - content = [attr.evolve(c, ctime=now) - for c in self._content_to_model(content)] + content = [attr.evolve(Content.from_dict(c), ctime=now) + for c in content] return self._content_add(content, with_data=True) def content_update(self, content, keys=[]): @@ -194,7 +154,7 @@ self._content_indexes[algorithm][hash_].add(new_key) def content_add_metadata(self, content): - content = list(self._content_to_model(content)) + content = [Content.from_dict(c) for c in content] return self._content_add(content, with_data=False) def content_get(self, content): @@ -308,6 +268,37 @@ if content not in self._content_indexes['sha1_git']: yield content + def content_get_random(self): + return random.choice(list(self._content_indexes['sha1_git'])) + + def _skipped_content_add(self, contents): + for content in contents: + if content.status is None: + content.status = 'absent' + if content.length is None: + content.length = -1 + if content.status != 'absent': + raise ValueError(f'Content with status={content.status}') + + if self.journal_writer: + for content in contents: + self.journal_writer.write_addition('content', content) + + summary = { + 'skipped_content:add': 0 + } + + skipped_content_missing = self.skipped_content_missing(contents) + for content in skipped_content_missing: + key = self._content_key(content) + for algo in DEFAULT_ALGORITHMS: + self._skipped_content_indexes[algo][content.get_hash(algo)] \ + .add(key) + self._skipped_contents[key] = content + summary['skipped_content:add'] += 1 + + return summary + def skipped_content_missing(self, contents): for content in contents: for (key, algorithm) in self._content_key_algorithm(content): @@ -319,8 +310,11 @@ yield content break - def content_get_random(self): - return random.choice(list(self._content_indexes['sha1_git'])) + def skipped_content_add(self, content): + now = datetime.datetime.now(tz=datetime.timezone.utc) + content = [attr.evolve(SkippedContent.from_dict(c), ctime=now) + for c in content] + return self._skipped_content_add(content) def directory_add(self, directories): directories = list(directories) @@ -1029,7 +1023,7 @@ @staticmethod def _content_key_algorithm(content): """ A stable key and the algorithm for a content""" - if isinstance(content, Content): + if isinstance(content, BaseContent): content = content.to_dict() return tuple((content.get(key), key) for key in sorted(DEFAULT_ALGORITHMS)) diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py --- a/swh/storage/tests/conftest.py +++ b/swh/storage/tests/conftest.py @@ -58,7 +58,10 @@ @pytest.fixture def swh_contents(swh_storage): contents = gen_contents(n=20) - swh_storage.content_add(contents) + swh_storage.content_add( + [c for c in contents if c['status'] != 'absent']) + swh_storage.skipped_content_add( + [c for c in contents if c['status'] == 'absent']) return contents diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -141,7 +141,6 @@ assert actual_result == { 'content:add': 1, 'content:add:bytes': cont['length'], - 'skipped_content:add': 0 } assert list(swh_storage.content_get([cont['sha1']])) == \ @@ -168,7 +167,6 @@ assert actual_result == { 'content:add': 1, 'content:add:bytes': data.cont['length'], - 'skipped_content:add': 0 } swh_storage.refresh_stat_counters() @@ -177,25 +175,35 @@ def test_content_add_validation(self, swh_storage): cont = data.cont + with pytest.raises(ValueError, match='status'): + swh_storage.content_add([{**cont, 'status': 'absent'}]) + with pytest.raises(ValueError, match='status'): swh_storage.content_add([{**cont, 'status': 'foobar'}]) with pytest.raises(ValueError, match="(?i)length"): swh_storage.content_add([{**cont, 'length': -2}]) + with pytest.raises( + (ValueError, TypeError), + match="reason"): + swh_storage.content_add([{**cont, 'reason': 'foobar'}]) + + def test_skipped_content_add_validation(self, swh_storage): + cont = data.cont.copy() + del cont['data'] + + with pytest.raises(ValueError, match='status'): + swh_storage.skipped_content_add([{**cont, 'status': 'visible'}]) + with pytest.raises((ValueError, psycopg2.IntegrityError), match='reason') as cm: - swh_storage.content_add([{**cont, 'status': 'absent'}]) + swh_storage.skipped_content_add([{**cont, 'status': 'absent'}]) if type(cm.value) == psycopg2.IntegrityError: assert cm.exception.pgcode == \ psycopg2.errorcodes.NOT_NULL_VIOLATION - with pytest.raises( - ValueError, - match="^Must not provide a reason if content is not absent.$"): - swh_storage.content_add([{**cont, 'reason': 'foobar'}]) - def test_content_get_missing(self, swh_storage): cont = data.cont @@ -225,7 +233,6 @@ assert actual_result == { 'content:add': 2, 'content:add:bytes': cont['length'] + cont2['length'], - 'skipped_content:add': 0 } def test_content_add_twice(self, swh_storage): @@ -233,7 +240,6 @@ assert actual_result == { 'content:add': 1, 'content:add:bytes': data.cont['length'], - 'skipped_content:add': 0 } assert len(swh_storage.journal_writer.objects) == 1 @@ -241,9 +247,8 @@ assert actual_result == { 'content:add': 1, 'content:add:bytes': data.cont2['length'], - 'skipped_content:add': 0 } - assert len(swh_storage.journal_writer.objects) == 2 + assert 2 <= len(swh_storage.journal_writer.objects) <= 3 assert len(swh_storage.content_find(data.cont)) == 1 assert len(swh_storage.content_find(data.cont2)) == 1 @@ -286,7 +291,6 @@ actual_result = swh_storage.content_add_metadata([cont]) assert actual_result == { 'content:add': 1, - 'skipped_content:add': 0 } expected_cont = cont.copy() @@ -308,7 +312,6 @@ actual_result = swh_storage.content_add_metadata([cont, cont2]) assert actual_result == { 'content:add': 2, - 'skipped_content:add': 0 } def test_content_add_metadata_collision(self, swh_storage): @@ -336,11 +339,9 @@ assert len(missing) == 2 - actual_result = swh_storage.content_add([cont, cont, cont2]) + actual_result = swh_storage.skipped_content_add([cont, cont, cont2]) assert actual_result == { - 'content:add': 0, - 'content:add:bytes': 0, 'skipped_content:add': 2, } @@ -3727,7 +3728,6 @@ assert actual_result == { 'content:add': 1, 'content:add:bytes': cont['length'], - 'skipped_content:add': 0 } if hasattr(swh_storage, 'objstorage'): @@ -3758,7 +3758,6 @@ assert actual_result == { 'content:add': 1, - 'skipped_content:add': 0 } if hasattr(swh_storage, 'objstorage'): @@ -3778,11 +3777,9 @@ cont2 = data.skipped_cont2 cont2['blake2s256'] = None - actual_result = swh_storage.content_add([cont, cont, cont2]) + actual_result = swh_storage.skipped_content_add([cont, cont, cont2]) assert actual_result == { - 'content:add': 0, - 'content:add:bytes': 0, 'skipped_content:add': 2, }