diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -675,6 +675,8 @@ ] db.mktemp("extid", cur) + self.journal_writer.extid_add(ids) + db.copy_to(extid, "tmp_extid", db.extid_cols, cur) # move metadata in place diff --git a/swh/storage/tests/test_backfill.py b/swh/storage/tests/test_backfill.py --- a/swh/storage/tests/test_backfill.py +++ b/swh/storage/tests/test_backfill.py @@ -19,6 +19,7 @@ compute_query, raw_extrinsic_metadata_target_ranges, ) +from swh.storage.in_memory import InMemoryStorage from swh.storage.replay import process_replay_objects from swh.storage.tests.test_replay import check_replayed @@ -287,6 +288,8 @@ replayer2.process(worker_fn2) # Compare storages + assert isinstance(sto1, InMemoryStorage) # needed to help mypy + assert isinstance(sto2, InMemoryStorage) check_replayed(sto1, sto2) for record in caplog.records: diff --git a/swh/storage/tests/test_replay.py b/swh/storage/tests/test_replay.py --- a/swh/storage/tests/test_replay.py +++ b/swh/storage/tests/test_replay.py @@ -85,7 +85,9 @@ nb_inserted = replayer.process(worker_fn) assert nb_sent == nb_inserted - _check_replayed(src, dst) + assert isinstance(src, InMemoryStorage) # needed to help mypy + assert isinstance(dst, InMemoryStorage) + check_replayed(src, dst) collision = 0 for record in caplog.records: @@ -165,7 +167,9 @@ assert expected_content_hashes in actual_colliding_hashes # all objects from the src should exists in the dst storage - _check_replayed(src, dst, exclude=["contents"]) + assert isinstance(src, InMemoryStorage) # needed to help mypy + assert isinstance(dst, InMemoryStorage) # needed to help mypy + check_replayed(src, dst, exclude=["contents"]) # but the dst has one content more (one of the 2 colliding ones) assert ( len(list(src._cql_runner._contents.iter_all())) @@ -188,12 +192,29 @@ # utility functions -def _check_replayed( - src: InMemoryStorage, dst: InMemoryStorage, exclude: Optional[Container] = None +def check_replayed( + src: InMemoryStorage, + dst: InMemoryStorage, + exclude: Optional[Container] = None, + expected_anonymized=False, ): - """Simple utility function to compare the content of 2 in_memory storages + """Simple utility function to compare the content of 2 in_memory storages""" + + def fix_expected(attr, row): + if expected_anonymized: + if attr == "releases": + row = dataclasses.replace( + row, author=row.author and row.author.anonymize() + ) + elif attr == "revisions": + row = dataclasses.replace( + row, + author=row.author.anonymize(), + committer=row.committer.anonymize(), + ) + + return row - """ for attr_ in ( "contents", "skipped_contents", @@ -210,7 +231,7 @@ if exclude and attr_ in exclude: continue expected_objects = [ - (id, nullify_ctime(obj)) + (id, nullify_ctime(fix_expected(attr_, obj))) for id, obj in sorted(getattr(src._cql_runner, f"_{attr_}").iter_all()) ] got_objects = [ @@ -321,46 +342,6 @@ assert nb_sent == nb_inserted # Check the contents of the destination storage, and whether the anonymization was # properly used + assert isinstance(storage, InMemoryStorage) # needed to help mypy + assert isinstance(dst_storage, InMemoryStorage) check_replayed(storage, dst_storage, expected_anonymized=not privileged) - - -def check_replayed(src, dst, expected_anonymized=False): - """Simple utility function to compare the content of 2 in_memory storages - - If expected_anonymized is True, objects from the source storage are anonymized - before comparing with the destination storage. - - """ - - def maybe_anonymize(attr_, row): - if expected_anonymized: - if attr_ == "releases": - row = dataclasses.replace(row, author=row.author.anonymize()) - elif attr_ == "revisions": - row = dataclasses.replace( - row, - author=row.author.anonymize(), - committer=row.committer.anonymize(), - ) - return row - - for attr_ in ( - "contents", - "skipped_contents", - "directories", - "revisions", - "releases", - "snapshots", - "origins", - "origin_visit_statuses", - "raw_extrinsic_metadata", - ): - expected_objects = [ - (id, nullify_ctime(maybe_anonymize(attr_, obj))) - for id, obj in sorted(getattr(src._cql_runner, f"_{attr_}").iter_all()) - ] - got_objects = [ - (id, nullify_ctime(obj)) - for id, obj in sorted(getattr(dst._cql_runner, f"_{attr_}").iter_all()) - ] - assert got_objects == expected_objects, f"Mismatch object list for {attr_}"