diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -24,6 +24,7 @@ from swh.model.model import ( Content, Directory, + DirectoryEntry, ExtID, Origin, OriginVisit, @@ -804,6 +805,44 @@ ("directory", directory) ] + def test_directory_add_raw_manifest__different_entries( + self, swh_storage, check_ls=True + ): + """Add two directories with the same raw_manifest (and therefore, same id) + but different entries. + """ + dir1 = Directory( + entries=( + DirectoryEntry( + name=b"name1", type="file", target=b"\x00" * 20, perms=0o100000 + ), + ), + raw_manifest=b"abc", + ) + dir2 = Directory( + entries=( + DirectoryEntry( + name=b"name2", type="file", target=b"\x00" * 20, perms=0o100000 + ), + ), + raw_manifest=b"abc", + ) + assert dir1.id == dir2.id # because it is computed from the raw_manifest only + + assert swh_storage.directory_add([dir1])["directory:add"] == 1 + assert swh_storage.directory_add([dir2])["directory:add"] in (0, 1) + + if check_ls: + # This assertion is skipped when running from + # test_directory_add_raw_manifest__different_entries__allow_overwrite + assert [entry["name"] for entry in swh_storage.directory_ls(dir1.id)] == ( + [b"name1"] + ) + + # used in TestCassandraStorage by + # test_directory_add_raw_manifest__different_entries__allow_overwrite + return dir1.id + def test_directory_ls_recursive(self, swh_storage, sample_data): # create consistent dataset regarding the directories we want to list content, content2 = sample_data.contents[:2] diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py --- a/swh/storage/tests/test_cassandra.py +++ b/swh/storage/tests/test_cassandra.py @@ -580,6 +580,43 @@ assert list(swh_storage.directory_ls(directory.id)) == [] assert swh_storage.directory_get_entries(directory.id) is None + def test_directory_add_raw_manifest__different_entries__allow_overwrite( + self, swh_storage + ): + """This test demonstrates a shortcoming of the Cassandra storage backend's + design: + + 1. add a directory with an entry named "name1" and raw_manifest="abc" + 2. add a directory with an entry named "name2" and the same raw_manifest + 3. the directories' id is computed only from the raw_manifest, so both + directories have the same id, which causes their entries to be + "additive" in the database; so directory_ls returns both entries + + However, by default, the Cassandra storage has allow_overwrite=False, + which "accidentally" avoids this issue most of the time, by skipping insertion + if an object with the same id is already in the database. + + This can still be an issue when either allow_overwrite=True or when inserting + both directories at about the same time (because of the lack of + transactionality); but the likelihood of two clients inserting two different + objects with the same manifest at the same time is very low, it could only + happen if loaders running in parallel used different (or nondeterministic) + parsers on corrupt objects. + """ + assert ( + swh_storage._allow_overwrite is False + ), "Unexpected default _allow_overwrite value" + swh_storage._allow_overwrite = True + + # Run the other test, but skip its last assertion + dir_id = self.test_directory_add_raw_manifest__different_entries( + swh_storage, check_ls=False + ) + assert [entry["name"] for entry in swh_storage.directory_ls(dir_id)] == [ + b"name1", + b"name2", + ] + def test_snapshot_add_atomic(self, swh_storage, sample_data, mocker): """Checks that a crash occurring after some snapshot branches were written does not cause the snapshot to be (partially) visible.