diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -844,6 +844,7 @@ def origin_add(self, origins: List[Origin]) -> Dict[str, int]: to_add = [ori for ori in origins if self.origin_get_one(ori.url) is None] + to_add = sorted(set(to_add), key=to_add.index) self.journal_writer.origin_add(to_add) for origin in to_add: self._cql_runner.origin_add_one( diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -1209,7 +1209,7 @@ urls = [o.url for o in origins] known_origins = set(url for (url,) in db.origin_get_by_url(urls, cur)) # use lists here to keep origins sorted; some tests depend on this - to_add = [url for url in urls if url not in known_origins] + to_add = sorted(set(urls) - known_origins, key=urls.index) self.journal_writer.origin_add([Origin(url=url) for url in to_add]) added = 0 diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1189,6 +1189,21 @@ ) assert add2 == {"origin:add": 0} + def test_origin_add_twice_at_once(self, swh_storage, sample_data): + origin, origin2 = sample_data.origins[:2] + + add1 = swh_storage.origin_add([origin, origin2, origin, origin2]) + assert set(swh_storage.journal_writer.journal.objects) == set( + [("origin", origin), ("origin", origin2),] + ) + assert add1 == {"origin:add": 2} + + add2 = swh_storage.origin_add([origin, origin2, origin, origin2]) + assert set(swh_storage.journal_writer.journal.objects) == set( + [("origin", origin), ("origin", origin2),] + ) + assert add2 == {"origin:add": 0} + def test_origin_get(self, swh_storage, sample_data): origin, origin2 = sample_data.origins[:2]