diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -261,6 +261,11 @@ select_cols = ListedOrigin.select_columns() insert_cols, insert_meta = ListedOrigin.insert_columns_and_metavars() + deduplicated_origins = { + tuple(getattr(origin, k) for k in pk_cols): origin + for origin in listed_origins + } + upsert_cols = [col for col in insert_cols if col not in pk_cols] upsert_set = ", ".join(f"{col} = EXCLUDED.{col}" for col in upsert_cols) @@ -274,7 +279,7 @@ ret = psycopg2.extras.execute_values( cur=cur, sql=query, - argslist=(attr.asdict(origin) for origin in listed_origins), + argslist=(attr.asdict(origin) for origin in deduplicated_origins.values()), template=f"({', '.join(insert_meta)})", page_size=1000, fetch=True, diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -686,6 +686,15 @@ assert all(origin.first_seen == origin.last_seen for origin in ret) + def test_record_listed_origins_with_duplicate(self, swh_scheduler, listed_origins): + # the duplicates must be in the same page to raise the "on conflict error" + listed_origins.insert(0, listed_origins[0]) + + ret = swh_scheduler.record_listed_origins(listed_origins) + + # without the duplicate + assert len(ret) == len(listed_origins) - 1 + def test_record_listed_origins_upsert(self, swh_scheduler, listed_origins): # First, insert `cutoff` origins cutoff = 100