Page MenuHomeSoftware Heritage

D6753.id24529.diff
No OneTemporary

D6753.id24529.diff

diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py
--- a/swh/scheduler/backend.py
+++ b/swh/scheduler/backend.py
@@ -261,6 +261,11 @@
select_cols = ListedOrigin.select_columns()
insert_cols, insert_meta = ListedOrigin.insert_columns_and_metavars()
+ deduplicated_origins = {
+ tuple(getattr(origin, k) for k in pk_cols): origin
+ for origin in listed_origins
+ }
+
upsert_cols = [col for col in insert_cols if col not in pk_cols]
upsert_set = ", ".join(f"{col} = EXCLUDED.{col}" for col in upsert_cols)
@@ -274,7 +279,7 @@
ret = psycopg2.extras.execute_values(
cur=cur,
sql=query,
- argslist=(attr.asdict(origin) for origin in listed_origins),
+ argslist=(attr.asdict(origin) for origin in deduplicated_origins.values()),
template=f"({', '.join(insert_meta)})",
page_size=1000,
fetch=True,
diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py
--- a/swh/scheduler/tests/test_scheduler.py
+++ b/swh/scheduler/tests/test_scheduler.py
@@ -686,6 +686,15 @@
assert all(origin.first_seen == origin.last_seen for origin in ret)
+ def test_record_listed_origins_with_duplicate(self, swh_scheduler, listed_origins):
+ # the duplicates must be in the same page to raise the "on conflict error"
+ listed_origins.insert(0, listed_origins[0])
+
+ ret = swh_scheduler.record_listed_origins(listed_origins)
+
+ # without the duplicate
+ assert len(ret) == len(listed_origins) - 1
+
def test_record_listed_origins_upsert(self, swh_scheduler, listed_origins):
# First, insert `cutoff` origins
cutoff = 100

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 8:54 PM (17 h, 38 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221433

Event Timeline