diff --git a/swh/provenance/postgresql/provenancedb.py b/swh/provenance/postgresql/provenancedb.py --- a/swh/provenance/postgresql/provenancedb.py +++ b/swh/provenance/postgresql/provenancedb.py @@ -146,53 +146,20 @@ def insert_all(self): # Performe insertions with cached information - if self.cache["content"]["added"]: - psycopg2.extras.execute_values( - self.cursor, - """ - LOCK TABLE ONLY content; - INSERT INTO content(sha1, date) VALUES %s - ON CONFLICT (sha1) DO - UPDATE SET date=LEAST(EXCLUDED.date,content.date) - """, - [ - (x, self.cache["content"]["data"][x]) - for x in self.cache["content"]["added"] - ], - ) - self.cache["content"]["added"].clear() - - if self.cache["directory"]["added"]: - psycopg2.extras.execute_values( - self.cursor, - """ - LOCK TABLE ONLY directory; - INSERT INTO directory(sha1, date) VALUES %s - ON CONFLICT (sha1) DO - UPDATE SET date=LEAST(EXCLUDED.date,directory.date) - """, - [ - (x, self.cache["directory"]["data"][x]) - for x in self.cache["directory"]["added"] - ], - ) - self.cache["directory"]["added"].clear() - - if self.cache["revision"]["added"]: - psycopg2.extras.execute_values( - self.cursor, - """ - LOCK TABLE ONLY revision; - INSERT INTO revision(sha1, date) VALUES %s - ON CONFLICT (sha1) DO - UPDATE SET date=LEAST(EXCLUDED.date,revision.date) - """, - [ - (x, self.cache["revision"]["data"][x]) - for x in self.cache["revision"]["added"] - ], - ) - self.cache["revision"]["added"].clear() + for table in ("content", "directory", "revision"): + cache = self.cache[table] + if cache["added"]: + psycopg2.extras.execute_values( + self.cursor, + f""" + LOCK TABLE ONLY {table}; + INSERT INTO {table}(sha1, date) VALUES %s + ON CONFLICT (sha1) DO + UPDATE SET date=LEAST(EXCLUDED.date, {table}.date) + """, + [(x, cache["data"][x]) for x in cache["added"]], + ) + cache["added"].clear() # Relations should come after ids for elements were resolved if self.cache["content_early_in_rev"]: @@ -401,24 +368,6 @@ # SQL queries (one to insert missing entries in the location' table, # one to insert entries in the dst_table) - # Resolve src0 ids - src0_sha1s = tuple(set(sha1 for (sha1, _, _) in self.cache[dst_table])) - fmt = ",".join(["%s"] * len(src0_sha1s)) - self.cursor.execute( - f"""SELECT sha1, id FROM {src0_table} WHERE sha1 IN ({fmt})""", - src0_sha1s, - ) - src0_values = dict(self.cursor.fetchall()) - - # Resolve src1 ids - src1_sha1s = tuple(set(sha1 for (_, sha1, _) in self.cache[dst_table])) - fmt = ",".join(["%s"] * len(src1_sha1s)) - self.cursor.execute( - f"""SELECT sha1, id FROM {src1_table} WHERE sha1 IN ({fmt})""", - src1_sha1s, - ) - src1_values = dict(self.cursor.fetchall()) - # insert missing locations locations = tuple(set((loc,) for (_, _, loc) in self.cache[dst_table])) psycopg2.extras.execute_values( @@ -430,26 +379,14 @@ """, locations, ) - # fetch location ids - fmt = ",".join(["%s"] * len(locations)) - self.cursor.execute( - f"SELECT path, id FROM location WHERE path IN ({fmt})", - locations, - ) - loc_ids = dict(self.cursor.fetchall()) - # Insert values in dst_table - rows = [ - (src0_values[sha1_src], src1_values[sha1_dst], loc_ids[loc]) - for (sha1_src, sha1_dst, loc) in self.cache[dst_table] - ] - psycopg2.extras.execute_values( - self.cursor, - f""" - LOCK TABLE ONLY {dst_table}; - INSERT INTO {dst_table} VALUES %s - ON CONFLICT DO NOTHING - """, - rows, - ) + sql = f""" + INSERT INTO {dst_table} + SELECT {src0_table}.id, {src1_table}.id, location.id + FROM (VALUES %s) AS V(src, dst, path) + INNER JOIN {src0_table} on ({src0_table}.sha1=V.src) + INNER JOIN {src1_table} on ({src1_table}.sha1=V.dst) + INNER JOIN location on (location.path=V.path) + """ + psycopg2.extras.execute_values(self.cursor, sql, self.cache[dst_table]) self.cache[dst_table].clear()