diff --git a/swh/provenance/model.py b/swh/provenance/model.py --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -10,14 +10,11 @@ class OriginEntry: - def __init__( - self, url: str, date: datetime, snapshot: bytes, id: Optional[int] = None - ): + def __init__(self, url: str, date: datetime, snapshot: bytes): self.url = url # TODO: this is probably not needed and will be removed! # self.date = date self.snapshot = snapshot - self.id = id self._revisions: Optional[List[RevisionEntry]] = None def retrieve_revisions(self, archive: ArchiveInterface): diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -69,8 +69,6 @@ origin: OriginEntry, graph: HistoryNode, ): - origin.id = provenance.origin_get_id(origin) - # head is treated separately since it should always be added to the given origin head = graph.entry check_preferred_origin(provenance, origin, head) diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -112,7 +112,7 @@ # This might be useless! data.clear() - def insert_origin_head(self, data: Dict[bytes, int]): + def insert_origin_head(self, data: Set[Tuple[bytes, str]]): if data: psycopg2.extras.execute_values( self.cursor, @@ -120,9 +120,10 @@ """ LOCK TABLE ONLY revision_in_origin; INSERT INTO revision_in_origin - SELECT R.id, V.org + SELECT R.id, O.id FROM (VALUES %s) AS V(rev, org) INNER JOIN revision AS R on (R.sha1=V.rev) + INNER JOIN origin AS O on (O.url=V.org::unix_path) """, data, ) @@ -149,28 +150,18 @@ ) data.clear() - def origin_get_id(self, url: str) -> int: - # Insert origin in the DB and return the assigned id - # XXX: not sure this works as expected if url is already in the db! + def revision_get_preferred_origin(self, revision: bytes) -> Optional[str]: self.cursor.execute( """ - LOCK TABLE ONLY origin; - INSERT INTO origin(url) VALUES (%s) - ON CONFLICT DO NOTHING - RETURNING id - """, - (url,), - ) - return self.cursor.fetchone()[0] - - def revision_get_preferred_origin(self, revision: bytes) -> Optional[int]: - self.cursor.execute( - """SELECT COALESCE(origin, 0) FROM revision WHERE sha1=%s""", (revision,) + SELECT O.url + FROM revision AS R + JOIN origin as O + ON R.origin=O.id + WHERE R.sha1=%s""", + (revision,), ) row = self.cursor.fetchone() - # None means revision is not in database; - # 0 means revision has no preferred origin - return row[0] if row is not None and row[0] != 0 else None + return str(row[0], encoding="utf-8") if row is not None else None def revision_in_history(self, revision: bytes) -> bool: self.cursor.execute( @@ -198,7 +189,7 @@ ) return self.cursor.fetchone() is not None - def update_preferred_origin(self, data: Dict[bytes, int]): + def update_preferred_origin(self, data: Dict[bytes, str]): if data: # XXX: this is assuming the revision already exists in the db! It should # be improved by allowing null dates in the revision table. @@ -206,9 +197,10 @@ self.cursor, """ UPDATE revision - SET origin=V.org - FROM (VALUES %s) AS V(rev, org) - WHERE sha1=V.rev + SET origin=O.id + FROM (VALUES %s) AS V(rev, org) + INNER JOIN origin AS O on (O.url=V.org::unix_path) + WHERE sha1=V.rev """, data.items(), ) diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -70,9 +70,6 @@ ) -> None: ... - def origin_get_id(self, origin: OriginEntry) -> int: - ... - def revision_add(self, revision: RevisionEntry) -> None: ... @@ -89,7 +86,7 @@ def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: ... - def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[int]: + def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]: ... def revision_in_history(self, revision: RevisionEntry) -> bool: @@ -110,7 +107,7 @@ class OriginCache(TypedDict): - data: Dict[bytes, int] # TODO: we should switch to use Url instead + data: Dict[bytes, str] added: Set[bytes] @@ -124,7 +121,7 @@ directory_in_revision: Set[Tuple[bytes, bytes, bytes]] # these two are for the origin layer revision_before_revision: Dict[bytes, Set[bytes]] - revision_in_origin: Set[Tuple[bytes, int]] + revision_in_origin: Set[Tuple[bytes, str]] revision_preferred_origin: OriginCache @@ -241,12 +238,6 @@ cache["data"].update(self.storage.get_dates(entity, list(missing_ids))) return {sha1: cache["data"][sha1] for sha1 in ids if sha1 in cache["data"]} - def origin_get_id(self, origin: OriginEntry) -> int: - if origin.id is None: - return self.storage.origin_get_id(origin.url) - else: - return origin.id - def revision_add(self, revision: RevisionEntry): # Add current revision to the compact DB assert revision.date is not None @@ -261,17 +252,16 @@ ) def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): - assert origin.id is not None - self.cache["revision_in_origin"].add((revision.id, origin.id)) + self.cache["revision_in_origin"].add((revision.id, origin.url)) def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: return self.get_dates("revision", [revision.id]).get(revision.id, None) - def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[int]: + def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]: if revision.id not in self.cache["revision_preferred_origin"]["data"]: - origin = self.storage.revision_get_preferred_origin(revision.id) - if origin is not None: - self.cache["revision_preferred_origin"]["data"][revision.id] = origin + url = self.storage.revision_get_preferred_origin(revision.id) + if url is not None: + self.cache["revision_preferred_origin"]["data"][revision.id] = url return self.cache["revision_preferred_origin"]["data"].get(revision.id) def revision_in_history(self, revision: RevisionEntry) -> bool: @@ -282,8 +272,7 @@ def revision_set_preferred_origin( self, origin: OriginEntry, revision: RevisionEntry ): - assert origin.id is not None - self.cache["revision_preferred_origin"]["data"][revision.id] = origin.id + self.cache["revision_preferred_origin"]["data"][revision.id] = origin.url self.cache["revision_preferred_origin"]["added"].add(revision.id) def revision_visited(self, revision: RevisionEntry) -> bool: