diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py --- a/swh/provenance/interface.py +++ b/swh/provenance/interface.py @@ -367,6 +367,10 @@ target of an snapshot for `origin` in the archive).""" ... + def revision_is_head(self, revision: RevisionEntry) -> bool: + """Check if `revision` is associated as a head revision for some origin.""" + ... + def revision_get_date(self, revision: RevisionEntry) -> Optional[datetime]: """Retrieve the date associated to `revision`.""" ... diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -50,14 +50,20 @@ provenance: ProvenanceInterface, archive: ArchiveInterface, origins: List[OriginEntry], + commit: bool = True, ) -> None: for origin in origins: provenance.origin_add(origin) origin.retrieve_revisions(archive) for revision in origin.revisions: - graph = HistoryGraph(archive, revision) - origin_add_revision(provenance, origin, graph) - provenance.flush() + if not provenance.revision_is_head(revision): + graph = HistoryGraph(archive, revision) + origin_add_revision(provenance, origin, graph) + # head is treated separately + check_preferred_origin(provenance, origin, revision) + provenance.revision_add_to_origin(origin, revision) + if commit: + provenance.flush() @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "process_revision"}) @@ -66,11 +72,7 @@ origin: OriginEntry, graph: HistoryGraph, ) -> None: - # head is treated separately since it should always be added to the given origin - check_preferred_origin(provenance, origin, graph.head) - provenance.revision_add_to_origin(origin, graph.head) visited = {graph.head} - # head's history should be recursively iterated starting from its parents stack = list(graph.parents[graph.head]) while stack: diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -472,6 +472,9 @@ ) -> None: self.cache["revision_in_origin"].add((revision.id, origin.id)) + def revision_is_head(self, revision: RevisionEntry) -> bool: + return bool(self.storage.relation_get(RelationType.REV_IN_ORG, [revision.id])) + def revision_get_date(self, revision: RevisionEntry) -> Optional[datetime]: return self.get_dates("revision", [revision.id]).get(revision.id)