diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -10,6 +10,7 @@ import os from typing import Any, Dict, Optional, Set +from swh.model.hashutil import hash_to_hex from swh.model.model import Sha1Git from .archive import ArchiveInterface @@ -21,68 +22,83 @@ class HistoryNode: def __init__( - self, entry: RevisionEntry, visited: bool = False, in_history: bool = False + self, entry: RevisionEntry, is_head: bool = False, in_history: bool = False ) -> None: self.entry = entry - # A revision is `visited` if it is directly pointed by an origin (ie. a head + # A revision is `is_head` if it is directly pointed by an origin (ie. a head # revision for some snapshot) - self.visited = visited + self.is_head = is_head # A revision is `in_history` if it appears in the history graph of an already # processed revision in the provenance database self.in_history = in_history - self.parents: Set[HistoryNode] = set() - - def add_parent( - self, parent: RevisionEntry, visited: bool = False, in_history: bool = False - ) -> HistoryNode: - node = HistoryNode(parent, visited=visited, in_history=in_history) - self.parents.add(node) - return node + # XXX: the current simplified version of the origin-revision layer algorithm + # does not use this previous two flags at all. They are kept for now but might + # be removed in the future (hence, RevisionEntry might be used instead of + # HistoryNode). def __str__(self) -> str: - return ( - f"<{self.entry}: visited={self.visited}, in_history={self.in_history}, " - f"parents=[{', '.join(str(parent) for parent in self.parents)}]>" - ) + return f"<{self.entry}: is_head={self.is_head}, in_history={self.in_history}>" - def __eq__(self, other: Any) -> bool: - return isinstance(other, HistoryNode) and self.__dict__ == other.__dict__ + def as_dict(self) -> Dict[str, Any]: + return { + "rev": hash_to_hex(self.entry.id), + "is_head": self.is_head, + "in_history": self.in_history, + } - def __hash__(self) -> int: - return hash((self.entry, self.visited, self.in_history)) +class HistoryGraph: + def __init__( + self, + archive: ArchiveInterface, + provenance: ProvenanceInterface, + revision: RevisionEntry, + ) -> None: + self._head = HistoryNode( + revision, + is_head=provenance.revision_visited(revision), + in_history=provenance.revision_in_history(revision), + ) + self._graph: Dict[HistoryNode, Set[HistoryNode]] = {} + + stack = [self._head] + while stack: + current = stack.pop() + + if current not in self._graph: + self._graph[current] = set() + current.entry.retrieve_parents(archive) + for parent in current.entry.parents: + node = HistoryNode( + parent, + is_head=provenance.revision_visited(parent), + in_history=provenance.revision_in_history(parent), + ) + self._graph[current].add(node) + stack.append(node) -def build_history_graph( - archive: ArchiveInterface, - provenance: ProvenanceInterface, - revision: RevisionEntry, -) -> HistoryNode: - """Recursively build the history graph from the given revision""" + @property + def head(self) -> HistoryNode: + return self._head - root = HistoryNode( - revision, - visited=provenance.revision_visited(revision), - in_history=provenance.revision_in_history(revision), - ) - stack = [root] - logging.debug( - f"Recursively creating history graph for revision {revision.id.hex()}..." - ) - while stack: - current = stack.pop() + @property + def parents(self) -> Dict[HistoryNode, Set[HistoryNode]]: + return self._graph - current.entry.retrieve_parents(archive) - for rev in current.entry.parents: - node = current.add_parent( - rev, - visited=provenance.revision_visited(rev), - in_history=provenance.revision_in_history(rev), - ) - stack.append(node) - logging.debug( - f"History graph for revision {revision.id.hex()} successfully created!" - ) - return root + def __str__(self) -> str: + return f" Dict[str, Any]: + return { + "head": self.head.as_dict(), + "graph": { + hash_to_hex(node.entry.id): sorted( + [parent.as_dict() for parent in parents], + key=lambda d: d["rev"], + ) + for node, parents in self._graph.items() + }, + } class IsochroneNode: diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -11,7 +11,7 @@ from swh.model.model import Sha1Git from .archive import ArchiveInterface -from .graph import HistoryNode, build_history_graph +from .graph import HistoryGraph from .interface import ProvenanceInterface from .model import OriginEntry, RevisionEntry @@ -54,7 +54,7 @@ provenance.origin_add(origin) origin.retrieve_revisions(archive) for revision in origin.revisions: - graph = build_history_graph(archive, provenance, revision) + graph = HistoryGraph(archive, provenance, revision) origin_add_revision(provenance, origin, graph) done = time.time() provenance.flush() @@ -69,7 +69,7 @@ def origin_add_revision( provenance: ProvenanceInterface, origin: OriginEntry, - graph: HistoryNode, + graph: HistoryGraph, ) -> None: # XXX: simplified version of the origin-revision algorithm. This is generating flat # models for the history of all head revisions. No previous result is reused now! @@ -77,20 +77,22 @@ # revisions due to a wrong reuse logic. # head is treated separately since it should always be added to the given origin - head = graph.entry - check_preferred_origin(provenance, origin, head) - provenance.revision_add_to_origin(origin, head) + check_preferred_origin(provenance, origin, graph.head.entry) + provenance.revision_add_to_origin(origin, graph.head.entry) + visited = {graph.head} # head's history should be recursively iterated starting from its parents - stack = list(graph.parents) + stack = list(graph.parents[graph.head]) while stack: current = stack.pop() check_preferred_origin(provenance, origin, current.entry) # create a link between it and the head, and recursively walk its history - provenance.revision_add_before_revision(head, current.entry) - for parent in current.parents: - stack.append(parent) + provenance.revision_add_before_revision(graph.head.entry, current.entry) + visited.add(current) + for parent in graph.parents[current]: + if parent not in visited: + stack.append(parent) def check_preferred_origin( diff --git a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml --- a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml +++ b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml @@ -2,188 +2,229 @@ - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + - head: + rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: False + in_history: False + graph: + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: False + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R03 and R06 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: False + in_history: False + graph: + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: False + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: False + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + - head: + rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: False + in_history: False + graph: + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R05 and R06 - origin: "https://repo_with_merges/2/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - parents: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - visited: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + is_head: False + in_history: False + graph: + 65e58853df939b318c106c4c1f55acaf8b41c74c: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + is_head: False + in_history: False + 0d66eadcc15e0d7f6cfd4289329a7749a1309982: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: False + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: True + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + - head: + rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: True + in_history: False + graph: + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R06 and R07 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - visited: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: True + in_history: False + graph: + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + - head: + rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + is_head: False + in_history: False + graph: + fff0089fad98e8f5b46ec5c9025a20a602851ba6: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: True + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R08 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" - parents: - - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - visited: True - parents: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - in_history: True - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - visited: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - visited: True - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" + is_head: False + in_history: False + graph: + 7c8f29237dded4f9d265e46ec7066503e7858e87: + - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + is_head: True + in_history: False + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: True + in_history: False + - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + is_head: True + in_history: False + 65e58853df939b318c106c4c1f55acaf8b41c74c: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + is_head: False + in_history: True + 0d66eadcc15e0d7f6cfd4289329a7749a1309982: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: True + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + fff0089fad98e8f5b46ec5c9025a20a602851ba6: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True # History graph for snapshot with branches: R08 - origin: "https://repo_with_merges/2/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" - visited: True - parents: - - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - in_history: True - visited: True - parents: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - in_history: True - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - in_history: True - visited: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - in_history: True - visited: True - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" + is_head: True + in_history: False + graph: + 7c8f29237dded4f9d265e46ec7066503e7858e87: + - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + is_head: True + in_history: True + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: True + in_history: True + - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + is_head: True + in_history: True + 65e58853df939b318c106c4c1f55acaf8b41c74c: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + is_head: False + in_history: True + 0d66eadcc15e0d7f6cfd4289329a7749a1309982: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: True + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + fff0089fad98e8f5b46ec5c9025a20a602851ba6: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py --- a/swh/provenance/tests/test_history_graph.py +++ b/swh/provenance/tests/test_history_graph.py @@ -3,34 +3,18 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict - import pytest import yaml from swh.model.hashutil import hash_to_bytes from swh.provenance.archive import ArchiveInterface -from swh.provenance.graph import HistoryNode, build_history_graph +from swh.provenance.graph import HistoryGraph from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry, RevisionEntry from swh.provenance.origin import origin_add_revision from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data -def history_graph_from_dict(d: Dict[str, Any]) -> HistoryNode: - """Takes a dictionary representing a tree of HistoryNode objects, and - recursively builds the corresponding graph.""" - node = HistoryNode( - entry=RevisionEntry(hash_to_bytes(d["rev"])), - visited=d.get("visited", False), - in_history=d.get("in_history", False), - ) - node.parents = set( - history_graph_from_dict(parent) for parent in d.get("parents", []) - ) - return node - - @pytest.mark.parametrize( "repo, visit", (("with-merges", "visits-01"),), @@ -54,17 +38,16 @@ entry = OriginEntry(expected["origin"], hash_to_bytes(expected["snapshot"])) provenance.origin_add(entry) - for graph_as_dict in expected["graphs"]: - expected_graph = history_graph_from_dict(graph_as_dict) - print("Expected graph:", expected_graph) + for expected_graph_as_dict in expected["graphs"]: + print("Expected graph:", expected_graph_as_dict) - computed_graph = build_history_graph( + computed_graph = HistoryGraph( archive, provenance, - RevisionEntry(hash_to_bytes(graph_as_dict["rev"])), + RevisionEntry(hash_to_bytes(expected_graph_as_dict["head"]["rev"])), ) - print("Computed graph:", computed_graph) - assert computed_graph == expected_graph + print("Computed graph:", computed_graph.as_dict()) + assert computed_graph.as_dict() == expected_graph_as_dict origin_add_revision(provenance, entry, computed_graph)