diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py index a255779..d232d48 100644 --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -1,264 +1,280 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from datetime import datetime, timezone import logging import os from typing import Any, Dict, Optional, Set +from swh.model.hashutil import hash_to_hex from swh.model.model import Sha1Git from .archive import ArchiveInterface from .interface import ProvenanceInterface from .model import DirectoryEntry, RevisionEntry UTCMIN = datetime.min.replace(tzinfo=timezone.utc) class HistoryNode: def __init__( - self, entry: RevisionEntry, visited: bool = False, in_history: bool = False + self, entry: RevisionEntry, is_head: bool = False, in_history: bool = False ) -> None: self.entry = entry - # A revision is `visited` if it is directly pointed by an origin (ie. a head + # A revision is `is_head` if it is directly pointed by an origin (ie. a head # revision for some snapshot) - self.visited = visited + self.is_head = is_head # A revision is `in_history` if it appears in the history graph of an already # processed revision in the provenance database self.in_history = in_history - self.parents: Set[HistoryNode] = set() - - def add_parent( - self, parent: RevisionEntry, visited: bool = False, in_history: bool = False - ) -> HistoryNode: - node = HistoryNode(parent, visited=visited, in_history=in_history) - self.parents.add(node) - return node + # XXX: the current simplified version of the origin-revision layer algorithm + # does not use this previous two flags at all. They are kept for now but might + # be removed in the future (hence, RevisionEntry might be used instead of + # HistoryNode). def __str__(self) -> str: - return ( - f"<{self.entry}: visited={self.visited}, in_history={self.in_history}, " - f"parents=[{', '.join(str(parent) for parent in self.parents)}]>" - ) + return f"<{self.entry}: is_head={self.is_head}, in_history={self.in_history}>" - def __eq__(self, other: Any) -> bool: - return isinstance(other, HistoryNode) and self.__dict__ == other.__dict__ + def as_dict(self) -> Dict[str, Any]: + return { + "rev": hash_to_hex(self.entry.id), + "is_head": self.is_head, + "in_history": self.in_history, + } - def __hash__(self) -> int: - return hash((self.entry, self.visited, self.in_history)) +class HistoryGraph: + def __init__( + self, + archive: ArchiveInterface, + provenance: ProvenanceInterface, + revision: RevisionEntry, + ) -> None: + self._head = HistoryNode( + revision, + is_head=provenance.revision_visited(revision), + in_history=provenance.revision_in_history(revision), + ) + self._graph: Dict[HistoryNode, Set[HistoryNode]] = {} + + stack = [self._head] + while stack: + current = stack.pop() + + if current not in self._graph: + self._graph[current] = set() + current.entry.retrieve_parents(archive) + for parent in current.entry.parents: + node = HistoryNode( + parent, + is_head=provenance.revision_visited(parent), + in_history=provenance.revision_in_history(parent), + ) + self._graph[current].add(node) + stack.append(node) -def build_history_graph( - archive: ArchiveInterface, - provenance: ProvenanceInterface, - revision: RevisionEntry, -) -> HistoryNode: - """Recursively build the history graph from the given revision""" + @property + def head(self) -> HistoryNode: + return self._head - root = HistoryNode( - revision, - visited=provenance.revision_visited(revision), - in_history=provenance.revision_in_history(revision), - ) - stack = [root] - logging.debug( - f"Recursively creating history graph for revision {revision.id.hex()}..." - ) - while stack: - current = stack.pop() + @property + def parents(self) -> Dict[HistoryNode, Set[HistoryNode]]: + return self._graph - current.entry.retrieve_parents(archive) - for rev in current.entry.parents: - node = current.add_parent( - rev, - visited=provenance.revision_visited(rev), - in_history=provenance.revision_in_history(rev), - ) - stack.append(node) - logging.debug( - f"History graph for revision {revision.id.hex()} successfully created!" - ) - return root + def __str__(self) -> str: + return f" Dict[str, Any]: + return { + "head": self.head.as_dict(), + "graph": { + hash_to_hex(node.entry.id): sorted( + [parent.as_dict() for parent in parents], + key=lambda d: d["rev"], + ) + for node, parents in self._graph.items() + }, + } class IsochroneNode: def __init__( self, entry: DirectoryEntry, dbdate: Optional[datetime] = None, depth: int = 0, prefix: bytes = b"", ) -> None: self.entry = entry self.depth = depth # dbdate is the maxdate for this node that comes from the DB self._dbdate: Optional[datetime] = dbdate # maxdate is set by the maxdate computation algorithm self.maxdate: Optional[datetime] = None # known is True if this node is already known in the db; either because # the current directory actually exists in the database, or because all # the content of the current directory is known (subdirectories and files) self.known = self.dbdate is not None self.invalid = False self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name self.children: Set[IsochroneNode] = set() @property def dbdate(self) -> Optional[datetime]: # use a property to make this attribute (mostly) read-only return self._dbdate def invalidate(self) -> None: self._dbdate = None self.maxdate = None self.known = False self.invalid = True def add_directory( self, child: DirectoryEntry, date: Optional[datetime] = None ) -> IsochroneNode: # we should not be processing this node (ie add subdirectories or files) if it's # actually known by the provenance DB assert self.dbdate is None node = IsochroneNode(child, dbdate=date, depth=self.depth + 1, prefix=self.path) self.children.add(node) return node def __str__(self) -> str: return ( f"<{self.entry}: depth={self.depth}, " f"dbdate={self.dbdate}, maxdate={self.maxdate}, " f"known={self.known}, invalid={self.invalid}, path={self.path!r}, " f"children=[{', '.join(str(child) for child in self.children)}]>" ) def __eq__(self, other: Any) -> bool: return isinstance(other, IsochroneNode) and self.__dict__ == other.__dict__ def __hash__(self) -> int: # only immutable attributes are considered to compute hash return hash((self.entry, self.depth, self.path)) def build_isochrone_graph( archive: ArchiveInterface, provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry, ) -> IsochroneNode: assert revision.date is not None assert revision.root == directory.id # this function process a revision in 2 steps: # # 1. build the tree structure of IsochroneNode objects (one INode per # directory under the root directory of the revision but not following # known subdirectories), and gather the dates from the DB for already # known objects; for files, just keep all the dates in a global 'fdates' # dict; note that in this step, we will only recurse the directories # that are not already known. # # 2. compute the maxdate for each node of the tree that was not found in the DB. # Build the nodes structure root_date = provenance.directory_get_date_in_isochrone_frontier(directory) root = IsochroneNode(directory, dbdate=root_date) stack = [root] logging.debug( f"Recursively creating isochrone graph for revision {revision.id.hex()}..." ) fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date} while stack: current = stack.pop() if current.dbdate is None or current.dbdate > revision.date: # If current directory has an associated date in the isochrone frontier that # is greater or equal to the current revision's one, it should be ignored as # the revision is being processed out of order. if current.dbdate is not None and current.dbdate > revision.date: logging.debug( f"Invalidating frontier on {current.entry.id.hex()}" f" (date {current.dbdate})" f" when processing revision {revision.id.hex()}" f" (date {revision.date})" ) current.invalidate() # Pre-query all known dates for directories in the current directory # for the provenance object to have them cached and (potentially) improve # performance. current.entry.retrieve_children(archive) ddates = provenance.directory_get_dates_in_isochrone_frontier( current.entry.dirs ) for dir in current.entry.dirs: # Recursively analyse subdirectory nodes node = current.add_directory(dir, date=ddates.get(dir.id, None)) stack.append(node) fdates.update(provenance.content_get_early_dates(current.entry.files)) logging.debug( f"Isochrone graph for revision {revision.id.hex()} successfully created!" ) # Precalculate max known date for each node in the graph (only directory nodes are # pushed to the stack). logging.debug(f"Computing maxdates for revision {revision.id.hex()}...") stack = [root] while stack: current = stack.pop() # Current directory node is known if it already has an assigned date (ie. it was # already seen as an isochrone frontier). if current.known: assert current.maxdate is None current.maxdate = current.dbdate else: if any(x.maxdate is None for x in current.children): # at least one child of current has no maxdate yet # Current node needs to be analysed again after its children. stack.append(current) for child in current.children: if child.maxdate is None: # if child.maxdate is None, it must be processed stack.append(child) else: # all the files and directories under current have a maxdate, # we can infer the maxdate for current directory assert current.maxdate is None # if all content is already known, update current directory info. current.maxdate = max( [UTCMIN] + [ child.maxdate for child in current.children if child.maxdate is not None # unnecessary, but needed for mypy ] + [ fdates.get(file.id, revision.date) for file in current.entry.files ] ) if current.maxdate <= revision.date: current.known = ( # true if all subdirectories are known all(child.known for child in current.children) # true if all files are in fdates, i.e. if all files were known # *before building this isochrone graph node* # Note: the 'all()' is lazy: will stop iterating as soon as # possible and all((file.id in fdates) for file in current.entry.files) ) else: # at least one content is being processed out-of-order, then current # node should be treated as unknown current.maxdate = revision.date current.known = False logging.debug(f"Maxdates for revision {revision.id.hex()} successfully computed!") return root diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py index f8a810a..dcb8e48 100644 --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -1,105 +1,107 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from itertools import islice import logging import time from typing import Generator, Iterable, Iterator, List, Optional, Tuple from swh.model.model import Sha1Git from .archive import ArchiveInterface -from .graph import HistoryNode, build_history_graph +from .graph import HistoryGraph from .interface import ProvenanceInterface from .model import OriginEntry, RevisionEntry class CSVOriginIterator: """Iterator over origin visit statuses typically present in the given CSV file. The input is an iterator that produces 2 elements per row: (url, snap) where: - url: is the origin url of the visit - snap: sha1_git of the snapshot pointed by the visit status """ def __init__( self, statuses: Iterable[Tuple[str, Sha1Git]], limit: Optional[int] = None, ) -> None: self.statuses: Iterator[Tuple[str, Sha1Git]] if limit is not None: self.statuses = islice(statuses, limit) else: self.statuses = iter(statuses) def __iter__(self) -> Generator[OriginEntry, None, None]: return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) def origin_add( provenance: ProvenanceInterface, archive: ArchiveInterface, origins: List[OriginEntry], ) -> None: start = time.time() for origin in origins: provenance.origin_add(origin) origin.retrieve_revisions(archive) for revision in origin.revisions: - graph = build_history_graph(archive, provenance, revision) + graph = HistoryGraph(archive, provenance, revision) origin_add_revision(provenance, origin, graph) done = time.time() provenance.flush() stop = time.time() logging.debug( "Origins " ";".join([origin.id.hex() + ":" + origin.snapshot.hex() for origin in origins]) + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) def origin_add_revision( provenance: ProvenanceInterface, origin: OriginEntry, - graph: HistoryNode, + graph: HistoryGraph, ) -> None: # XXX: simplified version of the origin-revision algorithm. This is generating flat # models for the history of all head revisions. No previous result is reused now! # The previous implementation was missing some paths from origins to certain # revisions due to a wrong reuse logic. # head is treated separately since it should always be added to the given origin - head = graph.entry - check_preferred_origin(provenance, origin, head) - provenance.revision_add_to_origin(origin, head) + check_preferred_origin(provenance, origin, graph.head.entry) + provenance.revision_add_to_origin(origin, graph.head.entry) + visited = {graph.head} # head's history should be recursively iterated starting from its parents - stack = list(graph.parents) + stack = list(graph.parents[graph.head]) while stack: current = stack.pop() check_preferred_origin(provenance, origin, current.entry) # create a link between it and the head, and recursively walk its history - provenance.revision_add_before_revision(head, current.entry) - for parent in current.parents: - stack.append(parent) + provenance.revision_add_before_revision(graph.head.entry, current.entry) + visited.add(current) + for parent in graph.parents[current]: + if parent not in visited: + stack.append(parent) def check_preferred_origin( provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry, ) -> None: # if the revision has no preferred origin just set the given origin as the # preferred one. TODO: this should be improved in the future! preferred = provenance.revision_get_preferred_origin(revision) if preferred is None: provenance.revision_set_preferred_origin(origin, revision) diff --git a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml index 79a2d0e..8d2ad11 100644 --- a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml +++ b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml @@ -1,189 +1,230 @@ # History graph for snapshot with branches: R01 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + - head: + rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: False + in_history: False + graph: + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: False + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R03 and R06 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: False + in_history: False + graph: + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: False + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: False + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + - head: + rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: False + in_history: False + graph: + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R05 and R06 - origin: "https://repo_with_merges/2/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - parents: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - visited: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + is_head: False + in_history: False + graph: + 65e58853df939b318c106c4c1f55acaf8b41c74c: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + is_head: False + in_history: False + 0d66eadcc15e0d7f6cfd4289329a7749a1309982: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: False + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: True + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + - head: + rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: True + in_history: False + graph: + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R06 and R07 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - visited: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: True + in_history: False + graph: + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + - head: + rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + is_head: False + in_history: False + graph: + fff0089fad98e8f5b46ec5c9025a20a602851ba6: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: True + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R08 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" - parents: - - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - visited: True - parents: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - in_history: True - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - visited: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - visited: True - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" + is_head: False + in_history: False + graph: + 7c8f29237dded4f9d265e46ec7066503e7858e87: + - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + is_head: True + in_history: False + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: True + in_history: False + - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + is_head: True + in_history: False + 65e58853df939b318c106c4c1f55acaf8b41c74c: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + is_head: False + in_history: True + 0d66eadcc15e0d7f6cfd4289329a7749a1309982: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: True + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + fff0089fad98e8f5b46ec5c9025a20a602851ba6: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True # History graph for snapshot with branches: R08 - origin: "https://repo_with_merges/2/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" - visited: True - parents: - - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - in_history: True - visited: True - parents: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - in_history: True - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - in_history: True - visited: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True - - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - in_history: True - visited: True - parents: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - in_history: True - visited: True - parents: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - in_history: True - parents: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - in_history: True - visited: True - parents: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - in_history: True + - head: + rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" + is_head: True + in_history: False + graph: + 7c8f29237dded4f9d265e46ec7066503e7858e87: + - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + is_head: True + in_history: True + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + is_head: True + in_history: True + - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + is_head: True + in_history: True + 65e58853df939b318c106c4c1f55acaf8b41c74c: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + is_head: False + in_history: True + 0d66eadcc15e0d7f6cfd4289329a7749a1309982: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True + 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + is_head: False + in_history: True + 1c533587277731236616cac0d44f3b46c1da0f8a: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + 1444db96cbd8cd791abe83527becee73d3c64e86: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + is_head: False + in_history: True + c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] + 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + is_head: True + in_history: True + fff0089fad98e8f5b46ec5c9025a20a602851ba6: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + is_head: True + in_history: True diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py index 0aadb9f..808c4f2 100644 --- a/swh/provenance/tests/test_history_graph.py +++ b/swh/provenance/tests/test_history_graph.py @@ -1,72 +1,55 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict - import pytest import yaml from swh.model.hashutil import hash_to_bytes from swh.provenance.archive import ArchiveInterface -from swh.provenance.graph import HistoryNode, build_history_graph +from swh.provenance.graph import HistoryGraph from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry, RevisionEntry from swh.provenance.origin import origin_add_revision from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data -def history_graph_from_dict(d: Dict[str, Any]) -> HistoryNode: - """Takes a dictionary representing a tree of HistoryNode objects, and - recursively builds the corresponding graph.""" - node = HistoryNode( - entry=RevisionEntry(hash_to_bytes(d["rev"])), - visited=d.get("visited", False), - in_history=d.get("in_history", False), - ) - node.parents = set( - history_graph_from_dict(parent) for parent in d.get("parents", []) - ) - return node - - @pytest.mark.parametrize( "repo, visit", (("with-merges", "visits-01"),), ) @pytest.mark.parametrize("batch", (True, False)) def test_history_graph( provenance: ProvenanceInterface, archive: ArchiveInterface, repo: str, visit: str, batch: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(archive.storage, data) filename = f"history_graphs_{repo}_{visit}.yaml" with open(get_datafile(filename)) as file: for expected in yaml.full_load(file): entry = OriginEntry(expected["origin"], hash_to_bytes(expected["snapshot"])) provenance.origin_add(entry) - for graph_as_dict in expected["graphs"]: - expected_graph = history_graph_from_dict(graph_as_dict) - print("Expected graph:", expected_graph) + for expected_graph_as_dict in expected["graphs"]: + print("Expected graph:", expected_graph_as_dict) - computed_graph = build_history_graph( + computed_graph = HistoryGraph( archive, provenance, - RevisionEntry(hash_to_bytes(graph_as_dict["rev"])), + RevisionEntry(hash_to_bytes(expected_graph_as_dict["head"]["rev"])), ) - print("Computed graph:", computed_graph) - assert computed_graph == expected_graph + print("Computed graph:", computed_graph.as_dict()) + assert computed_graph.as_dict() == expected_graph_as_dict origin_add_revision(provenance, entry, computed_graph) if not batch: provenance.flush()