diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -1,10 +1,7 @@ -from collections import Counter from datetime import datetime, timezone import logging import os -from typing import Dict, List, Optional - -from swh.model.hashutil import hash_to_hex +from typing import Dict, Optional, Set from .archive import ArchiveInterface from .model import DirectoryEntry, RevisionEntry @@ -13,6 +10,73 @@ UTCMIN = datetime.min.replace(tzinfo=timezone.utc) +class HistoryNode: + def __init__( + self, entry: RevisionEntry, visited: bool = False, in_history: bool = False + ): + self.entry = entry + # A revision is `visited` if it is directly pointed by an origin (ie. a head + # revision for some snapshot) + self.visited = visited + # A revision is `in_history` if it appears in the history graph of an already + # processed revision in the provenance database + self.in_history = in_history + self.parents: Set[HistoryNode] = set() + + def add_parent( + self, parent: RevisionEntry, visited: bool = False, in_history: bool = False + ) -> "HistoryNode": + node = HistoryNode(parent, visited=visited, in_history=in_history) + self.parents.add(node) + return node + + def __str__(self): + return ( + f"<{self.entry}: visited={self.visited}, in_history={self.in_history}, " + f"parents=[{', '.join(str(parent) for parent in self.parents)}]>" + ) + + def __eq__(self, other): + return isinstance(other, HistoryNode) and self.__dict__ == other.__dict__ + + def __hash__(self): + return hash((self.entry, self.visited, self.in_history)) + + +def build_history_graph( + archive: ArchiveInterface, + provenance: ProvenanceInterface, + revision: RevisionEntry, +) -> HistoryNode: + """Recursively build the history graph on the given revision""" + + root = HistoryNode( + revision, + visited=provenance.revision_visited(revision), + in_history=provenance.revision_in_history(revision), + ) + stack = [root] + logging.debug( + f"Recursively creating history graph for revision {revision.id.hex()}..." + ) + while stack: + current = stack.pop() + if not current.visited: + current.entry.retrieve_parents(archive) + + for rev in current.entry.parents: + node = current.add_parent( + rev, + visited=provenance.revision_visited(rev), + in_history=provenance.revision_in_history(rev), + ) + stack.append(node) + logging.debug( + f"History graph for revision {revision.id.hex()} successfully created!" + ) + return root + + class IsochroneNode: def __init__( self, @@ -36,7 +100,7 @@ self.known = self.dbdate is not None self.invalid = False self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name - self.children: List[IsochroneNode] = [] + self.children: Set[IsochroneNode] = set() @property def dbdate(self): @@ -52,56 +116,27 @@ def add_directory( self, child: DirectoryEntry, date: Optional[datetime] = None ) -> "IsochroneNode": - # we should not be processing this node (ie add subdirectories or - # files) if it's actually known by the provenance DB + # we should not be processing this node (ie add subdirectories or files) if it's + # actually known by the provenance DB assert self.dbdate is None node = IsochroneNode(child, dbdate=date, depth=self.depth + 1, prefix=self.path) - self.children.append(node) + self.children.add(node) return node def __str__(self): return ( - f"<{self.entry}: dbdate={self.dbdate}, maxdate={self.maxdate}, " + f"<{self.entry}: depth={self.depth}, " + f"dbdate={self.dbdate}, maxdate={self.maxdate}, " f"known={self.known}, invalid={self.invalid}, path={self.path}, " f"children=[{', '.join(str(child) for child in self.children)}]>" ) def __eq__(self, other): - return ( - isinstance(other, IsochroneNode) - and ( - self.entry, - self.depth, - self._dbdate, - self.maxdate, - self.known, - self.invalid, - self.path, - ) - == ( - other.entry, - other.depth, - other._dbdate, - other.maxdate, - other.known, - other.invalid, - other.path, - ) - and Counter(self.children) == Counter(other.children) - ) + return isinstance(other, IsochroneNode) and self.__dict__ == other.__dict__ def __hash__(self): - return hash( - ( - self.entry, - self.depth, - self._dbdate, - self.maxdate, - self.known, - self.invalid, - self.path, - ) - ) + # only immutable attributes are considered to compute hash + return hash((self.entry, self.depth, self.path)) def build_isochrone_graph( @@ -129,7 +164,7 @@ root = IsochroneNode(directory, dbdate=root_date) stack = [root] logging.debug( - f"Recursively creating graph for revision {hash_to_hex(revision.id)}..." + f"Recursively creating isochrone graph for revision {revision.id.hex()}..." ) fdates: Dict[bytes, datetime] = {} # map {file_id: date} while stack: @@ -140,9 +175,9 @@ # the revision is being processed out of order. if current.dbdate is not None and current.dbdate > revision.date: logging.debug( - f"Invalidating frontier on {hash_to_hex(current.entry.id)}" + f"Invalidating frontier on {current.entry.id.hex()}" f" (date {current.dbdate})" - f" when processing revision {hash_to_hex(revision.id)}" + f" when processing revision {revision.id.hex()}" f" (date {revision.date})" ) current.invalidate() @@ -162,11 +197,11 @@ fdates.update(provenance.content_get_early_dates(current.entry.files)) logging.debug( - f"Isochrone graph for revision {hash_to_hex(revision.id)} successfully created!" + f"Isochrone graph for revision {revision.id.hex()} successfully created!" ) # Precalculate max known date for each node in the graph (only directory nodes are # pushed to the stack). - logging.debug(f"Computing maxdates for revision {hash_to_hex(revision.id)}...") + logging.debug(f"Computing maxdates for revision {revision.id.hex()}...") stack = [root] while stack: @@ -217,7 +252,5 @@ # node should be treated as unknown current.maxdate = revision.date current.known = False - logging.debug( - f"Maxdates for revision {hash_to_hex(revision.id)} successfully computed!" - ) + logging.debug(f"Maxdates for revision {revision.id.hex()} successfully computed!") return root diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -6,9 +6,8 @@ import iso8601 -from swh.model.hashutil import hash_to_hex - from .archive import ArchiveInterface +from .graph import HistoryNode, build_history_graph from .model import OriginEntry, RevisionEntry from .provenance import ProvenanceInterface @@ -48,76 +47,60 @@ provenance: ProvenanceInterface, archive: ArchiveInterface, origins: List[OriginEntry], -) -> None: +): start = time.time() for origin in origins: origin.retrieve_revisions(archive) for revision in origin.revisions: - origin_add_revision(provenance, archive, origin, revision) + graph = build_history_graph(archive, provenance, revision) + origin_add_revision(provenance, origin, graph) done = time.time() provenance.commit() stop = time.time() logging.debug( "Origins " - ";".join( - [origin.url + ":" + hash_to_hex(origin.snapshot) for origin in origins] - ) + ";".join([origin.url + ":" + origin.snapshot.hex() for origin in origins]) + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) def origin_add_revision( provenance: ProvenanceInterface, - archive: ArchiveInterface, origin: OriginEntry, - revision: RevisionEntry, -) -> None: - stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)] + graph: HistoryNode, +): origin.id = provenance.origin_get_id(origin) - while stack: - relative, current = stack.pop() - - # Check if current revision has no preferred origin and update if necessary. - preferred = provenance.revision_get_preferred_origin(current) - - if preferred is None: - provenance.revision_set_preferred_origin(origin, current) - ######################################################################## - - if relative is None: - # This revision is pointed directly by the origin. - visited = provenance.revision_visited(current) - provenance.revision_add_to_origin(origin, current) + # head is treated separately since it should always be added to the given origin + head = graph.entry + check_preferred_origin(provenance, origin, head) + provenance.revision_add_to_origin(origin, head) - if not visited: - stack.append((current, current)) + # head's history should be recursively iterated starting from its parents + stack = list(graph.parents) + while stack: + current = stack.pop() + check_preferred_origin(provenance, origin, current.entry) + if current.visited: + # if current revision was already visited just add it to the current origin + # and stop recursion (its history has already been flattened) + provenance.revision_add_to_origin(origin, current.entry) else: - # This revision is a parent of another one in the history of the - # relative revision. - current.retrieve_parents(archive) + # if current revision was not visited before create a link between it and + # the head, and recursively walk its history + provenance.revision_add_before_revision(head, current.entry) for parent in current.parents: - visited = provenance.revision_visited(parent) - - if not visited: - # The parent revision has never been seen before pointing - # directly to an origin. - known = provenance.revision_in_history(parent) - - if known: - # The parent revision is already known in some other - # revision's history. We should point it directly to - # the origin and (eventually) walk its history. - stack.append((None, parent)) - else: - # The parent revision was never seen before. We should - # walk its history and associate it with the same - # relative revision. - provenance.revision_add_before_revision(relative, parent) - stack.append((relative, parent)) - else: - # The parent revision already points to an origin, so its - # history was properly processed before. We just need to - # make sure it points to the current origin as well. - provenance.revision_add_to_origin(origin, parent) + stack.append(parent) + + +def check_preferred_origin( + provenance: ProvenanceInterface, + origin: OriginEntry, + revision: RevisionEntry, +): + # if the revision has no preferred origin just set the given origin as the + # preferred one. TODO: this should be improved in the future! + preferred = provenance.revision_get_preferred_origin(revision) + if preferred is None: + provenance.revision_set_preferred_origin(origin, revision) diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -7,7 +7,7 @@ import iso8601 -from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.hashutil import hash_to_bytes from .archive import ArchiveInterface from .graph import IsochroneNode, build_isochrone_graph @@ -71,7 +71,7 @@ date = provenance.revision_get_early_date(revision) if date is None or revision.date < date: logging.debug( - f"Processing revisions {hash_to_hex(revision.id)}" + f"Processing revisions {revision.id.hex()}" f" (known date {date} / revision date {revision.date})..." ) graph = build_isochrone_graph( @@ -95,11 +95,11 @@ provenance.commit() stop = time.time() logging.debug( - f"Revisions {';'.join([hash_to_hex(revision.id) for revision in revisions])} " + f"Revisions {';'.join([revision.id.hex() for revision in revisions])} " f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) # logging.critical( - # ";".join([hash_to_hex(revision.id) for revision in revisions]) + # ";".join([revision.id.hex() for revision in revisions]) # + f",{stop - start},{stop - done}" # ) diff --git a/swh/provenance/tests/test_isochrone_graph.py b/swh/provenance/tests/test_isochrone_graph.py --- a/swh/provenance/tests/test_isochrone_graph.py +++ b/swh/provenance/tests/test_isochrone_graph.py @@ -40,9 +40,9 @@ node.known = d.get("known", False) node.invalid = d.get("invalid", False) node.path = bytes(d["path"], encoding="utf-8") - node.children = [ + node.children = set( isochrone_graph_from_dict(child, depth=depth + 1) for child in children - ] + ) return node