diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py index d474c6e..a255779 100644 --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -1,265 +1,264 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from datetime import datetime, timezone import logging import os from typing import Any, Dict, Optional, Set from swh.model.model import Sha1Git from .archive import ArchiveInterface from .interface import ProvenanceInterface from .model import DirectoryEntry, RevisionEntry UTCMIN = datetime.min.replace(tzinfo=timezone.utc) class HistoryNode: def __init__( self, entry: RevisionEntry, visited: bool = False, in_history: bool = False ) -> None: self.entry = entry # A revision is `visited` if it is directly pointed by an origin (ie. a head # revision for some snapshot) self.visited = visited # A revision is `in_history` if it appears in the history graph of an already # processed revision in the provenance database self.in_history = in_history self.parents: Set[HistoryNode] = set() def add_parent( self, parent: RevisionEntry, visited: bool = False, in_history: bool = False ) -> HistoryNode: node = HistoryNode(parent, visited=visited, in_history=in_history) self.parents.add(node) return node def __str__(self) -> str: return ( f"<{self.entry}: visited={self.visited}, in_history={self.in_history}, " f"parents=[{', '.join(str(parent) for parent in self.parents)}]>" ) def __eq__(self, other: Any) -> bool: return isinstance(other, HistoryNode) and self.__dict__ == other.__dict__ def __hash__(self) -> int: return hash((self.entry, self.visited, self.in_history)) def build_history_graph( archive: ArchiveInterface, provenance: ProvenanceInterface, revision: RevisionEntry, ) -> HistoryNode: """Recursively build the history graph from the given revision""" root = HistoryNode( revision, visited=provenance.revision_visited(revision), in_history=provenance.revision_in_history(revision), ) stack = [root] logging.debug( f"Recursively creating history graph for revision {revision.id.hex()}..." ) while stack: current = stack.pop() - if not current.visited: - current.entry.retrieve_parents(archive) - - for rev in current.entry.parents: - node = current.add_parent( - rev, - visited=provenance.revision_visited(rev), - in_history=provenance.revision_in_history(rev), - ) - stack.append(node) + + current.entry.retrieve_parents(archive) + for rev in current.entry.parents: + node = current.add_parent( + rev, + visited=provenance.revision_visited(rev), + in_history=provenance.revision_in_history(rev), + ) + stack.append(node) logging.debug( f"History graph for revision {revision.id.hex()} successfully created!" ) return root class IsochroneNode: def __init__( self, entry: DirectoryEntry, dbdate: Optional[datetime] = None, depth: int = 0, prefix: bytes = b"", ) -> None: self.entry = entry self.depth = depth # dbdate is the maxdate for this node that comes from the DB self._dbdate: Optional[datetime] = dbdate # maxdate is set by the maxdate computation algorithm self.maxdate: Optional[datetime] = None # known is True if this node is already known in the db; either because # the current directory actually exists in the database, or because all # the content of the current directory is known (subdirectories and files) self.known = self.dbdate is not None self.invalid = False self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name self.children: Set[IsochroneNode] = set() @property def dbdate(self) -> Optional[datetime]: # use a property to make this attribute (mostly) read-only return self._dbdate def invalidate(self) -> None: self._dbdate = None self.maxdate = None self.known = False self.invalid = True def add_directory( self, child: DirectoryEntry, date: Optional[datetime] = None ) -> IsochroneNode: # we should not be processing this node (ie add subdirectories or files) if it's # actually known by the provenance DB assert self.dbdate is None node = IsochroneNode(child, dbdate=date, depth=self.depth + 1, prefix=self.path) self.children.add(node) return node def __str__(self) -> str: return ( f"<{self.entry}: depth={self.depth}, " f"dbdate={self.dbdate}, maxdate={self.maxdate}, " f"known={self.known}, invalid={self.invalid}, path={self.path!r}, " f"children=[{', '.join(str(child) for child in self.children)}]>" ) def __eq__(self, other: Any) -> bool: return isinstance(other, IsochroneNode) and self.__dict__ == other.__dict__ def __hash__(self) -> int: # only immutable attributes are considered to compute hash return hash((self.entry, self.depth, self.path)) def build_isochrone_graph( archive: ArchiveInterface, provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry, ) -> IsochroneNode: assert revision.date is not None assert revision.root == directory.id # this function process a revision in 2 steps: # # 1. build the tree structure of IsochroneNode objects (one INode per # directory under the root directory of the revision but not following # known subdirectories), and gather the dates from the DB for already # known objects; for files, just keep all the dates in a global 'fdates' # dict; note that in this step, we will only recurse the directories # that are not already known. # # 2. compute the maxdate for each node of the tree that was not found in the DB. # Build the nodes structure root_date = provenance.directory_get_date_in_isochrone_frontier(directory) root = IsochroneNode(directory, dbdate=root_date) stack = [root] logging.debug( f"Recursively creating isochrone graph for revision {revision.id.hex()}..." ) fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date} while stack: current = stack.pop() if current.dbdate is None or current.dbdate > revision.date: # If current directory has an associated date in the isochrone frontier that # is greater or equal to the current revision's one, it should be ignored as # the revision is being processed out of order. if current.dbdate is not None and current.dbdate > revision.date: logging.debug( f"Invalidating frontier on {current.entry.id.hex()}" f" (date {current.dbdate})" f" when processing revision {revision.id.hex()}" f" (date {revision.date})" ) current.invalidate() # Pre-query all known dates for directories in the current directory # for the provenance object to have them cached and (potentially) improve # performance. current.entry.retrieve_children(archive) ddates = provenance.directory_get_dates_in_isochrone_frontier( current.entry.dirs ) for dir in current.entry.dirs: # Recursively analyse subdirectory nodes node = current.add_directory(dir, date=ddates.get(dir.id, None)) stack.append(node) fdates.update(provenance.content_get_early_dates(current.entry.files)) logging.debug( f"Isochrone graph for revision {revision.id.hex()} successfully created!" ) # Precalculate max known date for each node in the graph (only directory nodes are # pushed to the stack). logging.debug(f"Computing maxdates for revision {revision.id.hex()}...") stack = [root] while stack: current = stack.pop() # Current directory node is known if it already has an assigned date (ie. it was # already seen as an isochrone frontier). if current.known: assert current.maxdate is None current.maxdate = current.dbdate else: if any(x.maxdate is None for x in current.children): # at least one child of current has no maxdate yet # Current node needs to be analysed again after its children. stack.append(current) for child in current.children: if child.maxdate is None: # if child.maxdate is None, it must be processed stack.append(child) else: # all the files and directories under current have a maxdate, # we can infer the maxdate for current directory assert current.maxdate is None # if all content is already known, update current directory info. current.maxdate = max( [UTCMIN] + [ child.maxdate for child in current.children if child.maxdate is not None # unnecessary, but needed for mypy ] + [ fdates.get(file.id, revision.date) for file in current.entry.files ] ) if current.maxdate <= revision.date: current.known = ( # true if all subdirectories are known all(child.known for child in current.children) # true if all files are in fdates, i.e. if all files were known # *before building this isochrone graph node* # Note: the 'all()' is lazy: will stop iterating as soon as # possible and all((file.id in fdates) for file in current.entry.files) ) else: # at least one content is being processed out-of-order, then current # node should be treated as unknown current.maxdate = revision.date current.known = False logging.debug(f"Maxdates for revision {revision.id.hex()} successfully computed!") return root diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py index dfd596f..f8a810a 100644 --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -1,106 +1,105 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from itertools import islice import logging import time from typing import Generator, Iterable, Iterator, List, Optional, Tuple from swh.model.model import Sha1Git from .archive import ArchiveInterface from .graph import HistoryNode, build_history_graph from .interface import ProvenanceInterface from .model import OriginEntry, RevisionEntry class CSVOriginIterator: """Iterator over origin visit statuses typically present in the given CSV file. The input is an iterator that produces 2 elements per row: (url, snap) where: - url: is the origin url of the visit - snap: sha1_git of the snapshot pointed by the visit status """ def __init__( self, statuses: Iterable[Tuple[str, Sha1Git]], limit: Optional[int] = None, ) -> None: self.statuses: Iterator[Tuple[str, Sha1Git]] if limit is not None: self.statuses = islice(statuses, limit) else: self.statuses = iter(statuses) def __iter__(self) -> Generator[OriginEntry, None, None]: return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) def origin_add( provenance: ProvenanceInterface, archive: ArchiveInterface, origins: List[OriginEntry], ) -> None: start = time.time() for origin in origins: provenance.origin_add(origin) origin.retrieve_revisions(archive) for revision in origin.revisions: graph = build_history_graph(archive, provenance, revision) origin_add_revision(provenance, origin, graph) done = time.time() provenance.flush() stop = time.time() logging.debug( "Origins " ";".join([origin.id.hex() + ":" + origin.snapshot.hex() for origin in origins]) + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) def origin_add_revision( provenance: ProvenanceInterface, origin: OriginEntry, graph: HistoryNode, ) -> None: + # XXX: simplified version of the origin-revision algorithm. This is generating flat + # models for the history of all head revisions. No previous result is reused now! + # The previous implementation was missing some paths from origins to certain + # revisions due to a wrong reuse logic. + # head is treated separately since it should always be added to the given origin head = graph.entry check_preferred_origin(provenance, origin, head) provenance.revision_add_to_origin(origin, head) # head's history should be recursively iterated starting from its parents stack = list(graph.parents) while stack: current = stack.pop() check_preferred_origin(provenance, origin, current.entry) - if current.visited: - # if current revision was already visited just add it to the current origin - # and stop recursion (its history has already been flattened) - provenance.revision_add_to_origin(origin, current.entry) - else: - # if current revision was not visited before create a link between it and - # the head, and recursively walk its history - provenance.revision_add_before_revision(head, current.entry) - for parent in current.parents: - stack.append(parent) + # create a link between it and the head, and recursively walk its history + provenance.revision_add_before_revision(head, current.entry) + for parent in current.parents: + stack.append(parent) def check_preferred_origin( provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry, ) -> None: # if the revision has no preferred origin just set the given origin as the # preferred one. TODO: this should be improved in the future! preferred = provenance.revision_get_preferred_origin(revision) if preferred is None: provenance.revision_set_preferred_origin(origin, revision) diff --git a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml index d3aa69d..79a2d0e 100644 --- a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml +++ b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml @@ -1,61 +1,189 @@ # History graph for snapshot with branches: R01 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" parents: - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" # History graph for snapshot with branches: R03 and R06 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" parents: - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" parents: - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" parents: - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True # History graph for snapshot with branches: R05 and R06 - origin: "https://repo_with_merges/2/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" parents: - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" parents: - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" visited: True + parents: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + in_history: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" visited: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True # History graph for snapshot with branches: R06 and R07 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" visited: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" parents: - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + in_history: True visited: True + parents: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + in_history: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True # History graph for snapshot with branches: R08 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" parents: - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" visited: True + parents: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + in_history: True + parents: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + in_history: True + visited: True + parents: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + in_history: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" visited: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" visited: True + parents: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + in_history: True + visited: True + parents: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + in_history: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True # History graph for snapshot with branches: R08 - origin: "https://repo_with_merges/2/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" visited: True + parents: + - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + in_history: True + visited: True + parents: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + in_history: True + parents: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + in_history: True + visited: True + parents: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + in_history: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + in_history: True + visited: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True + - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + in_history: True + visited: True + parents: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + in_history: True + visited: True + parents: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + in_history: True + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + in_history: True + visited: True + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" + in_history: True