diff --git a/swh/provenance/model.py b/swh/provenance/model.py index 69d9b17..ee7a01d 100644 --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -1,171 +1,169 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime from typing import Iterable, Iterator, List, Optional, Set from swh.core.utils import grouper from swh.model.model import ObjectType, TargetType from .archive import ArchiveInterface class OriginEntry: def __init__( self, url: str, date: datetime, snapshot: bytes, id: Optional[int] = None ): self.url = url self.date = date self.snapshot = snapshot self.id = id self._revisions: Optional[List[RevisionEntry]] = None def retrieve_revisions(self, archive: ArchiveInterface): if self._revisions is None: snapshot = archive.snapshot_get_all_branches(self.snapshot) assert snapshot is not None targets_set = set() releases_set = set() if snapshot is not None: for branch in snapshot.branches: if snapshot.branches[branch].target_type == TargetType.REVISION: targets_set.add(snapshot.branches[branch].target) elif snapshot.branches[branch].target_type == TargetType.RELEASE: releases_set.add(snapshot.branches[branch].target) batchsize = 100 for releases in grouper(releases_set, batchsize): targets_set.update( release.target for release in archive.revision_get(releases) if release is not None and release.target_type == ObjectType.REVISION ) revisions: Set[RevisionEntry] = set() for targets in grouper(targets_set, batchsize): revisions.update( RevisionEntry(revision.id) for revision in archive.revision_get(targets) if revision is not None ) self._revisions = list(revisions) @property def revisions(self) -> Iterator["RevisionEntry"]: if self._revisions is None: raise RuntimeError( "Revisions of this node has not yet been retrieved. " "Please call retrieve_revisions() before using this property." ) return (x for x in self._revisions) class RevisionEntry: def __init__( self, id: bytes, date: Optional[datetime] = None, root: Optional[bytes] = None, parents: Optional[Iterable[bytes]] = None, ): self.id = id self.date = date assert self.date is None or self.date.tzinfo is not None self.root = root self._parents = parents self._nodes: List[RevisionEntry] = [] def parents(self, archive: ArchiveInterface): if self._parents is None: revision = archive.revision_get([self.id]) if revision: self._parents = list(revision)[0].parents if self._parents and not self._nodes: self._nodes = [ RevisionEntry( id=rev.id, root=rev.directory, date=rev.date, parents=rev.parents, ) for rev in archive.revision_get(self._parents) if rev ] yield from self._nodes def __str__(self): return f"" class DirectoryEntry: def __init__(self, id: bytes, name: bytes = b""): self.id = id self.name = name self._files: Optional[List[FileEntry]] = None self._dirs: Optional[List[DirectoryEntry]] = None def retrieve_children(self, archive: ArchiveInterface): if self._files is None and self._dirs is None: self._files = [] self._dirs = [] for child in archive.directory_ls(self.id): if child["type"] == "dir": self._dirs.append( DirectoryEntry(child["target"], name=child["name"]) ) elif child["type"] == "file": self._files.append(FileEntry(child["target"], child["name"])) @property def files(self) -> Iterator["FileEntry"]: if self._files is None: raise RuntimeError( "Children of this node has not yet been retrieved. " "Please call retrieve_children() before using this property." ) return (x for x in self._files) @property def dirs(self) -> Iterator["DirectoryEntry"]: if self._dirs is None: raise RuntimeError( "Children of this node has not yet been retrieved. " "Please call retrieve_children() before using this property." ) return (x for x in self._dirs) def __str__(self): return f"" def __eq__(self, other): - sameFiles = (self._files is None and other._files is None) or ( - set(self._files) == set(other._files) - ) - sameDirs = (self._dirs is None and other._dirs is None) or ( - set(self._dirs) == set(other._dirs) - ) - return ( - isinstance(other, DirectoryEntry) - and (self.id, self.name) == (other.id, other.name) - and sameFiles - and sameDirs + return isinstance(other, DirectoryEntry) and (self.id, self.name) == ( + other.id, + other.name, ) + def __hash__(self): + return hash((self.id, self.name)) + class FileEntry: def __init__(self, id: bytes, name: bytes): self.id = id self.name = name def __str__(self): return f"" def __eq__(self, other): return isinstance(other, FileEntry) and (self.id, self.name) == ( other.id, other.name, ) + + def __hash__(self): + return hash((self.id, self.name)) diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py index f483bce..4094b5f 100644 --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -1,569 +1,576 @@ from datetime import datetime, timezone import logging import os import time from typing import Dict, Generator, Iterable, List, Optional, Tuple from typing_extensions import Protocol, runtime_checkable from swh.model.hashutil import hash_to_hex from .archive import ArchiveInterface from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry UTCMIN = datetime.min.replace(tzinfo=timezone.utc) @runtime_checkable class ProvenanceInterface(Protocol): raise_on_commit: bool = False def commit(self): """Commit currently ongoing transactions in the backend DB""" ... def content_add_to_directory( self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes ) -> None: ... def content_add_to_revision( self, revision: RevisionEntry, blob: FileEntry, prefix: bytes ) -> None: ... def content_find_first( self, blobid: bytes ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: ... def content_find_all( self, blobid: bytes, limit: Optional[int] = None ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: ... def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: ... def content_get_early_dates( self, blobs: Iterable[FileEntry] ) -> Dict[bytes, datetime]: ... def content_set_early_date(self, blob: FileEntry, date: datetime) -> None: ... def directory_add_to_revision( self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes ) -> None: ... def directory_get_date_in_isochrone_frontier( self, directory: DirectoryEntry ) -> Optional[datetime]: ... def directory_get_dates_in_isochrone_frontier( self, dirs: Iterable[DirectoryEntry] ) -> Dict[bytes, datetime]: ... def directory_invalidate_in_isochrone_frontier( self, directory: DirectoryEntry ) -> None: ... def directory_set_date_in_isochrone_frontier( self, directory: DirectoryEntry, date: datetime ) -> None: ... def origin_get_id(self, origin: OriginEntry) -> int: ... def revision_add(self, revision: RevisionEntry) -> None: ... def revision_add_before_revision( self, relative: RevisionEntry, revision: RevisionEntry ) -> None: ... def revision_add_to_origin( self, origin: OriginEntry, revision: RevisionEntry ) -> None: ... def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: ... def revision_get_preferred_origin(self, revision: RevisionEntry) -> int: ... def revision_in_history(self, revision: RevisionEntry) -> bool: ... def revision_set_preferred_origin( self, origin: OriginEntry, revision: RevisionEntry ) -> None: ... def revision_visited(self, revision: RevisionEntry) -> bool: ... def flatten_directory( archive: ArchiveInterface, provenance: ProvenanceInterface, directory: DirectoryEntry, ) -> None: """Recursively retrieve all the files of 'directory' and insert them in the 'provenance' database in the 'content_to_directory' table. """ stack = [(directory, b"")] while stack: current, prefix = stack.pop() current.retrieve_children(archive) for f_child in current.files: # Add content to the directory with the computed prefix. provenance.content_add_to_directory(directory, f_child, prefix) for d_child in current.dirs: # Recursively walk the child directory. stack.append((d_child, os.path.join(prefix, d_child.name))) def origin_add( provenance: ProvenanceInterface, archive: ArchiveInterface, origins: List[OriginEntry], ) -> None: start = time.time() for origin in origins: origin.retrieve_revisions(archive) for revision in origin.revisions: origin_add_revision(provenance, archive, origin, revision) done = time.time() provenance.commit() stop = time.time() logging.debug( "Origins " ";".join( [origin.url + ":" + hash_to_hex(origin.snapshot) for origin in origins] ) + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) def origin_add_revision( provenance: ProvenanceInterface, archive: ArchiveInterface, origin: OriginEntry, revision: RevisionEntry, ) -> None: stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)] while stack: relative, current = stack.pop() # Check if current revision has no preferred origin and update if necessary. preferred = provenance.revision_get_preferred_origin(current) if preferred is None: provenance.revision_set_preferred_origin(origin, current) ######################################################################## if relative is None: # This revision is pointed directly by the origin. visited = provenance.revision_visited(current) provenance.revision_add_to_origin(origin, current) if not visited: stack.append((current, current)) else: # This revision is a parent of another one in the history of the # relative revision. for parent in current.parents(archive): visited = provenance.revision_visited(parent) if not visited: # The parent revision has never been seen before pointing # directly to an origin. known = provenance.revision_in_history(parent) if known: # The parent revision is already known in some other # revision's history. We should point it directly to # the origin and (eventually) walk its history. stack.append((None, parent)) else: # The parent revision was never seen before. We should # walk its history and associate it with the same # relative revision. provenance.revision_add_before_revision(relative, parent) stack.append((relative, parent)) else: # The parent revision already points to an origin, so its # history was properly processed before. We just need to # make sure it points to the current origin as well. provenance.revision_add_to_origin(origin, parent) def revision_add( provenance: ProvenanceInterface, archive: ArchiveInterface, revisions: List[RevisionEntry], trackall: bool = True, lower: bool = True, mindepth: int = 1, ) -> None: start = time.time() for revision in revisions: assert revision.date is not None assert revision.root is not None # Processed content starting from the revision's root directory. date = provenance.revision_get_early_date(revision) if date is None or revision.date < date: logging.debug( f"Processing revisions {hash_to_hex(revision.id)}" f" (known date {date} / revision date {revision.date})..." ) graph = build_isochrone_graph( archive, provenance, revision, DirectoryEntry(revision.root), ) # TODO: add file size filtering revision_process_content( archive, provenance, revision, graph, trackall=trackall, lower=lower, mindepth=mindepth, ) done = time.time() # TODO: improve this! Maybe using a max attempt counter? # Ideally Provenance class should guarantee that a commit never fails. while not provenance.commit(): logging.warning( "Could not commit revisions " + ";".join([hash_to_hex(revision.id) for revision in revisions]) + ". Retrying..." ) stop = time.time() logging.debug( f"Revisions {';'.join([hash_to_hex(revision.id) for revision in revisions])} " f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) # logging.critical( # ";".join([hash_to_hex(revision.id) for revision in revisions]) # + f",{stop - start},{stop - done}" # ) class IsochroneNode: def __init__( self, entry: DirectoryEntry, dbdate: Optional[datetime] = None, depth: int = 0, prefix: bytes = b"", ): self.entry = entry self.depth = depth # dbdate is the maxdate for this node that comes from the DB self._dbdate: Optional[datetime] = dbdate # maxdate is set by the maxdate computation algorithm self.maxdate: Optional[datetime] = None # known is True if this node is already known in the db; either because # the current directory actually exists in the database, or because all # the content of the current directory is known (subdirectories and files) self.known: bool = self.dbdate is not None self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name self.children: List[IsochroneNode] = [] - def __str__(self): - return ( - f"<{self.entry.__class__.__name__}[{self.entry.name}]: " - f"known={self.known}, maxdate={self.maxdate}, dbdate={self.dbdate}>" - ) - @property def dbdate(self): # use a property to make this attribute (mostly) read-only return self._dbdate def invalidate(self): self._dbdate = None self.maxdate = None self.known = False def add_directory( self, child: DirectoryEntry, date: Optional[datetime] = None ) -> "IsochroneNode": # we should not be processing this node (ie add subdirectories or # files) if it's actually known by the provenance DB assert self.dbdate is None node = IsochroneNode(child, dbdate=date, depth=self.depth + 1, prefix=self.path) self.children.append(node) return node + def __str__(self): + return ( + f"<{self.entry}: " + f"known={self.known}, maxdate={self.maxdate}, " + f"dbdate={self.dbdate}, path={self.path}, " + f"children=[{', '.join(str(child) for child in self.children)}]>" + ) + def __eq__(self, other): sameDbDate = ( self._dbdate is None and other._dbdate is None ) or self._dbdate == other._dbdate sameMaxdate = ( self.maxdate is None and other.maxdate is None ) or self.maxdate == other.maxdate return ( isinstance(other, IsochroneNode) and (self.entry, self.depth, self.known, self.path) == (other.entry, other.depth, other.known, other.path) and sameDbDate and sameMaxdate and set(self.children) == set(other.children) ) + def __hash__(self): + return hash( + (self.entry, self.depth, self._dbdate, self.maxdate, self.known, self.path) + ) + def build_isochrone_graph( archive: ArchiveInterface, provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry, ) -> IsochroneNode: assert revision.date is not None assert revision.root == directory.id # this function process a revision in 2 steps: # # 1. build the tree structure of IsochroneNode objects (one INode per # directory under the root directory of the revision but not following # known subdirectories), and gather the dates from the DB for already # known objects; for files, just keep all the dates in a global 'fdates' # dict; note that in this step, we will only recurse the directories # that are not already known. # # 2. compute the maxdate for each node of the tree that was not found in the DB. # Build the nodes structure root_date = provenance.directory_get_date_in_isochrone_frontier(directory) root = IsochroneNode(directory, dbdate=root_date) stack = [root] logging.debug( f"Recursively creating graph for revision {hash_to_hex(revision.id)}..." ) fdates: Dict[bytes, datetime] = {} # map {file_id: date} while stack: current = stack.pop() if current.dbdate is None or current.dbdate > revision.date: # If current directory has an associated date in the isochrone frontier that # is greater or equal to the current revision's one, it should be ignored as # the revision is being processed out of order. if current.dbdate is not None and current.dbdate > revision.date: logging.debug( f"Invalidating frontier on {hash_to_hex(current.entry.id)}" f" (date {current.dbdate})" f" when processing revision {hash_to_hex(revision.id)}" f" (date {revision.date})" ) provenance.directory_invalidate_in_isochrone_frontier(current.entry) current.invalidate() # Pre-query all known dates for directories in the current directory # for the provenance object to have them cached and (potentially) improve # performance. current.entry.retrieve_children(archive) ddates = provenance.directory_get_dates_in_isochrone_frontier( current.entry.dirs ) for dir in current.entry.dirs: # Recursively analyse subdirectory nodes node = current.add_directory(dir, date=ddates.get(dir.id, None)) stack.append(node) fdates.update(provenance.content_get_early_dates(current.entry.files)) logging.debug( f"Isochrone graph for revision {hash_to_hex(revision.id)} successfully created!" ) # Precalculate max known date for each node in the graph (only directory nodes are # pushed to the stack). logging.debug(f"Computing maxdates for revision {hash_to_hex(revision.id)}...") stack = [root] while stack: current = stack.pop() # Current directory node is known if it already has an assigned date (ie. it was # already seen as an isochrone frontier). if current.known: assert current.maxdate is None current.maxdate = current.dbdate else: if any(x.maxdate is None for x in current.children): # at least one child of current has no maxdate yet # Current node needs to be analysed again after its children. stack.append(current) for child in current.children: if child.maxdate is None: # if child.maxdate is None, it must be processed stack.append(child) else: # all the files and directories under current have a maxdate, # we can infer the maxdate for current directory assert current.maxdate is None # If all content is already known, update current directory info. current.maxdate = max( [UTCMIN] + [ child.maxdate for child in current.children if child.maxdate is not None # unnecessary, but needed for mypy ] + [ fdates.get(file.id, revision.date) for file in current.entry.files ] ) current.known = ( # true if all subdirectories are known all(child.known for child in current.children) # true if all files are in fdates, i.e. if all files were known # *before building this isochrone graph node* # Note: the 'all()' is lazy: will stop iterating as soon as possible and all((file.id in fdates) for file in current.entry.files) ) logging.debug( f"Maxdates for revision {hash_to_hex(revision.id)} successfully computed!" ) return root def revision_process_content( archive: ArchiveInterface, provenance: ProvenanceInterface, revision: RevisionEntry, graph: IsochroneNode, trackall: bool = True, lower: bool = True, mindepth: int = 1, ): assert revision.date is not None provenance.revision_add(revision) stack = [graph] while stack: current = stack.pop() if current.dbdate is not None: assert current.dbdate <= revision.date if trackall: # Current directory is an outer isochrone frontier for a previously # processed revision. It should be reused as is. provenance.directory_add_to_revision( revision, current.entry, current.path ) else: # Current directory is not an outer isochrone frontier for any previous # revision. It might be eligible for this one. if is_new_frontier( current, revision=revision, trackall=trackall, lower=lower, mindepth=mindepth, ): assert current.maxdate is not None # Outer frontier should be moved to current position in the isochrone # graph. This is the first time this directory is found in the isochrone # frontier. provenance.directory_set_date_in_isochrone_frontier( current.entry, current.maxdate ) if trackall: provenance.directory_add_to_revision( revision, current.entry, current.path ) flatten_directory(archive, provenance, current.entry) else: # No point moving the frontier here. Either there are no files or they # are being seen for the first time here. Add all blobs to current # revision updating date if necessary, and recursively analyse # subdirectories as candidates to the outer frontier. for blob in current.entry.files: date = provenance.content_get_early_date(blob) if date is None or revision.date < date: provenance.content_set_early_date(blob, revision.date) provenance.content_add_to_revision(revision, blob, current.path) for child in current.children: stack.append(child) def is_new_frontier( node: IsochroneNode, revision: RevisionEntry, trackall: bool = True, lower: bool = True, mindepth: int = 1, ) -> bool: assert node.maxdate is not None # for mypy assert revision.date is not None # idem if trackall: # The only real condition for a directory to be a frontier is that its # content is already known and its maxdate is less (or equal) than # current revision's date. Checking mindepth is meant to skip root # directories (or any arbitrary depth) to improve the result. The # option lower tries to maximize the reusage rate of previously defined # frontiers by keeping them low in the directory tree. return ( node.known and node.maxdate <= revision.date # all content is earlier than revision and node.depth >= mindepth # current node is deeper than the min allowed depth and (has_blobs(node) if lower else True) # there is at least one blob in it ) else: # If we are only tracking first occurrences, we want to ensure that all first # occurrences end up in the content_early_in_rev relation. Thus, we force for # every blob outside a frontier to have an extrictly earlier date. return ( node.maxdate < revision.date # all content is earlier than revision and node.depth >= mindepth # deeper than the min allowed depth and (has_blobs(node) if lower else True) # there is at least one blob ) def has_blobs(node: IsochroneNode) -> bool: # We may want to look for files in different ways to decide whether to define a # frontier or not: # 1. Only files in current node: return any(node.entry.files) # 2. Files anywhere in the isochrone graph # stack = [node] # while stack: # current = stack.pop() # if any( # map(lambda child: isinstance(child.entry, FileEntry), current.children)): # return True # else: # # All children are directory entries. # stack.extend(current.children) # return False # 3. Files in the intermediate directories between current node and any previously # defined frontier: # TODO: complete this case! # return any( # map(lambda child: isinstance(child.entry, FileEntry), node.children) # ) or all( # map( # lambda child: ( # not (isinstance(child.entry, DirectoryEntry) and child.date is None) # ) # or has_blobs(child), # node.children, # ) # ) diff --git a/swh/provenance/tests/data/graphs_cmdbts2_lower_1.yaml b/swh/provenance/tests/data/graphs_cmdbts2_lower_1.yaml new file mode 100644 index 0000000..f6d498a --- /dev/null +++ b/swh/provenance/tests/data/graphs_cmdbts2_lower_1.yaml @@ -0,0 +1,476 @@ +# Isochrone graph for R00 +c0d8929936631ecbcf9147be6b8aa13b13b014e4: + entry: + id: "a4cb5e6b2831f7e8eef0e6e08e43d642c97303a1" + name: "" + dbdate: null + maxdate: 1000000000.0 + known: False + path: "" + children: + - entry: + id: "1c8d9fd9afa7e5a2cf52a3db6f05dc5c3a1ca86b" + name: "A" + dbdate: null + maxdate: 1000000000.0 + known: False + path: "A" + children: + - entry: + id: "36876d475197b5ad86ad592e8e28818171455f16" + name: "B" + dbdate: null + maxdate: 1000000000.0 + known: False + path: "A/B" + children: + - entry: + id: "98f7a4a23d8df1fb1a5055facae2aff9b2d0a8b3" + name: "C" + dbdate: null + maxdate: 1000000000.0 + known: False + path: "A/B/C" + children: [] +# Isochrone graph for R01 +1444db96cbd8cd791abe83527becee73d3c64e86: + entry: + id: "b3cf11b22c9f93c3c494cf90ab072f394155072d" + name: "" + dbdate: null + maxdate: 1000000010.0 + known: False + path: "" + children: + - entry: + id: "baca735bf8b8720131b4bfdb47c51631a9260348" + name: "A" + dbdate: null + maxdate: 1000000010.0 + known: False + path: "A" + children: + - entry: + id: "4b28979d88ed209a09c272bcc80f69d9b18339c2" + name: "B" + dbdate: null + maxdate: 1000000010.0 + known: False + path: "A/B" + children: + - entry: + id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf" + name: "C" + dbdate: null + maxdate: 1000000010.0 + known: False + path: "A/B/C" + children: [] +# Isochrone graph for R02 +0d45f1ee524db8f6f0b5a267afac4e733b4b2cee: + entry: + id: "195601c98c28f04e0d19c218434738006990db72" + name: "" + dbdate: null + maxdate: 1000000020.0 + known: False + path: "" + children: + - entry: + id: "d591b308488541aabffd854eae85a9bf83a9d9f5" + name: "A" + dbdate: null + maxdate: 1000000020.0 + known: False + path: "A" + children: + - entry: + id: "0e540a8ebea2f5de3e62b92e2139902cf6f46e92" + name: "B" + dbdate: null + maxdate: 1000000020.0 + known: False + path: "A/B" + children: + - entry: + id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf" + name: "C" + dbdate: null + maxdate: 1000000010.0 + known: True + path: "A/B/C" + children: [] +# Isochrone graph for R03 +540bd6155a3c50cc47b2e6f43aeaace67a696d1d: + entry: + id: "cea28838ec1fb757e44b724fe1365d64c6a94e24" + name: "" + dbdate: null + maxdate: 1000000010.0 + known: True + path: "" + children: + - entry: + id: "48007c961cc734d1f63886d0413a6dc605e3e2ea" + name: "A" + dbdate: null + maxdate: 1000000010.0 + known: True + path: "A" + children: + - entry: + id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf" + name: "C" + dbdate: 1000000010.0 + maxdate: 1000000010.0 + known: True + path: "A/C" + children: [] +# Isochrone graph for R04 +17ed10db0612c9b46ba340943cb6b48b25431419: + entry: + id: "195601c98c28f04e0d19c218434738006990db72" + name: "" + dbdate: null + maxdate: 1000000020.0 + known: True + path: "" + children: + - entry: + id: "d591b308488541aabffd854eae85a9bf83a9d9f5" + name: "A" + dbdate: null + maxdate: 1000000020.0 + known: True + path: "A" + children: + - entry: + id: "0e540a8ebea2f5de3e62b92e2139902cf6f46e92" + name: "B" + dbdate: null + maxdate: 1000000020.0 + known: True + path: "A/B" + children: + - entry: + id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf" + name: "C" + dbdate: 1000000010.0 + maxdate: 1000000010.0 + known: True + path: "A/B/C" + children: [] +# Isochrone graph for R05 +c8bef45193355db33d64f375b4a4e4f23ac2a4f6: + entry: + id: "fa63f03d67d1a15563afe9f8ba97832dfb20f42a" + name: "" + dbdate: null + maxdate: 1000000050.0 + known: False + path: "" + children: + - entry: + id: "12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e" + name: "D" + dbdate: null + maxdate: 1000000050.0 + known: False + path: "D" + children: [] +# Isochrone graph for R06 +f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c: + entry: + id: "c86d2f588234098642ef6f33ca662a6a9de865bc" + name: "" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "" + children: + - entry: + id: "8a3993f4efa9385ce993775cab5ec4dc2c78d7f6" + name: "D" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "D" + children: + - entry: + id: "fa63f03d67d1a15563afe9f8ba97832dfb20f42a" + name: "E" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "D/E" + children: + - entry: + id: "12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e" + name: "D" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "D/E/D" + children: [] +# Isochrone graph for R07 +91ed6a03c80b61e0d63d328f7a4325230e7a0237: + entry: + id: "641baf6738fa5ebb3c5eb39af45f62ff52f8cc62" + name: "" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "" + children: + - entry: + id: "b0ae56ed5ca7daa34fd7a91a28db443ab3c389a0" + name: "F" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F" + children: + - entry: + id: "fa63f03d67d1a15563afe9f8ba97832dfb20f42a" + name: "E" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F/E" + children: + - entry: + id: "12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e" + name: "D" + dbdate: 1000000050.0 + maxdate: 1000000050.0 + known: True + path: "F/E/D" + children: [] +# Isochrone graph for R08 +a97e5c8a626510eefaa637091924cf800b1e8b06: + entry: + id: "79e219827e12f40e7146cc6834ee04b617a8073a" + name: "" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "" + children: + - entry: + id: "9a7b5762e20b11735b93a635cda451c75bd31270" + name: "F" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F" + children: + - entry: + id: "81b84d8fd8ceebd47f51896d19ce1aa286629225" + name: "E" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F/E" + children: + - entry: + id: "cb211f2d9dfee6c3968837a07960afd6ab09506c" + name: "D" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F/E/D" + children: [] +# Isochrone graph for R09 +3c5ad6be812b182ee2a01e84884b8ab7d384a4a0: + entry: + id: "53a71b331248f2144f4f012fd7e05f86b8ee62a0" + name: "" + dbdate: null + maxdate: 1000000090.0 + known: False + path: "" + children: + - entry: + id: "16cb311fc491b0b6dfade153191ee1c09d2152cf" + name: "F" + dbdate: null + maxdate: 1000000090.0 + known: False + path: "F" + children: + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "E" + dbdate: null + maxdate: 1000000090.0 + known: False + path: "F/E" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: null + maxdate: 1000000090.0 + known: False + path: "F/E/D" + children: [] +# Isochrone graph for R10 +b7c52e28d441ca0cb736fdbe49e39eae3847ad0f: + entry: + id: "8c61bb233c89936b310d8b269a35c24bff432227" + name: "" + dbdate: null + maxdate: 1000000100.0 + known: False + path: "" + children: + - entry: + id: "db2b00211f77c6c7f1f742020e483b506b82b5d6" + name: "F" + dbdate: null + maxdate: 1000000100.0 + known: False + path: "F" + children: + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "E" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "F/E" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "F/E/D" + children: [] +# Isochrone graph for R11 +f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2: + entry: + id: "b29a1c3fee0057016af424c41d58a8811b8c3a41" + name: "" + dbdate: null + maxdate: 1000000110.0 + known: False + path: "" + children: + - entry: + id: "74fb9789d162f02deabbdfbc3c8daa97f31559a1" + name: "G" + dbdate: null + maxdate: 1000000110.0 + known: False + path: "G" + children: + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "E" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "G/E" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/E/D" + children: [] +# Isochrone graph for R12 +99bd98e1803343ecfabe4b05d0218475c2b1bf74: + entry: + id: "6b2d11dd7bc6c7d7dcf59afed80f57413d929cf5" + name: "" + dbdate: null + maxdate: 1000000120.0 + known: False + path: "" + children: + - entry: + id: "5aa1d185e7e32bb53a16ba0db1b06d3a6243b36f" + name: "G" + dbdate: null + maxdate: 1000000120.0 + known: False + path: "G" + children: + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "H" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "G/H" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/H/D" + children: [] +# Isochrone graph for R13 +10287882c7ed1b7c96f43da269e6a868b98291ff: + entry: + id: "148f08e057416af1e471abb3dcd594d27233085d" + name: "" + dbdate: null + maxdate: 1000000130.0 + known: False + path: "" + children: + - entry: + id: "8084b999790aab88e5119915ea1083e747a3f42f" + name: "G" + dbdate: null + maxdate: 1000000130.0 + known: False + path: "G" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/D" + children: [] + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "I" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "G/I" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/I/D" + children: [] + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "H" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "G/H" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/H/D" + children: [] diff --git a/swh/provenance/tests/test_isochrone_graph.py b/swh/provenance/tests/test_isochrone_graph.py new file mode 100644 index 0000000..8af6a3a --- /dev/null +++ b/swh/provenance/tests/test_isochrone_graph.py @@ -0,0 +1,88 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timezone +import pytest +import yaml + +from swh.model.hashutil import hash_to_bytes +from swh.provenance.model import DirectoryEntry, RevisionEntry +from swh.provenance.provenance import IsochroneNode, build_isochrone_graph, revision_add +from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data +from swh.provenance.tests.test_provenance_db import ts2dt + + +def isochrone_graph_from_dict(d, depth=0) -> IsochroneNode: + """Takes a dictionary representing a tree of IsochroneNode objects, and + recursively builds the corresponding graph.""" + d = d.copy() + + d["entry"]["id"] = hash_to_bytes(d["entry"]["id"]) + d["entry"]["name"] = bytes(d["entry"]["name"], encoding="utf-8") + + if d["dbdate"] is not None: + d["dbdate"] = datetime.fromtimestamp(d["dbdate"], timezone.utc) + + if d["maxdate"] is not None: + d["maxdate"] = datetime.fromtimestamp(d["maxdate"], timezone.utc) + + node = IsochroneNode( + entry=DirectoryEntry(**d["entry"]), + dbdate=d["dbdate"], + depth=depth, + ) + node.maxdate = d["maxdate"] + node.known = d["known"] + node.path = bytes(d["path"], encoding="utf-8") + node.children = [ + isochrone_graph_from_dict(child, depth=depth + 1) for child in d["children"] + ] + return node + + +@pytest.mark.parametrize( + "repo, lower, mindepth", + ( + ("cmdbts2", True, 1), + # ("cmdbts2", False, 1), + # ("cmdbts2", True, 2), + # ("cmdbts2", False, 2), + # ("out-of-order", True, 1), + ), +) +def test_isochrone_graph(provenance, swh_storage, archive, repo, lower, mindepth): + # read data/README.md for more details on how these datasets are generated + data = load_repo_data(repo) + fill_storage(swh_storage, data) + + revisions = {rev["id"]: rev for rev in data["revision"]} + filename = f"graphs_{repo}_{'lower' if lower else 'upper'}_{mindepth}.yaml" + + with open(get_datafile(filename)) as file: + expected = yaml.full_load(file) + + for rev, graph_as_dict in expected.items(): + revision = revisions[hash_to_bytes(rev)] + entry = RevisionEntry( + id=revision["id"], + date=ts2dt(revision["date"]), + root=revision["directory"], + ) + expected_graph = isochrone_graph_from_dict(graph_as_dict) + print("Expected", expected_graph) + + # Create graph for current revision and check it has the expected structure. + computed_graph = build_isochrone_graph( + archive, + provenance, + entry, + DirectoryEntry(entry.root), + ) + print("Computed", computed_graph) + assert computed_graph == expected_graph + + # Add current revision so that provenance info is kept up to date for the + # following ones. + revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)