diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py index e80cdca..2293b22 100644 --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -1,254 +1,216 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from datetime import datetime, timezone import os from typing import Any, Dict, Optional, Set from swh.core.statsd import statsd from swh.model.hashutil import hash_to_hex from swh.model.model import Sha1Git from .archive import ArchiveInterface from .interface import ProvenanceInterface from .model import DirectoryEntry, RevisionEntry GRAPH_DURATION_METRIC = "swh_provenance_graph_duration_seconds" GRAPH_OPERATIONS_METRIC = "swh_provenance_graph_operations_total" UTCMIN = datetime.min.replace(tzinfo=timezone.utc) -class HistoryNode: - def __init__( - self, entry: RevisionEntry, is_head: bool = False, in_history: bool = False - ) -> None: - self.entry = entry - # A revision is `is_head` if it is directly pointed by an origin (ie. a head - # revision for some snapshot) - self.is_head = is_head - # A revision is `in_history` if it appears in the history graph of an already - # processed revision in the provenance database - self.in_history = in_history - # XXX: the current simplified version of the origin-revision layer algorithm - # does not use this previous two flags at all. They are kept for now but might - # be removed in the future (hence, RevisionEntry might be used instead of - # HistoryNode). - - def __str__(self) -> str: - return f"<{self.entry}: is_head={self.is_head}, in_history={self.in_history}>" - - def as_dict(self) -> Dict[str, Any]: - return { - "rev": hash_to_hex(self.entry.id), - "is_head": self.is_head, - "in_history": self.in_history, - } - - class HistoryGraph: @statsd.timed(metric=GRAPH_DURATION_METRIC, tags={"method": "build_history_graph"}) def __init__( self, - provenance: ProvenanceInterface, archive: ArchiveInterface, revision: RevisionEntry, ) -> None: - self._head = HistoryNode( - revision, - is_head=provenance.revision_visited(revision), - in_history=provenance.revision_in_history(revision), - ) - self._graph: Dict[HistoryNode, Set[HistoryNode]] = {} + self._head = revision + self._graph: Dict[RevisionEntry, Set[RevisionEntry]] = {} stack = [self._head] while stack: current = stack.pop() if current not in self._graph: self._graph[current] = set() - current.entry.retrieve_parents(archive) - for parent in current.entry.parents: - node = HistoryNode( - parent, - is_head=provenance.revision_visited(parent), - in_history=provenance.revision_in_history(parent), - ) - self._graph[current].add(node) - stack.append(node) + current.retrieve_parents(archive) + for parent in current.parents: + self._graph[current].add(parent) + stack.append(parent) @property - def head(self) -> HistoryNode: + def head(self) -> RevisionEntry: return self._head @property - def parents(self) -> Dict[HistoryNode, Set[HistoryNode]]: + def parents(self) -> Dict[RevisionEntry, Set[RevisionEntry]]: return self._graph def __str__(self) -> str: return f" Dict[str, Any]: return { - "head": self.head.as_dict(), + "head": hash_to_hex(self.head.id), "graph": { - hash_to_hex(node.entry.id): sorted( - [parent.as_dict() for parent in parents], - key=lambda d: d["rev"], + hash_to_hex(node.id): sorted( + [hash_to_hex(parent.id) for parent in parents] ) for node, parents in self._graph.items() }, } class IsochroneNode: def __init__( self, entry: DirectoryEntry, dbdate: Optional[datetime] = None, depth: int = 0, prefix: bytes = b"", ) -> None: self.entry = entry self.depth = depth # dbdate is the maxdate for this node that comes from the DB self._dbdate: Optional[datetime] = dbdate # maxdate is set by the maxdate computation algorithm self.maxdate: Optional[datetime] = None self.invalid = False self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name self.children: Set[IsochroneNode] = set() @property def dbdate(self) -> Optional[datetime]: # use a property to make this attribute (mostly) read-only return self._dbdate def invalidate(self) -> None: statsd.increment( metric=GRAPH_OPERATIONS_METRIC, tags={"method": "invalidate_frontier"} ) self._dbdate = None self.maxdate = None self.invalid = True def add_directory( self, child: DirectoryEntry, date: Optional[datetime] = None ) -> IsochroneNode: # we should not be processing this node (ie add subdirectories or files) if it's # actually known by the provenance DB assert self.dbdate is None node = IsochroneNode(child, dbdate=date, depth=self.depth + 1, prefix=self.path) self.children.add(node) return node def __str__(self) -> str: return ( f"<{self.entry}: depth={self.depth}, dbdate={self.dbdate}, " f"maxdate={self.maxdate}, invalid={self.invalid}, path={self.path!r}, " f"children=[{', '.join(str(child) for child in self.children)}]>" ) def __eq__(self, other: Any) -> bool: return isinstance(other, IsochroneNode) and self.__dict__ == other.__dict__ def __hash__(self) -> int: # only immutable attributes are considered to compute hash return hash((self.entry, self.depth, self.path)) @statsd.timed(metric=GRAPH_DURATION_METRIC, tags={"method": "build_isochrone_graph"}) def build_isochrone_graph( provenance: ProvenanceInterface, archive: ArchiveInterface, revision: RevisionEntry, directory: DirectoryEntry, minsize: int = 0, ) -> IsochroneNode: assert revision.date is not None assert revision.root == directory.id # this function process a revision in 2 steps: # # 1. build the tree structure of IsochroneNode objects (one INode per # directory under the root directory of the revision but not following # known subdirectories), and gather the dates from the DB for already # known objects; for files, just keep all the dates in a global 'fdates' # dict; note that in this step, we will only recurse the directories # that are not already known. # # 2. compute the maxdate for each node of the tree that was not found in the DB. # Build the nodes structure root_date = provenance.directory_get_date_in_isochrone_frontier(directory) root = IsochroneNode(directory, dbdate=root_date) stack = [root] fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date} while stack: current = stack.pop() if current.dbdate is None or current.dbdate >= revision.date: # If current directory has an associated date in the isochrone frontier that # is greater or equal to the current revision's one, it should be ignored as # the revision is being processed out of order. if current.dbdate is not None and current.dbdate >= revision.date: current.invalidate() # Pre-query all known dates for directories in the current directory # for the provenance object to have them cached and (potentially) improve # performance. current.entry.retrieve_children(archive, minsize=minsize) ddates = provenance.directory_get_dates_in_isochrone_frontier( current.entry.dirs ) for dir in current.entry.dirs: # Recursively analyse subdirectory nodes node = current.add_directory(dir, date=ddates.get(dir.id, None)) stack.append(node) fdates.update(provenance.content_get_early_dates(current.entry.files)) # Precalculate max known date for each node in the graph (only directory nodes are # pushed to the stack). stack = [root] while stack: current = stack.pop() # Current directory node is known if it already has an assigned date (ie. it was # already seen as an isochrone frontier). if current.dbdate is not None: assert current.maxdate is None current.maxdate = current.dbdate else: if any(x.maxdate is None for x in current.children): # at least one child of current has no maxdate yet # Current node needs to be analysed again after its children. stack.append(current) for child in current.children: if child.maxdate is None: # if child.maxdate is None, it must be processed stack.append(child) else: # all the files and directories under current have a maxdate, # we can infer the maxdate for current directory assert current.maxdate is None # if all content is already known, update current directory info. current.maxdate = max( [UTCMIN] + [ child.maxdate for child in current.children if child.maxdate is not None # for mypy ] + [ fdates.get(file.id, revision.date) for file in current.entry.files ] ) return root diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py index 78a7279..784b920 100644 --- a/swh/provenance/interface.py +++ b/swh/provenance/interface.py @@ -1,396 +1,384 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from dataclasses import dataclass from datetime import datetime import enum from types import TracebackType from typing import Dict, Generator, Iterable, Optional, Set, Type, Union from typing_extensions import Protocol, runtime_checkable from swh.core.api import remote_api_endpoint from swh.model.model import Sha1Git from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry class EntityType(enum.Enum): CONTENT = "content" DIRECTORY = "directory" REVISION = "revision" ORIGIN = "origin" class RelationType(enum.Enum): CNT_EARLY_IN_REV = "content_in_revision" CNT_IN_DIR = "content_in_directory" DIR_IN_REV = "directory_in_revision" REV_IN_ORG = "revision_in_origin" REV_BEFORE_REV = "revision_before_revision" @dataclass(eq=True, frozen=True) class ProvenanceResult: content: Sha1Git revision: Sha1Git date: datetime origin: Optional[str] path: bytes @dataclass(eq=True, frozen=True) class DirectoryData: """Object representing the data associated to a directory in the provenance model, where `date` is the date of the directory in the isochrone frontier, and `flat` is a flag acknowledging that a flat model for the elements outside the frontier has already been created. """ date: datetime flat: bool @dataclass(eq=True, frozen=True) class RevisionData: """Object representing the data associated to a revision in the provenance model, where `date` is the optional date of the revision (specifying it acknowledges that the revision was already processed by the revision-content algorithm); and `origin` identifies the preferred origin for the revision, if any. """ date: Optional[datetime] origin: Optional[Sha1Git] @dataclass(eq=True, frozen=True) class RelationData: """Object representing a relation entry in the provenance model, where `src` and `dst` are the sha1 ids of the entities being related, and `path` is optional depending on the relation being represented. """ dst: Sha1Git path: Optional[bytes] @runtime_checkable class ProvenanceStorageInterface(Protocol): def __enter__(self) -> ProvenanceStorageInterface: ... def __exit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: ... @remote_api_endpoint("close") def close(self) -> None: """Close connection to the storage and release resources.""" ... @remote_api_endpoint("content_add") def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool: """Add blobs identified by sha1 ids, with an associated date (as paired in `cnts`) to the provenance storage. Return a boolean stating whether the information was successfully stored. """ ... @remote_api_endpoint("content_find_first") def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: """Retrieve the first occurrence of the blob identified by `id`.""" ... @remote_api_endpoint("content_find_all") def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: """Retrieve all the occurrences of the blob identified by `id`.""" ... @remote_api_endpoint("content_get") def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: """Retrieve the associated date for each blob sha1 in `ids`.""" ... @remote_api_endpoint("directory_add") def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool: """Add directories identified by sha1 ids, with associated date and (optional) flatten flag (as paired in `dirs`) to the provenance storage. If the flatten flag is set to None, the previous value present in the storage is preserved. Return a boolean stating if the information was successfully stored. """ ... @remote_api_endpoint("directory_get") def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]: """Retrieve the associated date and (optional) flatten flag for each directory sha1 in `ids`. If some directories has no associated date, it is not present in the resulting dictionary. """ ... @remote_api_endpoint("entity_get_all") def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: """Retrieve all sha1 ids for entities of type `entity` present in the provenance model. This method is used only in tests. """ ... @remote_api_endpoint("location_add") def location_add(self, paths: Iterable[bytes]) -> bool: """Register the given `paths` in the storage.""" ... @remote_api_endpoint("location_get_all") def location_get_all(self) -> Set[bytes]: """Retrieve all paths present in the provenance model. This method is used only in tests.""" ... @remote_api_endpoint("open") def open(self) -> None: """Open connection to the storage and allocate necessary resources.""" ... @remote_api_endpoint("origin_add") def origin_add(self, orgs: Dict[Sha1Git, str]) -> bool: """Add origins identified by sha1 ids, with their corresponding url (as paired in `orgs`) to the provenance storage. Return a boolean stating if the information was successfully stored. """ ... @remote_api_endpoint("origin_get") def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: """Retrieve the associated url for each origin sha1 in `ids`.""" ... @remote_api_endpoint("revision_add") def revision_add( self, revs: Union[Iterable[Sha1Git], Dict[Sha1Git, RevisionData]] ) -> bool: """Add revisions identified by sha1 ids, with optional associated date or origin (as paired in `revs`) to the provenance storage. Return a boolean stating if the information was successfully stored. """ ... @remote_api_endpoint("revision_get") def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]: """Retrieve the associated date and origin for each revision sha1 in `ids`. If some revision has no associated date nor origin, it is not present in the resulting dictionary. """ ... @remote_api_endpoint("relation_add") def relation_add( self, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]] ) -> bool: """Add entries in the selected `relation`. This method assumes all entities being related are already registered in the storage. See `content_add`, `directory_add`, `origin_add`, and `revision_add`. """ ... @remote_api_endpoint("relation_get") def relation_get( self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False ) -> Dict[Sha1Git, Set[RelationData]]: """Retrieve all entries in the selected `relation` whose source entities are identified by some sha1 id in `ids`. If `reverse` is set, destination entities are matched instead. """ ... @remote_api_endpoint("relation_get_all") def relation_get_all( self, relation: RelationType ) -> Dict[Sha1Git, Set[RelationData]]: """Retrieve all entries in the selected `relation` that are present in the provenance model. This method is used only in tests. """ ... @remote_api_endpoint("with_path") def with_path(self) -> bool: ... @runtime_checkable class ProvenanceInterface(Protocol): storage: ProvenanceStorageInterface def __enter__(self) -> ProvenanceInterface: ... def __exit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: ... def close(self) -> None: """Close connection to the underlying `storage` and release resources.""" ... def flush(self) -> None: """Flush internal cache to the underlying `storage`.""" ... def content_add_to_directory( self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes ) -> None: """Associate `blob` with `directory` in the provenance model. `prefix` is the relative path from `directory` to `blob` (excluding `blob`'s name). """ ... def content_add_to_revision( self, revision: RevisionEntry, blob: FileEntry, prefix: bytes ) -> None: """Associate `blob` with `revision` in the provenance model. `prefix` is the absolute path from `revision`'s root directory to `blob` (excluding `blob`'s name). """ ... def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: """Retrieve the first occurrence of the blob identified by `id`.""" ... def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: """Retrieve all the occurrences of the blob identified by `id`.""" ... def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: """Retrieve the earliest known date of `blob`.""" ... def content_get_early_dates( self, blobs: Iterable[FileEntry] ) -> Dict[Sha1Git, datetime]: """Retrieve the earliest known date for each blob in `blobs`. If some blob has no associated date, it is not present in the resulting dictionary. """ ... def content_set_early_date(self, blob: FileEntry, date: datetime) -> None: """Associate `date` to `blob` as it's earliest known date.""" ... def directory_add_to_revision( self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes ) -> None: """Associate `directory` with `revision` in the provenance model. `path` is the absolute path from `revision`'s root directory to `directory` (including `directory`'s name). """ ... def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]: """Check if the directory is already flattenned in the provenance model. If the directory is unknown for the model, the methods returns None. """ ... def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None: """Mark the directory as flattenned in the provenance model. If the directory is unknown for the model, this method has no effect. """ ... def directory_get_date_in_isochrone_frontier( self, directory: DirectoryEntry ) -> Optional[datetime]: """Retrieve the earliest known date of `directory` as an isochrone frontier in the provenance model. """ ... def directory_get_dates_in_isochrone_frontier( self, dirs: Iterable[DirectoryEntry] ) -> Dict[Sha1Git, datetime]: """Retrieve the earliest known date for each directory in `dirs` as isochrone frontiers provenance model. If some directory has no associated date, it is not present in the resulting dictionary. """ ... def directory_set_date_in_isochrone_frontier( self, directory: DirectoryEntry, date: datetime ) -> None: """Associate `date` to `directory` as it's earliest known date as an isochrone frontier in the provenance model. """ ... def open(self) -> None: """Open connection to the underlying `storage` and allocate necessary resources. """ ... def origin_add(self, origin: OriginEntry) -> None: """Add `origin` to the provenance model.""" ... def revision_add(self, revision: RevisionEntry) -> None: """Add `revision` to the provenance model. This implies storing `revision`'s date in the model, thus `revision.date` must be a valid date. """ ... def revision_add_before_revision( self, head: RevisionEntry, revision: RevisionEntry ) -> None: """Associate `revision` to `head` as an ancestor of the latter.""" ... def revision_add_to_origin( self, origin: OriginEntry, revision: RevisionEntry ) -> None: """Associate `revision` to `origin` as a head revision of the latter (ie. the target of an snapshot for `origin` in the archive).""" ... def revision_get_date(self, revision: RevisionEntry) -> Optional[datetime]: """Retrieve the date associated to `revision`.""" ... def revision_get_preferred_origin( self, revision: RevisionEntry ) -> Optional[Sha1Git]: """Retrieve the preferred origin associated to `revision`.""" ... - def revision_in_history(self, revision: RevisionEntry) -> bool: - """Check if `revision` is known to be an ancestor of some head revision in the - provenance model. - """ - ... - def revision_set_preferred_origin( self, origin: OriginEntry, revision: RevisionEntry ) -> None: """Associate `origin` as the preferred origin for `revision`.""" ... - - def revision_visited(self, revision: RevisionEntry) -> bool: - """Check if `revision` is known to be a head revision for some origin in the - provenance model. - """ - ... diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py index 5d24568..73180b7 100644 --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -1,103 +1,98 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from itertools import islice from typing import Generator, Iterable, Iterator, List, Optional, Tuple from swh.core.statsd import statsd from swh.model.model import Sha1Git from .archive import ArchiveInterface from .graph import HistoryGraph from .interface import ProvenanceInterface from .model import OriginEntry, RevisionEntry ORIGIN_DURATION_METRIC = "swh_provenance_origin_revision_layer_duration_seconds" class CSVOriginIterator: """Iterator over origin visit statuses typically present in the given CSV file. The input is an iterator that produces 2 elements per row: (url, snap) where: - url: is the origin url of the visit - snap: sha1_git of the snapshot pointed by the visit status """ def __init__( self, statuses: Iterable[Tuple[str, Sha1Git]], limit: Optional[int] = None, ) -> None: self.statuses: Iterator[Tuple[str, Sha1Git]] if limit is not None: self.statuses = islice(statuses, limit) else: self.statuses = iter(statuses) def __iter__(self) -> Generator[OriginEntry, None, None]: return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "main"}) def origin_add( provenance: ProvenanceInterface, archive: ArchiveInterface, origins: List[OriginEntry], ) -> None: for origin in origins: provenance.origin_add(origin) origin.retrieve_revisions(archive) for revision in origin.revisions: - graph = HistoryGraph(provenance, archive, revision) + graph = HistoryGraph(archive, revision) origin_add_revision(provenance, origin, graph) provenance.flush() @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "process_revision"}) def origin_add_revision( provenance: ProvenanceInterface, origin: OriginEntry, graph: HistoryGraph, ) -> None: - # XXX: simplified version of the origin-revision algorithm. This is generating flat - # models for the history of all head revisions. No previous result is reused now! - # The previous implementation was missing some paths from origins to certain - # revisions due to a wrong reuse logic. - # head is treated separately since it should always be added to the given origin - check_preferred_origin(provenance, origin, graph.head.entry) - provenance.revision_add_to_origin(origin, graph.head.entry) + check_preferred_origin(provenance, origin, graph.head) + provenance.revision_add_to_origin(origin, graph.head) visited = {graph.head} # head's history should be recursively iterated starting from its parents stack = list(graph.parents[graph.head]) while stack: current = stack.pop() - check_preferred_origin(provenance, origin, current.entry) + check_preferred_origin(provenance, origin, current) # create a link between it and the head, and recursively walk its history - provenance.revision_add_before_revision(graph.head.entry, current.entry) + provenance.revision_add_before_revision(graph.head, current) visited.add(current) for parent in graph.parents[current]: if parent not in visited: stack.append(parent) @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "check_preferred_origin"}) def check_preferred_origin( provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry, ) -> None: # if the revision has no preferred origin just set the given origin as the # preferred one. TODO: this should be improved in the future! preferred = provenance.revision_get_preferred_origin(revision) if preferred is None: provenance.revision_set_preferred_origin(origin, revision) diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py index b0f43c6..b78e327 100644 --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -1,504 +1,494 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import logging import os from types import TracebackType from typing import Dict, Generator, Iterable, Optional, Set, Tuple, Type from typing_extensions import Literal, TypedDict from swh.core.statsd import statsd from swh.model.model import Sha1Git from .interface import ( DirectoryData, ProvenanceInterface, ProvenanceResult, ProvenanceStorageInterface, RelationData, RelationType, RevisionData, ) from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry from .util import path_normalize LOGGER = logging.getLogger(__name__) BACKEND_DURATION_METRIC = "swh_provenance_backend_duration_seconds" BACKEND_OPERATIONS_METRIC = "swh_provenance_backend_operations_total" class DatetimeCache(TypedDict): data: Dict[Sha1Git, Optional[datetime]] # None means unknown added: Set[Sha1Git] class OriginCache(TypedDict): data: Dict[Sha1Git, str] added: Set[Sha1Git] class RevisionCache(TypedDict): data: Dict[Sha1Git, Sha1Git] added: Set[Sha1Git] class ProvenanceCache(TypedDict): content: DatetimeCache directory: DatetimeCache directory_flatten: Dict[Sha1Git, Optional[bool]] # None means unknown revision: DatetimeCache # below are insertion caches only content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]] content_in_directory: Set[Tuple[Sha1Git, Sha1Git, bytes]] directory_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]] # these two are for the origin layer origin: OriginCache revision_origin: RevisionCache revision_before_revision: Dict[Sha1Git, Set[Sha1Git]] revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]] def new_cache() -> ProvenanceCache: return ProvenanceCache( content=DatetimeCache(data={}, added=set()), directory=DatetimeCache(data={}, added=set()), directory_flatten={}, revision=DatetimeCache(data={}, added=set()), content_in_revision=set(), content_in_directory=set(), directory_in_revision=set(), origin=OriginCache(data={}, added=set()), revision_origin=RevisionCache(data={}, added=set()), revision_before_revision={}, revision_in_origin=set(), ) class Provenance: def __init__(self, storage: ProvenanceStorageInterface) -> None: self.storage = storage self.cache = new_cache() def __enter__(self) -> ProvenanceInterface: self.open() return self def __exit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: self.close() def clear_caches(self) -> None: self.cache = new_cache() def close(self) -> None: self.storage.close() @statsd.timed(metric=BACKEND_DURATION_METRIC, tags={"method": "flush"}) def flush(self) -> None: self.flush_revision_content_layer() self.flush_origin_revision_layer() self.clear_caches() @statsd.timed( metric=BACKEND_DURATION_METRIC, tags={"method": "flush_origin_revision"} ) def flush_origin_revision_layer(self) -> None: # Origins and revisions should be inserted first so that internal ids' # resolution works properly. urls = { sha1: url for sha1, url in self.cache["origin"]["data"].items() if sha1 in self.cache["origin"]["added"] } if urls: while not self.storage.origin_add(urls): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_origin_revision_retry_origin"}, ) LOGGER.warning( "Unable to write origins urls to the storage. Retrying..." ) rev_orgs = { # Destinations in this relation should match origins in the next one **{ src: RevisionData(date=None, origin=None) for src in self.cache["revision_before_revision"] }, **{ # This relation comes second so that non-None origins take precedence src: RevisionData(date=None, origin=org) for src, org in self.cache["revision_in_origin"] }, } if rev_orgs: while not self.storage.revision_add(rev_orgs): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_origin_revision_retry_revision"}, ) LOGGER.warning( "Unable to write revision entities to the storage. Retrying..." ) # Second, flat models for revisions' histories (ie. revision-before-revision). if self.cache["revision_before_revision"]: rev_before_rev = { src: {RelationData(dst=dst, path=None) for dst in dsts} for src, dsts in self.cache["revision_before_revision"].items() } while not self.storage.relation_add( RelationType.REV_BEFORE_REV, rev_before_rev ): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={ "method": "flush_origin_revision_retry_revision_before_revision" }, ) LOGGER.warning( "Unable to write %s rows to the storage. Retrying...", RelationType.REV_BEFORE_REV, ) # Heads (ie. revision-in-origin entries) should be inserted once flat models for # their histories were already added. This is to guarantee consistent results if # something needs to be reprocessed due to a failure: already inserted heads # won't get reprocessed in such a case. if self.cache["revision_in_origin"]: rev_in_org: Dict[Sha1Git, Set[RelationData]] = {} for src, dst in self.cache["revision_in_origin"]: rev_in_org.setdefault(src, set()).add(RelationData(dst=dst, path=None)) while not self.storage.relation_add(RelationType.REV_IN_ORG, rev_in_org): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_origin_revision_retry_revision_in_origin"}, ) LOGGER.warning( "Unable to write %s rows to the storage. Retrying...", RelationType.REV_IN_ORG, ) @statsd.timed( metric=BACKEND_DURATION_METRIC, tags={"method": "flush_revision_content"} ) def flush_revision_content_layer(self) -> None: # Register in the storage all entities, to ensure the coming relations can # properly resolve any internal reference if needed. Content and directory # entries may safely be registered with their associated dates. In contrast, # revision entries should be registered without date, as it is used to # acknowledge that the flushing was successful. Also, directories are # registered with their flatten flag not set. cnt_dates = { sha1: date for sha1, date in self.cache["content"]["data"].items() if sha1 in self.cache["content"]["added"] and date is not None } if cnt_dates: while not self.storage.content_add(cnt_dates): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_revision_content_retry_content_date"}, ) LOGGER.warning( "Unable to write content dates to the storage. Retrying..." ) dir_dates = { sha1: DirectoryData(date=date, flat=False) for sha1, date in self.cache["directory"]["data"].items() if sha1 in self.cache["directory"]["added"] and date is not None } if dir_dates: while not self.storage.directory_add(dir_dates): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_revision_content_retry_directory_date"}, ) LOGGER.warning( "Unable to write directory dates to the storage. Retrying..." ) revs = { sha1 for sha1, date in self.cache["revision"]["data"].items() if sha1 in self.cache["revision"]["added"] and date is not None } if revs: while not self.storage.revision_add(revs): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_revision_content_retry_revision_none"}, ) LOGGER.warning( "Unable to write revision entities to the storage. Retrying..." ) paths = { path for _, _, path in self.cache["content_in_revision"] | self.cache["content_in_directory"] | self.cache["directory_in_revision"] } if paths: while not self.storage.location_add(paths): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_revision_content_retry_location"}, ) LOGGER.warning( "Unable to write locations entities to the storage. Retrying..." ) # For this layer, relations need to be inserted first so that, in case of # failure, reprocessing the input does not generated an inconsistent database. if self.cache["content_in_revision"]: cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {} for src, dst, path in self.cache["content_in_revision"]: cnt_in_rev.setdefault(src, set()).add(RelationData(dst=dst, path=path)) while not self.storage.relation_add( RelationType.CNT_EARLY_IN_REV, cnt_in_rev ): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_revision_content_retry_content_in_revision"}, ) LOGGER.warning( "Unable to write %s rows to the storage. Retrying...", RelationType.CNT_EARLY_IN_REV, ) if self.cache["content_in_directory"]: cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {} for src, dst, path in self.cache["content_in_directory"]: cnt_in_dir.setdefault(src, set()).add(RelationData(dst=dst, path=path)) while not self.storage.relation_add(RelationType.CNT_IN_DIR, cnt_in_dir): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={ "method": "flush_revision_content_retry_content_in_directory" }, ) LOGGER.warning( "Unable to write %s rows to the storage. Retrying...", RelationType.CNT_IN_DIR, ) if self.cache["directory_in_revision"]: dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {} for src, dst, path in self.cache["directory_in_revision"]: dir_in_rev.setdefault(src, set()).add(RelationData(dst=dst, path=path)) while not self.storage.relation_add(RelationType.DIR_IN_REV, dir_in_rev): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={ "method": "flush_revision_content_retry_directory_in_revision" }, ) LOGGER.warning( "Unable to write %s rows to the storage. Retrying...", RelationType.DIR_IN_REV, ) # After relations, flatten flags for directories can be safely set (if # applicable) acknowledging those directories that have already be flattened. # Similarly, dates for the revisions are set to acknowledge that these revisions # won't need to be reprocessed in case of failure. dir_acks = { sha1: DirectoryData( date=date, flat=self.cache["directory_flatten"].get(sha1) or False ) for sha1, date in self.cache["directory"]["data"].items() if self.cache["directory_flatten"].get(sha1) and date is not None } if dir_acks: while not self.storage.directory_add(dir_acks): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_revision_content_retry_directory_ack"}, ) LOGGER.warning( "Unable to write directory dates to the storage. Retrying..." ) rev_dates = { sha1: RevisionData(date=date, origin=None) for sha1, date in self.cache["revision"]["data"].items() if sha1 in self.cache["revision"]["added"] and date is not None } if rev_dates: while not self.storage.revision_add(rev_dates): statsd.increment( metric=BACKEND_OPERATIONS_METRIC, tags={"method": "flush_revision_content_retry_revision_date"}, ) LOGGER.warning( "Unable to write revision dates to the storage. Retrying..." ) def content_add_to_directory( self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes ) -> None: self.cache["content_in_directory"].add( (blob.id, directory.id, path_normalize(os.path.join(prefix, blob.name))) ) def content_add_to_revision( self, revision: RevisionEntry, blob: FileEntry, prefix: bytes ) -> None: self.cache["content_in_revision"].add( (blob.id, revision.id, path_normalize(os.path.join(prefix, blob.name))) ) def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: return self.storage.content_find_first(id) def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: yield from self.storage.content_find_all(id, limit=limit) def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: return self.get_dates("content", [blob.id]).get(blob.id) def content_get_early_dates( self, blobs: Iterable[FileEntry] ) -> Dict[Sha1Git, datetime]: return self.get_dates("content", [blob.id for blob in blobs]) def content_set_early_date(self, blob: FileEntry, date: datetime) -> None: self.cache["content"]["data"][blob.id] = date self.cache["content"]["added"].add(blob.id) def directory_add_to_revision( self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes ) -> None: self.cache["directory_in_revision"].add( (directory.id, revision.id, path_normalize(path)) ) def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]: cache = self.cache["directory_flatten"] if directory.id not in cache: cache.setdefault(directory.id, None) ret = self.storage.directory_get([directory.id]) if directory.id in ret: dir = ret[directory.id] cache[directory.id] = dir.flat # date is kept to ensure we have it available when flushing self.cache["directory"]["data"][directory.id] = dir.date return cache.get(directory.id) def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None: self.cache["directory_flatten"][directory.id] = True def directory_get_date_in_isochrone_frontier( self, directory: DirectoryEntry ) -> Optional[datetime]: return self.get_dates("directory", [directory.id]).get(directory.id) def directory_get_dates_in_isochrone_frontier( self, dirs: Iterable[DirectoryEntry] ) -> Dict[Sha1Git, datetime]: return self.get_dates("directory", [directory.id for directory in dirs]) def directory_set_date_in_isochrone_frontier( self, directory: DirectoryEntry, date: datetime ) -> None: self.cache["directory"]["data"][directory.id] = date self.cache["directory"]["added"].add(directory.id) def get_dates( self, entity: Literal["content", "directory", "revision"], ids: Iterable[Sha1Git], ) -> Dict[Sha1Git, datetime]: cache = self.cache[entity] missing_ids = set(id for id in ids if id not in cache) if missing_ids: if entity == "content": cache["data"].update(self.storage.content_get(missing_ids)) elif entity == "directory": cache["data"].update( { id: dir.date for id, dir in self.storage.directory_get(missing_ids).items() } ) elif entity == "revision": cache["data"].update( { id: rev.date for id, rev in self.storage.revision_get(missing_ids).items() } ) dates: Dict[Sha1Git, datetime] = {} for sha1 in ids: date = cache["data"].setdefault(sha1, None) if date is not None: dates[sha1] = date return dates def open(self) -> None: self.storage.open() def origin_add(self, origin: OriginEntry) -> None: self.cache["origin"]["data"][origin.id] = origin.url self.cache["origin"]["added"].add(origin.id) def revision_add(self, revision: RevisionEntry) -> None: self.cache["revision"]["data"][revision.id] = revision.date self.cache["revision"]["added"].add(revision.id) def revision_add_before_revision( self, head: RevisionEntry, revision: RevisionEntry ) -> None: self.cache["revision_before_revision"].setdefault(revision.id, set()).add( head.id ) def revision_add_to_origin( self, origin: OriginEntry, revision: RevisionEntry ) -> None: self.cache["revision_in_origin"].add((revision.id, origin.id)) def revision_get_date(self, revision: RevisionEntry) -> Optional[datetime]: return self.get_dates("revision", [revision.id]).get(revision.id) def revision_get_preferred_origin( self, revision: RevisionEntry ) -> Optional[Sha1Git]: cache = self.cache["revision_origin"]["data"] if revision.id not in cache: ret = self.storage.revision_get([revision.id]) if revision.id in ret: origin = ret[revision.id].origin if origin is not None: cache[revision.id] = origin return cache.get(revision.id) - def revision_in_history(self, revision: RevisionEntry) -> bool: - return revision.id in self.cache["revision_before_revision"] or bool( - self.storage.relation_get(RelationType.REV_BEFORE_REV, [revision.id]) - ) - def revision_set_preferred_origin( self, origin: OriginEntry, revision: RevisionEntry ) -> None: self.cache["revision_origin"]["data"][revision.id] = origin.id self.cache["revision_origin"]["added"].add(revision.id) - - def revision_visited(self, revision: RevisionEntry) -> bool: - return revision.id in dict(self.cache["revision_in_origin"]) or bool( - self.storage.relation_get(RelationType.REV_IN_ORG, [revision.id]) - ) diff --git a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml index 8d2ad11..d544d3c 100644 --- a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml +++ b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml @@ -1,230 +1,125 @@ # History graph for snapshot with branches: R01 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - head: - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: False - in_history: False + - head: "1444db96cbd8cd791abe83527becee73d3c64e86" graph: 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: False + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R03 and R06 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - head: - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - is_head: False - in_history: False + - head: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" graph: 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - is_head: False - in_history: False + - "1c533587277731236616cac0d44f3b46c1da0f8a" 1c533587277731236616cac0d44f3b46c1da0f8a: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: False + - "1444db96cbd8cd791abe83527becee73d3c64e86" 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: True + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] - - head: - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - is_head: False - in_history: False + - head: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" graph: 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: True + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R05 and R06 - origin: "https://repo_with_merges/2/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - head: - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - is_head: False - in_history: False + - head: "65e58853df939b318c106c4c1f55acaf8b41c74c" graph: 65e58853df939b318c106c4c1f55acaf8b41c74c: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - is_head: False - in_history: False + - "0d66eadcc15e0d7f6cfd4289329a7749a1309982" 0d66eadcc15e0d7f6cfd4289329a7749a1309982: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - is_head: True - in_history: False + - "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - is_head: False - in_history: True + - "1c533587277731236616cac0d44f3b46c1da0f8a" 1c533587277731236616cac0d44f3b46c1da0f8a: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: True + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] - - head: - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - is_head: True - in_history: False + - head: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" graph: 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: True + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R06 and R07 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - head: - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - is_head: True - in_history: False + - head: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" graph: 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: True + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] - - head: - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - is_head: False - in_history: False + - head: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" graph: fff0089fad98e8f5b46ec5c9025a20a602851ba6: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - is_head: True - in_history: True + - "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - is_head: False - in_history: True + - "1c533587277731236616cac0d44f3b46c1da0f8a" 1c533587277731236616cac0d44f3b46c1da0f8a: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: True + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] # History graph for snapshot with branches: R08 - origin: "https://repo_with_merges/1/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - head: - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" - is_head: False - in_history: False + - head: "7c8f29237dded4f9d265e46ec7066503e7858e87" graph: 7c8f29237dded4f9d265e46ec7066503e7858e87: - - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - is_head: True - in_history: False - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - is_head: True - in_history: False - - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - is_head: True - in_history: False + - "65e58853df939b318c106c4c1f55acaf8b41c74c" + - "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + - "fff0089fad98e8f5b46ec5c9025a20a602851ba6" 65e58853df939b318c106c4c1f55acaf8b41c74c: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - is_head: False - in_history: True + - "0d66eadcc15e0d7f6cfd4289329a7749a1309982" 0d66eadcc15e0d7f6cfd4289329a7749a1309982: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - is_head: True - in_history: True + - "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - is_head: False - in_history: True + - "1c533587277731236616cac0d44f3b46c1da0f8a" 1c533587277731236616cac0d44f3b46c1da0f8a: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: True + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" fff0089fad98e8f5b46ec5c9025a20a602851ba6: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - is_head: True - in_history: True + - "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" # History graph for snapshot with branches: R08 - origin: "https://repo_with_merges/2/" snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" graphs: - - head: - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" - is_head: True - in_history: False + - head: "7c8f29237dded4f9d265e46ec7066503e7858e87" graph: 7c8f29237dded4f9d265e46ec7066503e7858e87: - - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" - is_head: True - in_history: True - - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" - is_head: True - in_history: True - - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" - is_head: True - in_history: True + - "65e58853df939b318c106c4c1f55acaf8b41c74c" + - "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + - "fff0089fad98e8f5b46ec5c9025a20a602851ba6" 65e58853df939b318c106c4c1f55acaf8b41c74c: - - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" - is_head: False - in_history: True + - "0d66eadcc15e0d7f6cfd4289329a7749a1309982" 0d66eadcc15e0d7f6cfd4289329a7749a1309982: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - is_head: True - in_history: True + - "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb: - - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" - is_head: False - in_history: True + - "1c533587277731236616cac0d44f3b46c1da0f8a" 1c533587277731236616cac0d44f3b46c1da0f8a: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" 1444db96cbd8cd791abe83527becee73d3c64e86: - - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" - is_head: False - in_history: True + - "c0d8929936631ecbcf9147be6b8aa13b13b014e4" c0d8929936631ecbcf9147be6b8aa13b13b014e4: [] 72d92d41a9095db2dd6b8fb1c62d92c8251753ff: - - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" - is_head: True - in_history: True + - "1444db96cbd8cd791abe83527becee73d3c64e86" fff0089fad98e8f5b46ec5c9025a20a602851ba6: - - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" - is_head: True - in_history: True + - "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py index ca721e1..8aaf489 100644 --- a/swh/provenance/tests/test_history_graph.py +++ b/swh/provenance/tests/test_history_graph.py @@ -1,55 +1,54 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest import yaml from swh.model.hashutil import hash_to_bytes from swh.provenance.archive import ArchiveInterface from swh.provenance.graph import HistoryGraph from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry, RevisionEntry from swh.provenance.origin import origin_add_revision from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data @pytest.mark.parametrize( "repo, visit", (("with-merges", "visits-01"),), ) @pytest.mark.parametrize("batch", (True, False)) def test_history_graph( provenance: ProvenanceInterface, archive: ArchiveInterface, repo: str, visit: str, batch: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(archive.storage, data) filename = f"history_graphs_{repo}_{visit}.yaml" with open(get_datafile(filename)) as file: for expected in yaml.full_load(file): entry = OriginEntry(expected["origin"], hash_to_bytes(expected["snapshot"])) provenance.origin_add(entry) for expected_graph_as_dict in expected["graphs"]: print("Expected graph:", expected_graph_as_dict) computed_graph = HistoryGraph( - provenance, archive, - RevisionEntry(hash_to_bytes(expected_graph_as_dict["head"]["rev"])), + RevisionEntry(hash_to_bytes(expected_graph_as_dict["head"])), ) print("Computed graph:", computed_graph.as_dict()) assert computed_graph.as_dict() == expected_graph_as_dict origin_add_revision(provenance, entry, computed_graph) if not batch: provenance.flush()