diff --git a/swh/provenance/model.py b/swh/provenance/model.py index a731e5e..9464271 100644 --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -1,141 +1,161 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime from typing import Iterable, Iterator, List, Optional from .archive import ArchiveInterface class OriginEntry: def __init__( self, url: str, date: datetime, snapshot: bytes, id: Optional[int] = None ): self.url = url # TODO: this is probably not needed and will be removed! # self.date = date self.snapshot = snapshot self.id = id self._revisions: Optional[List[RevisionEntry]] = None def retrieve_revisions(self, archive: ArchiveInterface): if self._revisions is None: self._revisions = [ RevisionEntry(rev) for rev in archive.snapshot_get_heads(self.snapshot) ] @property def revisions(self) -> Iterator["RevisionEntry"]: if self._revisions is None: raise RuntimeError( "Revisions of this node has not yet been retrieved. " "Please call retrieve_revisions() before using this property." ) return (x for x in self._revisions) + def __str__(self): + return ( + f"" + ) + class RevisionEntry: def __init__( self, id: bytes, date: Optional[datetime] = None, root: Optional[bytes] = None, parents: Optional[Iterable[bytes]] = None, ): self.id = id self.date = date assert self.date is None or self.date.tzinfo is not None self.root = root - self._parents = parents - self._nodes: List[RevisionEntry] = [] - - def parents(self, archive: ArchiveInterface): - if self._parents is None: - revision = list(archive.revision_get([self.id])) - if revision: - self._parents = revision[0].parents - if self._parents and not self._nodes: - self._nodes = [ + self._parents_ids = parents + self._parents_entries: Optional[List[RevisionEntry]] = None + + def retrieve_parents(self, archive: ArchiveInterface): + if self._parents_entries is None: + if self._parents_ids is None: + revision = list(archive.revision_get([self.id])) + if revision: + self._parents_ids = revision[0].parents + else: + self._parents_ids = [] + + self._parents_entries = [ RevisionEntry( id=rev.id, root=rev.directory, date=rev.date.to_datetime(), parents=rev.parents, ) - for rev in archive.revision_get(self._parents) + for rev in archive.revision_get(self._parents_ids) if rev.date is not None ] - yield from self._nodes + + @property + def parents(self) -> Iterator["RevisionEntry"]: + if self._parents_entries is None: + raise RuntimeError( + "Parents of this node has not yet been retrieved. " + "Please call retrieve_parents() before using this property." + ) + return (x for x in self._parents_entries) def __str__(self): - return f"" + return ( + f"" + ) class DirectoryEntry: def __init__(self, id: bytes, name: bytes = b""): self.id = id self.name = name self._files: Optional[List[FileEntry]] = None self._dirs: Optional[List[DirectoryEntry]] = None def retrieve_children(self, archive: ArchiveInterface): if self._files is None and self._dirs is None: self._files = [] self._dirs = [] for child in archive.directory_ls(self.id): if child["type"] == "dir": self._dirs.append( DirectoryEntry(child["target"], name=child["name"]) ) elif child["type"] == "file": self._files.append(FileEntry(child["target"], child["name"])) @property def files(self) -> Iterator["FileEntry"]: if self._files is None: raise RuntimeError( "Children of this node has not yet been retrieved. " "Please call retrieve_children() before using this property." ) return (x for x in self._files) @property def dirs(self) -> Iterator["DirectoryEntry"]: if self._dirs is None: raise RuntimeError( "Children of this node has not yet been retrieved. " "Please call retrieve_children() before using this property." ) return (x for x in self._dirs) def __str__(self): return f"" def __eq__(self, other): return isinstance(other, DirectoryEntry) and (self.id, self.name) == ( other.id, other.name, ) def __hash__(self): return hash((self.id, self.name)) class FileEntry: def __init__(self, id: bytes, name: bytes): self.id = id self.name = name def __str__(self): return f"" def __eq__(self, other): return isinstance(other, FileEntry) and (self.id, self.name) == ( other.id, other.name, ) def __hash__(self): return hash((self.id, self.name)) diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py index 3317873..623c86a 100644 --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -1,122 +1,123 @@ from datetime import datetime, timezone from itertools import islice import logging import time from typing import Iterable, Iterator, List, Optional, Tuple import iso8601 from swh.model.hashutil import hash_to_hex from .archive import ArchiveInterface from .model import OriginEntry, RevisionEntry from .provenance import ProvenanceInterface class CSVOriginIterator: """Iterator over origin visit statuses typically present in the given CSV file. The input is an iterator that produces 3 elements per row: (url, date, snap) where: - url: is the origin url of the visit - date: is the date of the visit - snap: sha1_git of the snapshot pointed by the visit status """ def __init__( self, statuses: Iterable[Tuple[str, datetime, bytes]], limit: Optional[int] = None, ): self.statuses: Iterator[Tuple[str, datetime, bytes]] if limit is not None: self.statuses = islice(statuses, limit) else: self.statuses = iter(statuses) def __iter__(self): for url, date, snap in self.statuses: date = iso8601.parse_date(date, default_timezone=timezone.utc) yield OriginEntry(url, date, snap) def origin_add( provenance: ProvenanceInterface, archive: ArchiveInterface, origins: List[OriginEntry], ) -> None: start = time.time() for origin in origins: origin.retrieve_revisions(archive) for revision in origin.revisions: origin_add_revision(provenance, archive, origin, revision) done = time.time() provenance.commit() stop = time.time() logging.debug( "Origins " ";".join( [origin.url + ":" + hash_to_hex(origin.snapshot) for origin in origins] ) + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) def origin_add_revision( provenance: ProvenanceInterface, archive: ArchiveInterface, origin: OriginEntry, revision: RevisionEntry, ) -> None: stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)] origin.id = provenance.origin_get_id(origin) while stack: relative, current = stack.pop() # Check if current revision has no preferred origin and update if necessary. preferred = provenance.revision_get_preferred_origin(current) if preferred is None: provenance.revision_set_preferred_origin(origin, current) ######################################################################## if relative is None: # This revision is pointed directly by the origin. visited = provenance.revision_visited(current) provenance.revision_add_to_origin(origin, current) if not visited: stack.append((current, current)) else: # This revision is a parent of another one in the history of the # relative revision. - for parent in current.parents(archive): + current.retrieve_parents(archive) + for parent in current.parents: visited = provenance.revision_visited(parent) if not visited: # The parent revision has never been seen before pointing # directly to an origin. known = provenance.revision_in_history(parent) if known: # The parent revision is already known in some other # revision's history. We should point it directly to # the origin and (eventually) walk its history. stack.append((None, parent)) else: # The parent revision was never seen before. We should # walk its history and associate it with the same # relative revision. provenance.revision_add_before_revision(relative, parent) stack.append((relative, parent)) else: # The parent revision already points to an origin, so its # history was properly processed before. We just need to # make sure it points to the current origin as well. provenance.revision_add_to_origin(origin, parent)