diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -137,7 +137,7 @@ revisions_provider = ( line.strip().split(",") for line in open(filename, "r") if line.strip() ) - revisions = CSVRevisionIterator(revisions_provider, archive, limit=limit) + revisions = CSVRevisionIterator(revisions_provider, limit=limit) for revision in revisions: revision_add( @@ -157,14 +157,18 @@ def iter_origins(ctx, filename, limit): """Process a provided list of origins.""" from . import get_archive, get_provenance - from .origin import FileOriginIterator + from .origin import CSVOriginIterator from .provenance import origin_add archive = get_archive(**ctx.obj["config"]["archive"]) provenance = get_provenance(**ctx.obj["config"]["provenance"]) + origins_provider = ( + line.strip().split(",") for line in open(filename, "r") if line.strip() + ) + origins = CSVOriginIterator(origins_provider, limit=limit) - for origin in FileOriginIterator(filename, archive, limit=limit): - origin_add(archive, provenance, origin) + for origin in origins: + origin_add(provenance, archive, [origin]) @cli.command(name="find-first") diff --git a/swh/provenance/model.py b/swh/provenance/model.py --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -6,14 +6,57 @@ from datetime import datetime from typing import Iterable, Iterator, List, Optional +from swh.model.model import ObjectType, TargetType + from .archive import ArchiveInterface class OriginEntry: - def __init__(self, url, revisions: Iterable["RevisionEntry"], id=None): - self.id = id + def __init__( + self, url: str, date: datetime, snapshot: bytes, id: Optional[int] = None + ): self.url = url - self.revisions = revisions + self.date = date + self.snapshot = snapshot + self.id = id + self._revisions: Optional[List[RevisionEntry]] = None + + def revisions(self, archive: ArchiveInterface): + if self._revisions is None: + snapshot = archive.snapshot_get_all_branches(self.snapshot) + assert snapshot is not None + targets_set = set() + releases_set = set() + if snapshot is not None: + for branch in snapshot.branches: + if snapshot.branches[branch].target_type == TargetType.REVISION: + targets_set.add(snapshot.branches[branch].target) + elif snapshot.branches[branch].target_type == TargetType.RELEASE: + releases_set.add(snapshot.branches[branch].target) + + # This is done to keep the query in release_get small, hence avoiding + # a timeout. + batchsize = 100 + while releases_set: + releases = [ + releases_set.pop() for i in range(batchsize) if releases_set + ] + for release in archive.release_get(releases): + if release is not None: + if release.target_type == ObjectType.REVISION: + targets_set.add(release.target) + + # This is done to keep the query in revision_get small, hence avoiding + # a timeout. + revisions = set() + while targets_set: + targets = [targets_set.pop() for i in range(batchsize) if targets_set] + for revision in archive.revision_get(targets): + if revision is not None: + revisions.add(RevisionEntry(revision.id)) + self._revisions = list(revisions) + + yield from self._revisions class RevisionEntry: @@ -35,7 +78,7 @@ if self._parents is None: revision = archive.revision_get([self.id]) if revision: - self._parents = revision[0].parents + self._parents = list(revision)[0].parents if self._parents and not self._nodes: self._nodes = [ RevisionEntry( @@ -93,6 +136,15 @@ def __str__(self): return f"" + def __eq__(self, other): + return isinstance(other, DirectoryEntry) and (self.id, self.name) == ( + other.id, + other.name, + ) + + def __hash__(self): + return hash((self.id, self.name)) + class FileEntry: def __init__(self, id: bytes, name: bytes): @@ -101,3 +153,12 @@ def __str__(self): return f"" + + def __eq__(self, other): + return isinstance(other, FileEntry) and (self.id, self.name) == ( + other.id, + other.name, + ) + + def __hash__(self): + return hash((self.id, self.name)) diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -1,91 +1,44 @@ -from typing import List, Optional +from datetime import datetime, timezone +from itertools import islice +from typing import Iterable, Iterator, Optional, Tuple -from swh.model.model import ObjectType, Origin, TargetType +import iso8601 -from .archive import ArchiveInterface -from .model import OriginEntry, RevisionEntry +from .model import OriginEntry ################################################################################ ################################################################################ -class FileOriginIterator: - """Iterator over origins present in the given CSV file.""" +class CSVOriginIterator: + """Iterator over origin visit statuses typically present in the given CSV + file. - def __init__( - self, filename: str, archive: ArchiveInterface, limit: Optional[int] = None - ): - self.file = open(filename) - self.limit = limit - self.archive = archive - - def __iter__(self): - yield from iterate_statuses( - [Origin(url.strip()) for url in self.file], self.archive, self.limit - ) + The input is an iterator that produces 3 elements per row: + (url, date, snap) -class ArchiveOriginIterator: - """Iterator over origins present in the given storage.""" + where: + - url: is the origin url of the visit + - date: is the date of the visit + - snap: sha1_git of the snapshot pointed by the visit status + """ - def __init__(self, archive: ArchiveInterface, limit: Optional[int] = None): - self.limit = limit - self.archive = archive + def __init__( + self, + statuses: Iterable[Tuple[str, datetime, bytes]], + limit: Optional[int] = None, + ): + self.statuses: Iterator[Tuple[str, datetime, bytes]] + if limit is not None: + self.statuses = islice(statuses, limit) + else: + self.statuses = iter(statuses) def __iter__(self): - yield from iterate_statuses( - self.archive.iter_origins(), self.archive, self.limit - ) - - -def iterate_statuses( - origins: List[Origin], archive: ArchiveInterface, limit: Optional[int] = None -): - idx = 0 - for origin in origins: - for visit in archive.iter_origin_visits(origin.url): - for status in archive.iter_origin_visit_statuses(origin.url, visit.visit): - snapshot = archive.snapshot_get_all_branches(status.snapshot) - if snapshot is None: - continue - # TODO: may filter only those whose status is 'full'?? - targets_set = set() - releases_set = set() - if snapshot is not None: - for branch in snapshot.branches: - if snapshot.branches[branch].target_type == TargetType.REVISION: - targets_set.add(snapshot.branches[branch].target) - elif ( - snapshot.branches[branch].target_type == TargetType.RELEASE - ): - releases_set.add(snapshot.branches[branch].target) - - # This is done to keep the query in release_get small, hence avoiding - # a timeout. - batchsize = 100 - while releases_set: - releases = [ - releases_set.pop() for i in range(batchsize) if releases_set - ] - for release in archive.release_get(releases): - if release is not None: - if release.target_type == ObjectType.REVISION: - targets_set.add(release.target) - - # This is done to keep the query in revision_get small, hence avoiding - # a timeout. - revisions = set() - while targets_set: - targets = [ - targets_set.pop() for i in range(batchsize) if targets_set - ] - for revision in archive.revision_get(targets): - if revision is not None: - revisions.add(RevisionEntry(revision.id)) - # target_set |= set(revision.parents) - - yield OriginEntry(status.origin, list(revisions)) + return self - idx += 1 - if idx == limit: - return + def __next__(self): + url, date, snap = next(self.statuses) + date = iso8601.parse_date(date, default_timezone=timezone.utc) + return OriginEntry(url, date, snap) diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -133,20 +133,29 @@ def origin_add( - archive: ArchiveInterface, provenance: ProvenanceInterface, origin: OriginEntry + provenance: ProvenanceInterface, + archive: ArchiveInterface, + origins: List[OriginEntry], ) -> None: - # TODO: refactor to iterate over origin visit statuses and commit only once - # per status. - origin.id = provenance.origin_get_id(origin) - for revision in origin.revisions: - origin_add_revision(archive, provenance, origin, revision) - # Commit after each revision - provenance.commit() # TODO: verify this! + start = time.time() + for origin in origins: + for revision in origin.revisions(archive): + origin_add_revision(provenance, archive, origin, revision) + done = time.time() + provenance.commit() + stop = time.time() + logging.debug( + "Origins " + ";".join( + [origin.url + ":" + hash_to_hex(origin.snapshot) for origin in origins] + ) + + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" + ) def origin_add_revision( - archive: ArchiveInterface, provenance: ProvenanceInterface, + archive: ArchiveInterface, origin: OriginEntry, revision: RevisionEntry, ) -> None: @@ -278,12 +287,6 @@ self.path = os.path.join(prefix, self.entry.name) if prefix else self.entry.name self.children: List[IsochroneNode] = [] - def __str__(self): - return ( - f"<{self.entry.__class__.__name__}[{self.entry.name}]: " - f"known={self.known}, maxdate={self.maxdate}, dbdate={self.dbdate}>" - ) - @property def dbdate(self): # use a property to make this attribute (mostly) read-only @@ -304,6 +307,35 @@ self.children.append(node) return node + def __str__(self): + return ( + f"<{self.entry}: " + f"known={self.known}, maxdate={self.maxdate}, " + f"dbdate={self.dbdate}, path={self.path}, " + f"children=[{', '.join(str(child) for child in self.children)}]>" + ) + + def __eq__(self, other): + sameDbDate = ( + self._dbdate is None and other._dbdate is None + ) or self._dbdate == other._dbdate + sameMaxdate = ( + self.maxdate is None and other.maxdate is None + ) or self.maxdate == other.maxdate + return ( + isinstance(other, IsochroneNode) + and (self.entry, self.depth, self.known, self.path) + == (other.entry, other.depth, other.known, other.path) + and sameDbDate + and sameMaxdate + and set(self.children) == set(other.children) + ) + + def __hash__(self): + return hash( + (self.entry, self.depth, self._dbdate, self.maxdate, self.known, self.path) + ) + def build_isochrone_graph( archive: ArchiveInterface, diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -1,12 +1,10 @@ from datetime import datetime, timezone from itertools import islice -import threading from typing import Iterable, Iterator, Optional, Tuple import iso8601 from swh.model.hashutil import hash_to_bytes -from swh.provenance.archive import ArchiveInterface from swh.provenance.model import RevisionEntry ######################################################################################## @@ -29,7 +27,6 @@ def __init__( self, revisions: Iterable[Tuple[bytes, datetime, bytes]], - archive: ArchiveInterface, limit: Optional[int] = None, ): self.revisions: Iterator[Tuple[bytes, datetime, bytes]] @@ -37,20 +34,17 @@ self.revisions = islice(revisions, limit) else: self.revisions = iter(revisions) - self.mutex = threading.Lock() - self.archive = archive def __iter__(self): return self def __next__(self): - with self.mutex: - id, date, root = next(self.revisions) - date = iso8601.parse_date(date) - if date.tzinfo is None: - date = date.replace(tzinfo=timezone.utc) - return RevisionEntry( - hash_to_bytes(id), - date=date, - root=hash_to_bytes(root), - ) + id, date, root = next(self.revisions) + date = iso8601.parse_date(date) + if date.tzinfo is None: + date = date.replace(tzinfo=timezone.utc) + return RevisionEntry( + hash_to_bytes(id), + date=date, + root=hash_to_bytes(root), + ) diff --git a/swh/provenance/tests/data/graphs_cmdbts2_lower_1.yaml b/swh/provenance/tests/data/graphs_cmdbts2_lower_1.yaml new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/data/graphs_cmdbts2_lower_1.yaml @@ -0,0 +1,476 @@ +# Isochrone graph for R00 +c0d8929936631ecbcf9147be6b8aa13b13b014e4: + entry: + id: "a4cb5e6b2831f7e8eef0e6e08e43d642c97303a1" + name: "" + dbdate: null + maxdate: 1000000000.0 + known: False + path: "" + children: + - entry: + id: "1c8d9fd9afa7e5a2cf52a3db6f05dc5c3a1ca86b" + name: "A" + dbdate: null + maxdate: 1000000000.0 + known: False + path: "A" + children: + - entry: + id: "36876d475197b5ad86ad592e8e28818171455f16" + name: "B" + dbdate: null + maxdate: 1000000000.0 + known: False + path: "A/B" + children: + - entry: + id: "98f7a4a23d8df1fb1a5055facae2aff9b2d0a8b3" + name: "C" + dbdate: null + maxdate: 1000000000.0 + known: False + path: "A/B/C" + children: [] +# Isochrone graph for R01 +1444db96cbd8cd791abe83527becee73d3c64e86: + entry: + id: "b3cf11b22c9f93c3c494cf90ab072f394155072d" + name: "" + dbdate: null + maxdate: 1000000010.0 + known: False + path: "" + children: + - entry: + id: "baca735bf8b8720131b4bfdb47c51631a9260348" + name: "A" + dbdate: null + maxdate: 1000000010.0 + known: False + path: "A" + children: + - entry: + id: "4b28979d88ed209a09c272bcc80f69d9b18339c2" + name: "B" + dbdate: null + maxdate: 1000000010.0 + known: False + path: "A/B" + children: + - entry: + id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf" + name: "C" + dbdate: null + maxdate: 1000000010.0 + known: False + path: "A/B/C" + children: [] +# Isochrone graph for R02 +0d45f1ee524db8f6f0b5a267afac4e733b4b2cee: + entry: + id: "195601c98c28f04e0d19c218434738006990db72" + name: "" + dbdate: null + maxdate: 1000000020.0 + known: False + path: "" + children: + - entry: + id: "d591b308488541aabffd854eae85a9bf83a9d9f5" + name: "A" + dbdate: null + maxdate: 1000000020.0 + known: False + path: "A" + children: + - entry: + id: "0e540a8ebea2f5de3e62b92e2139902cf6f46e92" + name: "B" + dbdate: null + maxdate: 1000000020.0 + known: False + path: "A/B" + children: + - entry: + id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf" + name: "C" + dbdate: null + maxdate: 1000000010.0 + known: True + path: "A/B/C" + children: [] +# Isochrone graph for R03 +540bd6155a3c50cc47b2e6f43aeaace67a696d1d: + entry: + id: "cea28838ec1fb757e44b724fe1365d64c6a94e24" + name: "" + dbdate: null + maxdate: 1000000010.0 + known: True + path: "" + children: + - entry: + id: "48007c961cc734d1f63886d0413a6dc605e3e2ea" + name: "A" + dbdate: null + maxdate: 1000000010.0 + known: True + path: "A" + children: + - entry: + id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf" + name: "C" + dbdate: 1000000010.0 + maxdate: 1000000010.0 + known: True + path: "A/C" + children: [] +# Isochrone graph for R04 +17ed10db0612c9b46ba340943cb6b48b25431419: + entry: + id: "195601c98c28f04e0d19c218434738006990db72" + name: "" + dbdate: null + maxdate: 1000000020.0 + known: True + path: "" + children: + - entry: + id: "d591b308488541aabffd854eae85a9bf83a9d9f5" + name: "A" + dbdate: null + maxdate: 1000000020.0 + known: True + path: "A" + children: + - entry: + id: "0e540a8ebea2f5de3e62b92e2139902cf6f46e92" + name: "B" + dbdate: null + maxdate: 1000000020.0 + known: True + path: "A/B" + children: + - entry: + id: "c9cabe7f49012e3fdef6ac6b929efb5654f583cf" + name: "C" + dbdate: 1000000010.0 + maxdate: 1000000010.0 + known: True + path: "A/B/C" + children: [] +# Isochrone graph for R05 +c8bef45193355db33d64f375b4a4e4f23ac2a4f6: + entry: + id: "fa63f03d67d1a15563afe9f8ba97832dfb20f42a" + name: "" + dbdate: null + maxdate: 1000000050.0 + known: False + path: "" + children: + - entry: + id: "12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e" + name: "D" + dbdate: null + maxdate: 1000000050.0 + known: False + path: "D" + children: [] +# Isochrone graph for R06 +f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c: + entry: + id: "c86d2f588234098642ef6f33ca662a6a9de865bc" + name: "" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "" + children: + - entry: + id: "8a3993f4efa9385ce993775cab5ec4dc2c78d7f6" + name: "D" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "D" + children: + - entry: + id: "fa63f03d67d1a15563afe9f8ba97832dfb20f42a" + name: "E" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "D/E" + children: + - entry: + id: "12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e" + name: "D" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "D/E/D" + children: [] +# Isochrone graph for R07 +91ed6a03c80b61e0d63d328f7a4325230e7a0237: + entry: + id: "641baf6738fa5ebb3c5eb39af45f62ff52f8cc62" + name: "" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "" + children: + - entry: + id: "b0ae56ed5ca7daa34fd7a91a28db443ab3c389a0" + name: "F" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F" + children: + - entry: + id: "fa63f03d67d1a15563afe9f8ba97832dfb20f42a" + name: "E" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F/E" + children: + - entry: + id: "12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e" + name: "D" + dbdate: 1000000050.0 + maxdate: 1000000050.0 + known: True + path: "F/E/D" + children: [] +# Isochrone graph for R08 +a97e5c8a626510eefaa637091924cf800b1e8b06: + entry: + id: "79e219827e12f40e7146cc6834ee04b617a8073a" + name: "" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "" + children: + - entry: + id: "9a7b5762e20b11735b93a635cda451c75bd31270" + name: "F" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F" + children: + - entry: + id: "81b84d8fd8ceebd47f51896d19ce1aa286629225" + name: "E" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F/E" + children: + - entry: + id: "cb211f2d9dfee6c3968837a07960afd6ab09506c" + name: "D" + dbdate: null + maxdate: 1000000050.0 + known: True + path: "F/E/D" + children: [] +# Isochrone graph for R09 +3c5ad6be812b182ee2a01e84884b8ab7d384a4a0: + entry: + id: "53a71b331248f2144f4f012fd7e05f86b8ee62a0" + name: "" + dbdate: null + maxdate: 1000000090.0 + known: False + path: "" + children: + - entry: + id: "16cb311fc491b0b6dfade153191ee1c09d2152cf" + name: "F" + dbdate: null + maxdate: 1000000090.0 + known: False + path: "F" + children: + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "E" + dbdate: null + maxdate: 1000000090.0 + known: False + path: "F/E" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: null + maxdate: 1000000090.0 + known: False + path: "F/E/D" + children: [] +# Isochrone graph for R10 +b7c52e28d441ca0cb736fdbe49e39eae3847ad0f: + entry: + id: "8c61bb233c89936b310d8b269a35c24bff432227" + name: "" + dbdate: null + maxdate: 1000000100.0 + known: False + path: "" + children: + - entry: + id: "db2b00211f77c6c7f1f742020e483b506b82b5d6" + name: "F" + dbdate: null + maxdate: 1000000100.0 + known: False + path: "F" + children: + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "E" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "F/E" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "F/E/D" + children: [] +# Isochrone graph for R11 +f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2: + entry: + id: "b29a1c3fee0057016af424c41d58a8811b8c3a41" + name: "" + dbdate: null + maxdate: 1000000110.0 + known: False + path: "" + children: + - entry: + id: "74fb9789d162f02deabbdfbc3c8daa97f31559a1" + name: "G" + dbdate: null + maxdate: 1000000110.0 + known: False + path: "G" + children: + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "E" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "G/E" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/E/D" + children: [] +# Isochrone graph for R12 +99bd98e1803343ecfabe4b05d0218475c2b1bf74: + entry: + id: "6b2d11dd7bc6c7d7dcf59afed80f57413d929cf5" + name: "" + dbdate: null + maxdate: 1000000120.0 + known: False + path: "" + children: + - entry: + id: "5aa1d185e7e32bb53a16ba0db1b06d3a6243b36f" + name: "G" + dbdate: null + maxdate: 1000000120.0 + known: False + path: "G" + children: + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "H" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "G/H" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/H/D" + children: [] +# Isochrone graph for R13 +10287882c7ed1b7c96f43da269e6a868b98291ff: + entry: + id: "148f08e057416af1e471abb3dcd594d27233085d" + name: "" + dbdate: null + maxdate: 1000000130.0 + known: False + path: "" + children: + - entry: + id: "8084b999790aab88e5119915ea1083e747a3f42f" + name: "G" + dbdate: null + maxdate: 1000000130.0 + known: False + path: "G" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/D" + children: [] + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "I" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "G/I" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/I/D" + children: [] + - entry: + id: "8b4df27934ce48db6f4bdf326b3bce89d4571252" + name: "H" + dbdate: null + maxdate: 1000000090.0 + known: True + path: "G/H" + children: + - entry: + id: "2cb3ae467165716d1d0e7fa85190d753c3b76d78" + name: "D" + dbdate: 1000000090.0 + maxdate: 1000000090.0 + known: True + path: "G/H/D" + children: [] diff --git a/swh/provenance/tests/test_isochrone_graph.py b/swh/provenance/tests/test_isochrone_graph.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_isochrone_graph.py @@ -0,0 +1,88 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timezone +import pytest +import yaml + +from swh.model.hashutil import hash_to_bytes +from swh.provenance.model import DirectoryEntry, RevisionEntry +from swh.provenance.provenance import IsochroneNode, build_isochrone_graph, revision_add +from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data +from swh.provenance.tests.test_provenance_db import ts2dt + + +def isochrone_graph_from_dict(d, depth=0) -> IsochroneNode: + """Takes a dictionary representing a tree of IsochroneNode objects, and + recursively builds the corresponding graph.""" + d = d.copy() + + d["entry"]["id"] = hash_to_bytes(d["entry"]["id"]) + d["entry"]["name"] = bytes(d["entry"]["name"], encoding="utf-8") + + if d["dbdate"] is not None: + d["dbdate"] = datetime.fromtimestamp(d["dbdate"], timezone.utc) + + if d["maxdate"] is not None: + d["maxdate"] = datetime.fromtimestamp(d["maxdate"], timezone.utc) + + node = IsochroneNode( + entry=DirectoryEntry(**d["entry"]), + dbdate=d["dbdate"], + depth=depth, + ) + node.maxdate = d["maxdate"] + node.known = d["known"] + node.path = bytes(d["path"], encoding="utf-8") + node.children = [ + isochrone_graph_from_dict(child, depth=depth + 1) for child in d["children"] + ] + return node + + +@pytest.mark.parametrize( + "repo, lower, mindepth", + ( + ("cmdbts2", True, 1), + # ("cmdbts2", False, 1), + # ("cmdbts2", True, 2), + # ("cmdbts2", False, 2), + # ("out-of-order", True, 1), + ), +) +def test_isochrone_graph(provenance, swh_storage, archive, repo, lower, mindepth): + # read data/README.md for more details on how these datasets are generated + data = load_repo_data(repo) + fill_storage(swh_storage, data) + + revisions = {rev["id"]: rev for rev in data["revision"]} + filename = f"graphs_{repo}_{'lower' if lower else 'upper'}_{mindepth}.yaml" + + with open(get_datafile(filename)) as file: + expected = yaml.full_load(file) + + for rev, graph_as_dict in expected.items(): + revision = revisions[hash_to_bytes(rev)] + entry = RevisionEntry( + id=revision["id"], + date=ts2dt(revision["date"]), + root=revision["directory"], + ) + expected_graph = isochrone_graph_from_dict(graph_as_dict) + print("Expected", expected_graph) + + # Create graph for current revision and check it has the expected structure. + computed_graph = build_isochrone_graph( + archive, + provenance, + entry, + DirectoryEntry(entry.root), + ) + print("Computed", computed_graph) + assert computed_graph == expected_graph + + # Add current revision so that provenance info is kept up to date for the + # following ones. + revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py --- a/swh/provenance/tests/test_origin_iterator.py +++ b/swh/provenance/tests/test_origin_iterator.py @@ -2,23 +2,36 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import pytest from swh.model.tests.swh_model_data import TEST_OBJECTS -from swh.provenance.origin import ArchiveOriginIterator +from swh.provenance.origin import CSVOriginIterator +from swh.storage.algos.origin import ( + iter_origins, + iter_origin_visits, + iter_origin_visit_statuses, +) -def test_archive_direct_origin_iterator(swh_storage_with_objects, archive_direct): - """Test ArchiveOriginIterator against the ArchivePostgreSQL""" - # XXX - pytest.xfail("Iterate Origins is currently unsupported by ArchivePostgreSQL") - origins = list(ArchiveOriginIterator(archive_direct)) +def test_origin_iterator(swh_storage_with_objects): + """Test CSVOriginIterator""" + origins_csv = [] + for origin in iter_origins(swh_storage_with_objects): + for visit in iter_origin_visits(swh_storage_with_objects, origin.url): + for status in iter_origin_visit_statuses( + swh_storage_with_objects, origin.url, visit.visit + ): + if status.snapshot is not None: + origins_csv.append( + (status.origin, status.date.isoformat(), status.snapshot) + ) + origins = list(CSVOriginIterator(origins_csv)) assert origins - assert len(origins) == len(TEST_OBJECTS["origin"]) - - -def test_archive_api_origin_iterator(swh_storage_with_objects, archive_api): - """Test ArchiveOriginIterator against the ArchiveStorage""" - origins = list(ArchiveOriginIterator(archive_api)) - assert origins - assert len(origins) == len(TEST_OBJECTS["origin"]) + assert len(origins) == len( + list( + { + status.origin + for status in TEST_OBJECTS["origin_visit_status"] + if status.snapshot is not None + } + ) + ) diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -21,10 +21,14 @@ def test_provenance_origin_add(provenance, swh_storage_with_objects): - """Test the ProvenanceDB.origin_add() method""" - for origin in TEST_OBJECTS["origin"]: - entry = OriginEntry(url=origin.url, revisions=[]) - origin_add(ArchiveStorage(swh_storage_with_objects), provenance, entry) + """Test the origin_add function""" + archive = ArchiveStorage(swh_storage_with_objects) + for status in TEST_OBJECTS["origin_visit_status"]: + if status.snapshot is not None: + entry = OriginEntry( + url=status.origin, date=status.date, snapshot=status.snapshot + ) + origin_add(provenance, archive, [entry]) # TODO: check some facts here diff --git a/swh/provenance/tests/test_revision_iterator.py b/swh/provenance/tests/test_revision_iterator.py --- a/swh/provenance/tests/test_revision_iterator.py +++ b/swh/provenance/tests/test_revision_iterator.py @@ -7,13 +7,13 @@ from swh.provenance.tests.test_provenance_db import ts2dt -def test_archive_direct_revision_iterator(storage_and_CMDBTS, archive_direct): +def test_archive_direct_revision_iterator(storage_and_CMDBTS): """Test CSVRevisionIterator""" - storage, data = storage_and_CMDBTS + _, data = storage_and_CMDBTS revisions_csv = [ (rev["id"], ts2dt(rev["date"]).isoformat(), rev["directory"]) for rev in data["revision"] ] - revisions = list(CSVRevisionIterator(revisions_csv, archive_direct)) + revisions = list(CSVRevisionIterator(revisions_csv)) assert revisions assert len(revisions) == len(data["revision"])