diff --git a/swh/provenance/model.py b/swh/provenance/model.py index 38253f9..cfe6c1f 100644 --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -1,144 +1,147 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime from typing import Iterable, Iterator, List, Optional from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import origin_identifier from swh.model.model import Sha1Git from .archive import ArchiveInterface class OriginEntry: def __init__(self, url: str, snapshot: Sha1Git): self.url = url self.id: Sha1Git = hash_to_bytes(origin_identifier({"url": self.url})) self.snapshot = snapshot self._revisions: Optional[List[RevisionEntry]] = None def retrieve_revisions(self, archive: ArchiveInterface): if self._revisions is None: self._revisions = [ RevisionEntry(rev) for rev in archive.snapshot_get_heads(self.snapshot) ] @property def revisions(self) -> Iterator["RevisionEntry"]: if self._revisions is None: raise RuntimeError( "Revisions of this node has not yet been retrieved. " "Please call retrieve_revisions() before using this property." ) return (x for x in self._revisions) def __str__(self): return f"" class RevisionEntry: def __init__( self, id: Sha1Git, date: Optional[datetime] = None, root: Optional[Sha1Git] = None, parents: Optional[Iterable[Sha1Git]] = None, ): self.id = id self.date = date assert self.date is None or self.date.tzinfo is not None self.root = root self._parents_ids = parents self._parents_entries: Optional[List[RevisionEntry]] = None def retrieve_parents(self, archive: ArchiveInterface): if self._parents_entries is None: if self._parents_ids is None: self._parents_ids = archive.revision_get_parents(self.id) self._parents_entries = [RevisionEntry(id) for id in self._parents_ids] @property def parents(self) -> Iterator["RevisionEntry"]: if self._parents_entries is None: raise RuntimeError( "Parents of this node has not yet been retrieved. " "Please call retrieve_parents() before using this property." ) return (x for x in self._parents_entries) def __str__(self): - return ( - f"" - ) + return f"" + + def __eq__(self, other): + return isinstance(other, RevisionEntry) and self.id == other.id + + def __hash__(self): + return hash(self.id) class DirectoryEntry: def __init__(self, id: Sha1Git, name: bytes = b""): self.id = id self.name = name self._files: Optional[List[FileEntry]] = None self._dirs: Optional[List[DirectoryEntry]] = None def retrieve_children(self, archive: ArchiveInterface): if self._files is None and self._dirs is None: self._files = [] self._dirs = [] for child in archive.directory_ls(self.id): if child["type"] == "dir": self._dirs.append( DirectoryEntry(child["target"], name=child["name"]) ) elif child["type"] == "file": self._files.append(FileEntry(child["target"], child["name"])) @property def files(self) -> Iterator["FileEntry"]: if self._files is None: raise RuntimeError( "Children of this node has not yet been retrieved. " "Please call retrieve_children() before using this property." ) return (x for x in self._files) @property def dirs(self) -> Iterator["DirectoryEntry"]: if self._dirs is None: raise RuntimeError( "Children of this node has not yet been retrieved. " "Please call retrieve_children() before using this property." ) return (x for x in self._dirs) def __str__(self): return f"" def __eq__(self, other): return isinstance(other, DirectoryEntry) and (self.id, self.name) == ( other.id, other.name, ) def __hash__(self): return hash((self.id, self.name)) class FileEntry: def __init__(self, id: Sha1Git, name: bytes): self.id = id self.name = name def __str__(self): return f"" def __eq__(self, other): return isinstance(other, FileEntry) and (self.id, self.name) == ( other.id, other.name, ) def __hash__(self): return hash((self.id, self.name)) diff --git a/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml new file mode 100644 index 0000000..a84e73b --- /dev/null +++ b/swh/provenance/tests/data/history_graphs_with-merges_visits-01.yaml @@ -0,0 +1,55 @@ +# History graph for snapshot with branches: R01 +- origin: "https://with-merges" + snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" + graphs: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + parents: + - rev: "c0d8929936631ecbcf9147be6b8aa13b13b014e4" +# History graph for snapshot with branches: R03 and R06 +- origin: "https://with-merges" + snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" + graphs: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + parents: + - rev: "1c533587277731236616cac0d44f3b46c1da0f8a" + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + visited: True + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + parents: + - rev: "1444db96cbd8cd791abe83527becee73d3c64e86" + visited: True +# History graph for snapshot with branches: R05 and R06 +- origin: "https://with-merges" + snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" + graphs: + - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + parents: + - rev: "0d66eadcc15e0d7f6cfd4289329a7749a1309982" + parents: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + visited: True + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + visited: True +# History graph for snapshot with branches: R06 and R07 +- origin: "https://with-merges" + snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" + graphs: + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + visited: True + - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + parents: + - rev: "20f4da0f48609d9f7f908ebbcac3b3741a0f25cb" + visited: True +# History graph for snapshot with branches: R08 +- origin: "https://with-merges" + snapshot: "e2520f0dbf34c92754f00c5a60241dfa7d612868" + graphs: + - rev: "7c8f29237dded4f9d265e46ec7066503e7858e87" + parents: + - rev: "65e58853df939b318c106c4c1f55acaf8b41c74c" + visited: True + - rev: "72d92d41a9095db2dd6b8fb1c62d92c8251753ff" + visited: True + - rev: "fff0089fad98e8f5b46ec5c9025a20a602851ba6" + visited: True diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py new file mode 100644 index 0000000..3dc0037 --- /dev/null +++ b/swh/provenance/tests/test_history_graph.py @@ -0,0 +1,62 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +import yaml + +from swh.model.hashutil import hash_to_bytes +from swh.provenance.graph import HistoryNode, build_history_graph +from swh.provenance.model import OriginEntry, RevisionEntry +from swh.provenance.origin import origin_add_revision +from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data + + +def history_graph_from_dict(d) -> HistoryNode: + """Takes a dictionary representing a tree of HistoryNode objects, and + recursively builds the corresponding graph.""" + node = HistoryNode( + entry=RevisionEntry(hash_to_bytes(d["rev"])), + visited=d.get("visited", False), + in_history=d.get("in_history", False), + ) + node.parents = set( + history_graph_from_dict(parent) for parent in d.get("parents", []) + ) + return node + + +@pytest.mark.parametrize( + "repo, visit", + (("with-merges", "visits-01"),), +) +@pytest.mark.parametrize("batch", (True, False)) +def test_history_graph(provenance, swh_storage, archive, repo, visit, batch): + # read data/README.md for more details on how these datasets are generated + data = load_repo_data(repo) + fill_storage(swh_storage, data) + + filename = f"history_graphs_{repo}_{visit}.yaml" + + with open(get_datafile(filename)) as file: + for expected in yaml.full_load(file): + entry = OriginEntry(expected["origin"], hash_to_bytes(expected["snapshot"])) + provenance.origin_add(entry) + + for graph_as_dict in expected["graphs"]: + expected_graph = history_graph_from_dict(graph_as_dict) + print("Expected graph:", expected_graph) + + computed_graph = build_history_graph( + archive, + provenance, + RevisionEntry(hash_to_bytes(graph_as_dict["rev"])), + ) + print("Computed graph:", computed_graph) + assert computed_graph == expected_graph + + origin_add_revision(provenance, entry, computed_graph) + + if not batch: + provenance.commit()