diff --git a/requirements-swh-graph.txt b/requirements-swh-graph.txt new file mode 100644 --- /dev/null +++ b/requirements-swh-graph.txt @@ -0,0 +1 @@ +swh.graph >= 0.3.2 diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -36,9 +36,6 @@ return requirements -# Edit this part to match your module. -# Full sample: -# https://forge.softwareheritage.org/diffusion/DCORE/browse/master/setup.py setup( name="swh.provenance", description="Software Heritage code provenance", @@ -48,12 +45,15 @@ author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/222/", - packages=find_packages(), # packages's modules + packages=find_packages(), install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), setup_requires=["setuptools-scm"], use_scm_version=True, - extras_require={"testing": parse_requirements("test")}, + extras_require={ + "testing": parse_requirements("test"), + "graph": parse_requirements("swh-graph"), + }, include_package_data=True, entry_points=""" [swh.cli.subcommands] diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -17,7 +17,7 @@ """Get an archive object of class ``cls`` with arguments ``args``. Args: - cls: archive's class, either 'api' or 'direct' + cls: archive's class, either 'api', 'direct' or 'graph' args: dictionary of arguments passed to the archive class constructor Returns: @@ -33,12 +33,27 @@ from .storage.archive import ArchiveStorage return ArchiveStorage(get_storage(**kwargs["storage"])) + elif cls == "direct": from swh.core.db import BaseDb from .postgresql.archive import ArchivePostgreSQL return ArchivePostgreSQL(BaseDb.connect(**kwargs["db"]).conn) + + elif cls == "graph": + try: + from swh.graph.client import RemoteGraphClient + + from .swhgraph.archive import ArchiveGraph + + graph = RemoteGraphClient(kwargs.get("url")) + return ArchiveGraph(graph, get_storage(**kwargs["storage"])) + + except ModuleNotFoundError: + raise EnvironmentError( + "Graph configuration required but module is not installed." + ) else: raise ValueError diff --git a/swh/provenance/archive.py b/swh/provenance/archive.py --- a/swh/provenance/archive.py +++ b/swh/provenance/archive.py @@ -47,8 +47,7 @@ id: sha1 id of the snapshot. Yields: - sha1 ids of revisions that are a target of such snapshot. Revisions are - guaranteed to be retrieved in chronological order + sha1 ids of revisions that are a target of such snapshot. """ ... diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py --- a/swh/provenance/postgresql/archive.py +++ b/swh/provenance/postgresql/archive.py @@ -135,7 +135,7 @@ ON (RL.target=RV.id) WHERE B.target_type='release'::snapshot_target AND RL.target_type='revision'::object_type) - ORDER BY date, id) + ) SELECT id FROM heads """, (id,), diff --git a/swh/provenance/storage/archive.py b/swh/provenance/storage/archive.py --- a/swh/provenance/storage/archive.py +++ b/swh/provenance/storage/archive.py @@ -70,4 +70,4 @@ if revision is not None and revision.date is not None ) - yield from (head for _, head in sorted(revisions)) + yield from (head for _, head in revisions) diff --git a/swh/provenance/swhgraph/__init__.py b/swh/provenance/swhgraph/__init__.py new file mode 100644 diff --git a/swh/provenance/swhgraph/archive.py b/swh/provenance/swhgraph/archive.py new file mode 100644 --- /dev/null +++ b/swh/provenance/swhgraph/archive.py @@ -0,0 +1,41 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Any, Dict, Iterable + +from swh.core.statsd import statsd +from swh.model.model import Sha1Git +from swh.model.swhids import CoreSWHID, ObjectType +from swh.storage.interface import StorageInterface + +ARCHIVE_DURATION_METRIC = "swh_provenance_archive_graph_duration_seconds" + + +class ArchiveGraph: + def __init__(self, graph, storage: StorageInterface) -> None: + self.graph = graph + self.storage = storage # required by ArchiveInterface + + @statsd.timed(metric=ARCHIVE_DURATION_METRIC, tags={"method": "directory_ls"}) + def directory_ls(self, id: Sha1Git, minsize: int = 0) -> Iterable[Dict[str, Any]]: + raise NotImplementedError + + @statsd.timed( + metric=ARCHIVE_DURATION_METRIC, tags={"method": "revision_get_parents"} + ) + def revision_get_parents(self, id: Sha1Git) -> Iterable[Sha1Git]: + src = CoreSWHID(object_type=ObjectType.REVISION, object_id=id) + request = self.graph.neighbors(str(src), edges="rev:rev", return_types="rev") + + yield from (CoreSWHID.from_string(swhid).object_id for swhid in request) + + @statsd.timed(metric=ARCHIVE_DURATION_METRIC, tags={"method": "snapshot_get_heads"}) + def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: + src = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=id) + request = self.graph.visit_nodes( + str(src), edges="snp:rev,snp:rel,rel:rev", return_types="rev" + ) + + yield from (CoreSWHID.from_string(swhid).object_id for swhid in request) diff --git a/swh/provenance/tests/test_archive_interface.py b/swh/provenance/tests/test_archive_interface.py --- a/swh/provenance/tests/test_archive_interface.py +++ b/swh/provenance/tests/test_archive_interface.py @@ -6,56 +6,198 @@ from collections import Counter from operator import itemgetter from typing import Counter as TCounter +from typing import Dict, List, Set, Tuple, Type, Union import pytest from swh.core.db import BaseDb -from swh.model.model import Sha1Git +from swh.graph.naive_client import NaiveClient +from swh.model.model import ( + BaseModel, + Content, + Directory, + DirectoryEntry, + Origin, + OriginVisit, + OriginVisitStatus, + Revision, + Sha1Git, + Snapshot, + SnapshotBranch, + TargetType, +) +from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID +from swh.provenance.archive import ArchiveInterface from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage +from swh.provenance.swhgraph.archive import ArchiveGraph from swh.provenance.tests.conftest import fill_storage, load_repo_data -from swh.storage.interface import StorageInterface from swh.storage.postgresql.storage import Storage +def check_directory_ls( + reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]] +) -> None: + for directory in data["directory"]: + entries_ref = sorted( + reference.directory_ls(directory["id"]), key=itemgetter("name") + ) + entries = sorted(archive.directory_ls(directory["id"]), key=itemgetter("name")) + assert entries_ref == entries + + +def check_revision_get_parents( + reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]] +) -> None: + for revision in data["revision"]: + parents_ref: TCounter[Sha1Git] = Counter( + reference.revision_get_parents(revision["id"]) + ) + parents: TCounter[Sha1Git] = Counter( + archive.revision_get_parents(revision["id"]) + ) + assert parents_ref == parents + + +def check_snapshot_get_heads( + reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]] +) -> None: + for snapshot in data["snapshot"]: + heads_ref: TCounter[Sha1Git] = Counter( + reference.snapshot_get_heads(snapshot["id"]) + ) + heads: TCounter[Sha1Git] = Counter(archive.snapshot_get_heads(snapshot["id"])) + assert heads_ref == heads + + +def get_object_class(object_type: str) -> Type[BaseModel]: + if object_type == "origin": + return Origin + elif object_type == "origin_visit": + return OriginVisit + elif object_type == "origin_visit_status": + return OriginVisitStatus + elif object_type == "content": + return Content + elif object_type == "directory": + return Directory + elif object_type == "revision": + return Revision + elif object_type == "snapshot": + return Snapshot + raise ValueError + + +def data_to_model(data: Dict[str, List[dict]]) -> Dict[str, List[BaseModel]]: + model: Dict[str, List[BaseModel]] = {} + for object_type, objects in data.items(): + for object in objects: + model.setdefault(object_type, []).append( + get_object_class(object_type).from_dict(object) + ) + return model + + +def add_link( + edges: Set[Tuple[Union[CoreSWHID, ExtendedSWHID], Union[CoreSWHID, ExtendedSWHID]]], + src_obj: Union[Origin, Snapshot, Revision, Directory, Content], + dst_id: bytes, + dst_type: ExtendedObjectType, +) -> None: + swhid = ExtendedSWHID(object_type=dst_type, object_id=dst_id) + edges.add((src_obj.swhid(), swhid)) + + +def get_graph_data( + data: Dict[str, List[dict]] +) -> Tuple[List[str], List[Tuple[str, str]]]: + nodes: Set[Union[CoreSWHID, ExtendedSWHID]] = set() + edges: Set[ + Tuple[Union[CoreSWHID, ExtendedSWHID], Union[CoreSWHID, ExtendedSWHID]] + ] = set() + + model = data_to_model(data) + + for origin in model["origin"]: + assert isinstance(origin, Origin) + nodes.add(origin.swhid()) + for status in model["origin_visit_status"]: + assert isinstance(status, OriginVisitStatus) + if status.origin == origin.url and status.snapshot is not None: + add_link(edges, origin, status.snapshot, ExtendedObjectType.SNAPSHOT) + + for snapshot in model["snapshot"]: + assert isinstance(snapshot, Snapshot) + nodes.add(snapshot.swhid()) + for branch in snapshot.branches.values(): + assert isinstance(branch, SnapshotBranch) + if branch.target_type in [TargetType.RELEASE, TargetType.REVISION]: + target_type = ( + ExtendedObjectType.RELEASE + if branch.target_type == TargetType.RELEASE + else ExtendedObjectType.REVISION + ) + add_link(edges, snapshot, branch.target, target_type) + + for revision in model["revision"]: + assert isinstance(revision, Revision) + nodes.add(revision.swhid()) + # root directory + add_link(edges, revision, revision.directory, ExtendedObjectType.DIRECTORY) + # parent + for parent in revision.parents: + add_link(edges, revision, parent, ExtendedObjectType.REVISION) + + for directory in model["directory"]: + assert isinstance(directory, Directory) + nodes.add(directory.swhid()) + for entry in directory.entries: + assert isinstance(entry, DirectoryEntry) + if entry.type == "file": + target_type = ExtendedObjectType.CONTENT + elif entry.type == "dir": + target_type = ExtendedObjectType.DIRECTORY + elif entry.type == "rev": + target_type = ExtendedObjectType.REVISION + add_link(edges, directory, entry.target, target_type) + + for content in model["content"]: + assert isinstance(content, Content) + nodes.add(content.swhid()) + + return [str(node) for node in nodes], [(str(src), str(dst)) for src, dst in edges] + + @pytest.mark.parametrize( "repo", ("cmdbts2", "out-of-order", "with-merges"), ) -def test_archive_interface(repo: str, swh_storage: StorageInterface) -> None: - archive_api = ArchiveStorage(swh_storage) - assert isinstance(swh_storage, Storage) - dsn = swh_storage.get_db().conn.dsn +def test_archive_interface(repo: str, archive: ArchiveInterface) -> None: + # read data/README.md for more details on how these datasets are generated + data = load_repo_data(repo) + fill_storage(archive.storage, data) + + # test against ArchiveStorage + archive_api = ArchiveStorage(archive.storage) + check_directory_ls(archive, archive_api, data) + check_revision_get_parents(archive, archive_api, data) + check_snapshot_get_heads(archive, archive_api, data) + + # test against ArchivePostgreSQL + assert isinstance(archive.storage, Storage) + dsn = archive.storage.get_db().conn.dsn with BaseDb.connect(dsn).conn as conn: BaseDb.adapt_conn(conn) archive_direct = ArchivePostgreSQL(conn) - # read data/README.md for more details on how these datasets are generated - data = load_repo_data(repo) - fill_storage(swh_storage, data) + check_directory_ls(archive, archive_direct, data) + check_revision_get_parents(archive, archive_direct, data) + check_snapshot_get_heads(archive, archive_direct, data) - for directory in data["directory"]: - entries_api = sorted( - archive_api.directory_ls(directory["id"]), key=itemgetter("name") - ) - entries_direct = sorted( - archive_direct.directory_ls(directory["id"]), key=itemgetter("name") - ) - assert entries_api == entries_direct - - for revision in data["revision"]: - parents_api: TCounter[Sha1Git] = Counter( - archive_api.revision_get_parents(revision["id"]) - ) - parents_direct: TCounter[Sha1Git] = Counter( - archive_direct.revision_get_parents(revision["id"]) - ) - assert parents_api == parents_direct - - for snapshot in data["snapshot"]: - heads_api: TCounter[Sha1Git] = Counter( - archive_api.snapshot_get_heads(snapshot["id"]) - ) - heads_direct: TCounter[Sha1Git] = Counter( - archive_direct.snapshot_get_heads(snapshot["id"]) - ) - assert heads_api == heads_direct + # test against ArchiveGraph + nodes, edges = get_graph_data(data) + graph = NaiveClient(nodes=nodes, edges=edges) + archive_graph = ArchiveGraph(graph, archive.storage) + with pytest.raises(NotImplementedError): + check_directory_ls(archive, archive_graph, data) + check_revision_get_parents(archive, archive_graph, data) + check_snapshot_get_heads(archive, archive_graph, data) diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -4,6 +4,7 @@ [testenv] extras = testing + graph deps = pytest-cov commands = @@ -30,6 +31,7 @@ [testenv:mypy] extras = testing + graph deps = mypy==0.920 commands = @@ -43,6 +45,7 @@ usedevelop = true extras = testing + graph deps = # fetch and install swh-docs in develop mode -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs @@ -60,6 +63,7 @@ usedevelop = true extras = testing + graph deps = # install swh-docs in develop mode -e ../swh-docs