diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,5 @@ # Add here internal Software Heritage dependencies, one per line. swh.core[db,http] >= 0.14 +swh.graph swh.model >= 2.6.1 swh.storage diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -17,7 +17,7 @@ """Get an archive object of class ``cls`` with arguments ``args``. Args: - cls: archive's class, either 'api' or 'direct' + cls: archive's class, either 'api', 'direct' or 'graph' args: dictionary of arguments passed to the archive class constructor Returns: @@ -39,6 +39,13 @@ from .postgresql.archive import ArchivePostgreSQL return ArchivePostgreSQL(BaseDb.connect(**kwargs["db"]).conn) + + elif cls == "graph": + from .swhgraph.archive import ArchiveGraph + + url = kwargs.get("url") + assert url is not None + return ArchiveGraph(url, get_storage(**kwargs["storage"])) else: raise ValueError diff --git a/swh/provenance/archive.py b/swh/provenance/archive.py --- a/swh/provenance/archive.py +++ b/swh/provenance/archive.py @@ -47,8 +47,7 @@ id: sha1 id of the snapshot. Yields: - sha1 ids of revisions that are a target of such snapshot. Revisions are - guaranteed to be retrieved in chronological order + sha1 ids of revisions that are a target of such snapshot. """ ... diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py --- a/swh/provenance/postgresql/archive.py +++ b/swh/provenance/postgresql/archive.py @@ -135,7 +135,7 @@ ON (RL.target=RV.id) WHERE B.target_type='release'::snapshot_target AND RL.target_type='revision'::object_type) - ORDER BY date, id) + ) SELECT id FROM heads """, (id,), diff --git a/swh/provenance/storage/archive.py b/swh/provenance/storage/archive.py --- a/swh/provenance/storage/archive.py +++ b/swh/provenance/storage/archive.py @@ -70,4 +70,4 @@ if revision is not None and revision.date is not None ) - yield from (head for _, head in sorted(revisions)) + yield from (head for _, head in revisions) diff --git a/swh/provenance/swhgraph/__init__.py b/swh/provenance/swhgraph/__init__.py new file mode 100644 diff --git a/swh/provenance/swhgraph/archive.py b/swh/provenance/swhgraph/archive.py new file mode 100644 --- /dev/null +++ b/swh/provenance/swhgraph/archive.py @@ -0,0 +1,46 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Any, Dict, Iterable + +from swh.core.statsd import statsd +from swh.graph.client import RemoteGraphClient +from swh.model.model import Sha1Git +from swh.model.swhids import CoreSWHID, ObjectType +from swh.storage.interface import StorageInterface + +ARCHIVE_DURATION_METRIC = "swh_provenance_archive_graph_duration_seconds" + + +class ArchiveGraph: + def __init__(self, url: str, storage: StorageInterface) -> None: + self.graph = RemoteGraphClient(url) + self.storage = storage # required by ArchiveInterface + + @statsd.timed(metric=ARCHIVE_DURATION_METRIC, tags={"method": "directory_ls"}) + def directory_ls(self, id: Sha1Git, minsize: int = 0) -> Iterable[Dict[str, Any]]: + raise NotImplementedError + + @statsd.timed( + metric=ARCHIVE_DURATION_METRIC, tags={"method": "revision_get_parents"} + ) + def revision_get_parents(self, id: Sha1Git) -> Iterable[Sha1Git]: + src = CoreSWHID(object_type=ObjectType.REVISION, object_id=id) + request = self.graph.visit_nodes(str(src), edges="rev:rev", return_types="rev") + + yield from ( + CoreSWHID.from_string(swhid).object_id + for swhid in request + if swhid != str(src) + ) + + @statsd.timed(metric=ARCHIVE_DURATION_METRIC, tags={"method": "snapshot_get_heads"}) + def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: + src = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=id) + request = self.graph.visit_nodes( + str(src), edges="snp:rev,rel:rev", return_types="rev" + ) + + yield from (CoreSWHID.from_string(swhid).object_id for swhid in request)