diff --git a/swh/provenance/archive.py b/swh/provenance/archive.py --- a/swh/provenance/archive.py +++ b/swh/provenance/archive.py @@ -2,7 +2,7 @@ from typing_extensions import Protocol, runtime_checkable -from swh.model.model import Revision, Sha1Git +from swh.model.model import Sha1Git @runtime_checkable @@ -14,32 +14,32 @@ id: sha1 id of the directory to list entries from. Yields: - directory entries for such directory. + dictionary of entries in such directory containing only the keys "name", + "target" and "type". """ ... - def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]: - """Given a list of sha1, return the revisions' information + def revision_get_parents(self, id: Sha1Git) -> Iterable[Sha1Git]: + """List parents of one revision. Args: - revisions: list of sha1s for the revisions to be retrieved + revisions: sha1 id of the revision to list parents from. Yields: - revisions matching the identifiers. If a revision does - not exist, the provided sha1 is simply ignored. + sha1 ids for the parents of such revision. """ ... def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: - """List all revisions pointed by one snapshot. + """List all revisions targeted by one snapshot. Args: id: sha1 id of the snapshot. Yields: - sha1 ids of found revisions. + sha1 ids of revisions that a target of such snapshot. """ ... diff --git a/swh/provenance/model.py b/swh/provenance/model.py --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -57,22 +57,8 @@ def retrieve_parents(self, archive: ArchiveInterface): if self._parents_entries is None: if self._parents_ids is None: - revision = list(archive.revision_get([self.id])) - if revision: - self._parents_ids = revision[0].parents - else: - self._parents_ids = [] - - self._parents_entries = [ - RevisionEntry( - id=rev.id, - root=rev.directory, - date=rev.date.to_datetime(), - parents=rev.parents, - ) - for rev in archive.revision_get(self._parents_ids) - if rev.date is not None - ] + self._parents_ids = archive.revision_get_parents(self.id) + self._parents_entries = [RevisionEntry(id) for id in self._parents_ids] @property def parents(self) -> Iterator["RevisionEntry"]: diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py --- a/swh/provenance/postgresql/archive.py +++ b/swh/provenance/postgresql/archive.py @@ -1,9 +1,9 @@ -from typing import Any, Dict, Iterable, List, Set +from typing import Any, Dict, Iterable, List from methodtools import lru_cache import psycopg2 -from swh.model.model import ObjectType, Revision, Sha1Git, TargetType +from swh.model.model import Sha1Git from swh.storage.postgresql.storage import Storage @@ -13,8 +13,6 @@ self.storage = Storage(conn, objstorage={"cls": "memory"}) def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]: - # TODO: only call directory_ls_internal if the id is not being queried by - # someone else. Otherwise wait until results get properly cached. entries = self.directory_ls_internal(id) yield from entries @@ -23,12 +21,13 @@ # TODO: add file size filtering with self.conn.cursor() as cursor: cursor.execute( - """WITH - dir AS (SELECT id AS dir_id, dir_entries, file_entries, rev_entries + """ + WITH + dir AS (SELECT id AS dir_id, dir_entries, file_entries, rev_entries FROM directory WHERE id=%s), - ls_d AS (SELECT dir_id, UNNEST(dir_entries) AS entry_id FROM dir), + ls_d AS (SELECT dir_id, UNNEST(dir_entries) AS entry_id FROM dir), ls_f AS (SELECT dir_id, UNNEST(file_entries) AS entry_id FROM dir), - ls_r AS (SELECT dir_id, UNNEST(rev_entries) AS entry_id FROM dir) + ls_r AS (SELECT dir_id, UNNEST(rev_entries) AS entry_id FROM dir) (SELECT 'dir'::directory_entry_type AS type, e.target, e.name, NULL::sha1_git FROM ls_d @@ -53,7 +52,6 @@ ) ) ) - ORDER BY name """, (id,), ) @@ -62,73 +60,44 @@ for row in cursor.fetchall() ] - def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]: + def revision_get_parents(self, id: Sha1Git) -> Iterable[Sha1Git]: with self.conn.cursor() as cursor: - psycopg2.extras.execute_values( - cursor, + cursor.execute( """ - SELECT t.id, revision.date, revision.directory, - ARRAY( - SELECT rh.parent_id::bytea - FROM revision_history rh - WHERE rh.id = t.id - ORDER BY rh.parent_rank + SELECT ARRAY( + SELECT RH.parent_id::bytea + FROM revision_history AS RH + WHERE RH.id=R.id + ORDER BY RH.parent_rank ) - FROM (VALUES %s) as t(sortkey, id) - LEFT JOIN revision ON t.id = revision.id - LEFT JOIN person author ON revision.author = author.id - LEFT JOIN person committer ON revision.committer = committer.id - ORDER BY sortkey + FROM revision AS R WHERE R.id=%s """, - ((sortkey, id) for sortkey, id in enumerate(ids)), + (id,), ) + # There should be at most one row anyway for row in cursor.fetchall(): - parents = [] - for parent in row[3]: - if parent: - parents.append(parent) - yield Revision.from_dict( - { - "id": row[0], - "date": row[1], - "directory": row[2], - "parents": tuple(parents), - } - ) + yield from row[0] def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: - # TODO: this code is duplicated here (same as in swh.provenance.storage.archive) - # but it's just temporary. This method should actually perform a direct query to - # the SQL db of the archive. - from swh.core.utils import grouper - from swh.storage.algos.snapshot import snapshot_get_all_branches - - snapshot = snapshot_get_all_branches(self.storage, id) - assert snapshot is not None - - targets_set = set() - releases_set = set() - if snapshot is not None: - for branch in snapshot.branches: - if snapshot.branches[branch].target_type == TargetType.REVISION: - targets_set.add(snapshot.branches[branch].target) - elif snapshot.branches[branch].target_type == TargetType.RELEASE: - releases_set.add(snapshot.branches[branch].target) - - batchsize = 100 - for releases in grouper(releases_set, batchsize): - targets_set.update( - release.target - for release in self.storage.release_get(releases) - if release is not None and release.target_type == ObjectType.REVISION - ) - - revisions: Set[Sha1Git] = set() - for targets in grouper(targets_set, batchsize): - revisions.update( - revision.id - for revision in self.storage.revision_get(targets) - if revision is not None + with self.conn.cursor() as cursor: + cursor.execute( + """ + WITH S AS (SELECT object_id FROM snapshot WHERE snapshot.id=%s) + (SELECT B.target AS head + FROM S + JOIN snapshot_branches AS BS ON (S.object_id=BS.snapshot_id) + JOIN snapshot_branch AS B ON (BS.branch_id=B.object_id) + WHERE B.target_type='revision'::snapshot_target) + UNION + (SELECT R.target AS head + FROM S + JOIN snapshot_branches AS BS ON (S.object_id=BS.snapshot_id) + JOIN snapshot_branch AS B ON (BS.branch_id=B.object_id) + JOIN release AS R ON (B.target=R.id) + WHERE B.target_type='release'::snapshot_target + AND R.target_type='revision'::object_type) + """, + (id,), ) - - yield from revisions + for row in cursor.fetchall(): + yield row[0] diff --git a/swh/provenance/storage/archive.py b/swh/provenance/storage/archive.py --- a/swh/provenance/storage/archive.py +++ b/swh/provenance/storage/archive.py @@ -1,6 +1,6 @@ from typing import Any, Dict, Iterable, Set -from swh.model.model import ObjectType, Revision, Sha1Git, TargetType +from swh.model.model import ObjectType, Sha1Git, TargetType from swh.storage.interface import StorageInterface @@ -9,14 +9,18 @@ self.storage = storage def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]: - # TODO: filter unused fields - yield from self.storage.directory_ls(id) - - def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]: - # TODO: filter unused fields - yield from ( - rev for rev in self.storage.revision_get(list(ids)) if rev is not None - ) + # TODO: add file size filtering + for entry in self.storage.directory_ls(id): + yield { + "name": entry["name"], + "target": entry["target"], + "type": entry["type"], + } + + def revision_get_parents(self, id: Sha1Git) -> Iterable[Sha1Git]: + rev = self.storage.revision_get([id])[0] + if rev is not None: + yield from rev.parents def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: from swh.core.utils import grouper diff --git a/swh/provenance/tests/test_archive_interface.py b/swh/provenance/tests/test_archive_interface.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_archive_interface.py @@ -0,0 +1,40 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from collections import Counter +from operator import itemgetter + +import pytest + +from swh.provenance.tests.conftest import fill_storage, load_repo_data + + +@pytest.mark.parametrize( + "repo", + ("cmdbts2", "out-of-order", "with-merges"), +) +def test_archive_interface(swh_storage, repo, archive_api, archive_direct): + # read data/README.md for more details on how these datasets are generated + data = load_repo_data(repo) + fill_storage(swh_storage, data) + + for directory in data["directory"]: + entries_api = sorted( + archive_api.directory_ls(directory["id"]), key=itemgetter("name") + ) + entries_direct = sorted( + archive_direct.directory_ls(directory["id"]), key=itemgetter("name") + ) + assert entries_api == entries_direct + + for revision in data["revision"]: + parents_api = Counter(archive_api.revision_get_parents(revision["id"])) + parents_direct = Counter(archive_direct.revision_get_parents(revision["id"])) + assert parents_api == parents_direct + + for snapshot in data["snapshot"]: + heads_api = Counter(archive_api.snapshot_get_heads(snapshot["id"])) + heads_direct = Counter(archive_direct.snapshot_get_heads(snapshot["id"])) + assert heads_api == heads_direct