diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py index 53bf275..6336483 100644 --- a/swh/provenance/tests/test_cli.py +++ b/swh/provenance/tests/test_cli.py @@ -1,164 +1,165 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, List, Set from _pytest.monkeypatch import MonkeyPatch from click.testing import CliRunner import psycopg2.extensions import pytest from swh.core.cli import swh as swhmain import swh.core.cli.db # noqa ; ensure cli is loaded from swh.core.db import BaseDb from swh.core.db.db_utils import init_admin_extensions from swh.model.hashutil import MultiHash import swh.provenance.cli # noqa ; ensure cli is loaded from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.interface import StorageInterface from .conftest import get_datafile from .test_utils import invoke, write_configuration_path def test_cli_swh_db_help() -> None: # swhmain.add_command(provenance_cli) result = CliRunner().invoke(swhmain, ["provenance", "-h"]) assert result.exit_code == 0 assert "Commands:" in result.output commands = result.output.split("Commands:")[1] for command in ( "find-all", "find-first", "iter-frontiers", "iter-origins", "iter-revisions", ): assert f" {command} " in commands TABLES = { "dbflavor", "dbmodule", "dbversion", "content", "content_in_revision", "content_in_directory", "directory", "directory_in_revision", "location", "origin", "revision", "revision_before_revision", "revision_in_origin", } @pytest.mark.parametrize( "flavor, dbtables", (("with-path", TABLES), ("without-path", TABLES)) ) def test_cli_db_create_and_init_db_with_flavor( monkeypatch: MonkeyPatch, postgresql: psycopg2.extensions.connection, flavor: str, dbtables: Set[str], ) -> None: """Test that 'swh db init provenance' works with flavors for both with-path and without-path flavors""" dbname = f"{flavor}-db" # DB creation using 'swh db create' db_params = postgresql.get_dsn_parameters() monkeypatch.setenv("PGHOST", db_params["host"]) monkeypatch.setenv("PGUSER", db_params["user"]) monkeypatch.setenv("PGPORT", db_params["port"]) result = CliRunner().invoke(swhmain, ["db", "create", "-d", dbname, "provenance"]) assert result.exit_code == 0, result.output # DB init using 'swh db init' result = CliRunner().invoke( swhmain, ["db", "init", "-d", dbname, "--flavor", flavor, "provenance"] ) assert result.exit_code == 0, result.output assert f"(flavor {flavor})" in result.output db_params["dbname"] = dbname cnx = BaseDb.connect(**db_params).conn # check the DB looks OK (check for db_flavor and expected tables) with cnx.cursor() as cur: cur.execute("select swh_get_dbflavor()") assert cur.fetchone() == (flavor,) cur.execute( "select table_name from information_schema.tables " "where table_schema = 'public' " f"and table_catalog = '{dbname}'" ) tables = set(x for (x,) in cur.fetchall()) assert tables == dbtables def test_cli_init_db_default_flavor(postgresql: psycopg2.extensions.connection) -> None: "Test that 'swh db init provenance' defaults to a with-path flavored DB" dbname = postgresql.dsn init_admin_extensions("swh.provenance", dbname) result = CliRunner().invoke(swhmain, ["db", "init", "-d", dbname, "provenance"]) assert result.exit_code == 0, result.output with postgresql.cursor() as cur: cur.execute("select swh_get_dbflavor()") assert cur.fetchone() == ("with-path",) +@pytest.mark.origin_layer @pytest.mark.parametrize( "subcommand", (["origin", "from-csv"], ["iter-origins"]), ) def test_cli_origin_from_csv( swh_storage: StorageInterface, subcommand: List[str], swh_storage_backend_config: Dict, provenance, tmp_path, ): repo = "cmdbts2" origin_url = f"https://{repo}" data = load_repo_data(repo) fill_storage(swh_storage, data) assert len(data["origin"]) >= 1 assert origin_url in [o["url"] for o in data["origin"]] cfg = { "provenance": { "archive": { "cls": "api", "storage": swh_storage_backend_config, }, "storage": { "cls": "postgresql", # "db": provenance.storage.conn.dsn, "db": provenance.storage.conn.get_dsn_parameters(), }, }, } config_path = write_configuration_path(cfg, tmp_path) csv_filepath = get_datafile("origins.csv") subcommand = subcommand + [csv_filepath] result = invoke(subcommand, config_path) assert result.exit_code == 0, f"Unexpected result: {result.output}" origin_sha1 = MultiHash.from_data( origin_url.encode(), hash_names=["sha1"] ).digest()["sha1"] actual_result = provenance.storage.origin_get([origin_sha1]) assert actual_result == {origin_sha1: origin_url} diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py index 8aaf489..92fbedc 100644 --- a/swh/provenance/tests/test_history_graph.py +++ b/swh/provenance/tests/test_history_graph.py @@ -1,54 +1,55 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest import yaml from swh.model.hashutil import hash_to_bytes from swh.provenance.archive import ArchiveInterface from swh.provenance.graph import HistoryGraph from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry, RevisionEntry from swh.provenance.origin import origin_add_revision from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data +@pytest.mark.origin_layer @pytest.mark.parametrize( "repo, visit", (("with-merges", "visits-01"),), ) @pytest.mark.parametrize("batch", (True, False)) def test_history_graph( provenance: ProvenanceInterface, archive: ArchiveInterface, repo: str, visit: str, batch: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(archive.storage, data) filename = f"history_graphs_{repo}_{visit}.yaml" with open(get_datafile(filename)) as file: for expected in yaml.full_load(file): entry = OriginEntry(expected["origin"], hash_to_bytes(expected["snapshot"])) provenance.origin_add(entry) for expected_graph_as_dict in expected["graphs"]: print("Expected graph:", expected_graph_as_dict) computed_graph = HistoryGraph( archive, RevisionEntry(hash_to_bytes(expected_graph_as_dict["head"])), ) print("Computed graph:", computed_graph.as_dict()) assert computed_graph.as_dict() == expected_graph_as_dict origin_add_revision(provenance, entry, computed_graph) if not batch: provenance.flush() diff --git a/swh/provenance/tests/test_journal_client.py b/swh/provenance/tests/test_journal_client.py index 4fd6854..27bb7af 100644 --- a/swh/provenance/tests/test_journal_client.py +++ b/swh/provenance/tests/test_journal_client.py @@ -1,135 +1,136 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict from confluent_kafka import Consumer import pytest from swh.model.hashutil import MultiHash from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.interface import StorageInterface from .test_utils import invoke, write_configuration_path @pytest.fixture def swh_storage_backend_config(swh_storage_backend_config, kafka_server, kafka_prefix): writer_config = { "cls": "kafka", "brokers": [kafka_server], "client_id": "kafka_writer", "prefix": kafka_prefix, "anonymize": False, } yield {**swh_storage_backend_config, "journal_writer": writer_config} +@pytest.mark.origin_layer def test_cli_origin_from_journal_client( swh_storage: StorageInterface, swh_storage_backend_config: Dict, kafka_prefix: str, kafka_server: str, consumer: Consumer, tmp_path: str, provenance, postgres_provenance, ) -> None: """Test origin journal client cli""" # Prepare storage data data = load_repo_data("cmdbts2") assert len(data["origin"]) >= 1 origin_url = data["origin"][0]["url"] fill_storage(swh_storage, data) # Prepare configuration for cli call swh_storage_backend_config.pop("journal_writer", None) # no need for that config storage_config_dict = swh_storage_backend_config cfg = { "journal_client": { "cls": "kafka", "brokers": [kafka_server], "group_id": "toto", "prefix": kafka_prefix, "stop_on_eof": True, }, "provenance": { "archive": { "cls": "api", "storage": storage_config_dict, }, "storage": { "cls": "postgresql", "db": postgres_provenance.get_dsn_parameters(), }, }, } config_path = write_configuration_path(cfg, tmp_path) # call the cli 'swh provenance origin from-journal' result = invoke(["origin", "from-journal"], config_path) assert result.exit_code == 0, f"Unexpected result: {result.output}" origin_sha1 = MultiHash.from_data( origin_url.encode(), hash_names=["sha1"] ).digest()["sha1"] actual_result = provenance.storage.origin_get([origin_sha1]) assert actual_result == {origin_sha1: origin_url} def test_cli_revision_from_journal_client( swh_storage: StorageInterface, swh_storage_backend_config: Dict, kafka_prefix: str, kafka_server: str, consumer: Consumer, tmp_path: str, provenance, postgres_provenance, ) -> None: """Test revision journal client cli""" # Prepare storage data data = load_repo_data("cmdbts2") assert len(data["origin"]) >= 1 fill_storage(swh_storage, data) # Prepare configuration for cli call swh_storage_backend_config.pop("journal_writer", None) # no need for that config storage_config_dict = swh_storage_backend_config cfg = { "journal_client": { "cls": "kafka", "brokers": [kafka_server], "group_id": "toto", "prefix": kafka_prefix, "stop_on_eof": True, }, "provenance": { "archive": { "cls": "api", "storage": storage_config_dict, }, "storage": { "cls": "postgresql", "db": postgres_provenance.get_dsn_parameters(), }, }, } config_path = write_configuration_path(cfg, tmp_path) revisions = [rev["id"] for rev in data["revision"]] result = provenance.storage.revision_get(revisions) assert not result # call the cli 'swh provenance revision from-journal' cli_result = invoke(["revision", "from-journal"], config_path) assert cli_result.exit_code == 0, f"Unexpected result: {result.output}" result = provenance.storage.revision_get(revisions) assert set(result.keys()) == set(revisions) diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py index 020a07d..581aaed 100644 --- a/swh/provenance/tests/test_origin_iterator.py +++ b/swh/provenance/tests/test_origin_iterator.py @@ -1,46 +1,47 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.provenance.origin import CSVOriginIterator from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.algos.origin import ( iter_origin_visit_statuses, iter_origin_visits, iter_origins, ) from swh.storage.interface import StorageInterface +@pytest.mark.origin_layer @pytest.mark.parametrize( "repo", ( "cmdbts2", "out-of-order", ), ) def test_origin_iterator(swh_storage: StorageInterface, repo: str) -> None: """Test CSVOriginIterator""" data = load_repo_data(repo) fill_storage(swh_storage, data) origins_csv = [] for origin in iter_origins(swh_storage): for visit in iter_origin_visits(swh_storage, origin.url): if visit.visit is not None: for status in iter_origin_visit_statuses( swh_storage, origin.url, visit.visit ): if status.snapshot is not None: origins_csv.append((status.origin, status.snapshot)) origins = list(CSVOriginIterator(origins_csv)) assert origins # there can be more origins, depending on the additional extra visits.yaml # file used during dataset generation (see data/generate_storage_from_git) assert len(origins) >= len(data["origin"]) # but we can check it's a subset assert set(o.url for o in origins) <= set(o["url"] for o in data["origin"]) diff --git a/swh/provenance/tests/test_origin_revision_layer.py b/swh/provenance/tests/test_origin_revision_layer.py index 1bbdf41..38c4bff 100644 --- a/swh/provenance/tests/test_origin_revision_layer.py +++ b/swh/provenance/tests/test_origin_revision_layer.py @@ -1,194 +1,195 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re from typing import Any, Dict, Iterable, Iterator, List, Set import pytest from typing_extensions import TypedDict from swh.model.hashutil import hash_to_bytes from swh.model.model import Sha1Git from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType from swh.provenance.model import OriginEntry from swh.provenance.origin import origin_add from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data class SynthRelation(TypedDict): src: Sha1Git dst: Sha1Git name: str class SynthOrigin(TypedDict): sha1: Sha1Git url: str snap: Sha1Git O_R: List[SynthRelation] R_R: List[SynthRelation] def synthetic_origin_revision_result(filename: str) -> Iterator[SynthOrigin]: """Generates dict representations of synthetic origin visits found in the synthetic file (from the data/ directory) given as argument of the generator. Generated SynthOrigin (typed dict) with the following elements: "sha1": (Sha1Git) sha1 of the origin, "url": (str) url of the origin, "snap": (Sha1Git) sha1 of the visit's snapshot, "O_R": (list) new O-R relations added by this origin visit "R_R": (list) new R-R relations added by this origin visit Each relation above is a SynthRelation typed dict with: "src": (Sha1Git) sha1 of the source of the relation "dst": (Sha1Git) sha1 of the destination of the relation """ with open(get_datafile(filename), "r") as fobj: yield from _parse_synthetic_origin_revision_file(fobj) def _parse_synthetic_origin_revision_file(fobj: Iterable[str]) -> Iterator[SynthOrigin]: """Read a 'synthetic' file and generate a dict representation of the synthetic origin visit for each snapshot listed in the synthetic file. """ regs = [ "(?P[^ ]+)?", "(?P[^| ]*)", "(?PR[0-9]{2,4})?", "(?P[ORS]) (?P[0-9a-f]{40})", ] regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$") current_org: List[dict] = [] for m in (regex.match(line) for line in fobj): if m: d = m.groupdict() if d["url"]: if current_org: yield _mk_synth_org(current_org) current_org.clear() current_org.append(d) if current_org: yield _mk_synth_org(current_org) def _mk_synth_org(synth_org: List[Dict[str, str]]) -> SynthOrigin: assert synth_org[0]["type"] == "O" assert synth_org[1]["type"] == "S" org = SynthOrigin( sha1=hash_to_bytes(synth_org[0]["sha1"]), url=synth_org[0]["url"], snap=hash_to_bytes(synth_org[1]["sha1"]), O_R=[], R_R=[], ) for row in synth_org[2:]: if row["reltype"] == "O-R": assert row["type"] == "R" org["O_R"].append( SynthRelation( src=org["sha1"], dst=hash_to_bytes(row["sha1"]), name=row["revname"], ) ) elif row["reltype"] == "R-R": assert row["type"] == "R" org["R_R"].append( SynthRelation( src=org["O_R"][-1]["dst"], dst=hash_to_bytes(row["sha1"]), name=row["revname"], ) ) return org +@pytest.mark.origin_layer @pytest.mark.parametrize( "repo, visit", (("with-merges", "visits-01"),), ) def test_origin_revision_layer( provenance: ProvenanceInterface, archive: ArchiveInterface, repo: str, visit: str, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(archive.storage, data) syntheticfile = get_datafile(f"origin-revision_{repo}_{visit}.txt") origins = [ {"url": status["origin"], "snap": status["snapshot"]} for status in data["origin_visit_status"] if status["snapshot"] is not None ] rows: Dict[str, Set[Any]] = { "origin": set(), "revision_in_origin": set(), "revision_before_revision": set(), "revision": set(), } for synth_org in synthetic_origin_revision_result(syntheticfile): for origin in ( org for org in origins if org["url"] == synth_org["url"] and org["snap"] == synth_org["snap"] ): entry = OriginEntry(url=origin["url"], snapshot=origin["snap"]) origin_add(provenance, archive, [entry]) # each "entry" in the synth file is one new origin visit rows["origin"].add(synth_org["sha1"]) assert rows["origin"] == provenance.storage.entity_get_all( EntityType.ORIGIN ), synth_org["url"] # check the url of the origin assert ( provenance.storage.origin_get([synth_org["sha1"]])[synth_org["sha1"]] == synth_org["url"] ), synth_org["snap"] # this origin visit might have added new revision objects rows["revision"] |= set(x["dst"] for x in synth_org["O_R"]) rows["revision"] |= set(x["dst"] for x in synth_org["R_R"]) assert rows["revision"] == provenance.storage.entity_get_all( EntityType.REVISION ), synth_org["snap"] # check for O-R (head) entries # these are added in the revision_in_origin relation rows["revision_in_origin"] |= set( (x["dst"], x["src"], None) for x in synth_org["O_R"] ) assert rows["revision_in_origin"] == { (src, rel.dst, rel.path) for src, rels in provenance.storage.relation_get_all( RelationType.REV_IN_ORG ).items() for rel in rels }, synth_org["snap"] # check for R-R entries # these are added in the revision_before_revision relation rows["revision_before_revision"] |= set( (x["dst"], x["src"], None) for x in synth_org["R_R"] ) assert rows["revision_before_revision"] == { (src, rel.dst, rel.path) for src, rels in provenance.storage.relation_get_all( RelationType.REV_BEFORE_REV ).items() for rel in rels }, synth_org["snap"] diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py index fee0a88..d4ec7a4 100644 --- a/swh/provenance/tests/test_provenance_storage.py +++ b/swh/provenance/tests/test_provenance_storage.py @@ -1,470 +1,521 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import inspect import os from typing import Any, Dict, Iterable, Optional, Set, Tuple +import pytest + from swh.model.hashutil import hash_to_bytes from swh.model.model import Origin, Sha1Git from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ( DirectoryData, EntityType, ProvenanceInterface, ProvenanceResult, ProvenanceStorageInterface, RelationData, RelationType, RevisionData, ) from swh.provenance.model import OriginEntry, RevisionEntry from swh.provenance.origin import origin_add from swh.provenance.provenance import Provenance from swh.provenance.revision import revision_add from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt class TestProvenanceStorage: def test_provenance_storage_content( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests content methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Add all content present in the current repo to the storage, just assigning their # creation dates. Then check that the returned results when querying are the same. cnt_dates = { cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"]) } assert provenance_storage.content_add(cnt_dates) assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates assert provenance_storage.entity_get_all(EntityType.CONTENT) == set( cnt_dates.keys() ) def test_provenance_storage_directory( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests directory methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Of all directories present in the current repo, only assign a date to those # containing blobs (picking the max date among the available ones). Then check that # the returned results when querying are the same. def getmaxdate( directory: Dict[str, Any], contents: Iterable[Dict[str, Any]] ) -> Optional[datetime]: dates = [ content["ctime"] for entry in directory["entries"] for content in contents if entry["type"] == "file" and entry["target"] == content["sha1_git"] ] return max(dates) if dates else None flat_values = (False, True) dir_dates = {} for idx, dir in enumerate(data["directory"]): date = getmaxdate(dir, data["content"]) if date is not None: dir_dates[dir["id"]] = DirectoryData( date=date, flat=flat_values[idx % 2] ) assert provenance_storage.directory_add(dir_dates) assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set( dir_dates.keys() ) def test_provenance_storage_location( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests location methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Add all names of entries present in the directories of the current repo as paths # to the storage. Then check that the returned results when querying are the same. paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]} assert provenance_storage.location_add(paths) if provenance_storage.with_path(): assert provenance_storage.location_get_all() == paths else: assert provenance_storage.location_get_all() == set() + @pytest.mark.origin_layer def test_provenance_storage_origin( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests origin methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Test origin methods. # Add all origins present in the current repo to the storage. Then check that the # returned results when querying are the same. orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]} assert orgs assert provenance_storage.origin_add(orgs) assert provenance_storage.origin_get(set(orgs.keys())) == orgs assert provenance_storage.entity_get_all(EntityType.ORIGIN) == set(orgs.keys()) def test_provenance_storage_revision( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests revision methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Test revision methods. # Add all revisions present in the current repo to the storage, assigning their # dates and an arbitrary origin to each one. Then check that the returned results # when querying are the same. origin = Origin(url=next(iter(data["origin"]))["url"]) # Origin must be inserted in advance. assert provenance_storage.origin_add({origin.id: origin.url}) revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0} rev_data = { rev["id"]: RevisionData( date=ts2dt(rev["date"]) if idx % 2 != 0 else None, origin=origin.id if idx % 3 != 0 else None, ) for idx, rev in enumerate(data["revision"]) if idx % 6 != 0 } assert revs assert provenance_storage.revision_add(revs) assert provenance_storage.revision_add(rev_data) assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data assert provenance_storage.entity_get_all(EntityType.REVISION) == revs | set( rev_data.keys() ) - def test_provenance_storage_relation( + def test_provenance_storage_relation_revision_layer( self, provenance_storage: ProvenanceStorageInterface, ) -> None: """Tests relation methods for every `ProvenanceStorageInterface` implementation.""" # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") # Test content-in-revision relation. # Create flat models of every root directory for the revisions in the dataset. cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {} for rev in data["revision"]: root = next( subdir for subdir in data["directory"] if subdir["id"] == rev["directory"] ) for cnt, rel in dircontent(data, rev["id"], root): cnt_in_rev.setdefault(cnt, set()).add(rel) relation_add_and_compare_result( provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev ) # Test content-in-directory relation. # Create flat models for every directory in the dataset. cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {} for dir in data["directory"]: for cnt, rel in dircontent(data, dir["id"], dir): cnt_in_dir.setdefault(cnt, set()).add(rel) relation_add_and_compare_result( provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir ) # Test content-in-directory relation. # Add root directories to their correspondent revision in the dataset. dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {} for rev in data["revision"]: dir_in_rev.setdefault(rev["directory"], set()).add( RelationData(dst=rev["id"], path=b".") ) relation_add_and_compare_result( provenance_storage, RelationType.DIR_IN_REV, dir_in_rev ) + @pytest.mark.origin_layer + def test_provenance_storage_relation_orign_layer( + self, + provenance_storage: ProvenanceStorageInterface, + ) -> None: + """Tests relation methods for every `ProvenanceStorageInterface` implementation.""" + + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") + # Test revision-in-origin relation. # Origins must be inserted in advance (cannot be done by `entity_add` inside # `relation_add_and_compare_result`). orgs = {Origin(url=org["url"]).id: org["url"] for org in data["origin"]} assert provenance_storage.origin_add(orgs) # Add all revisions that are head of some snapshot branch to the corresponding # origin. rev_in_org: Dict[Sha1Git, Set[RelationData]] = {} for status in data["origin_visit_status"]: if status["snapshot"] is not None: for snapshot in data["snapshot"]: if snapshot["id"] == status["snapshot"]: for branch in snapshot["branches"].values(): if branch["target_type"] == "revision": rev_in_org.setdefault(branch["target"], set()).add( RelationData( dst=Origin(url=status["origin"]).id, path=None, ) ) relation_add_and_compare_result( provenance_storage, RelationType.REV_IN_ORG, rev_in_org ) # Test revision-before-revision relation. # For each revision in the data set add an entry for each parent to the relation. rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {} for rev in data["revision"]: for parent in rev["parents"]: rev_before_rev.setdefault(parent, set()).add( RelationData(dst=rev["id"], path=None) ) relation_add_and_compare_result( provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev ) - def test_provenance_storage_find( + def test_provenance_storage_find_revision_layer( self, provenance: ProvenanceInterface, provenance_storage: ProvenanceStorageInterface, archive: ArchiveInterface, ) -> None: """Tests `content_find_first` and `content_find_all` methods for every `ProvenanceStorageInterface` implementation. """ # Read data/README.md for more details on how these datasets are generated. data = load_repo_data("cmdbts2") fill_storage(archive.storage, data) # Test content_find_first and content_find_all, first only executing the # revision-content algorithm, then adding the origin-revision layer. def adapt_result( result: Optional[ProvenanceResult], with_path: bool ) -> Optional[ProvenanceResult]: if result is not None: return ProvenanceResult( result.content, result.revision, result.date, result.origin, result.path if with_path else b"", ) return result # Execute the revision-content algorithm on both storages. revisions = [ RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"]) for rev in data["revision"] ] revision_add(provenance, archive, revisions) revision_add(Provenance(provenance_storage), archive, revisions) assert adapt_result( ProvenanceResult( content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), date=datetime.fromtimestamp(1000000000.0, timezone.utc), origin=None, path=b"A/B/C/a", ), provenance_storage.with_path(), ) == provenance_storage.content_find_first( hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494") ) for cnt in {cnt["sha1_git"] for cnt in data["content"]}: assert adapt_result( provenance.storage.content_find_first(cnt), provenance_storage.with_path(), ) == provenance_storage.content_find_first(cnt) assert { adapt_result(occur, provenance_storage.with_path()) for occur in provenance.storage.content_find_all(cnt) } == set(provenance_storage.content_find_all(cnt)) + @pytest.mark.origin_layer + def test_provenance_storage_find_origin_layer( + self, + provenance: ProvenanceInterface, + provenance_storage: ProvenanceStorageInterface, + archive: ArchiveInterface, + ) -> None: + """Tests `content_find_first` and `content_find_all` methods for every + `ProvenanceStorageInterface` implementation. + """ + + # Read data/README.md for more details on how these datasets are generated. + data = load_repo_data("cmdbts2") + fill_storage(archive.storage, data) + + # Execute the revision-content algorithm on both storages. + revisions = [ + RevisionEntry(id=rev["id"], date=ts2dt(rev["date"]), root=rev["directory"]) + for rev in data["revision"] + ] + revision_add(provenance, archive, revisions) + revision_add(Provenance(provenance_storage), archive, revisions) + + # Test content_find_first and content_find_all, first only executing the + # revision-content algorithm, then adding the origin-revision layer. + def adapt_result( + result: Optional[ProvenanceResult], with_path: bool + ) -> Optional[ProvenanceResult]: + if result is not None: + return ProvenanceResult( + result.content, + result.revision, + result.date, + result.origin, + result.path if with_path else b"", + ) + return result + # Execute the origin-revision algorithm on both storages. origins = [ OriginEntry(url=sta["origin"], snapshot=sta["snapshot"]) for sta in data["origin_visit_status"] if sta["snapshot"] is not None ] origin_add(provenance, archive, origins) origin_add(Provenance(provenance_storage), archive, origins) assert adapt_result( ProvenanceResult( content=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), revision=hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), date=datetime.fromtimestamp(1000000000.0, timezone.utc), origin="https://cmdbts2", path=b"A/B/C/a", ), provenance_storage.with_path(), ) == provenance_storage.content_find_first( hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494") ) for cnt in {cnt["sha1_git"] for cnt in data["content"]}: assert adapt_result( provenance.storage.content_find_first(cnt), provenance_storage.with_path(), ) == provenance_storage.content_find_first(cnt) assert { adapt_result(occur, provenance_storage.with_path()) for occur in provenance.storage.content_find_all(cnt) } == set(provenance_storage.content_find_all(cnt)) def test_types(self, provenance_storage: ProvenanceStorageInterface) -> None: """Checks all methods of ProvenanceStorageInterface are implemented by this backend, and that they have the same signature.""" # Create an instance of the protocol (which cannot be instantiated # directly, so this creates a subclass, then instantiates it) interface = type("_", (ProvenanceStorageInterface,), {})() assert "content_find_first" in dir(interface) missing_methods = [] for meth_name in dir(interface): if meth_name.startswith("_"): continue interface_meth = getattr(interface, meth_name) try: concrete_meth = getattr(provenance_storage, meth_name) except AttributeError: if not getattr(interface_meth, "deprecated_endpoint", False): # The backend is missing a (non-deprecated) endpoint missing_methods.append(meth_name) continue expected_signature = inspect.signature(interface_meth) actual_signature = inspect.signature(concrete_meth) assert expected_signature == actual_signature, meth_name assert missing_methods == [] # If all the assertions above succeed, then this one should too. # But there's no harm in double-checking. # And we could replace the assertions above by this one, but unlike # the assertions above, it doesn't explain what is missing. assert isinstance(provenance_storage, ProvenanceStorageInterface) def dircontent( data: Dict[str, Any], ref: Sha1Git, dir: Dict[str, Any], prefix: bytes = b"", ) -> Iterable[Tuple[Sha1Git, RelationData]]: content = { ( entry["target"], RelationData(dst=ref, path=os.path.join(prefix, entry["name"])), ) for entry in dir["entries"] if entry["type"] == "file" } for entry in dir["entries"]: if entry["type"] == "dir": child = next( subdir for subdir in data["directory"] if subdir["id"] == entry["target"] ) content.update( dircontent(data, ref, child, os.path.join(prefix, entry["name"])) ) return content def entity_add( storage: ProvenanceStorageInterface, entity: EntityType, ids: Set[Sha1Git] ) -> bool: now = datetime.now(tz=timezone.utc) if entity == EntityType.CONTENT: return storage.content_add({sha1: now for sha1 in ids}) elif entity == EntityType.DIRECTORY: return storage.directory_add( {sha1: DirectoryData(date=now, flat=False) for sha1 in ids} ) else: # entity == EntityType.REVISION: return storage.revision_add( {sha1: RevisionData(date=None, origin=None) for sha1 in ids} ) def relation_add_and_compare_result( storage: ProvenanceStorageInterface, relation: RelationType, data: Dict[Sha1Git, Set[RelationData]], ) -> None: # Source, destinations and locations must be added in advance. src, *_, dst = relation.value.split("_") srcs = {sha1 for sha1 in data} if src != "origin": assert entity_add(storage, EntityType(src), srcs) dsts = {rel.dst for rels in data.values() for rel in rels} if dst != "origin": assert entity_add(storage, EntityType(dst), dsts) if storage.with_path(): assert storage.location_add( {rel.path for rels in data.values() for rel in rels if rel.path is not None} ) assert data assert storage.relation_add(relation, data) for src_sha1 in srcs: relation_compare_result( storage.relation_get(relation, [src_sha1]), {src_sha1: data[src_sha1]}, storage.with_path(), ) for dst_sha1 in dsts: relation_compare_result( storage.relation_get(relation, [dst_sha1], reverse=True), { src_sha1: { RelationData(dst=dst_sha1, path=rel.path) for rel in rels if dst_sha1 == rel.dst } for src_sha1, rels in data.items() if dst_sha1 in {rel.dst for rel in rels} }, storage.with_path(), ) relation_compare_result( storage.relation_get_all(relation), data, storage.with_path() ) def relation_compare_result( computed: Dict[Sha1Git, Set[RelationData]], expected: Dict[Sha1Git, Set[RelationData]], with_path: bool, ) -> None: assert { src_sha1: { RelationData(dst=rel.dst, path=rel.path if with_path else None) for rel in rels } for src_sha1, rels in expected.items() } == computed