diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -5,9 +5,10 @@ from os import path import re -from typing import Iterable, Iterator, List, Optional +from typing import Any, Dict, Iterable, Iterator, List, Optional import msgpack +import psycopg2 import pytest from typing_extensions import TypedDict @@ -17,13 +18,20 @@ from swh.model.model import Sha1Git from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance import get_provenance +from swh.provenance.archive import ArchiveInterface from swh.provenance.postgresql.archive import ArchivePostgreSQL +from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase +from swh.provenance.provenance import ProvenanceInterface from swh.provenance.storage.archive import ArchiveStorage +from swh.storage.postgresql.storage import Storage from swh.storage.replay import process_replay_objects @pytest.fixture(params=["with-path", "without-path"]) -def provenance(request, postgresql): +def provenance( + request, # TODO: add proper type annotation + postgresql: psycopg2.extensions.connection, +) -> ProvenanceInterface: """return a working and initialized provenance db""" from swh.core.cli.db import populate_database_for_package @@ -32,9 +40,13 @@ BaseDb.adapt_conn(postgresql) - args = dict(tuple(item.split("=")) for item in postgresql.dsn.split()) - args.pop("options") + args: Dict[str, str] = { + item.split("=")[0]: item.split("=")[1] + for item in postgresql.dsn.split() + if item.split("=")[0] != "options" + } prov = get_provenance(cls="local", db=args) + assert isinstance(prov.storage, ProvenanceDBBase) assert prov.storage.flavor == flavor # in test sessions, we DO want to raise any exception occurring at commit time prov.storage.raise_on_commit = True @@ -42,7 +54,7 @@ @pytest.fixture -def swh_storage_with_objects(swh_storage): +def swh_storage_with_objects(swh_storage: Storage) -> Storage: """return a Storage object (postgresql-based by default) with a few of each object type in it @@ -64,22 +76,22 @@ @pytest.fixture -def archive_direct(swh_storage_with_objects): +def archive_direct(swh_storage_with_objects: Storage) -> ArchiveInterface: return ArchivePostgreSQL(swh_storage_with_objects.get_db().conn) @pytest.fixture -def archive_api(swh_storage_with_objects): +def archive_api(swh_storage_with_objects: Storage) -> ArchiveInterface: return ArchiveStorage(swh_storage_with_objects) @pytest.fixture(params=["archive", "db"]) -def archive(request, swh_storage_with_objects): +def archive(request, swh_storage_with_objects: Storage) -> Iterator[ArchiveInterface]: """Return a ArchivePostgreSQL based StorageInterface object""" # this is a workaround to prevent tests from hanging because of an unclosed # transaction. # TODO: refactor the ArchivePostgreSQL to properly deal with - # transactions and get rif of this fixture + # transactions and get rid of this fixture if request.param == "db": archive = ArchivePostgreSQL(conn=swh_storage_with_objects.get_db().conn) yield archive @@ -88,12 +100,12 @@ yield ArchiveStorage(swh_storage_with_objects) -def get_datafile(fname): +def get_datafile(fname: str) -> str: return path.join(path.dirname(__file__), "data", fname) -def load_repo_data(repo): - data = {} +def load_repo_data(repo: str) -> Dict[str, Any]: + data: Dict[str, Any] = {} with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj: unpacker = msgpack.Unpacker( fobj, @@ -107,11 +119,11 @@ return data -def filter_dict(d, keys): +def filter_dict(d: Dict[Any, Any], keys: Iterable[Any]) -> Dict[Any, Any]: return {k: v for (k, v) in d.items() if k in keys} -def fill_storage(storage, data): +def fill_storage(storage: Storage, data: Dict[str, Any]) -> None: process_replay_objects(data, storage=storage) @@ -184,7 +196,7 @@ yield _mk_synth_rev(current_rev) -def _mk_synth_rev(synth_rev) -> SynthRevision: +def _mk_synth_rev(synth_rev: List[Dict[str, str]]) -> SynthRevision: assert synth_rev[0]["type"] == "R" rev = SynthRevision( sha1=hash_to_bytes(synth_rev[0]["sha1"]), diff --git a/swh/provenance/tests/test_archive_interface.py b/swh/provenance/tests/test_archive_interface.py --- a/swh/provenance/tests/test_archive_interface.py +++ b/swh/provenance/tests/test_archive_interface.py @@ -12,13 +12,14 @@ from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage from swh.provenance.tests.conftest import fill_storage, load_repo_data +from swh.storage.postgresql.storage import Storage @pytest.mark.parametrize( "repo", ("cmdbts2", "out-of-order", "with-merges"), ) -def test_archive_interface(repo, swh_storage): +def test_archive_interface(repo: str, swh_storage: Storage) -> None: archive_api = ArchiveStorage(swh_storage) dsn = swh_storage.get_db().conn.dsn with BaseDb.connect(dsn).conn as conn: diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py --- a/swh/provenance/tests/test_cli.py +++ b/swh/provenance/tests/test_cli.py @@ -3,7 +3,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Set + from click.testing import CliRunner +import psycopg2 import pytest from swh.core.cli import swh as swhmain @@ -12,7 +15,7 @@ import swh.provenance.cli # noqa ; ensure cli is loaded -def test_cli_swh_db_help(): +def test_cli_swh_db_help() -> None: # swhmain.add_command(provenance_cli) result = CliRunner().invoke(swhmain, ["provenance", "-h"]) assert result.exit_code == 0 @@ -47,8 +50,11 @@ "flavor, dbtables", (("with-path", TABLES | {"location"}), ("without-path", TABLES)) ) def test_cli_db_create_and_init_db_with_flavor( - monkeypatch, postgresql, flavor, dbtables -): + monkeypatch, # TODO: add proper type annotation + postgresql: psycopg2.extensions.connection, + flavor: str, + dbtables: Set[str], +) -> None: """Test that 'swh db init provenance' works with flavors for both with-path and without-path flavors""" @@ -86,7 +92,7 @@ assert tables == dbtables -def test_cli_init_db_default_flavor(postgresql): +def test_cli_init_db_default_flavor(postgresql: psycopg2.extensions.connection) -> None: "Test that 'swh db init provenance' defaults to a with-path flavored DB" dbname = postgresql.dsn result = CliRunner().invoke(swhmain, ["db", "init", "-d", dbname, "provenance"]) diff --git a/swh/provenance/tests/test_conftest.py b/swh/provenance/tests/test_conftest.py --- a/swh/provenance/tests/test_conftest.py +++ b/swh/provenance/tests/test_conftest.py @@ -3,14 +3,17 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from swh.provenance.provenance import ProvenanceInterface +from swh.storage.postgresql.storage import Storage -def test_provenance_fixture(provenance): + +def test_provenance_fixture(provenance: ProvenanceInterface) -> None: """Check the 'provenance' fixture produce a working ProvenanceDB object""" assert provenance provenance.flush() # should be a noop -def test_storage(swh_storage_with_objects): +def test_storage(swh_storage_with_objects: Storage) -> None: """Check the 'swh_storage_with_objects' fixture produce a working Storage object with at least some Content, Revision and Directory in it""" assert swh_storage_with_objects diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py --- a/swh/provenance/tests/test_history_graph.py +++ b/swh/provenance/tests/test_history_graph.py @@ -3,17 +3,22 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Any, Dict + import pytest import yaml from swh.model.hashutil import hash_to_bytes +from swh.provenance.archive import ArchiveInterface from swh.provenance.graph import HistoryNode, build_history_graph from swh.provenance.model import OriginEntry, RevisionEntry from swh.provenance.origin import origin_add_revision +from swh.provenance.provenance import ProvenanceInterface from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data +from swh.storage.postgresql.storage import Storage -def history_graph_from_dict(d) -> HistoryNode: +def history_graph_from_dict(d: Dict[str, Any]) -> HistoryNode: """Takes a dictionary representing a tree of HistoryNode objects, and recursively builds the corresponding graph.""" node = HistoryNode( @@ -32,7 +37,14 @@ (("with-merges", "visits-01"),), ) @pytest.mark.parametrize("batch", (True, False)) -def test_history_graph(provenance, swh_storage, archive, repo, visit, batch): +def test_history_graph( + provenance: ProvenanceInterface, + swh_storage: Storage, + archive: ArchiveInterface, + repo: str, + visit: str, + batch: bool, +) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) diff --git a/swh/provenance/tests/test_isochrone_graph.py b/swh/provenance/tests/test_isochrone_graph.py --- a/swh/provenance/tests/test_isochrone_graph.py +++ b/swh/provenance/tests/test_isochrone_graph.py @@ -5,19 +5,23 @@ from copy import deepcopy from datetime import datetime, timezone +from typing import Any, Dict import pytest import yaml from swh.model.hashutil import hash_to_bytes +from swh.provenance.archive import ArchiveInterface from swh.provenance.graph import IsochroneNode, build_isochrone_graph from swh.provenance.model import DirectoryEntry, RevisionEntry +from swh.provenance.provenance import ProvenanceInterface from swh.provenance.revision import revision_add from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data from swh.provenance.tests.test_provenance_db import ts2dt +from swh.storage.postgresql.storage import Storage -def isochrone_graph_from_dict(d, depth=0) -> IsochroneNode: +def isochrone_graph_from_dict(d: Dict[str, Any], depth: int = 0) -> IsochroneNode: """Takes a dictionary representing a tree of IsochroneNode objects, and recursively builds the corresponding graph.""" d = deepcopy(d) @@ -58,8 +62,14 @@ ) @pytest.mark.parametrize("batch", (True, False)) def test_isochrone_graph( - provenance, swh_storage, archive, repo, lower, mindepth, batch -): + provenance: ProvenanceInterface, + swh_storage: Storage, + archive: ArchiveInterface, + repo: str, + lower: bool, + mindepth: int, + batch: bool, +) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) @@ -80,6 +90,7 @@ print("Expected graph:", expected_graph) # Create graph for current revision and check it has the expected structure. + assert entry.root is not None computed_graph = build_isochrone_graph( archive, provenance, diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py --- a/swh/provenance/tests/test_origin_iterator.py +++ b/swh/provenance/tests/test_origin_iterator.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from swh.model.model import OriginVisitStatus from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance.origin import CSVOriginIterator from swh.storage.algos.origin import ( @@ -10,18 +11,20 @@ iter_origin_visits, iter_origins, ) +from swh.storage.postgresql.storage import Storage -def test_origin_iterator(swh_storage_with_objects): +def test_origin_iterator(swh_storage_with_objects: Storage) -> None: """Test CSVOriginIterator""" origins_csv = [] for origin in iter_origins(swh_storage_with_objects): for visit in iter_origin_visits(swh_storage_with_objects, origin.url): - for status in iter_origin_visit_statuses( - swh_storage_with_objects, origin.url, visit.visit - ): - if status.snapshot is not None: - origins_csv.append((status.origin, status.snapshot)) + if visit.visit is not None: + for status in iter_origin_visit_statuses( + swh_storage_with_objects, origin.url, visit.visit + ): + if status.snapshot is not None: + origins_csv.append((status.origin, status.snapshot)) origins = list(CSVOriginIterator(origins_csv)) assert origins assert len(origins) == len( @@ -29,7 +32,7 @@ { status.origin for status in TEST_OBJECTS["origin_visit_status"] - if status.snapshot is not None + if isinstance(status, OriginVisitStatus) and status.snapshot is not None } ) ) diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -3,36 +3,47 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime +from datetime import datetime, timedelta, timezone +from typing import Type +from swh.model.model import OriginVisitStatus from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance.model import OriginEntry from swh.provenance.origin import origin_add +from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase from swh.provenance.postgresql.provenancedb_with_path import ProvenanceWithPathDB from swh.provenance.postgresql.provenancedb_without_path import ProvenanceWithoutPathDB +from swh.provenance.provenance import ProvenanceInterface, ProvenanceStorageInterface from swh.provenance.storage.archive import ArchiveStorage +from swh.storage.postgresql.storage import Storage -def ts2dt(ts: dict) -> datetime.datetime: - timestamp = datetime.datetime.fromtimestamp( - ts["timestamp"]["seconds"], - datetime.timezone(datetime.timedelta(minutes=ts["offset"])), +# TODO: remove this function in favour of TimestampWithTimezone.to_datetime +# from swh.model.model +def ts2dt(ts: dict) -> datetime: + timestamp = datetime.fromtimestamp( + ts["timestamp"]["seconds"], timezone(timedelta(minutes=ts["offset"])) ) return timestamp.replace(microsecond=ts["timestamp"]["microseconds"]) -def test_provenance_origin_add(provenance, swh_storage_with_objects): +def test_provenance_origin_add( + provenance: ProvenanceInterface, swh_storage_with_objects: Storage +) -> None: """Test the origin_add function""" archive = ArchiveStorage(swh_storage_with_objects) for status in TEST_OBJECTS["origin_visit_status"]: + assert isinstance(status, OriginVisitStatus) if status.snapshot is not None: entry = OriginEntry(url=status.origin, snapshot=status.snapshot) origin_add(provenance, archive, [entry]) # TODO: check some facts here -def test_provenance_flavor(provenance): +def test_provenance_flavor(provenance: ProvenanceInterface) -> None: + assert isinstance(provenance.storage, ProvenanceDBBase) assert provenance.storage.flavor in ("with-path", "without-path") + backend_class: Type[ProvenanceStorageInterface] if provenance.storage.flavor == "with-path": backend_class = ProvenanceWithPathDB else: diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -3,12 +3,18 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Dict, List, Tuple +from datetime import datetime +from typing import Any, Dict, List, Optional, Set, Tuple +import psycopg2 import pytest from swh.model.hashutil import hash_to_bytes +from swh.model.model import Sha1Git +from swh.provenance.archive import ArchiveInterface from swh.provenance.model import RevisionEntry +from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase +from swh.provenance.provenance import ProvenanceInterface from swh.provenance.revision import revision_add from swh.provenance.tests.conftest import ( fill_storage, @@ -17,9 +23,10 @@ synthetic_result, ) from swh.provenance.tests.test_provenance_db import ts2dt +from swh.storage.postgresql.storage import Storage -def sha1s(cur, table): +def sha1s(cur: psycopg2.extensions.cursor, table: str) -> Set[Sha1Git]: """return the 'sha1' column from the DB 'table' (as hex) 'cur' is a cursor to the provenance index DB. @@ -28,7 +35,7 @@ return set(row["sha1"].hex() for row in cur.fetchall()) -def locations(cur): +def locations(cur: psycopg2.extensions.cursor) -> Set[bytes]: """return the 'path' column from the DB location table 'cur' is a cursor to the provenance index DB. @@ -37,7 +44,9 @@ return set(row["path"] for row in cur.fetchall()) -def relations(cur, src, dst): +def relations( + cur: psycopg2.extensions.cursor, src: str, dst: str +) -> Set[Tuple[Sha1Git, Sha1Git, bytes]]: """return the triplets ('sha1', 'sha1', 'path') from the DB for the relation between 'src' table and 'dst' table @@ -77,13 +86,13 @@ return set((row["src"], row["dst"], row["path"]) for row in cur.fetchall()) -def get_timestamp(cur, table, sha1): +def get_timestamp( + cur: psycopg2.extensions.cursor, table: str, sha1: Sha1Git +) -> List[datetime]: """return the date for the 'sha1' from the DB 'table' (as hex) 'cur' is a cursor to the provenance index DB. """ - if isinstance(sha1, str): - sha1 = hash_to_bytes(sha1) cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) return [row["date"].timestamp() for row in cur.fetchall()] @@ -98,7 +107,14 @@ ("out-of-order", True, 1), ), ) -def test_provenance_heuristics(provenance, swh_storage, archive, repo, lower, mindepth): +def test_provenance_heuristics( + provenance: ProvenanceInterface, + swh_storage: Storage, + archive: ArchiveInterface, + repo: str, + lower: bool, + mindepth: int, +) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) @@ -108,7 +124,7 @@ revisions = {rev["id"]: rev for rev in data["revision"]} - rows = { + rows: Dict[str, Set[Any]] = { "content": set(), "content_in_directory": set(), "content_in_revision": set(), @@ -117,9 +133,11 @@ "location": set(), "revision": set(), } + assert isinstance(provenance.storage, ProvenanceDBBase) cursor = provenance.storage.cursor def maybe_path(path: str) -> str: + assert isinstance(provenance.storage, ProvenanceDBBase) if provenance.storage.with_path: return path return "" @@ -138,7 +156,7 @@ assert rows["revision"] == sha1s(cursor, "revision"), synth_rev["msg"] # check the timestamp of the revision rev_ts = synth_rev["date"] - assert get_timestamp(cursor, "revision", synth_rev["sha1"].hex()) == [ + assert get_timestamp(cursor, "revision", synth_rev["sha1"]) == [ rev_ts ], synth_rev["msg"] @@ -218,8 +236,13 @@ ), ) def test_provenance_heuristics_content_find_all( - provenance, swh_storage, archive, repo, lower, mindepth -): + provenance: ProvenanceInterface, + swh_storage: Storage, + archive: ArchiveInterface, + repo: str, + lower: bool, + mindepth: int, +) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) @@ -233,6 +256,7 @@ ] def maybe_path(path: str) -> str: + assert isinstance(provenance.storage, ProvenanceDBBase) if provenance.storage.with_path: return path return "" @@ -246,7 +270,7 @@ syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) - expected_occurrences = {} + expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {} for synth_rev in synthetic_result(syntheticfile): rev_id = synth_rev["sha1"].hex() rev_ts = synth_rev["date"] @@ -261,6 +285,7 @@ (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) ) + assert isinstance(provenance.storage, ProvenanceDBBase) for content_id, results in expected_occurrences.items(): expected = [(content_id, *result) for result in results] db_occurrences = [ @@ -292,8 +317,13 @@ ), ) def test_provenance_heuristics_content_find_first( - provenance, swh_storage, archive, repo, lower, mindepth -): + provenance: ProvenanceInterface, + swh_storage: Storage, + archive: ArchiveInterface, + repo: str, + lower: bool, + mindepth: int, +) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) @@ -315,7 +345,7 @@ syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) - expected_first: Dict[str, Tuple[str, str, List[str]]] = {} + expected_first: Dict[str, Tuple[str, float, List[str]]] = {} # dict of tuples (blob_id, rev_id, [path, ...]) the third element for path # is a list because a content can be added at several places in a single # revision, in which case the result of content_find_first() is one of @@ -333,15 +363,17 @@ if rev_ts == expected_first[sha1][1]: expected_first[sha1][2].append(rc["path"]) elif rev_ts < expected_first[sha1][1]: - expected_first[sha1] = (rev_id, rev_ts, rc["path"]) + expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) for dc in synth_rev["D_C"]: sha1 = rc["dst"].hex() assert sha1 in expected_first # nothing to do there, this content cannot be a "first seen file" + assert isinstance(provenance.storage, ProvenanceDBBase) for content_id, (rev_id, ts, paths) in expected_first.items(): occur = provenance.content_find_first(hash_to_bytes(content_id)) + assert occur is not None assert occur.content.hex() == content_id assert occur.revision.hex() == rev_id assert occur.date.timestamp() == ts diff --git a/swh/provenance/tests/test_revision_iterator.py b/swh/provenance/tests/test_revision_iterator.py --- a/swh/provenance/tests/test_revision_iterator.py +++ b/swh/provenance/tests/test_revision_iterator.py @@ -8,6 +8,7 @@ from swh.provenance.revision import CSVRevisionIterator from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.provenance.tests.test_provenance_db import ts2dt +from swh.storage.postgresql.storage import Storage @pytest.mark.parametrize( @@ -17,7 +18,7 @@ "out-of-order", ), ) -def test_archive_direct_revision_iterator(swh_storage, repo): +def test_archive_direct_revision_iterator(swh_storage: Storage, repo: str) -> None: """Test CSVRevisionIterator""" data = load_repo_data(repo) fill_storage(swh_storage, data)