diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py index aa54503..495b528 100644 --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -1,283 +1,157 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from os import path -import re -from typing import Any, Dict, Iterable, Iterator, List, Optional +from typing import Any, Dict, Iterable, Iterator import msgpack import psycopg2 import pytest -from typing_extensions import TypedDict from swh.journal.serializers import msgpack_ext_hook -from swh.model.hashutil import hash_to_bytes -from swh.model.model import Sha1Git from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance import get_provenance, get_provenance_storage from swh.provenance.api.client import RemoteProvenanceStorage import swh.provenance.api.server as server from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage from swh.storage.postgresql.storage import Storage from swh.storage.replay import process_replay_objects @pytest.fixture( params=[ "with-path", "without-path", "with-path-denormalized", "without-path-denormalized", ] ) def populated_db( request, # TODO: add proper type annotation postgresql: psycopg2.extensions.connection, ) -> Dict[str, str]: """return a working and initialized provenance db""" from swh.core.cli.db import populate_database_for_package # flavor = "with-path" if request.param == "client-server" else request.param populate_database_for_package( "swh.provenance", postgresql.dsn, flavor=request.param ) return { k: v for (k, v) in (item.split("=") for item in postgresql.dsn.split()) if k != "options" } # the Flask app used as server in these tests @pytest.fixture def app(populated_db: Dict[str, str]): assert hasattr(server, "storage") server.storage = get_provenance_storage(cls="local", db=populated_db) yield server.app # the RPCClient class used as client used in these tests @pytest.fixture def swh_rpc_client_class(): return RemoteProvenanceStorage @pytest.fixture(params=["local", "remote"]) def provenance( request, # TODO: add proper type annotation populated_db: Dict[str, str], swh_rpc_client: RemoteProvenanceStorage, ) -> ProvenanceInterface: """return a working and initialized provenance db""" if request.param == "remote": from swh.provenance.provenance import Provenance assert isinstance(swh_rpc_client, ProvenanceStorageInterface) return Provenance(swh_rpc_client) else: # in test sessions, we DO want to raise any exception occurring at commit time prov = get_provenance(cls=request.param, db=populated_db, raise_on_commit=True) return prov @pytest.fixture def swh_storage_with_objects(swh_storage: Storage) -> Storage: """return a Storage object (postgresql-based by default) with a few of each object type in it The inserted content comes from swh.model.tests.swh_model_data. """ for obj_type in ( "content", "skipped_content", "directory", "revision", "release", "snapshot", "origin", "origin_visit", "origin_visit_status", ): getattr(swh_storage, f"{obj_type}_add")(TEST_OBJECTS[obj_type]) return swh_storage @pytest.fixture def archive_direct(swh_storage_with_objects: Storage) -> ArchiveInterface: return ArchivePostgreSQL(swh_storage_with_objects.get_db().conn) @pytest.fixture def archive_api(swh_storage_with_objects: Storage) -> ArchiveInterface: return ArchiveStorage(swh_storage_with_objects) @pytest.fixture(params=["archive", "db"]) def archive(request, swh_storage_with_objects: Storage) -> Iterator[ArchiveInterface]: """Return a ArchivePostgreSQL based StorageInterface object""" # this is a workaround to prevent tests from hanging because of an unclosed # transaction. # TODO: refactor the ArchivePostgreSQL to properly deal with # transactions and get rid of this fixture if request.param == "db": archive = ArchivePostgreSQL(conn=swh_storage_with_objects.get_db().conn) yield archive archive.conn.rollback() else: yield ArchiveStorage(swh_storage_with_objects) def get_datafile(fname: str) -> str: return path.join(path.dirname(__file__), "data", fname) def load_repo_data(repo: str) -> Dict[str, Any]: data: Dict[str, Any] = {} with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj: unpacker = msgpack.Unpacker( fobj, raw=False, ext_hook=msgpack_ext_hook, strict_map_key=False, timestamp=3, # convert Timestamp in datetime objects (tz UTC) ) for objtype, objd in unpacker: data.setdefault(objtype, []).append(objd) return data def filter_dict(d: Dict[Any, Any], keys: Iterable[Any]) -> Dict[Any, Any]: return {k: v for (k, v) in d.items() if k in keys} def fill_storage(storage: Storage, data: Dict[str, Any]) -> None: process_replay_objects(data, storage=storage) - - -class SynthRelation(TypedDict): - prefix: Optional[str] - path: str - src: Sha1Git - dst: Sha1Git - rel_ts: float - - -class SynthRevision(TypedDict): - sha1: Sha1Git - date: float - msg: str - R_C: List[SynthRelation] - R_D: List[SynthRelation] - D_C: List[SynthRelation] - - -def synthetic_result(filename: str) -> Iterator[SynthRevision]: - """Generates dict representations of synthetic revisions found in the synthetic - file (from the data/ directory) given as argument of the generator. - - Generated SynthRevision (typed dict) with the following elements: - - "sha1": (Sha1Git) sha1 of the revision, - "date": (float) timestamp of the revision, - "msg": (str) commit message of the revision, - "R_C": (list) new R---C relations added by this revision - "R_D": (list) new R-D relations added by this revision - "D_C": (list) new D-C relations added by this revision - - Each relation above is a SynthRelation typed dict with: - - "path": (str) location - "src": (Sha1Git) sha1 of the source of the relation - "dst": (Sha1Git) sha1 of the destination of the relation - "rel_ts": (float) timestamp of the target of the relation - (related to the timestamp of the revision) - - """ - - with open(get_datafile(filename), "r") as fobj: - yield from _parse_synthetic_file(fobj) - - -def _parse_synthetic_file(fobj: Iterable[str]) -> Iterator[SynthRevision]: - """Read a 'synthetic' file and generate a dict representation of the synthetic - revision for each revision listed in the synthetic file. - """ - regs = [ - "(?PR[0-9]{2,4})?", - "(?P[^| ]*)", - "([+] )?(?P[^| +]*?)[/]?", - "(?P[RDC]) (?P[0-9a-z]{40})", - "(?P-?[0-9]+(.[0-9]+)?)", - ] - regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$") - current_rev: List[dict] = [] - for m in (regex.match(line) for line in fobj): - if m: - d = m.groupdict() - if d["revname"]: - if current_rev: - yield _mk_synth_rev(current_rev) - current_rev.clear() - current_rev.append(d) - if current_rev: - yield _mk_synth_rev(current_rev) - - -def _mk_synth_rev(synth_rev: List[Dict[str, str]]) -> SynthRevision: - assert synth_rev[0]["type"] == "R" - rev = SynthRevision( - sha1=hash_to_bytes(synth_rev[0]["sha1"]), - date=float(synth_rev[0]["ts"]), - msg=synth_rev[0]["revname"], - R_C=[], - R_D=[], - D_C=[], - ) - current_path = None - # path of the last R-D relation we parsed, used a prefix for next D-C - # relations - - for row in synth_rev[1:]: - if row["reltype"] == "R---C": - assert row["type"] == "C" - rev["R_C"].append( - SynthRelation( - prefix=None, - path=row["path"], - src=rev["sha1"], - dst=hash_to_bytes(row["sha1"]), - rel_ts=float(row["ts"]), - ) - ) - current_path = None - elif row["reltype"] == "R-D": - assert row["type"] == "D" - rev["R_D"].append( - SynthRelation( - prefix=None, - path=row["path"], - src=rev["sha1"], - dst=hash_to_bytes(row["sha1"]), - rel_ts=float(row["ts"]), - ) - ) - current_path = row["path"] - elif row["reltype"] == "D-C": - assert row["type"] == "C" - rev["D_C"].append( - SynthRelation( - prefix=current_path, - path=row["path"], - src=rev["R_D"][-1]["dst"], - dst=hash_to_bytes(row["sha1"]), - rel_ts=float(row["ts"]), - ) - ) - return rev diff --git a/swh/provenance/tests/data/README.md b/swh/provenance/tests/data/README.md index 2eb0da0..81dff87 100644 --- a/swh/provenance/tests/data/README.md +++ b/swh/provenance/tests/data/README.md @@ -1,166 +1,166 @@ # Provenance Index Test Dataset This directory contains datasets used by `test_provenance_heurstics` tests of the provenance index database. Each dataset `xxx` consist in several parts: - a description of a git repository as a yaml file named `xxx_repo.yaml`, - a msgpack file containing storage objects for the given repository, from which the storage is filled before each test using these data, and - a set of synthetic files, named `synthetic_xxx_(lower|upper)_.txt`, describing the expected result in the provenance database if ingested with the flag `lower` set or not set, and the `mindepth` value (integer, most often `1` or `2`). ## Git repos description file The description of a git repository is a yaml file which contains a list dicts, each one representing a git revision to add (linearly) in the git repo used a base for the dataset. Each dict consist in a structure like: ``` yaml - msg: R00 date: 1000000000 content: A/B/C/a: "content a" ``` this example will generate a git commit with the commit message "R00", the author and committer date 1000000000 (given as a unix timestamp), and a one file which path is `A/B/C/a` and content is "content a". The file is parsed to create git revisions in a temporary git repository, in order of appearance in the yaml file (so one may create an git repository with 'out-of-order' commits). There is no way of creating branches and merges for now. The tool to generate this git repo is `generate_repo.py`: ``` python generate_repo.py --help Usage: generate_repo.py [OPTIONS] INPUT_FILE OUTPUT_DIR Options: -C, --clean-output / --no-clean-output --help Show this message and exit. ``` It generates a git repository in the `OUTPUT_DIR` as well as produces a template `synthetic` file on its standard output, which can be used to ease writing the expected `synthetic` files. Typical usage will be: ``` python generate_repo.py repo2_repo.yaml repo2 > synthetic_repo2_template.txt ``` Note that hashes (for revision, directories and content) of the git objects only depends on the content of the input yaml file. Calling the tool twice on the same input file should generate the exact same git repo twice. Also note that the tool will add a branch at each revision (using the commit message as bramch name), to make it easier to reference any point in the git history. ## Msgpack dump of the storage This file contains a set of storage objects (`Revision`, `Content` and `Directory`) and is usually generated from a local git repository (typically the one generated by the previous command) using the `generate_storage_from_git.py` tool: ``` python generate_storage_from_git.py --help Usage: generate_storage_from_git.py [OPTIONS] GIT_REPO simple tool to generate the CMDBTS.msgpack dataset filed used in tests Options: -r, --head TEXT head revision to start from -o, --output TEXT output file --help Show this message and exit. ``` Typical usage would be, using the git repository `repo2` created previously: ``` python generate_storage_from_git.py repo2 Revision hash for master is 8363e8e98751dc9f264d2fedd6b829ad4b1218b0 Wrote 86 objects in repo2.msgpack ``` ### Adding extra visits/snapshots It is also possible to generate a storage from a git repo with extra origin -visits, using the `--visit` option of the `generate_repo_from_git` tool. +visits, using the `--visit` option of the `generate_storage_from_git` tool. This option expect a yaml file as argument. This file contains a description of extra visits (and snapshots) you want to add to the storage. The format is simple, for example: ``` # a visit pattern scenario for the 'repo_with_merges' repo - origin: http://repo_with_merges/1/ date: 1000000015 branches: - R01 ``` will create an OriginVisit (at given date) for the given origin URL (the Origin will be created as well), with a `Snapshot` including the listed branches. ## Synthetic files These files describe the expected content of the provenance database for each revision (in order of ingestion). The `generate_repo.py` tool will produce a template of synthetic file like: ``` 1000000000.0 b582a17b3fc37f72fc57877616f85c3f0abed064 R00 R00 | | | R b582a17b3fc37f72fc57877616f85c3f0abed064 | 1000000000.0 | | . | D a4cb5e6b2831f7e8eef0e6e08e43d642c97303a1 | 0.0 | | A | D 1c8d9fd9afa7e5a2cf52a3db6f05dc5c3a1ca86b | 0.0 | | A/B | D 36876d475197b5ad86ad592e8e28818171455f16 | 0.0 | | A/B/C | D 98f7a4a23d8df1fb1a5055facae2aff9b2d0a8b3 | 0.0 | | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0 1000000010.0 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe R01 R01 | | | R 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe | 1000000010.0 | | . | D b3cf11b22c9f93c3c494cf90ab072f394155072d | 0.0 | | A | D baca735bf8b8720131b4bfdb47c51631a9260348 | 0.0 | | A/B | D 4b28979d88ed209a09c272bcc80f69d9b18339c2 | 0.0 | | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | 0.0 | | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0 | | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0.0 [...] ``` where all the content and directories of each revision are listed; it's then the responsibility of the user to create the expected synthetic file for a given heuristics configuration. For example, the 2 revisions above are to be adapted, for the `(lower=True, mindepth=1)` case, as: ``` 1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00 R00 | | | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000 | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0 1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01 R01 | | | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010 | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10 | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 ``` diff --git a/swh/provenance/tests/data/origin-revision_with-merges_visits-01.txt b/swh/provenance/tests/data/origin-revision_with-merges_visits-01.txt new file mode 100644 index 0000000..87a99c3 --- /dev/null +++ b/swh/provenance/tests/data/origin-revision_with-merges_visits-01.txt @@ -0,0 +1,67 @@ +1000000015 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 http://repo_with_merges/1/ +http://repo_with_merges/1/ | | | O 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 + | | | S 8d6b9ac022cae46a59b4f5b9285f0eea9736dae4 + | O-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 + +1000000025 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 http://repo_with_merges/1/ +http://repo_with_merges/1/ | | | O 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 + | | | S 537a4f35fc36eb08a065952210958c8095fea3c7 + | O-R | R03 | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb + | R-R | R02 | R 1c533587277731236616cac0d44f3b46c1da0f8a + | R-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 + | O-R | R06 | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff + | R-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 + +1000000035 ed02cb79ed65a650cee6907323f986cff4a8428f http://repo_with_merges/2/ +http://repo_with_merges/2/ | | | O ed02cb79ed65a650cee6907323f986cff4a8428f + | | | S 5aa910026ddbbea2971fe1b89725bc5f076637e3 + | O-R | R05 | R 65e58853df939b318c106c4c1f55acaf8b41c74c + | R-R | R04 | R 0d66eadcc15e0d7f6cfd4289329a7749a1309982 + | R-R | R03 | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb + | R-R | R02 | R 1c533587277731236616cac0d44f3b46c1da0f8a + | R-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 + | O-R | R06 | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff + | R-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 + +1000000045 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 http://repo_with_merges/1/ +http://repo_with_merges/1/ | | | O 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 + | | | S 8193e0324e7a21181331c13588b3e348022b3268 + | O-R | R06 | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff + | R-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 + | O-R | R07 | R fff0089fad98e8f5b46ec5c9025a20a602851ba6 + | R-R | R03 | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb + | R-R | R02 | R 1c533587277731236616cac0d44f3b46c1da0f8a + | R-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 + +1000000055 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 http://repo_with_merges/1/ +http://repo_with_merges/1/ | | | O 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 + | | | S ee9ea8b0ba40c9012d7d8103234946a21b66a729 + | O-R | R08 | R 7c8f29237dded4f9d265e46ec7066503e7858e87 + | R-R | R07 | R fff0089fad98e8f5b46ec5c9025a20a602851ba6 + | R-R | R06 | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff + | R-R | R05 | R 65e58853df939b318c106c4c1f55acaf8b41c74c + | R-R | R04 | R 0d66eadcc15e0d7f6cfd4289329a7749a1309982 + | R-R | R03 | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb + | R-R | R02 | R 1c533587277731236616cac0d44f3b46c1da0f8a + | R-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 + +1000000065 ed02cb79ed65a650cee6907323f986cff4a8428f http://repo_with_merges/2/ +http://repo_with_merges/2/ | | | O ed02cb79ed65a650cee6907323f986cff4a8428f + | | | S ee9ea8b0ba40c9012d7d8103234946a21b66a729 + | O-R | R08 | R 7c8f29237dded4f9d265e46ec7066503e7858e87 + | R-R | R07 | R fff0089fad98e8f5b46ec5c9025a20a602851ba6 + | R-R | R06 | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff + | R-R | R05 | R 65e58853df939b318c106c4c1f55acaf8b41c74c + | R-R | R04 | R 0d66eadcc15e0d7f6cfd4289329a7749a1309982 + | R-R | R03 | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb + | R-R | R02 | R 1c533587277731236616cac0d44f3b46c1da0f8a + | R-R | R01 | R 1444db96cbd8cd791abe83527becee73d3c64e86 + | R-R | R00 | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 diff --git a/swh/provenance/tests/data/with-merges.msgpack b/swh/provenance/tests/data/with-merges.msgpack index 3accba0..52ff778 100644 Binary files a/swh/provenance/tests/data/with-merges.msgpack and b/swh/provenance/tests/data/with-merges.msgpack differ diff --git a/swh/provenance/tests/test_origin_revision_layer.py b/swh/provenance/tests/test_origin_revision_layer.py new file mode 100644 index 0000000..9bb66d8 --- /dev/null +++ b/swh/provenance/tests/test_origin_revision_layer.py @@ -0,0 +1,192 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re +from typing import Any, Dict, Iterable, Iterator, List, Set + +import pytest +from typing_extensions import TypedDict + +from swh.model.hashutil import hash_to_bytes +from swh.model.model import Sha1Git +from swh.provenance.archive import ArchiveInterface +from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType +from swh.provenance.model import OriginEntry +from swh.provenance.origin import origin_add +from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data +from swh.storage.postgresql.storage import Storage + + +class SynthRelation(TypedDict): + src: Sha1Git + dst: Sha1Git + name: str + + +class SynthOrigin(TypedDict): + sha1: Sha1Git + url: str + snap: Sha1Git + O_R: List[SynthRelation] + R_R: List[SynthRelation] + + +def synthetic_origin_revision_result(filename: str) -> Iterator[SynthOrigin]: + """Generates dict representations of synthetic origin visits found in the + synthetic file (from the data/ directory) given as argument of the generator. + + Generated SynthOrigin (typed dict) with the following elements: + + "sha1": (Sha1Git) sha1 of the origin, + "url": (str) url of the origin, + "snap": (Sha1Git) sha1 of the visit's snapshot, + "O_R": (list) new O-R relations added by this origin visit + "R_R": (list) new R-R relations added by this origin visit + + Each relation above is a SynthRelation typed dict with: + + "src": (Sha1Git) sha1 of the source of the relation + "dst": (Sha1Git) sha1 of the destination of the relation + + """ + + with open(get_datafile(filename), "r") as fobj: + yield from _parse_synthetic_origin_revision_file(fobj) + + +def _parse_synthetic_origin_revision_file(fobj: Iterable[str]) -> Iterator[SynthOrigin]: + """Read a 'synthetic' file and generate a dict representation of the synthetic + origin visit for each snapshot listed in the synthetic file. + """ + regs = [ + "(?P[^ ]+)?", + "(?P[^| ]*)", + "(?PR[0-9]{2,4})?", + "(?P[ORS]) (?P[0-9a-f]{40})", + ] + regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$") + current_org: List[dict] = [] + for m in (regex.match(line) for line in fobj): + if m: + d = m.groupdict() + if d["url"]: + if current_org: + yield _mk_synth_org(current_org) + current_org.clear() + current_org.append(d) + if current_org: + yield _mk_synth_org(current_org) + + +def _mk_synth_org(synth_org: List[Dict[str, str]]) -> SynthOrigin: + assert synth_org[0]["type"] == "O" + assert synth_org[1]["type"] == "S" + org = SynthOrigin( + sha1=hash_to_bytes(synth_org[0]["sha1"]), + url=synth_org[0]["url"], + snap=hash_to_bytes(synth_org[1]["sha1"]), + O_R=[], + R_R=[], + ) + + for row in synth_org[2:]: + if row["reltype"] == "O-R": + assert row["type"] == "R" + org["O_R"].append( + SynthRelation( + src=org["sha1"], + dst=hash_to_bytes(row["sha1"]), + name=row["revname"], + ) + ) + elif row["reltype"] == "R-R": + assert row["type"] == "R" + org["R_R"].append( + SynthRelation( + src=org["O_R"][-1]["dst"], + dst=hash_to_bytes(row["sha1"]), + name=row["revname"], + ) + ) + return org + + +@pytest.mark.parametrize( + "repo, visit", + (("with-merges", "visits-01"),), +) +def test_origin_revision_layer( + provenance: ProvenanceInterface, + swh_storage: Storage, + archive: ArchiveInterface, + repo: str, + visit: str, +) -> None: + # read data/README.md for more details on how these datasets are generated + data = load_repo_data(repo) + fill_storage(swh_storage, data) + syntheticfile = get_datafile(f"origin-revision_{repo}_{visit}.txt") + + origins = [ + {"url": status["origin"], "snap": status["snapshot"]} + for status in data["origin_visit_status"] + if status["snapshot"] is not None + ] + + rows: Dict[str, Set[Any]] = { + "origin": set(), + "revision_in_origin": set(), + "revision_before_revision": set(), + "revision": set(), + } + + for synth_org in synthetic_origin_revision_result(syntheticfile): + for origin in ( + org + for org in origins + if org["url"] == synth_org["url"] and org["snap"] == synth_org["snap"] + ): + entry = OriginEntry(url=origin["url"], snapshot=origin["snap"]) + origin_add(provenance, archive, [entry]) + + # each "entry" in the synth file is one new origin visit + rows["origin"].add(synth_org["sha1"]) + assert rows["origin"] == provenance.storage.entity_get_all( + EntityType.ORIGIN + ), synth_org["url"] + # check the url of the origin + assert ( + provenance.storage.origin_get([synth_org["sha1"]])[synth_org["sha1"]] + == synth_org["url"] + ), synth_org["snap"] + + # this origin visit might have added new revision objects + rows["revision"] |= set(x["dst"] for x in synth_org["O_R"]) + rows["revision"] |= set(x["dst"] for x in synth_org["R_R"]) + assert rows["revision"] == provenance.storage.entity_get_all( + EntityType.REVISION + ), synth_org["snap"] + + # check for O-R (head) entries + # these are added in the revision_in_origin relation + rows["revision_in_origin"] |= set( + (x["dst"], x["src"], None) for x in synth_org["O_R"] + ) + assert rows["revision_in_origin"] == { + (rel.src, rel.dst, rel.path) + for rel in provenance.storage.relation_get_all(RelationType.REV_IN_ORG) + }, synth_org["snap"] + + # check for R-R entries + # these are added in the revision_before_revision relation + rows["revision_before_revision"] |= set( + (x["dst"], x["src"], None) for x in synth_org["R_R"] + ) + assert rows["revision_before_revision"] == { + (rel.src, rel.dst, rel.path) + for rel in provenance.storage.relation_get_all( + RelationType.REV_BEFORE_REV + ) + }, synth_org["snap"] diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_revision_content_layer.py similarity index 71% rename from swh/provenance/tests/test_provenance_heuristics.py rename to swh/provenance/tests/test_revision_content_layer.py index 7eb1cd9..77e36d3 100644 --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_revision_content_layer.py @@ -1,325 +1,447 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, List, Optional, Set, Tuple +import re +from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple import pytest +from typing_extensions import TypedDict from swh.model.hashutil import hash_to_bytes +from swh.model.model import Sha1Git from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType from swh.provenance.model import RevisionEntry from swh.provenance.revision import revision_add -from swh.provenance.tests.conftest import ( - fill_storage, - get_datafile, - load_repo_data, - synthetic_result, -) +from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data from swh.provenance.tests.test_provenance_db import ts2dt from swh.storage.postgresql.storage import Storage +class SynthRelation(TypedDict): + prefix: Optional[str] + path: str + src: Sha1Git + dst: Sha1Git + rel_ts: float + + +class SynthRevision(TypedDict): + sha1: Sha1Git + date: float + msg: str + R_C: List[SynthRelation] + R_D: List[SynthRelation] + D_C: List[SynthRelation] + + +def synthetic_revision_content_result(filename: str) -> Iterator[SynthRevision]: + """Generates dict representations of synthetic revisions found in the synthetic + file (from the data/ directory) given as argument of the generator. + + Generated SynthRevision (typed dict) with the following elements: + + "sha1": (Sha1Git) sha1 of the revision, + "date": (float) timestamp of the revision, + "msg": (str) commit message of the revision, + "R_C": (list) new R---C relations added by this revision + "R_D": (list) new R-D relations added by this revision + "D_C": (list) new D-C relations added by this revision + + Each relation above is a SynthRelation typed dict with: + + "path": (str) location + "src": (Sha1Git) sha1 of the source of the relation + "dst": (Sha1Git) sha1 of the destination of the relation + "rel_ts": (float) timestamp of the target of the relation + (related to the timestamp of the revision) + + """ + + with open(get_datafile(filename), "r") as fobj: + yield from _parse_synthetic_revision_content_file(fobj) + + +def _parse_synthetic_revision_content_file( + fobj: Iterable[str], +) -> Iterator[SynthRevision]: + """Read a 'synthetic' file and generate a dict representation of the synthetic + revision for each revision listed in the synthetic file. + """ + regs = [ + "(?PR[0-9]{2,4})?", + "(?P[^| ]*)", + "([+] )?(?P[^| +]*?)[/]?", + "(?P[RDC]) (?P[0-9a-f]{40})", + "(?P-?[0-9]+(.[0-9]+)?)", + ] + regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$") + current_rev: List[dict] = [] + for m in (regex.match(line) for line in fobj): + if m: + d = m.groupdict() + if d["revname"]: + if current_rev: + yield _mk_synth_rev(current_rev) + current_rev.clear() + current_rev.append(d) + if current_rev: + yield _mk_synth_rev(current_rev) + + +def _mk_synth_rev(synth_rev: List[Dict[str, str]]) -> SynthRevision: + assert synth_rev[0]["type"] == "R" + rev = SynthRevision( + sha1=hash_to_bytes(synth_rev[0]["sha1"]), + date=float(synth_rev[0]["ts"]), + msg=synth_rev[0]["revname"], + R_C=[], + R_D=[], + D_C=[], + ) + current_path = None + # path of the last R-D relation we parsed, used a prefix for next D-C + # relations + + for row in synth_rev[1:]: + if row["reltype"] == "R---C": + assert row["type"] == "C" + rev["R_C"].append( + SynthRelation( + prefix=None, + path=row["path"], + src=rev["sha1"], + dst=hash_to_bytes(row["sha1"]), + rel_ts=float(row["ts"]), + ) + ) + current_path = None + elif row["reltype"] == "R-D": + assert row["type"] == "D" + rev["R_D"].append( + SynthRelation( + prefix=None, + path=row["path"], + src=rev["sha1"], + dst=hash_to_bytes(row["sha1"]), + rel_ts=float(row["ts"]), + ) + ) + current_path = row["path"] + elif row["reltype"] == "D-C": + assert row["type"] == "C" + rev["D_C"].append( + SynthRelation( + prefix=current_path, + path=row["path"], + src=rev["R_D"][-1]["dst"], + dst=hash_to_bytes(row["sha1"]), + rel_ts=float(row["ts"]), + ) + ) + return rev + + @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) -def test_provenance_heuristics( +def test_revision_content_result( provenance: ProvenanceInterface, swh_storage: Storage, archive: ArchiveInterface, repo: str, lower: bool, mindepth: int, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) revisions = {rev["id"]: rev for rev in data["revision"]} rows: Dict[str, Set[Any]] = { "content": set(), "content_in_directory": set(), "content_in_revision": set(), "directory": set(), "directory_in_revision": set(), "location": set(), "revision": set(), } def maybe_path(path: str) -> Optional[bytes]: if provenance.storage.with_path(): return path.encode("utf-8") return None - for synth_rev in synthetic_result(syntheticfile): + for synth_rev in synthetic_revision_content_result(syntheticfile): revision = revisions[synth_rev["sha1"]] entry = RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) # each "entry" in the synth file is one new revision rows["revision"].add(synth_rev["sha1"]) assert rows["revision"] == provenance.storage.entity_get_all( EntityType.REVISION ), synth_rev["msg"] # check the timestamp of the revision rev_ts = synth_rev["date"] rev_data = provenance.storage.revision_get([synth_rev["sha1"]])[ synth_rev["sha1"] ] assert ( rev_data.date is not None and rev_ts == rev_data.date.timestamp() ), synth_rev["msg"] # this revision might have added new content objects rows["content"] |= set(x["dst"] for x in synth_rev["R_C"]) rows["content"] |= set(x["dst"] for x in synth_rev["D_C"]) assert rows["content"] == provenance.storage.entity_get_all( EntityType.CONTENT ), synth_rev["msg"] # check for R-C (direct) entries # these are added directly in the content_early_in_rev table rows["content_in_revision"] |= set( (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_C"] ) assert rows["content_in_revision"] == { (rel.src, rel.dst, rel.path) for rel in provenance.storage.relation_get_all( RelationType.CNT_EARLY_IN_REV ) }, synth_rev["msg"] # check timestamps for rc in synth_rev["R_C"]: assert ( rev_ts + rc["rel_ts"] == provenance.storage.content_get([rc["dst"]])[rc["dst"]].timestamp() ), synth_rev["msg"] # check directories # each directory stored in the provenance index is an entry # in the "directory" table... rows["directory"] |= set(x["dst"] for x in synth_rev["R_D"]) assert rows["directory"] == provenance.storage.entity_get_all( EntityType.DIRECTORY ), synth_rev["msg"] # ... + a number of rows in the "directory_in_rev" table... # check for R-D entries rows["directory_in_revision"] |= set( (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_D"] ) assert rows["directory_in_revision"] == { (rel.src, rel.dst, rel.path) for rel in provenance.storage.relation_get_all(RelationType.DIR_IN_REV) }, synth_rev["msg"] # check timestamps for rd in synth_rev["R_D"]: assert ( rev_ts + rd["rel_ts"] == provenance.storage.directory_get([rd["dst"]])[rd["dst"]].timestamp() ), synth_rev["msg"] # ... + a number of rows in the "content_in_dir" table # for content of the directory. # check for D-C entries rows["content_in_directory"] |= set( (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["D_C"] ) assert rows["content_in_directory"] == { (rel.src, rel.dst, rel.path) for rel in provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) }, synth_rev["msg"] # check timestamps for dc in synth_rev["D_C"]: assert ( rev_ts + dc["rel_ts"] == provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp() ), synth_rev["msg"] if provenance.storage.with_path(): # check for location entries rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) assert rows["location"] == provenance.storage.location_get(), synth_rev[ "msg" ] @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) @pytest.mark.parametrize("batch", (True, False)) def test_provenance_heuristics_content_find_all( provenance: ProvenanceInterface, swh_storage: Storage, archive: ArchiveInterface, repo: str, lower: bool, mindepth: int, batch: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) revisions = [ RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) for revision in data["revision"] ] def maybe_path(path: str) -> str: if provenance.storage.with_path(): return path return "" if batch: revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) else: for revision in revisions: revision_add( provenance, archive, [revision], lower=lower, mindepth=mindepth ) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {} - for synth_rev in synthetic_result(syntheticfile): + for synth_rev in synthetic_revision_content_result(syntheticfile): rev_id = synth_rev["sha1"].hex() rev_ts = synth_rev["date"] for rc in synth_rev["R_C"]: expected_occurrences.setdefault(rc["dst"].hex(), []).append( (rev_id, rev_ts, None, maybe_path(rc["path"])) ) for dc in synth_rev["D_C"]: assert dc["prefix"] is not None # to please mypy expected_occurrences.setdefault(dc["dst"].hex(), []).append( (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) ) for content_id, results in expected_occurrences.items(): expected = [(content_id, *result) for result in results] db_occurrences = [ ( occur.content.hex(), occur.revision.hex(), occur.date.timestamp(), occur.origin, occur.path.decode(), ) for occur in provenance.content_find_all(hash_to_bytes(content_id)) ] if provenance.storage.with_path(): # this is not true if the db stores no path, because a same content # that appears several times in a given revision may be reported # only once by content_find_all() assert len(db_occurrences) == len(expected) assert set(db_occurrences) == set(expected) @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) @pytest.mark.parametrize("batch", (True, False)) def test_provenance_heuristics_content_find_first( provenance: ProvenanceInterface, swh_storage: Storage, archive: ArchiveInterface, repo: str, lower: bool, mindepth: int, batch: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) revisions = [ RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) for revision in data["revision"] ] if batch: revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) else: for revision in revisions: revision_add( provenance, archive, [revision], lower=lower, mindepth=mindepth ) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) expected_first: Dict[str, Tuple[str, float, List[str]]] = {} # dict of tuples (blob_id, rev_id, [path, ...]) the third element for path # is a list because a content can be added at several places in a single # revision, in which case the result of content_find_first() is one of # those path, but we have no guarantee which one it will return. - for synth_rev in synthetic_result(syntheticfile): + for synth_rev in synthetic_revision_content_result(syntheticfile): rev_id = synth_rev["sha1"].hex() rev_ts = synth_rev["date"] for rc in synth_rev["R_C"]: sha1 = rc["dst"].hex() if sha1 not in expected_first: assert rc["rel_ts"] == 0 expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) else: if rev_ts == expected_first[sha1][1]: expected_first[sha1][2].append(rc["path"]) elif rev_ts < expected_first[sha1][1]: expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) for dc in synth_rev["D_C"]: sha1 = rc["dst"].hex() assert sha1 in expected_first # nothing to do there, this content cannot be a "first seen file" for content_id, (rev_id, ts, paths) in expected_first.items(): occur = provenance.content_find_first(hash_to_bytes(content_id)) assert occur is not None assert occur.content.hex() == content_id assert occur.revision.hex() == rev_id assert occur.date.timestamp() == ts assert occur.origin is None if provenance.storage.with_path(): assert occur.path.decode() in paths