diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py index 08c078b..82128a4 100644 --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -1,163 +1,162 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from datetime import datetime, timedelta, timezone +from datetime import datetime from os import path from typing import Any, Dict, Generator, List from _pytest.fixtures import SubRequest import msgpack import psycopg2.extensions import pytest from pytest_postgresql.factories import postgresql from swh.journal.serializers import msgpack_ext_hook -from swh.model.model import BaseModel +from swh.model.model import BaseModel, TimestampWithTimezone from swh.provenance import get_provenance, get_provenance_storage from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface from swh.provenance.storage.archive import ArchiveStorage from swh.storage.interface import StorageInterface from swh.storage.replay import OBJECT_CONVERTERS, OBJECT_FIXERS, process_replay_objects @pytest.fixture( params=[ "with-path", "without-path", "with-path-denormalized", "without-path-denormalized", ] ) def provenance_postgresqldb( request: SubRequest, postgresql: psycopg2.extensions.connection, ) -> Dict[str, str]: """return a working and initialized provenance db""" from swh.core.db.db_utils import ( init_admin_extensions, populate_database_for_package, ) init_admin_extensions("swh.provenance", postgresql.dsn) populate_database_for_package( "swh.provenance", postgresql.dsn, flavor=request.param ) return postgresql.get_dsn_parameters() @pytest.fixture(params=["postgresql", "rabbitmq"]) def provenance_storage( request: SubRequest, provenance_postgresqldb: Dict[str, str], ) -> Generator[ProvenanceStorageInterface, None, None]: """Return a working and initialized ProvenanceStorageInterface object""" if request.param == "rabbitmq": from swh.provenance.api.server import ProvenanceStorageRabbitMQServer rabbitmq = request.getfixturevalue("rabbitmq") host = rabbitmq.args["host"] port = rabbitmq.args["port"] rabbitmq_params: Dict[str, Any] = { "url": f"amqp://guest:guest@{host}:{port}/%2f", "storage_config": { "cls": "postgresql", "db": provenance_postgresqldb, "raise_on_commit": True, }, } server = ProvenanceStorageRabbitMQServer( url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"] ) server.start() with get_provenance_storage(cls=request.param, **rabbitmq_params) as storage: yield storage server.stop() else: # in test sessions, we DO want to raise any exception occurring at commit time with get_provenance_storage( cls=request.param, db=provenance_postgresqldb, raise_on_commit=True ) as storage: yield storage provenance_postgresql = postgresql("postgresql_proc", dbname="provenance_tests") @pytest.fixture def provenance( provenance_postgresql: psycopg2.extensions.connection, ) -> Generator[ProvenanceInterface, None, None]: """Return a working and initialized ProvenanceInterface object""" from swh.core.db.db_utils import ( init_admin_extensions, populate_database_for_package, ) init_admin_extensions("swh.provenance", provenance_postgresql.dsn) populate_database_for_package( "swh.provenance", provenance_postgresql.dsn, flavor="with-path" ) # in test sessions, we DO want to raise any exception occurring at commit time with get_provenance( cls="postgresql", db=provenance_postgresql.get_dsn_parameters(), raise_on_commit=True, ) as provenance: yield provenance @pytest.fixture def archive(swh_storage: StorageInterface) -> ArchiveInterface: """Return an ArchiveStorage-based ArchiveInterface object""" return ArchiveStorage(swh_storage) def fill_storage(storage: StorageInterface, data: Dict[str, List[dict]]) -> None: objects = { objtype: [objs_from_dict(objtype, d) for d in dicts] for objtype, dicts in data.items() } process_replay_objects(objects, storage=storage) def get_datafile(fname: str) -> str: return path.join(path.dirname(__file__), "data", fname) # TODO: this should return Dict[str, List[BaseModel]] directly, but it requires # refactoring several tests def load_repo_data(repo: str) -> Dict[str, List[dict]]: data: Dict[str, List[dict]] = {} with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj: unpacker = msgpack.Unpacker( fobj, raw=False, ext_hook=msgpack_ext_hook, strict_map_key=False, timestamp=3, # convert Timestamp in datetime objects (tz UTC) ) - for objtype, objd in unpacker: + for msg in unpacker: + if len(msg) == 2: # old format + objtype, objd = msg + else: # now we should have a triplet (type, key, value) + objtype, _, objd = msg data.setdefault(objtype, []).append(objd) return data def objs_from_dict(object_type: str, dict_repr: dict) -> BaseModel: if object_type in OBJECT_FIXERS: dict_repr = OBJECT_FIXERS[object_type](dict_repr) obj = OBJECT_CONVERTERS[object_type](dict_repr) return obj -# TODO: remove this function in favour of TimestampWithTimezone.to_datetime -# from swh.model.model def ts2dt(ts: Dict[str, Any]) -> datetime: - timestamp = datetime.fromtimestamp( - ts["timestamp"]["seconds"], timezone(timedelta(minutes=ts["offset"])) - ) - return timestamp.replace(microsecond=ts["timestamp"]["microseconds"]) + return TimestampWithTimezone.from_dict(ts).to_datetime() diff --git a/swh/provenance/tests/data/README.md b/swh/provenance/tests/data/README.md index 81dff87..5fc5a33 100644 --- a/swh/provenance/tests/data/README.md +++ b/swh/provenance/tests/data/README.md @@ -1,166 +1,184 @@ # Provenance Index Test Dataset This directory contains datasets used by `test_provenance_heurstics` tests of the provenance index database. +## Datasets + +There are currently 3 dataset: + +- cmdbts2: original dataset +- out-of-order: with unsorted revisions +- with-merge: with merge revisions + Each dataset `xxx` consist in several parts: - a description of a git repository as a yaml file named `xxx_repo.yaml`, - a msgpack file containing storage objects for the given repository, from which the storage is filled before each test using these data, and - a set of synthetic files, named `synthetic_xxx_(lower|upper)_.txt`, describing the expected result in the provenance database if ingested with the flag `lower` set or not set, and the `mindepth` value (integer, most often `1` or `2`). +### Generate datasets files +For each dataset `xxx`, execute a number of commands: + +``` +for dataset in cmdbts2 out-of-order with-merges; do + python generate_repo.py -C ${dataset}_repo.yaml $dataset > synthetic_${dataset}_template.txt + # you may want to edit/update synthetic files from this template, see below + python generate_storage_from_git.py $dataset +done +``` ## Git repos description file The description of a git repository is a yaml file which contains a list dicts, each one representing a git revision to add (linearly) in the git repo used a base for the dataset. Each dict consist in a structure like: ``` yaml - msg: R00 date: 1000000000 content: A/B/C/a: "content a" ``` this example will generate a git commit with the commit message "R00", the author and committer date 1000000000 (given as a unix timestamp), and a one file which path is `A/B/C/a` and content is "content a". The file is parsed to create git revisions in a temporary git repository, in order of appearance in the yaml file (so one may create an git repository with 'out-of-order' commits). There is no way of creating branches and merges for now. The tool to generate this git repo is `generate_repo.py`: ``` python generate_repo.py --help Usage: generate_repo.py [OPTIONS] INPUT_FILE OUTPUT_DIR Options: -C, --clean-output / --no-clean-output --help Show this message and exit. ``` It generates a git repository in the `OUTPUT_DIR` as well as produces a template `synthetic` file on its standard output, which can be used to ease writing the expected `synthetic` files. Typical usage will be: ``` python generate_repo.py repo2_repo.yaml repo2 > synthetic_repo2_template.txt ``` Note that hashes (for revision, directories and content) of the git objects only depends on the content of the input yaml file. Calling the tool twice on the same input file should generate the exact same git repo twice. Also note that the tool will add a branch at each revision (using the commit message as bramch name), to make it easier to reference any point in the git history. ## Msgpack dump of the storage This file contains a set of storage objects (`Revision`, `Content` and `Directory`) and is usually generated from a local git repository (typically the one generated by the previous command) using the `generate_storage_from_git.py` tool: ``` python generate_storage_from_git.py --help Usage: generate_storage_from_git.py [OPTIONS] GIT_REPO simple tool to generate the CMDBTS.msgpack dataset filed used in tests Options: -r, --head TEXT head revision to start from -o, --output TEXT output file --help Show this message and exit. ``` Typical usage would be, using the git repository `repo2` created previously: ``` python generate_storage_from_git.py repo2 Revision hash for master is 8363e8e98751dc9f264d2fedd6b829ad4b1218b0 Wrote 86 objects in repo2.msgpack ``` ### Adding extra visits/snapshots It is also possible to generate a storage from a git repo with extra origin visits, using the `--visit` option of the `generate_storage_from_git` tool. This option expect a yaml file as argument. This file contains a description of extra visits (and snapshots) you want to add to the storage. The format is simple, for example: ``` # a visit pattern scenario for the 'repo_with_merges' repo - origin: http://repo_with_merges/1/ date: 1000000015 branches: - R01 ``` will create an OriginVisit (at given date) for the given origin URL (the Origin will be created as well), with a `Snapshot` including the listed branches. ## Synthetic files These files describe the expected content of the provenance database for each revision (in order of ingestion). The `generate_repo.py` tool will produce a template of synthetic file like: ``` 1000000000.0 b582a17b3fc37f72fc57877616f85c3f0abed064 R00 R00 | | | R b582a17b3fc37f72fc57877616f85c3f0abed064 | 1000000000.0 | | . | D a4cb5e6b2831f7e8eef0e6e08e43d642c97303a1 | 0.0 | | A | D 1c8d9fd9afa7e5a2cf52a3db6f05dc5c3a1ca86b | 0.0 | | A/B | D 36876d475197b5ad86ad592e8e28818171455f16 | 0.0 | | A/B/C | D 98f7a4a23d8df1fb1a5055facae2aff9b2d0a8b3 | 0.0 | | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0 1000000010.0 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe R01 R01 | | | R 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe | 1000000010.0 | | . | D b3cf11b22c9f93c3c494cf90ab072f394155072d | 0.0 | | A | D baca735bf8b8720131b4bfdb47c51631a9260348 | 0.0 | | A/B | D 4b28979d88ed209a09c272bcc80f69d9b18339c2 | 0.0 | | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | 0.0 | | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0 | | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0.0 [...] ``` where all the content and directories of each revision are listed; it's then the responsibility of the user to create the expected synthetic file for a given heuristics configuration. For example, the 2 revisions above are to be adapted, for the `(lower=True, mindepth=1)` case, as: ``` 1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00 R00 | | | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000 | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0 1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01 R01 | | | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010 | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10 | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 ``` diff --git a/swh/provenance/tests/data/cmdbts2.msgpack b/swh/provenance/tests/data/cmdbts2.msgpack index 5dfd186..f98be7d 100644 Binary files a/swh/provenance/tests/data/cmdbts2.msgpack and b/swh/provenance/tests/data/cmdbts2.msgpack differ diff --git a/swh/provenance/tests/data/cmdbts2_repo.yaml b/swh/provenance/tests/data/cmdbts2_repo.yaml index d58e8d2..1832c75 100644 --- a/swh/provenance/tests/data/cmdbts2_repo.yaml +++ b/swh/provenance/tests/data/cmdbts2_repo.yaml @@ -1,80 +1,82 @@ - msg: R00 date: 1000000000 content: A/B/C/a: "content a" - msg: R01 date: 1000000010 content: A/B/C/a: "content a" A/B/C/b: "content b" - msg: R02 date: 1000000020 content: A/B/C/a: "content a" A/B/C/b: "content b" A/B/c: "content c" - msg: R03 date: 1000000030 content: A/C/a: "content a" A/C/b: "content b" A/a: "content a" - msg: R04 date: 1000000040 content: A/B/C/a: "content a" A/B/C/b: "content b" A/B/c: "content c" - msg: R05 date: 1000000050 content: D/d: "content d" - msg: R06 date: 1000000060 content: D/E/D/d: "content d" - msg: R07 date: 1000000070 content: F/E/D/d: "content d" F/d: "content d" - msg: R08 date: 1000000080 content: F/E/D/d: "content d" F/E/D/e: "content d" # /!\ same content as d F/a: "content a" + tag: "0.1" - msg: R09 date: 1000000090 content: F/E/D/f: "content f" F/E/D/g: "content f" # /!\ same content as f F/g: "content f" - msg: R10 date: 1000000100 content: F/E/D/f: "content f" F/E/D/g: "content f" # /!\ F/h: "content h" - msg: R11 date: 1000000110 content: G/E/D/f: "content f" G/E/D/g: "content f" # /!\ G/i: "content i" - msg: R12 date: 1000000120 content: G/H/D/f: "content f" G/H/D/g: "content f" # /!\ G/j: "content j" - msg: R13 date: 1000000130 content: G/H/D/f: "content f" G/H/D/g: "content f" G/I/D/f: "content f" G/I/D/g: "content f" G/D/f: "content f" G/D/g: "content f" G/k: "content k" + tag: "1.0" diff --git a/swh/provenance/tests/data/cmdbts2_visits-01.yaml b/swh/provenance/tests/data/cmdbts2_visits-01.yaml new file mode 100644 index 0000000..4d4d3ea --- /dev/null +++ b/swh/provenance/tests/data/cmdbts2_visits-01.yaml @@ -0,0 +1,39 @@ +# a visit pattern scenario for the 'cmdbts2' repo + +- origin: https://cmdbts2 + date: 1000000150 + branches: + - R13 + +- origin: http://cmdbts2/1 + date: 1000000015 + branches: + - R01 + +- origin: http://cmdbts2/1 + date: 1000000025 + branches: + - R03 + - R06 + +- origin: http://cmdbts2/2 + date: 1000000035 + branches: + - R05 + - R06 + +- origin: http://cmdbts2/1 + date: 1000000045 + branches: + - R06 + - R07 + +- origin: http://cmdbts2/1 + date: 1000000055 + branches: + - R08 + +- origin: http://cmdbts2/2 + date: 1000000065 + branches: + - R08 diff --git a/swh/provenance/tests/data/generate_repo.py b/swh/provenance/tests/data/generate_repo.py index 0a2e024..5b1ec2f 100644 --- a/swh/provenance/tests/data/generate_repo.py +++ b/swh/provenance/tests/data/generate_repo.py @@ -1,116 +1,122 @@ import os import pathlib import shutil from subprocess import PIPE, check_call, check_output from typing import Any, Dict, List import click import yaml def clean_wd() -> None: _, dirnames, filenames = next(os.walk(".")) for d in dirnames: if not d.startswith(".git"): shutil.rmtree(d) for f in filenames: if not f.startswith(".git"): os.unlink(f) def print_ids() -> None: revid = check_output(["git", "rev-parse", "HEAD"]).decode().strip() ts, msg = ( check_output(["git", "log", "-1", '--format="%at %s"']) .decode() .strip()[1:-1] .split() ) print(f"{ts}.0 {revid} {msg}") print(f"{msg:<5} | {'':>5} | {'':>20} | R {revid} | {ts}.0") for currentpath, dirnames, filenames in os.walk("."): if currentpath == ".": output = check_output(["git", "cat-file", "-p", "HEAD"]).decode() dirhash = output.splitlines()[0].split()[1] else: currentpath = currentpath[2:] output = check_output(["git", "ls-tree", "HEAD", currentpath]).decode() dirhash = output.split()[2] print(f"{'':>5} | {'':>5} | {currentpath:<20} | D {dirhash} | 0.0") for fname in filenames: fname = os.path.join(currentpath, fname) output = check_output(["git", "ls-tree", "HEAD", fname]).decode() fhash = output.split()[2] print(f"{'':>5} | {'':>5} | {fname:<20} | C {fhash} | 0.0") if ".git" in dirnames: dirnames.remove(".git") def generate_repo(repo_desc: List[Dict[str, Any]], output_dir: str) -> None: check_call(["git", "init", output_dir], stdout=PIPE, stderr=PIPE) os.chdir(output_dir) os.environ.update( { "GIT_AUTHOR_NAME": "SWH", "GIT_AUTHOR_EMAIL": "contact@softwareheritage.org", "GIT_COMMITTER_NAME": "SWH", "GIT_COMMITTER_EMAIL": "contact@softwareheritage.org", } ) for rev_d in repo_desc: parents = rev_d.get("parents") if parents: # move at the proper (first) parent position, if any check_call(["git", "checkout", parents[0]], stdout=PIPE) # give a branch name (the msg) to each commit to make it easier to # navigate in history check_call(["git", "checkout", "-b", rev_d["msg"]], stdout=PIPE) if parents and len(parents) > 1: # it's a merge check_call(["git", "merge", "--no-commit", *parents[1:]], stdout=PIPE) clean_wd() for path, content in rev_d["content"].items(): p = pathlib.Path(path) p.parent.mkdir(parents=True, exist_ok=True) p.write_text(content) os.environ.update( { "GIT_AUTHOR_DATE": str(rev_d["date"]), "GIT_COMMITTER_DATE": str(rev_d["date"]), } ) check_call(["git", "add", "."], stdout=PIPE) check_call( [ "git", "commit", "--all", "--allow-empty", "-m", rev_d["msg"], ], stdout=PIPE, ) + if rev_d.get("tag"): + check_call( + ["git", "tag", "-a", str(rev_d["tag"]), "-m", "tag message"], + stdout=PIPE, + ) + print_ids() @click.command(name="generate-repo") @click.argument("input-file") @click.argument("output-dir") @click.option("-C", "--clean-output/--no-clean-output", default=False) def main(input_file: str, output_dir: str, clean_output: bool) -> None: repo_desc = yaml.load(open(input_file), Loader=yaml.Loader) if clean_output and os.path.exists(output_dir): shutil.rmtree(output_dir) generate_repo(repo_desc, output_dir) if __name__ == "__main__": main() diff --git a/swh/provenance/tests/data/generate_storage_from_git.py b/swh/provenance/tests/data/generate_storage_from_git.py index e31c54a..14cb178 100644 --- a/swh/provenance/tests/data/generate_storage_from_git.py +++ b/swh/provenance/tests/data/generate_storage_from_git.py @@ -1,119 +1,121 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import os from subprocess import check_output from typing import Dict, Optional import click import yaml from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, Snapshot, SnapshotBranch, TargetType, ) from swh.storage import get_storage from swh.storage.interface import StorageInterface def load_git_repo( url: str, directory: str, storage: StorageInterface ) -> Dict[str, str]: visit_date = datetime.now(tz=timezone.utc) loader = GitLoaderFromDisk( url=url, directory=directory, visit_date=visit_date, storage=storage, ) return loader.load() @click.command() @click.option("-o", "--output", default=None, help="output file") @click.option( "-v", "--visits", type=click.File(mode="rb"), default=None, help="additional visits to generate.", ) @click.argument("git-repo", type=click.Path(exists=True, file_okay=False)) def main(output: Optional[str], visits: bytes, git_repo: str) -> None: "simple tool to generate the git_repo.msgpack dataset file used in some tests" if output is None: output = f"{git_repo}.msgpack" with open(output, "wb") as outstream: sto = get_storage( cls="memory", journal_writer={"cls": "stream", "output_stream": outstream} ) if git_repo.endswith("/"): git_repo = git_repo[:-1] reponame = os.path.basename(git_repo) load_git_repo(f"https://{reponame}", git_repo, sto) if visits: # retrieve all branches from the actual git repo all_branches = { ref: sha1 for sha1, ref in ( line.strip().split() for line in check_output(["git", "-C", git_repo, "show-ref"]) .decode() .splitlines() ) } for visit in yaml.full_load(visits): # add the origin (if it already exists, this is a noop) sto.origin_add([Origin(url=visit["origin"])]) # add a new visit for this origin visit_id = list( sto.origin_visit_add( [ OriginVisit( origin=visit["origin"], date=datetime.fromtimestamp( visit["date"], tz=timezone.utc ), type="git", ) ] ) )[0].visit assert visit_id is not None # add a snapshot with branches from the input file branches = { f"refs/heads/{name}".encode(): SnapshotBranch( target=hash_to_bytes(all_branches[f"refs/heads/{name}"]), target_type=TargetType.REVISION, ) for name in visit["branches"] } snap = Snapshot(branches=branches) sto.snapshot_add([snap]) # add a "closing" origin visit status update referencing the snapshot status = OriginVisitStatus( origin=visit["origin"], visit=visit_id, - date=datetime.fromtimestamp(visit["date"], tz=timezone.utc), + # +1 required otherwise this second visit status is ignored + # (storage sql query with ON CONFLICT DO NOTHING) + date=datetime.fromtimestamp(visit["date"] + 1, tz=timezone.utc), status="full", snapshot=snap.id, ) sto.origin_visit_status_add([status]) click.echo(f"Serialized the storage made from {reponame} in {output}") if __name__ == "__main__": main() diff --git a/swh/provenance/tests/data/origins.csv b/swh/provenance/tests/data/origins.csv index e7e44bc..e246ec2 100644 --- a/swh/provenance/tests/data/origins.csv +++ b/swh/provenance/tests/data/origins.csv @@ -1 +1 @@ -https://cmdbts2,5f577c4d4e5a1d0bca64f78facfb891933b17d94 +https://cmdbts2,b2bd799f3c32f1a53b0370f0ea7afc78757b4f0d diff --git a/swh/provenance/tests/data/out-of-order.msgpack b/swh/provenance/tests/data/out-of-order.msgpack index 660fe16..624bf27 100644 Binary files a/swh/provenance/tests/data/out-of-order.msgpack and b/swh/provenance/tests/data/out-of-order.msgpack differ diff --git a/swh/provenance/tests/data/out-of-order_repo.yaml b/swh/provenance/tests/data/out-of-order_repo.yaml index 9d13f7a..9926d0c 100644 --- a/swh/provenance/tests/data/out-of-order_repo.yaml +++ b/swh/provenance/tests/data/out-of-order_repo.yaml @@ -1,35 +1,36 @@ - msg: R00 date: 1000000000 content: A/B/C/a: "content a" - msg: R01 date: 1000000010 content: A/B/C/a: "content a" A/B/C/b: "content b" - msg: R02 date: 1000000020 content: A/C/a: "content a" A/C/b: "content b" - msg: R03 date: 1000000030 content: A/B/C/a: "content a" A/B/C/b: "content b" - msg: R04 date: 1000000040 content: A/C/a: "content a" A/C/b: "content b" - msg: R05 date: 1000000005 # /!\ we add an earlier version of the 'b' file content: A/B/C/a: "content a" A/B/C/b: "content b" - msg: R06 date: 1000000050 content: A/B/C/a: "content a" A/B/C/b: "content b" A/B/c: "content c" + tag: "1.0" diff --git a/swh/provenance/tests/data/with-merges.msgpack b/swh/provenance/tests/data/with-merges.msgpack index 52ff778..6501325 100644 Binary files a/swh/provenance/tests/data/with-merges.msgpack and b/swh/provenance/tests/data/with-merges.msgpack differ diff --git a/swh/provenance/tests/data/with-merges_repo.yaml b/swh/provenance/tests/data/with-merges_repo.yaml index 74745c7..f1d841c 100644 --- a/swh/provenance/tests/data/with-merges_repo.yaml +++ b/swh/provenance/tests/data/with-merges_repo.yaml @@ -1,73 +1,76 @@ # generate a git history with a multi-merge revision # *-. R08 # |\ \ # | * | R07 # | | | # | | * R06 # | | | # * | | R05 # | | | # * | | R04 # |/ | # * | R03 # | / # * / R02 # |/ # * R01 # | # * R00 - msg: R00 date: 1000000000 content: A/B/C/a: "content a" - msg: R01 date: 1000000010 content: A/B/C/a: "content a" A/B/C/b: "content b" + tag: "0.0" - msg: R02 date: 1000000020 content: A/C/a: "content a" A/C/b: "content b" - msg: R03 date: 1000000030 content: A/B/C/a: "content a" A/B/C/b: "content b" - msg: R04 date: 1000000040 content: A/C/a: "content a" A/C/b: "content b" - msg: R05 date: 1000000050 content: A/B/C/a: "content a" A/B/C/b: "content b" A/B/c: "content c" - msg: R06 parents: - R01 date: 1000000005 # /!\ we add an earlier version of the 'b' file content: A/B/C/a: "content a" A/B/C/b: "content b" - msg: R07 parents: - R03 date: 1000000035 # /!\ we add an earlier version of the 'b' file content: A/B/C/a: "content a" A/B/C/b: "content b" A/B/c: "content c" + tag: "0.9" - msg: R08 parents: - R05 - R06 - R07 date: 1000000060 content: A/B/C/a: "content a" A/B/C/b: "content b" A/B/c: "content c" + tag: "1.0" diff --git a/swh/provenance/tests/test_archive_interface.py b/swh/provenance/tests/test_archive_interface.py index 624cee8..9d673d8 100644 --- a/swh/provenance/tests/test_archive_interface.py +++ b/swh/provenance/tests/test_archive_interface.py @@ -1,255 +1,254 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter from operator import itemgetter from typing import Any from typing import Counter as TCounter from typing import Dict, Iterable, List, Set, Tuple, Type, Union import pytest from swh.core.db import BaseDb from swh.graph.naive_client import NaiveClient from swh.model.model import ( + SWH_MODEL_OBJECT_TYPES, BaseModel, Content, Directory, DirectoryEntry, + ObjectType, Origin, - OriginVisit, OriginVisitStatus, + Release, Revision, Sha1Git, Snapshot, SnapshotBranch, TargetType, ) from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID from swh.provenance.archive import ArchiveInterface from swh.provenance.multiplexer.archive import ArchiveMultiplexed from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage from swh.provenance.swhgraph.archive import ArchiveGraph from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.interface import StorageInterface from swh.storage.postgresql.storage import Storage class ArchiveNoop: storage: StorageInterface def directory_ls(self, id: Sha1Git, minsize: int = 0) -> Iterable[Dict[str, Any]]: return [] def revision_get_some_outbound_edges( self, revision_id: Sha1Git ) -> Iterable[Tuple[Sha1Git, Sha1Git]]: return [] def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: return [] def check_directory_ls( reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]] ) -> None: for directory in data["directory"]: entries_ref = sorted( reference.directory_ls(directory["id"]), key=itemgetter("name") ) entries = sorted(archive.directory_ls(directory["id"]), key=itemgetter("name")) assert entries_ref == entries def check_revision_get_some_outbound_edges( reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]] ) -> None: for revision in data["revision"]: parents_ref: TCounter[Tuple[Sha1Git, Sha1Git]] = Counter( reference.revision_get_some_outbound_edges(revision["id"]) ) parents: TCounter[Tuple[Sha1Git, Sha1Git]] = Counter( archive.revision_get_some_outbound_edges(revision["id"]) ) # Check that all the reference outbound edges are included in the other # archives's outbound edges assert set(parents_ref.items()) <= set(parents.items()) def check_snapshot_get_heads( reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]] ) -> None: for snapshot in data["snapshot"]: heads_ref: TCounter[Sha1Git] = Counter( reference.snapshot_get_heads(snapshot["id"]) ) heads: TCounter[Sha1Git] = Counter(archive.snapshot_get_heads(snapshot["id"])) assert heads_ref == heads def get_object_class(object_type: str) -> Type[BaseModel]: - if object_type == "origin": - return Origin - elif object_type == "origin_visit": - return OriginVisit - elif object_type == "origin_visit_status": - return OriginVisitStatus - elif object_type == "content": - return Content - elif object_type == "directory": - return Directory - elif object_type == "revision": - return Revision - elif object_type == "snapshot": - return Snapshot - raise ValueError + return SWH_MODEL_OBJECT_TYPES[object_type] def data_to_model(data: Dict[str, List[dict]]) -> Dict[str, List[BaseModel]]: model: Dict[str, List[BaseModel]] = {} for object_type, objects in data.items(): for object in objects: model.setdefault(object_type, []).append( get_object_class(object_type).from_dict(object) ) return model def add_link( edges: Set[ Tuple[ Union[CoreSWHID, ExtendedSWHID, str], Union[CoreSWHID, ExtendedSWHID, str] ] ], - src_obj: Union[Origin, Snapshot, Revision, Directory, Content], + src_obj: Union[Content, Directory, Origin, Release, Revision, Snapshot], dst_id: bytes, dst_type: ExtendedObjectType, ) -> None: swhid = ExtendedSWHID(object_type=dst_type, object_id=dst_id) edges.add((src_obj.swhid(), swhid)) def get_graph_data( data: Dict[str, List[dict]] ) -> Tuple[ List[Union[CoreSWHID, ExtendedSWHID, str]], List[ Tuple[ Union[CoreSWHID, ExtendedSWHID, str], Union[CoreSWHID, ExtendedSWHID, str] ] ], ]: nodes: Set[Union[CoreSWHID, ExtendedSWHID, str]] = set() edges: Set[ Tuple[ Union[CoreSWHID, ExtendedSWHID, str], Union[CoreSWHID, ExtendedSWHID, str] ] ] = set() model = data_to_model(data) for origin in model["origin"]: assert isinstance(origin, Origin) nodes.add(origin.swhid()) for status in model["origin_visit_status"]: assert isinstance(status, OriginVisitStatus) if status.origin == origin.url and status.snapshot is not None: add_link(edges, origin, status.snapshot, ExtendedObjectType.SNAPSHOT) for snapshot in model["snapshot"]: assert isinstance(snapshot, Snapshot) nodes.add(snapshot.swhid()) for branch in snapshot.branches.values(): assert isinstance(branch, SnapshotBranch) if branch.target_type in [TargetType.RELEASE, TargetType.REVISION]: target_type = ( ExtendedObjectType.RELEASE if branch.target_type == TargetType.RELEASE else ExtendedObjectType.REVISION ) add_link(edges, snapshot, branch.target, target_type) for revision in model["revision"]: assert isinstance(revision, Revision) nodes.add(revision.swhid()) # root directory add_link(edges, revision, revision.directory, ExtendedObjectType.DIRECTORY) # parent for parent in revision.parents: add_link(edges, revision, parent, ExtendedObjectType.REVISION) + dir_entry_types = { + "file": ExtendedObjectType.CONTENT, + "dir": ExtendedObjectType.DIRECTORY, + "rev": ExtendedObjectType.REVISION, + } for directory in model["directory"]: assert isinstance(directory, Directory) nodes.add(directory.swhid()) for entry in directory.entries: assert isinstance(entry, DirectoryEntry) - if entry.type == "file": - target_type = ExtendedObjectType.CONTENT - elif entry.type == "dir": - target_type = ExtendedObjectType.DIRECTORY - elif entry.type == "rev": - target_type = ExtendedObjectType.REVISION - else: - assert False, "unknown directory entry type" - add_link(edges, directory, entry.target, target_type) + add_link(edges, directory, entry.target, dir_entry_types[entry.type]) for content in model["content"]: assert isinstance(content, Content) nodes.add(content.swhid()) + object_type = { + ObjectType.CONTENT: ExtendedObjectType.CONTENT, + ObjectType.DIRECTORY: ExtendedObjectType.DIRECTORY, + ObjectType.REVISION: ExtendedObjectType.REVISION, + ObjectType.RELEASE: ExtendedObjectType.RELEASE, + ObjectType.SNAPSHOT: ExtendedObjectType.SNAPSHOT, + } + for release in model["release"]: + assert isinstance(release, Release) + nodes.add(release.swhid()) + + if release.target is not None: + add_link(edges, release, release.target, object_type[release.target_type]) + return list(nodes), list(edges) @pytest.mark.parametrize( "repo", ("cmdbts2", "out-of-order", "with-merges"), ) def test_archive_interface(repo: str, archive: ArchiveInterface) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(archive.storage, data) # test against ArchiveStorage archive_api = ArchiveStorage(archive.storage) check_directory_ls(archive, archive_api, data) check_revision_get_some_outbound_edges(archive, archive_api, data) check_snapshot_get_heads(archive, archive_api, data) # test against ArchivePostgreSQL assert isinstance(archive.storage, Storage) dsn = archive.storage.get_db().conn.dsn with BaseDb.connect(dsn).conn as conn: BaseDb.adapt_conn(conn) archive_direct = ArchivePostgreSQL(conn) check_directory_ls(archive, archive_direct, data) check_revision_get_some_outbound_edges(archive, archive_direct, data) check_snapshot_get_heads(archive, archive_direct, data) # test against ArchiveGraph nodes, edges = get_graph_data(data) graph = NaiveClient(nodes=nodes, edges=edges) archive_graph = ArchiveGraph(graph, archive.storage) with pytest.raises(NotImplementedError): check_directory_ls(archive, archive_graph, data) check_revision_get_some_outbound_edges(archive, archive_graph, data) check_snapshot_get_heads(archive, archive_graph, data) # test against ArchiveMultiplexer archive_multiplexed = ArchiveMultiplexed( [("noop", ArchiveNoop()), ("graph", archive_graph), ("api", archive_api)] ) check_directory_ls(archive, archive_multiplexed, data) check_revision_get_some_outbound_edges(archive, archive_multiplexed, data) check_snapshot_get_heads(archive, archive_multiplexed, data) def test_noop_multiplexer(): archive = ArchiveMultiplexed([("noop", ArchiveNoop())]) assert not archive.directory_ls(Sha1Git(b"abcd")) assert not archive.revision_get_some_outbound_edges(Sha1Git(b"abcd")) assert not archive.snapshot_get_heads(Sha1Git(b"abcd")) diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py index 18ded81..53bf275 100644 --- a/swh/provenance/tests/test_cli.py +++ b/swh/provenance/tests/test_cli.py @@ -1,164 +1,164 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, List, Set from _pytest.monkeypatch import MonkeyPatch from click.testing import CliRunner import psycopg2.extensions import pytest from swh.core.cli import swh as swhmain import swh.core.cli.db # noqa ; ensure cli is loaded from swh.core.db import BaseDb from swh.core.db.db_utils import init_admin_extensions from swh.model.hashutil import MultiHash import swh.provenance.cli # noqa ; ensure cli is loaded from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.interface import StorageInterface from .conftest import get_datafile from .test_utils import invoke, write_configuration_path def test_cli_swh_db_help() -> None: # swhmain.add_command(provenance_cli) result = CliRunner().invoke(swhmain, ["provenance", "-h"]) assert result.exit_code == 0 assert "Commands:" in result.output commands = result.output.split("Commands:")[1] for command in ( "find-all", "find-first", "iter-frontiers", "iter-origins", "iter-revisions", ): assert f" {command} " in commands TABLES = { "dbflavor", "dbmodule", "dbversion", "content", "content_in_revision", "content_in_directory", "directory", "directory_in_revision", "location", "origin", "revision", "revision_before_revision", "revision_in_origin", } @pytest.mark.parametrize( "flavor, dbtables", (("with-path", TABLES), ("without-path", TABLES)) ) def test_cli_db_create_and_init_db_with_flavor( monkeypatch: MonkeyPatch, postgresql: psycopg2.extensions.connection, flavor: str, dbtables: Set[str], ) -> None: """Test that 'swh db init provenance' works with flavors for both with-path and without-path flavors""" dbname = f"{flavor}-db" # DB creation using 'swh db create' db_params = postgresql.get_dsn_parameters() monkeypatch.setenv("PGHOST", db_params["host"]) monkeypatch.setenv("PGUSER", db_params["user"]) monkeypatch.setenv("PGPORT", db_params["port"]) result = CliRunner().invoke(swhmain, ["db", "create", "-d", dbname, "provenance"]) assert result.exit_code == 0, result.output # DB init using 'swh db init' result = CliRunner().invoke( swhmain, ["db", "init", "-d", dbname, "--flavor", flavor, "provenance"] ) assert result.exit_code == 0, result.output assert f"(flavor {flavor})" in result.output db_params["dbname"] = dbname cnx = BaseDb.connect(**db_params).conn # check the DB looks OK (check for db_flavor and expected tables) with cnx.cursor() as cur: cur.execute("select swh_get_dbflavor()") assert cur.fetchone() == (flavor,) cur.execute( "select table_name from information_schema.tables " "where table_schema = 'public' " f"and table_catalog = '{dbname}'" ) tables = set(x for (x,) in cur.fetchall()) assert tables == dbtables def test_cli_init_db_default_flavor(postgresql: psycopg2.extensions.connection) -> None: "Test that 'swh db init provenance' defaults to a with-path flavored DB" dbname = postgresql.dsn init_admin_extensions("swh.provenance", dbname) result = CliRunner().invoke(swhmain, ["db", "init", "-d", dbname, "provenance"]) assert result.exit_code == 0, result.output with postgresql.cursor() as cur: cur.execute("select swh_get_dbflavor()") assert cur.fetchone() == ("with-path",) @pytest.mark.parametrize( "subcommand", (["origin", "from-csv"], ["iter-origins"]), ) def test_cli_origin_from_csv( swh_storage: StorageInterface, subcommand: List[str], swh_storage_backend_config: Dict, provenance, tmp_path, ): repo = "cmdbts2" origin_url = f"https://{repo}" data = load_repo_data(repo) fill_storage(swh_storage, data) - assert len(data["origin"]) == 1 - assert {"url": origin_url} in data["origin"] + assert len(data["origin"]) >= 1 + assert origin_url in [o["url"] for o in data["origin"]] cfg = { "provenance": { "archive": { "cls": "api", "storage": swh_storage_backend_config, }, "storage": { "cls": "postgresql", # "db": provenance.storage.conn.dsn, "db": provenance.storage.conn.get_dsn_parameters(), }, }, } config_path = write_configuration_path(cfg, tmp_path) csv_filepath = get_datafile("origins.csv") subcommand = subcommand + [csv_filepath] result = invoke(subcommand, config_path) assert result.exit_code == 0, f"Unexpected result: {result.output}" origin_sha1 = MultiHash.from_data( origin_url.encode(), hash_names=["sha1"] ).digest()["sha1"] actual_result = provenance.storage.origin_get([origin_sha1]) assert actual_result == {origin_sha1: origin_url} diff --git a/swh/provenance/tests/test_journal_client.py b/swh/provenance/tests/test_journal_client.py index 85225b8..c0fc79c 100644 --- a/swh/provenance/tests/test_journal_client.py +++ b/swh/provenance/tests/test_journal_client.py @@ -1,135 +1,135 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict from confluent_kafka import Consumer import pytest from swh.model.hashutil import MultiHash from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.interface import StorageInterface from .test_utils import invoke, write_configuration_path @pytest.fixture def swh_storage_backend_config(swh_storage_backend_config, kafka_server, kafka_prefix): writer_config = { "cls": "kafka", "brokers": [kafka_server], "client_id": "kafka_writer", "prefix": kafka_prefix, "anonymize": False, } yield {**swh_storage_backend_config, "journal_writer": writer_config} def test_cli_origin_from_journal_client( swh_storage: StorageInterface, swh_storage_backend_config: Dict, kafka_prefix: str, kafka_server: str, consumer: Consumer, tmp_path: str, provenance, provenance_postgresql, ) -> None: """Test origin journal client cli""" # Prepare storage data data = load_repo_data("cmdbts2") - assert len(data["origin"]) == 1 + assert len(data["origin"]) >= 1 origin_url = data["origin"][0]["url"] fill_storage(swh_storage, data) # Prepare configuration for cli call swh_storage_backend_config.pop("journal_writer", None) # no need for that config storage_config_dict = swh_storage_backend_config cfg = { "journal_client": { "cls": "kafka", "brokers": [kafka_server], "group_id": "toto", "prefix": kafka_prefix, "stop_on_eof": True, }, "provenance": { "archive": { "cls": "api", "storage": storage_config_dict, }, "storage": { "cls": "postgresql", "db": provenance_postgresql.get_dsn_parameters(), }, }, } config_path = write_configuration_path(cfg, tmp_path) # call the cli 'swh provenance origin from-journal' result = invoke(["origin", "from-journal"], config_path) assert result.exit_code == 0, f"Unexpected result: {result.output}" origin_sha1 = MultiHash.from_data( origin_url.encode(), hash_names=["sha1"] ).digest()["sha1"] actual_result = provenance.storage.origin_get([origin_sha1]) assert actual_result == {origin_sha1: origin_url} def test_cli_revision_from_journal_client( swh_storage: StorageInterface, swh_storage_backend_config: Dict, kafka_prefix: str, kafka_server: str, consumer: Consumer, tmp_path: str, provenance, provenance_postgresql, ) -> None: """Test revision journal client cli""" # Prepare storage data data = load_repo_data("cmdbts2") - assert len(data["origin"]) == 1 + assert len(data["origin"]) >= 1 fill_storage(swh_storage, data) # Prepare configuration for cli call swh_storage_backend_config.pop("journal_writer", None) # no need for that config storage_config_dict = swh_storage_backend_config cfg = { "journal_client": { "cls": "kafka", "brokers": [kafka_server], "group_id": "toto", "prefix": kafka_prefix, "stop_on_eof": True, }, "provenance": { "archive": { "cls": "api", "storage": storage_config_dict, }, "storage": { "cls": "postgresql", "db": provenance_postgresql.get_dsn_parameters(), }, }, } config_path = write_configuration_path(cfg, tmp_path) revisions = [rev["id"] for rev in data["revision"]] result = provenance.storage.revision_get(revisions) assert not result # call the cli 'swh provenance revision from-journal' cli_result = invoke(["revision", "from-journal"], config_path) assert cli_result.exit_code == 0, f"Unexpected result: {result.output}" result = provenance.storage.revision_get(revisions) assert set(result.keys()) == set(revisions) diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py index 2c4e3ac..020a07d 100644 --- a/swh/provenance/tests/test_origin_iterator.py +++ b/swh/provenance/tests/test_origin_iterator.py @@ -1,42 +1,46 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.provenance.origin import CSVOriginIterator from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.algos.origin import ( iter_origin_visit_statuses, iter_origin_visits, iter_origins, ) from swh.storage.interface import StorageInterface @pytest.mark.parametrize( "repo", ( "cmdbts2", "out-of-order", ), ) def test_origin_iterator(swh_storage: StorageInterface, repo: str) -> None: """Test CSVOriginIterator""" data = load_repo_data(repo) fill_storage(swh_storage, data) origins_csv = [] for origin in iter_origins(swh_storage): for visit in iter_origin_visits(swh_storage, origin.url): if visit.visit is not None: for status in iter_origin_visit_statuses( swh_storage, origin.url, visit.visit ): if status.snapshot is not None: origins_csv.append((status.origin, status.snapshot)) origins = list(CSVOriginIterator(origins_csv)) assert origins - assert len(origins) == len(data["origin"]) + # there can be more origins, depending on the additional extra visits.yaml + # file used during dataset generation (see data/generate_storage_from_git) + assert len(origins) >= len(data["origin"]) + # but we can check it's a subset + assert set(o.url for o in origins) <= set(o["url"] for o in data["origin"])