diff --git a/requirements-test.txt b/requirements-test.txt index e079f8a..67f8370 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1 +1,2 @@ pytest +swh.loader.git diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py index 96b6cb0..e1693a5 100644 --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -1,304 +1,310 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import glob from os import path import re from typing import Iterable, Iterator, List import pytest from typing_extensions import TypedDict from swh.core.api.serializers import msgpack_loads from swh.core.db import BaseDb from swh.core.db.pytest_plugin import postgresql_fact from swh.core.utils import numfile_sortkey as sortkey from swh.model.model import Content, Directory, DirectoryEntry, Revision from swh.model.tests.swh_model_data import TEST_OBJECTS import swh.provenance from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql") SQL_FILES = [ sqlfile for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey) if "-without-path-" not in sqlfile ] provenance_db = postgresql_fact( "postgresql_proc", dbname="provenance", dump_files=SQL_FILES ) @pytest.fixture def provenance(provenance_db): """return a working and initialized provenance db""" from swh.provenance.postgresql.provenancedb_with_path import ( ProvenanceWithPathDB as ProvenanceDB, ) BaseDb.adapt_conn(provenance_db) prov = ProvenanceDB(provenance_db) # in test sessions, we DO want to raise any exception occurring at commit time prov.raise_on_commit = True return prov @pytest.fixture def swh_storage_with_objects(swh_storage): """return a Storage object (postgresql-based by default) with a few of each object type in it The inserted content comes from swh.model.tests.swh_model_data. """ for obj_type in ( "content", "skipped_content", "directory", "revision", "release", "snapshot", "origin", "origin_visit", "origin_visit_status", ): getattr(swh_storage, f"{obj_type}_add")(TEST_OBJECTS[obj_type]) return swh_storage @pytest.fixture def archive_direct(swh_storage_with_objects): return ArchivePostgreSQL(swh_storage_with_objects.get_db().conn) @pytest.fixture def archive_api(swh_storage_with_objects): return ArchiveStorage(swh_storage_with_objects) @pytest.fixture def archive(swh_storage_with_objects): """Return a ArchivePostgreSQL based StorageInterface object""" # this is a workaround to prevent tests from hanging because of an unclosed # transaction. # TODO: refactor the ArchivePostgreSQL to properly deal with # transactions and get rif of this fixture archive = ArchivePostgreSQL(conn=swh_storage_with_objects.get_db().conn) yield archive archive.conn.rollback() def get_datafile(fname): return path.join(path.dirname(__file__), "data", fname) @pytest.fixture def CMDBTS_data(): # imported git tree is https://github.com/grouss/CMDBTS rev 4c5551b496 # ([xxx] is the timestamp): # o - [1609757158] first commit 35ccb8dd1b53d2d8a5c1375eb513ef2beaa79ae5 # | `- README.md * 43f3c871310a8e524004e91f033e7fb3b0bc8475 # o - [1610644094] Reset Empty repository 840b91df68e9549c156942ddd5002111efa15604 # | # o - [1610644094] R0000 9e36e095b79e36a3da104ce272989b39cd68aefd # | `- Red/Blue/Green/a * 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # o - [1610644097] R0001 bfbfcc72ae7fc35d6941386c36280512e6b38440 # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # | `- Red/Blue/Green/b * 9f6e04be05297905f1275d3f4e0bb0583458b2e8 # o - [1610644099] R0002 0a31c9d509783abfd08f9fdfcd3acae20f17dfd0 # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # | |- Red/Blue/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 # | `- Red/Blue/c * a28fa70e725ebda781e772795ca080cd737b823c # o - [1610644101] R0003 ca6ec564c69efd2e5c70fb05486fd3f794765a04 # | |- Red/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # | |- Red/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 # | `- Red/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # o - [1610644103] R0004 fc6e10b7d41b1d56a94091134e3683ce91e80d91 # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # | |- Red/Blue/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 # | `- Red/Blue/c a28fa70e725ebda781e772795ca080cd737b823c # o - [1610644105] R0005 1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17 # | `- Purple/d * c0229d305adf3edf49f031269a70e3e87665fe88 # o - [1610644107] R0006 9a71f967ae1a125be9b6569cc4eccec0aecabb7c # | `- Purple/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 # o - [1610644109] R0007 4fde4ea4494a630030a4bda99d03961d9add00c7 # | |- Dark/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 # | `- Dark/d c0229d305adf3edf49f031269a70e3e87665fe88 # o - [1610644111] R0008 ba00e89d47dc820bb32c783af7123ffc6e58b56d # | |- Dark/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 # | |- Dark/Brown/Purple/e c0229d305adf3edf49f031269a70e3e87665fe88 # | `- Dark/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # o - [1610644113] R0009 55d4dc9471de6144f935daf3c38878155ca274d5 # | |- Dark/Brown/Purple/f * 94ba40161084e8b80943accd9d24e1f9dd47189b # | |- Dark/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # | `- Dark/f 94ba40161084e8b80943accd9d24e1f9dd47189b # o - [1610644116] R0010 a8939755d0be76cfea136e9e5ebce9bc51c49fef # | |- Dark/Brown/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # | |- Dark/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # | `- Dark/h * 5e8f9ceaee9dafae2e3210e254fdf170295f8b5b # o - [1610644118] R0011 ca1774a07b6e02c1caa7ae678924efa9259ee7c6 # | |- Paris/Brown/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # | |- Paris/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # | `- Paris/i * bbd54b961764094b13f10cef733e3725d0a834c3 # o - [1610644120] R0012 611fe71d75b6ea151b06e3845c09777acc783d82 # | |- Paris/Berlin/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # | |- Paris/Berlin/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # | `- Paris/j * 7ce4fe9a22f589fa1656a752ea371b0ebc2106b1 # o - [1610644122] R0013 4c5551b4969eb2160824494d40b8e1f6187fc01e # |- Paris/Berlin/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Berlin/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Munich/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Munich/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # `- Paris/k * cb79b39935c9392fa5193d9f84a6c35dc9c22c75 + return load_repo_data("CMDBTS") + + +def load_repo_data(repo): data = {"revision": [], "directory": [], "content": []} - with open(get_datafile("CMDBTS.msgpack"), "rb") as fobj: + with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj: for etype, value in msgpack_loads(fobj.read()): data[etype].append(value) return data def filter_dict(d, keys): return {k: v for (k, v) in d.items() if k in keys} @pytest.fixture def storage_and_CMDBTS(swh_storage, CMDBTS_data): - swh_storage.content_add_metadata( - Content.from_dict(content) for content in CMDBTS_data["content"] + fill_storage(swh_storage, CMDBTS_data) + return swh_storage, CMDBTS_data + + +def fill_storage(storage, data): + storage.content_add_metadata( + Content.from_dict(content) for content in data["content"] ) - swh_storage.directory_add( + storage.directory_add( [ Directory( entries=tuple( [ DirectoryEntry.from_dict( filter_dict(entry, ("name", "type", "target", "perms")) ) for entry in dir["entries"] ] ) ) - for dir in CMDBTS_data["directory"] + for dir in data["directory"] ] ) - swh_storage.revision_add( - Revision.from_dict(revision) for revision in CMDBTS_data["revision"] - ) - return swh_storage, CMDBTS_data + storage.revision_add(Revision.from_dict(revision) for revision in data["revision"]) class SynthRelation(TypedDict): path: str src: bytes dst: bytes rel_ts: float class SynthRevision(TypedDict): sha1: bytes date: float msg: str R_C: List[SynthRelation] R_D: List[SynthRelation] D_C: List[SynthRelation] def synthetic_result(filename: str) -> Iterator[SynthRevision]: """Generates dict representations of synthetic revisions found in the synthetic file (from the data/ directory) given as argument of the generator. Generated SynthRevision (typed dict) with the following elements: "sha1": (bytes) sha1 of the revision, "date": (float) timestamp of the revision, "msg": (str) commit message of the revision, "R_C": (list) new R---C relations added by this revision "R_D": (list) new R-D relations added by this revision "D_C": (list) new D-C relations added by this revision Each relation above is a SynthRelation typed dict with: "path": (str) location "src": (bytes) sha1 of the source of the relation "dst": (bytes) sha1 of the destination of the relation "rel_ts": (float) timestamp of the target of the relation (related to the timestamp of the revision) """ with open(get_datafile(filename), "r") as fobj: yield from _parse_synthetic_file(fobj) def _parse_synthetic_file(fobj: Iterable[str]) -> Iterator[SynthRevision]: """Read a 'synthetic' file and generate a dict representation of the synthetic revision for each revision listed in the synthetic file. """ regs = [ - "(?PR[0-9]{4})?", + "(?PR[0-9]{2,4})?", "(?P[^| ]*)", "([+] )?(?P[^| +]*?)[/]?", "(?P[RDC]) (?P[0-9a-z]{40})", "(?P-?[0-9]+(.[0-9]+)?)", ] - regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *$") + regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$") current_rev: List[dict] = [] for m in (regex.match(line) for line in fobj): if m: d = m.groupdict() if d["revname"]: if current_rev: yield _mk_synth_rev(current_rev) current_rev.clear() current_rev.append(d) if current_rev: yield _mk_synth_rev(current_rev) def _mk_synth_rev(synth_rev) -> SynthRevision: assert synth_rev[0]["type"] == "R" rev = SynthRevision( sha1=bytes.fromhex(synth_rev[0]["sha1"]), date=float(synth_rev[0]["ts"]), msg=synth_rev[0]["revname"], R_C=[], R_D=[], D_C=[], ) for row in synth_rev[1:]: if row["reltype"] == "R---C": assert row["type"] == "C" rev["R_C"].append( SynthRelation( path=row["path"], src=rev["sha1"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) elif row["reltype"] == "R-D": assert row["type"] == "D" rev["R_D"].append( SynthRelation( path=row["path"], src=rev["sha1"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) elif row["reltype"] == "D-C": assert row["type"] == "C" rev["D_C"].append( SynthRelation( path=row["path"], src=rev["R_D"][-1]["dst"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) return rev diff --git a/swh/provenance/tests/data/README.md b/swh/provenance/tests/data/README.md new file mode 100644 index 0000000..3e78ba5 --- /dev/null +++ b/swh/provenance/tests/data/README.md @@ -0,0 +1,138 @@ +# Provenance Index Test Dataset + +This directory contains datasets used by `test_provenance_heurstics` tests of +the provenance index database. + +Each dataset `xxx` consist in several parts: + +- a description of a git repository as a yaml file named `xxx_repo.yaml`, +- a msgpack file containing storage objects for the given repository, from + which the storage is filled before each test using these data, and +- a set of synthetic files, named `synthetic_xxx_(lower|upper)_.txt`, + describing the expected result in the provenance database if ingested with + the flag `lower` set or not set, and the `mindepth` value (integer, most + often `1` or `2`). + + +## Git repos description file + +The description of a git repository is a yaml file which contains a list dicts, +each one representing a git revision to add (linearly) in the git repo used a +base for the dataset. Each dict consist in a structure like: + +``` yaml +- msg: R00 + date: 1000000000 + content: + A/B/C/a: "content a" + +``` + +this example will generate a git commit with the commit message "R00", the +author and committer date 1000000000 (given as a unix timestamp), and a one +file which path is `A/B/C/a` and content is "content a". + +The file is parsed to create git revisions in a temporary git repository, in +order of appearance in the yaml file (so one may create an git repository with +'out-of-order' commits). + +There is no way of creating branches and merges for now. + +The tool to generate this git repo is `generate_repo.py`: + +``` + python generate_repo.py --help +Usage: generate_repo.py [OPTIONS] INPUT_FILE OUTPUT_DIR + +Options: + -C, --clean-output / --no-clean-output + --help Show this message and exit. +``` + +It generates a git repository in the `OUTPUT_DIR` as well as produces a +template `synthetic` file on its standard output, which can be used to ease +writing the expected `synthetic` files. + +Typical usage will be: + +``` +python generate_repo.py repo2_repo.yaml repo2 > synthetic_repo2_template.txt +``` + +Note that hashes (for revision, directories and content) of the git objects +only depends on the content of the input yaml file. Calling the tool twice on +the same input file should generate the exact same git repo twice. + + +## Msgpack dump of the storage + +This file contains a set of storage objects (`Revision`, `Content` and +`Directory`) and is usually generated from a local git repository (typically +the one generated by the previous command) using the +`generate_storage_from_git.py` tool: + +``` +python generate_storage_from_git.py --help +Usage: generate_storage_from_git.py [OPTIONS] GIT_REPO + + simple tool to generate the CMDBTS.msgpack dataset filed used in tests + +Options: + -r, --head TEXT head revision to start from + -o, --output TEXT output file + --help Show this message and exit. + +``` + +Typical usage would be, using the git repository `repo2` created previously: + +``` +python generate_storage_from_git.py repo2 +Revision hash for master is 8363e8e98751dc9f264d2fedd6b829ad4b1218b0 +Wrote 86 objects in repo2.msgpack +``` + +## Synthetic files + +These files describe the expected content of the provenance database for each +revision (in order of ingestion). + +The `generate_repo.py` tool will produce a template of synthetic file like: + +``` +1000000000.0 b582a17b3fc37f72fc57877616f85c3f0abed064 R00 +R00 | | | R b582a17b3fc37f72fc57877616f85c3f0abed064 | 1000000000.0 + | | . | D a4cb5e6b2831f7e8eef0e6e08e43d642c97303a1 | 0.0 + | | A | D 1c8d9fd9afa7e5a2cf52a3db6f05dc5c3a1ca86b | 0.0 + | | A/B | D 36876d475197b5ad86ad592e8e28818171455f16 | 0.0 + | | A/B/C | D 98f7a4a23d8df1fb1a5055facae2aff9b2d0a8b3 | 0.0 + | | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0 + +1000000010.0 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe R01 +R01 | | | R 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe | 1000000010.0 + | | . | D b3cf11b22c9f93c3c494cf90ab072f394155072d | 0.0 + | | A | D baca735bf8b8720131b4bfdb47c51631a9260348 | 0.0 + | | A/B | D 4b28979d88ed209a09c272bcc80f69d9b18339c2 | 0.0 + | | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | 0.0 + | | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0 + | | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0.0 + +[...] +``` + +where all the content and directories of each revision are listed; it's then +the responsibility of the user to create the expected synthetic file for a +given heuristics configuration. For example, the 2 revisions above are to be +adapted, for the `(lower=True, mindepth=1)` case, as: + +``` +1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00 +R00 | | | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0 + +1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01 +R01 | | | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10 + | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 + +``` diff --git a/swh/provenance/tests/data/cmdbts2.msgpack b/swh/provenance/tests/data/cmdbts2.msgpack new file mode 100644 index 0000000..0ca8460 Binary files /dev/null and b/swh/provenance/tests/data/cmdbts2.msgpack differ diff --git a/swh/provenance/tests/data/cmdbts2_repo.yaml b/swh/provenance/tests/data/cmdbts2_repo.yaml new file mode 100644 index 0000000..d58e8d2 --- /dev/null +++ b/swh/provenance/tests/data/cmdbts2_repo.yaml @@ -0,0 +1,80 @@ +- msg: R00 + date: 1000000000 + content: + A/B/C/a: "content a" +- msg: R01 + date: 1000000010 + content: + A/B/C/a: "content a" + A/B/C/b: "content b" +- msg: R02 + date: 1000000020 + content: + A/B/C/a: "content a" + A/B/C/b: "content b" + A/B/c: "content c" +- msg: R03 + date: 1000000030 + content: + A/C/a: "content a" + A/C/b: "content b" + A/a: "content a" +- msg: R04 + date: 1000000040 + content: + A/B/C/a: "content a" + A/B/C/b: "content b" + A/B/c: "content c" +- msg: R05 + date: 1000000050 + content: + D/d: "content d" +- msg: R06 + date: 1000000060 + content: + D/E/D/d: "content d" +- msg: R07 + date: 1000000070 + content: + F/E/D/d: "content d" + F/d: "content d" +- msg: R08 + date: 1000000080 + content: + F/E/D/d: "content d" + F/E/D/e: "content d" # /!\ same content as d + F/a: "content a" +- msg: R09 + date: 1000000090 + content: + F/E/D/f: "content f" + F/E/D/g: "content f" # /!\ same content as f + F/g: "content f" +- msg: R10 + date: 1000000100 + content: + F/E/D/f: "content f" + F/E/D/g: "content f" # /!\ + F/h: "content h" +- msg: R11 + date: 1000000110 + content: + G/E/D/f: "content f" + G/E/D/g: "content f" # /!\ + G/i: "content i" +- msg: R12 + date: 1000000120 + content: + G/H/D/f: "content f" + G/H/D/g: "content f" # /!\ + G/j: "content j" +- msg: R13 + date: 1000000130 + content: + G/H/D/f: "content f" + G/H/D/g: "content f" + G/I/D/f: "content f" + G/I/D/g: "content f" + G/D/f: "content f" + G/D/g: "content f" + G/k: "content k" diff --git a/swh/provenance/tests/data/generate_storage_from_git.py b/swh/provenance/tests/data/generate_storage_from_git.py new file mode 100644 index 0000000..1c7b628 --- /dev/null +++ b/swh/provenance/tests/data/generate_storage_from_git.py @@ -0,0 +1,115 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timezone +import os +import re +from subprocess import check_output +from typing import Dict + +import click + +from swh.core.api.serializers import msgpack_dumps +from swh.loader.git.from_disk import GitLoaderFromDisk +from swh.model.hashutil import hash_to_bytes as h2b +from swh.provenance.tests.test_provenance_db import ts2dt +from swh.storage import get_storage + + +def load_git_repo(url, directory, storage): + visit_date = datetime.now(tz=timezone.utc) + loader = GitLoaderFromDisk( + url=url, + directory=directory, + visit_date=visit_date, + storage=storage, + ) + return loader.load() + + +def pop_key(d, k): + d.pop(k) + return d + + +def dump_file(hash, storage, cache): + if hash not in cache: + content = storage.content_find({"sha1_git": hash})[0] + cache[hash] = content + # we remove ctime to make the resulting data (eg. output msgpack file) + # independent from execution time + yield "content", pop_key(content.to_dict(), "ctime") + + +def dump_directory(hash, storage, cache): + if hash not in cache: + dircontent = list(storage.directory_ls(hash)) + cache[hash] = dircontent + yield "directory", {"id": hash, "entries": list(storage.directory_ls(hash))} + for direntry in dircontent: + if direntry["type"] == "dir": + yield from dump_directory(direntry["target"], storage, cache) + elif direntry["type"] == "file": + yield from dump_file(direntry["target"], storage, cache) + else: + raise ValueError("Unexpected directory entry type {direntry['type']}") + + +def dump_git_revision(hash, storage, cache): + if hash not in cache: + rev = storage.revision_get([hash])[0] + revd = { + "id": rev.id, + "date": ts2dt(rev.date.to_dict()), + "parents": rev.parents, + "directory": rev.directory, + } + revd = rev.to_dict() + cache[hash] = revd + for parent in rev.parents: + yield from dump_git_revision(parent, storage, cache) + yield from dump_directory(rev.directory, storage, cache) + yield "revision", cache[hash] + + +@click.command() +@click.option( + "-r", + "--head", + default="master", + help="head revision to start from", +) +@click.option("-o", "--output", default=None, help="output file") +@click.argument("git-repo") +def main(head, output, git_repo): + "simple tool to generate the git_repo.msgpack dataset file used in some tests" + sto = get_storage(cls="memory") + if git_repo.endswith("/"): + git_repo = git_repo[:-1] + + reponame = os.path.basename(git_repo) + load_git_repo(f"https://{reponame}", git_repo, sto) + + if output is None: + output = f"{git_repo}.msgpack" + + if not re.match("[0-9a-fA-F]{40}", head): + headhash = ( + check_output(["git", "-C", git_repo, "rev-parse", head]).decode().strip() + ) + click.echo(f"Revision hash for {head} is {headhash}") + else: + headhash = head + cache: Dict[bytes, dict] = {} + outf = open(output, "wb") + outd = [] + for e in dump_git_revision(h2b(headhash), storage=sto, cache=cache): + outd.append(e) + outf.write(msgpack_dumps(outd)) + click.echo(f"Wrote {len(outd)} objects in {output}") + + +if __name__ == "__main__": + main() diff --git a/swh/provenance/tests/data/synthetic_cmdbts2_lower_1.txt b/swh/provenance/tests/data/synthetic_cmdbts2_lower_1.txt new file mode 100644 index 0000000..89c823f --- /dev/null +++ b/swh/provenance/tests/data/synthetic_cmdbts2_lower_1.txt @@ -0,0 +1,91 @@ +1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00 +R00 | | | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0 + +1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01 +R01 | | | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10 + | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 + +1000000020 0d45f1ee524db8f6f0b5a267afac4e733b4b2cee R02 +R02 | | | R 0d45f1ee524db8f6f0b5a267afac4e733b4b2cee | 1000000020 + | R---C | A/B/c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | 0 + | R-D | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -10 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -20 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -10 + +1000000030 540bd6155a3c50cc47b2e6f43aeaace67a696d1d R03 +R03 | | | R 540bd6155a3c50cc47b2e6f43aeaace67a696d1d | 1000000030 + | R-D | A | D 48007c961cc734d1f63886d0413a6dc605e3e2ea | -20 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | D-C | + C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | D-C | + C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -20 + +1000000040 17ed10db0612c9b46ba340943cb6b48b25431419 R04 +R04 | | | R 17ed10db0612c9b46ba340943cb6b48b25431419 | 1000000040 + | R-D | A/B | D 0e540a8ebea2f5de3e62b92e2139902cf6f46e92 | -20 + | D-C | + c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | -20 + | D-C | + C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -40 + | D-C | + C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -30 + +1000000050 c8bef45193355db33d64f375b4a4e4f23ac2a4f6 R05 +R05 | | | R c8bef45193355db33d64f375b4a4e4f23ac2a4f6 | 1000000050 + | R---C | D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | 0 + +1000000060 f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c R06 +R06 | | | R f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c | 1000000060 + | R-D | D/E/D | D 12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e | -10 + | D-C | + d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -10 + +1000000070 91ed6a03c80b61e0d63d328f7a4325230e7a0237 R07 +R07 | | | R 91ed6a03c80b61e0d63d328f7a4325230e7a0237 | 1000000070 + | R-D | F | D b0ae56ed5ca7daa34fd7a91a28db443ab3c389a0 | -20 + | D-C | + d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -20 + | D-C | + E/D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -20 + +1000000080 a97e5c8a626510eefaa637091924cf800b1e8b06 R08 +R08 | | | R a97e5c8a626510eefaa637091924cf800b1e8b06 | 1000000080 + | R-D | F | D 9a7b5762e20b11735b93a635cda451c75bd31270 | -30 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -80 + | D-C | + E/D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -30 + | D-C | + E/D/e | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -30 + +1000000090 3c5ad6be812b182ee2a01e84884b8ab7d384a4a0 R09 +R09 | | | R 3c5ad6be812b182ee2a01e84884b8ab7d384a4a0 | 1000000090 + | R---C | F/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + | R---C | F/E/D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + | R---C | F/E/D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + +1000000100 b7c52e28d441ca0cb736fdbe49e39eae3847ad0f R10 +R10 | | | R b7c52e28d441ca0cb736fdbe49e39eae3847ad0f | 1000000100 + | R---C | F/h | C 920fc6de626e6f51027590557ac8e11cccd5dbc2 | 0 + | R-D | F/E/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -10 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -10 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -10 + +1000000110 f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2 R11 +R11 | | | R f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2 | 1000000110 + | R---C | G/i | C 388d789b624db360ede4c1b5a317da3f07b98e9c | 0 + | R-D | G/E/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -20 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -20 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -20 + +1000000120 99bd98e1803343ecfabe4b05d0218475c2b1bf74 R12 +R12 | | | R 99bd98e1803343ecfabe4b05d0218475c2b1bf74 | 1000000120 + | R---C | G/j | C ab0d07c4e8650fade0f87c7380f3462c89ecfc90 | 0 + | R-D | G/H/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -30 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -30 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -30 + +1000000130 10287882c7ed1b7c96f43da269e6a868b98291ff R13 +R13 | | | R 10287882c7ed1b7c96f43da269e6a868b98291ff | 1000000130 + | R---C | G/k | C 5d322b152c491c915079caa3d8af22ab4d02d5cb | 0 + | R-D | G/I/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -40 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | R-D | G/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -40 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | R-D | G/H/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -40 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 diff --git a/swh/provenance/tests/data/synthetic_cmdbts2_lower_2.txt b/swh/provenance/tests/data/synthetic_cmdbts2_lower_2.txt new file mode 100644 index 0000000..7025cdc --- /dev/null +++ b/swh/provenance/tests/data/synthetic_cmdbts2_lower_2.txt @@ -0,0 +1,91 @@ +1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00 +R00 | | | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0 + +1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01 +R01 | | | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10 + | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 + +1000000020 0d45f1ee524db8f6f0b5a267afac4e733b4b2cee R02 +R02 | | | R 0d45f1ee524db8f6f0b5a267afac4e733b4b2cee | 1000000020 + | R---C | A/B/c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | 0 + | R-D | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -10 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -20 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -10 + +1000000030 540bd6155a3c50cc47b2e6f43aeaace67a696d1d R03 +R03 | | | R 540bd6155a3c50cc47b2e6f43aeaace67a696d1d | 1000000030 + | R---C | A/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | R-D | A/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -20 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -20 + +1000000040 17ed10db0612c9b46ba340943cb6b48b25431419 R04 +R04 | | | R 17ed10db0612c9b46ba340943cb6b48b25431419 | 1000000040 + | R-D | A/B | D 0e540a8ebea2f5de3e62b92e2139902cf6f46e92 | -20 + | D-C | + c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | -20 + | D-C | + C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -40 + | D-C | + C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -30 + +1000000050 c8bef45193355db33d64f375b4a4e4f23ac2a4f6 R05 +R05 | | | R c8bef45193355db33d64f375b4a4e4f23ac2a4f6 | 1000000050 + | R---C | D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | 0 + +1000000060 f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c R06 +R06 | | | R f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c | 1000000060 + | R-D | D/E/D | D 12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e | -10 + | D-C | + d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -10 + +1000000070 91ed6a03c80b61e0d63d328f7a4325230e7a0237 R07 +R07 | | | R 91ed6a03c80b61e0d63d328f7a4325230e7a0237 | 1000000070 + | R---C | F/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -20 + | R-D | F/E/D | D 12f1bc8ca9678ecc055bc65efd7fb4dd1f13457e | -20 + | D-C | + d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -20 + +1000000080 a97e5c8a626510eefaa637091924cf800b1e8b06 R08 +R08 | | | R a97e5c8a626510eefaa637091924cf800b1e8b06 | 1000000080 + | R---C | F/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -80 + | R-D | F/E/D | D cb211f2d9dfee6c3968837a07960afd6ab09506c | -30 + | D-C | + d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -30 + | D-C | + e | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -30 + +1000000090 3c5ad6be812b182ee2a01e84884b8ab7d384a4a0 R09 +R09 | | | R 3c5ad6be812b182ee2a01e84884b8ab7d384a4a0 | 1000000090 + | R---C | F/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + | R---C | F/E/D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + | R---C | F/E/D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + +1000000100 b7c52e28d441ca0cb736fdbe49e39eae3847ad0f R10 +R10 | | | R b7c52e28d441ca0cb736fdbe49e39eae3847ad0f | 1000000100 + | R---C | F/h | C 920fc6de626e6f51027590557ac8e11cccd5dbc2 | 0 + | R-D | F/E/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -10 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -10 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -10 + +1000000110 f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2 R11 +R11 | | | R f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2 | 1000000110 + | R---C | G/i | C 388d789b624db360ede4c1b5a317da3f07b98e9c | 0 + | R-D | G/E/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -20 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -20 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -20 + +1000000120 99bd98e1803343ecfabe4b05d0218475c2b1bf74 R12 +R12 | | | R 99bd98e1803343ecfabe4b05d0218475c2b1bf74 | 1000000120 + | R---C | G/j | C ab0d07c4e8650fade0f87c7380f3462c89ecfc90 | 0 + | R-D | G/H/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -30 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -30 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -30 + +1000000130 10287882c7ed1b7c96f43da269e6a868b98291ff R13 +R13 | | | R 10287882c7ed1b7c96f43da269e6a868b98291ff | 1000000130 + | R---C | G/k | C 5d322b152c491c915079caa3d8af22ab4d02d5cb | 0 + | R-D | G/I/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -40 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | R-D | G/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -40 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | R-D | G/H/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -40 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 diff --git a/swh/provenance/tests/data/synthetic_cmdbts2_upper_1.txt b/swh/provenance/tests/data/synthetic_cmdbts2_upper_1.txt new file mode 100644 index 0000000..6602216 --- /dev/null +++ b/swh/provenance/tests/data/synthetic_cmdbts2_upper_1.txt @@ -0,0 +1,91 @@ +1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00 +R00 | | | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0 + +1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01 +R01 | | | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10 + | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 + +1000000020 0d45f1ee524db8f6f0b5a267afac4e733b4b2cee R02 +R02 | | | R 0d45f1ee524db8f6f0b5a267afac4e733b4b2cee | 1000000020 + | R---C | A/B/c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | 0 + | R-D | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -10 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -20 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -10 + +1000000030 540bd6155a3c50cc47b2e6f43aeaace67a696d1d R03 +R03 | | | R 540bd6155a3c50cc47b2e6f43aeaace67a696d1d | 1000000030 + | R-D | A | D 48007c961cc734d1f63886d0413a6dc605e3e2ea | -20 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | D-C | + C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | D-C | + C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -20 + +1000000040 17ed10db0612c9b46ba340943cb6b48b25431419 R04 +R04 | | | R 17ed10db0612c9b46ba340943cb6b48b25431419 | 1000000040 + | R-D | A | D d591b308488541aabffd854eae85a9bf83a9d9f5 | -20 + | D-C | + B/c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | -20 + | D-C | + B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -40 + | D-C | + B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -30 + +1000000050 c8bef45193355db33d64f375b4a4e4f23ac2a4f6 R05 +R05 | | | R c8bef45193355db33d64f375b4a4e4f23ac2a4f6 | 1000000050 + | R---C | D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | 0 + +1000000060 f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c R06 +R06 | | | R f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c | 1000000060 + | R-D | D | D 8a3993f4efa9385ce993775cab5ec4dc2c78d7f6 | -10 + | D-C | + E/D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -10 + +1000000070 91ed6a03c80b61e0d63d328f7a4325230e7a0237 R07 +R07 | | | R 91ed6a03c80b61e0d63d328f7a4325230e7a0237 | 1000000070 + | R-D | F | D b0ae56ed5ca7daa34fd7a91a28db443ab3c389a0 | -20 + | D-C | + d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -20 + | D-C | + E/D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -20 + +1000000080 a97e5c8a626510eefaa637091924cf800b1e8b06 R08 +R08 | | | R a97e5c8a626510eefaa637091924cf800b1e8b06 | 1000000080 + | R-D | F | D 9a7b5762e20b11735b93a635cda451c75bd31270 | -30 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -80 + | D-C | + E/D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -30 + | D-C | + E/D/e | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -30 + +1000000090 3c5ad6be812b182ee2a01e84884b8ab7d384a4a0 R09 +R09 | | | R 3c5ad6be812b182ee2a01e84884b8ab7d384a4a0 | 1000000090 + | R---C | F/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + | R---C | F/E/D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + | R---C | F/E/D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + +1000000100 b7c52e28d441ca0cb736fdbe49e39eae3847ad0f R10 +R10 | | | R b7c52e28d441ca0cb736fdbe49e39eae3847ad0f | 1000000100 + | R---C | F/h | C 920fc6de626e6f51027590557ac8e11cccd5dbc2 | 0 + | R-D | F/E | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -10 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -10 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -10 + +1000000110 f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2 R11 +R11 | | | R f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2 | 1000000110 + | R---C | G/i | C 388d789b624db360ede4c1b5a317da3f07b98e9c | 0 + | R-D | G/E | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -20 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -20 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -20 + +1000000120 99bd98e1803343ecfabe4b05d0218475c2b1bf74 R12 +R12 | | | R 99bd98e1803343ecfabe4b05d0218475c2b1bf74 | 1000000120 + | R---C | G/j | C ab0d07c4e8650fade0f87c7380f3462c89ecfc90 | 0 + | R-D | G/H | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -30 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -30 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -30 + +1000000130 10287882c7ed1b7c96f43da269e6a868b98291ff R13 +R13 | | | R 10287882c7ed1b7c96f43da269e6a868b98291ff | 1000000130 + | R---C | G/k | C 5d322b152c491c915079caa3d8af22ab4d02d5cb | 0 + | R-D | G/I | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -40 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | R-D | G/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -40 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | R-D | G/H | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -40 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 diff --git a/swh/provenance/tests/data/synthetic_cmdbts2_upper_2.txt b/swh/provenance/tests/data/synthetic_cmdbts2_upper_2.txt new file mode 100644 index 0000000..78ac96a --- /dev/null +++ b/swh/provenance/tests/data/synthetic_cmdbts2_upper_2.txt @@ -0,0 +1,91 @@ +1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00 +R00 | | | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0 + +1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01 +R01 | | | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10 + | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 + +1000000020 0d45f1ee524db8f6f0b5a267afac4e733b4b2cee R02 +R02 | | | R 0d45f1ee524db8f6f0b5a267afac4e733b4b2cee | 1000000020 + | R---C | A/B/c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | 0 + | R-D | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -10 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -20 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -10 + +1000000030 540bd6155a3c50cc47b2e6f43aeaace67a696d1d R03 +R03 | | | R 540bd6155a3c50cc47b2e6f43aeaace67a696d1d | 1000000030 + | R---C | A/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | R-D | A/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -20 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -20 + +1000000040 17ed10db0612c9b46ba340943cb6b48b25431419 R04 +R04 | | | R 17ed10db0612c9b46ba340943cb6b48b25431419 | 1000000040 + | R-D | A/B | D 0e540a8ebea2f5de3e62b92e2139902cf6f46e92 | -20 + | D-C | + c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | -20 + | D-C | + C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -40 + | D-C | + C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -30 + +1000000050 c8bef45193355db33d64f375b4a4e4f23ac2a4f6 R05 +R05 | | | R c8bef45193355db33d64f375b4a4e4f23ac2a4f6 | 1000000050 + | R---C | D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | 0 + +1000000060 f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c R06 +R06 | | | R f5c16cb16dc29d9e5b25bd3d4d1e252ac7d5493c | 1000000060 + | R-D | D/E | D fa63f03d67d1a15563afe9f8ba97832dfb20f42a | -10 + | D-C | + D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -10 + +1000000070 91ed6a03c80b61e0d63d328f7a4325230e7a0237 R07 +R07 | | | R 91ed6a03c80b61e0d63d328f7a4325230e7a0237 | 1000000070 + | R---C | + F/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -20 + | R-D | F/E | D fa63f03d67d1a15563afe9f8ba97832dfb20f42a | -20 + | D-C | + D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -20 + +1000000080 a97e5c8a626510eefaa637091924cf800b1e8b06 R08 +R08 | | | R a97e5c8a626510eefaa637091924cf800b1e8b06 | 1000000080 + | R---C | F/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -80 + | R-D | F/E | D 81b84d8fd8ceebd47f51896d19ce1aa286629225 | -30 + | D-C | + D/d | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -30 + | D-C | + D/e | C 86d5e8e8649d6a884e2d3e994f5e99e6435f63e1 | -30 + +1000000090 3c5ad6be812b182ee2a01e84884b8ab7d384a4a0 R09 +R09 | | | R 3c5ad6be812b182ee2a01e84884b8ab7d384a4a0 | 1000000090 + | R---C | F/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + | R---C | F/E/D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + | R---C | F/E/D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | 0 + +1000000100 b7c52e28d441ca0cb736fdbe49e39eae3847ad0f R10 +R10 | | | R b7c52e28d441ca0cb736fdbe49e39eae3847ad0f | 1000000100 + | R---C | F/h | C 920fc6de626e6f51027590557ac8e11cccd5dbc2 | 0 + | R-D | F/E | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -10 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -10 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -10 + +1000000110 f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2 R11 +R11 | | | R f4b2d6d273a6f0d9f2b1299c668b7b7ea095a6a2 | 1000000110 + | R---C | G/i | C 388d789b624db360ede4c1b5a317da3f07b98e9c | 0 + | R-D | G/E | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -20 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -20 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -20 + +1000000120 99bd98e1803343ecfabe4b05d0218475c2b1bf74 R12 +R12 | | | R 99bd98e1803343ecfabe4b05d0218475c2b1bf74 | 1000000120 + | R---C | G/j | C ab0d07c4e8650fade0f87c7380f3462c89ecfc90 | 0 + | R-D | G/H | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -30 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -30 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -30 + +1000000130 10287882c7ed1b7c96f43da269e6a868b98291ff R13 +R13 | | | R 10287882c7ed1b7c96f43da269e6a868b98291ff | 1000000130 + | R---C | G/k | C 5d322b152c491c915079caa3d8af22ab4d02d5cb | 0 + | R-D | G/I | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -40 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | R-D | G/D | D 2cb3ae467165716d1d0e7fa85190d753c3b76d78 | -40 + | D-C | + g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | R-D | G/H | D 8b4df27934ce48db6f4bdf326b3bce89d4571252 | -40 + | D-C | + D/g | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 + | D-C | + D/f | C f82e92602fc443956f77811e1929cb7b5a9c500b | -40 diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py index d7ef760..62222a5 100644 --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -1,367 +1,235 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime -import pytest - from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance.model import RevisionEntry from swh.provenance.origin import OriginEntry from swh.provenance.provenance import origin_add, revision_add from swh.provenance.storage.archive import ArchiveStorage -from swh.provenance.tests.conftest import synthetic_result def ts2dt(ts: dict) -> datetime.datetime: timestamp = datetime.datetime.fromtimestamp( ts["timestamp"]["seconds"], datetime.timezone(datetime.timedelta(minutes=ts["offset"])), ) return timestamp.replace(microsecond=ts["timestamp"]["microseconds"]) def test_provenance_origin_add(provenance, swh_storage_with_objects): """Test the ProvenanceDB.origin_add() method""" for origin in TEST_OBJECTS["origin"]: entry = OriginEntry(url=origin.url, revisions=[]) origin_add(ArchiveStorage(swh_storage_with_objects), provenance, entry) # TODO: check some facts here def test_provenance_add_revision(provenance, storage_and_CMDBTS, archive): storage, data = storage_and_CMDBTS for i in range(2): # do it twice, there should be no change in results for revision in data["revision"]: entry = RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) revision_add(provenance, archive, [entry]) # there should be as many entries in 'revision' as revisions from the # test dataset provenance.cursor.execute("SELECT count(*) FROM revision") assert provenance.cursor.fetchone()[0] == len(data["revision"]) # there should be no 'location' for the empty path provenance.cursor.execute("SELECT count(*) FROM location WHERE path=''") assert provenance.cursor.fetchone()[0] == 0 # there should be 32 'location' for non-empty path provenance.cursor.execute("SELECT count(*) FROM location WHERE path!=''") assert provenance.cursor.fetchone()[0] == 32 # there should be as many entries in 'revision' as revisions from the # test dataset provenance.cursor.execute("SELECT count(*) FROM revision") assert provenance.cursor.fetchone()[0] == len(data["revision"]) # 7 directories provenance.cursor.execute("SELECT count(*) FROM directory") assert provenance.cursor.fetchone()[0] == 7 # 12 D-R entries provenance.cursor.execute("SELECT count(*) FROM directory_in_rev") assert provenance.cursor.fetchone()[0] == 12 provenance.cursor.execute("SELECT count(*) FROM content") assert provenance.cursor.fetchone()[0] == len(data["content"]) provenance.cursor.execute("SELECT count(*) FROM content_in_dir") assert provenance.cursor.fetchone()[0] == 16 provenance.cursor.execute("SELECT count(*) FROM content_early_in_rev") assert provenance.cursor.fetchone()[0] == 13 def test_provenance_content_find_first(provenance, storage_and_CMDBTS, archive): storage, data = storage_and_CMDBTS for revision in data["revision"]: entry = RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) revision_add(provenance, archive, [entry]) first_expected_content = [ { "content": "43f3c871310a8e524004e91f033e7fb3b0bc8475", "rev": "35ccb8dd1b53d2d8a5c1375eb513ef2beaa79ae5", "date": 1609757158, "path": "README.md", }, { "content": "6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1", "rev": "9e36e095b79e36a3da104ce272989b39cd68aefd", "date": 1610644094, "path": "Red/Blue/Green/a", }, { "content": "9f6e04be05297905f1275d3f4e0bb0583458b2e8", "rev": "bfbfcc72ae7fc35d6941386c36280512e6b38440", "date": 1610644097, "path": "Red/Blue/Green/b", }, { "content": "a28fa70e725ebda781e772795ca080cd737b823c", "rev": "0a31c9d509783abfd08f9fdfcd3acae20f17dfd0", "date": 1610644099, "path": "Red/Blue/c", }, { "content": "c0229d305adf3edf49f031269a70e3e87665fe88", "rev": "1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17", "date": 1610644105, "path": "Purple/d", }, { "content": "94ba40161084e8b80943accd9d24e1f9dd47189b", "rev": "55d4dc9471de6144f935daf3c38878155ca274d5", "date": 1610644113, "path": ("Dark/Brown/Purple/f", "Dark/Brown/Purple/g", "Dark/h"), # XXX }, { "content": "5e8f9ceaee9dafae2e3210e254fdf170295f8b5b", "rev": "a8939755d0be76cfea136e9e5ebce9bc51c49fef", "date": 1610644116, "path": "Dark/h", }, { "content": "bbd54b961764094b13f10cef733e3725d0a834c3", "rev": "ca1774a07b6e02c1caa7ae678924efa9259ee7c6", "date": 1610644118, "path": "Paris/i", }, { "content": "7ce4fe9a22f589fa1656a752ea371b0ebc2106b1", "rev": "611fe71d75b6ea151b06e3845c09777acc783d82", "date": 1610644120, "path": "Paris/j", }, { "content": "cb79b39935c9392fa5193d9f84a6c35dc9c22c75", "rev": "4c5551b4969eb2160824494d40b8e1f6187fc01e", "date": 1610644122, "path": "Paris/k", }, ] for expected in first_expected_content: contentid = bytes.fromhex(expected["content"]) (blob, rev, date, path) = provenance.content_find_first(contentid) if isinstance(expected["path"], tuple): assert bytes(path).decode() in expected["path"] else: assert bytes(path).decode() == expected["path"] assert bytes(blob) == contentid assert bytes(rev).hex() == expected["rev"] assert int(date.timestamp()) == expected["date"] def test_provenance_content_find_all(provenance, storage_and_CMDBTS, archive): storage, data = storage_and_CMDBTS for revision in data["revision"]: entry = RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) revision_add(provenance, archive, [entry]) expected_content = { # fmt: off '43f3c871310a8e524004e91f033e7fb3b0bc8475': [ ('35ccb8dd1b53d2d8a5c1375eb513ef2beaa79ae5', 1609757158.0, b'README.md')], '6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1': [ ('9e36e095b79e36a3da104ce272989b39cd68aefd', 1610644094.0, b'Red/Blue/Green/a'), # noqa: E501 ('bfbfcc72ae7fc35d6941386c36280512e6b38440', 1610644097.0, b'Red/Blue/Green/a'), # noqa: E501 ('0a31c9d509783abfd08f9fdfcd3acae20f17dfd0', 1610644099.0, b'Red/Blue/Green/a'), # noqa: E501 ('ca6ec564c69efd2e5c70fb05486fd3f794765a04', 1610644101.0, b'Red/Green/a'), ('ca6ec564c69efd2e5c70fb05486fd3f794765a04', 1610644101.0, b'Red/a'), ('fc6e10b7d41b1d56a94091134e3683ce91e80d91', 1610644103.0, b'Red/Blue/Green/a'), # noqa: E501 ('ba00e89d47dc820bb32c783af7123ffc6e58b56d', 1610644111.0, b'Dark/a')], '9f6e04be05297905f1275d3f4e0bb0583458b2e8': [ ('bfbfcc72ae7fc35d6941386c36280512e6b38440', 1610644097.0, b'Red/Blue/Green/b'), # noqa: E501 ('0a31c9d509783abfd08f9fdfcd3acae20f17dfd0', 1610644099.0, b'Red/Blue/Green/b'), # noqa: E501 ('ca6ec564c69efd2e5c70fb05486fd3f794765a04', 1610644101.0, b'Red/Green/b'), ('fc6e10b7d41b1d56a94091134e3683ce91e80d91', 1610644103.0, b'Red/Blue/Green/b')], # noqa: E501 'a28fa70e725ebda781e772795ca080cd737b823c': [ ('0a31c9d509783abfd08f9fdfcd3acae20f17dfd0', 1610644099.0, b'Red/Blue/c'), ('fc6e10b7d41b1d56a94091134e3683ce91e80d91', 1610644103.0, b'Red/Blue/c')], 'c0229d305adf3edf49f031269a70e3e87665fe88': [ ('1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17', 1610644105.0, b'Purple/d'), ('9a71f967ae1a125be9b6569cc4eccec0aecabb7c', 1610644107.0, b'Purple/Brown/Purple/d'), # noqa: E501 ('4fde4ea4494a630030a4bda99d03961d9add00c7', 1610644109.0, b'Dark/Brown/Purple/d'), # noqa: E501 ('4fde4ea4494a630030a4bda99d03961d9add00c7', 1610644109.0, b'Dark/d'), ('ba00e89d47dc820bb32c783af7123ffc6e58b56d', 1610644111.0, b'Dark/Brown/Purple/d'), # noqa: E501 ('ba00e89d47dc820bb32c783af7123ffc6e58b56d', 1610644111.0, b'Dark/Brown/Purple/e')], # noqa: E501 '94ba40161084e8b80943accd9d24e1f9dd47189b': [ ('55d4dc9471de6144f935daf3c38878155ca274d5', 1610644113.0, b'Dark/Brown/Purple/f'), # noqa: E501 ('55d4dc9471de6144f935daf3c38878155ca274d5', 1610644113.0, b'Dark/Brown/Purple/g'), # noqa: E501 ('55d4dc9471de6144f935daf3c38878155ca274d5', 1610644113.0, b'Dark/f'), ('a8939755d0be76cfea136e9e5ebce9bc51c49fef', 1610644116.0, b'Dark/Brown/Purple/f'), # noqa: E501 ('a8939755d0be76cfea136e9e5ebce9bc51c49fef', 1610644116.0, b'Dark/Brown/Purple/g'), # noqa: E501 ('ca1774a07b6e02c1caa7ae678924efa9259ee7c6', 1610644118.0, b'Paris/Brown/Purple/f'), # noqa: E501 ('ca1774a07b6e02c1caa7ae678924efa9259ee7c6', 1610644118.0, b'Paris/Brown/Purple/g'), # noqa: E501 ('611fe71d75b6ea151b06e3845c09777acc783d82', 1610644120.0, b'Paris/Berlin/Purple/f'), # noqa: E501 ('611fe71d75b6ea151b06e3845c09777acc783d82', 1610644120.0, b'Paris/Berlin/Purple/g'), # noqa: E501 ('4c5551b4969eb2160824494d40b8e1f6187fc01e', 1610644122.0, b'Paris/Berlin/Purple/f'), # noqa: E501 ('4c5551b4969eb2160824494d40b8e1f6187fc01e', 1610644122.0, b'Paris/Berlin/Purple/g'), # noqa: E501 ('4c5551b4969eb2160824494d40b8e1f6187fc01e', 1610644122.0, b'Paris/Munich/Purple/f'), # noqa: E501 ('4c5551b4969eb2160824494d40b8e1f6187fc01e', 1610644122.0, b'Paris/Munich/Purple/g'), # noqa: E501 ('4c5551b4969eb2160824494d40b8e1f6187fc01e', 1610644122.0, b'Paris/Purple/f'), # noqa: E501 ('4c5551b4969eb2160824494d40b8e1f6187fc01e', 1610644122.0, b'Paris/Purple/g')], # noqa: E501 '5e8f9ceaee9dafae2e3210e254fdf170295f8b5b': [ ('a8939755d0be76cfea136e9e5ebce9bc51c49fef', 1610644116.0, b'Dark/h')], 'bbd54b961764094b13f10cef733e3725d0a834c3': [ ('ca1774a07b6e02c1caa7ae678924efa9259ee7c6', 1610644118.0, b'Paris/i')], '7ce4fe9a22f589fa1656a752ea371b0ebc2106b1': [ ('611fe71d75b6ea151b06e3845c09777acc783d82', 1610644120.0, b'Paris/j')], 'cb79b39935c9392fa5193d9f84a6c35dc9c22c75': [ ('4c5551b4969eb2160824494d40b8e1f6187fc01e', 1610644122.0, b'Paris/k')], # fmt: on } for content, results in expected_content.items(): contentid = bytes.fromhex(content) occurrences = [ (blob.hex(), rev.hex(), date.timestamp(), path) for blob, rev, date, path in provenance.content_find_all(contentid) ] expected = [(content, *result) for result in results] assert len(occurrences) == len(expected) assert set(occurrences) == set(expected) - - -def sha1s(cur, table): - """return the 'sha1' column from the DB 'table' (as hex) - - 'cur' is a cursor to the provenance index DB. - """ - cur.execute(f"SELECT sha1 FROM {table}") - return set(sha1.hex() for (sha1,) in cur.fetchall()) - - -def locations(cur): - """return the 'path' column from the DB location table - - 'cur' is a cursor to the provenance index DB. - """ - cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location") - return set(x for (x,) in cur.fetchall()) - - -def relations(cur, src, dst): - """return the triplets ('sha1', 'sha1', 'path') from the DB - - for the relation between 'src' table and 'dst' table - (i.e. for C-R, C-D and D-R relations). - - 'cur' is a cursor to the provenance index DB. - """ - relation = { - ("content", "revision"): "content_early_in_rev", - ("content", "directory"): "content_in_dir", - ("directory", "revision"): "directory_in_rev", - }[(src, dst)] - - srccol = {"content": "blob", "directory": "dir"}[src] - dstcol = {"directory": "dir", "revision": "rev"}[dst] - - cur.execute( - f"SELECT encode(src.sha1::bytea, 'hex')," - f" encode(dst.sha1::bytea, 'hex')," - f" encode(location.path::bytea, 'escape') " - f"FROM {relation} as rel, " - f" {src} as src, {dst} as dst, location " - f"WHERE rel.{srccol}=src.id AND rel.{dstcol}=dst.id AND rel.loc=location.id" - ) - return set(cur.fetchall()) - - -@pytest.mark.parametrize( - "syntheticfile, args", - ( - ("synthetic_lower_1.txt", {"lower": True, "mindepth": 1}), - ("synthetic_upper_1.txt", {"lower": False, "mindepth": 1}), - ("synthetic_lower_2.txt", {"lower": True, "mindepth": 2}), - ("synthetic_upper_2.txt", {"lower": False, "mindepth": 2}), - ), -) -def test_provenance_heuristics( - provenance, storage_and_CMDBTS, archive, syntheticfile, args -): - storage, data = storage_and_CMDBTS - - revisions = {rev["id"]: rev for rev in data["revision"]} - - rows = { - "content": set(), - "content_in_dir": set(), - "content_early_in_rev": set(), - "directory": set(), - "directory_in_rev": set(), - "location": set(), - "revision": set(), - } - - for synth_rev in synthetic_result(syntheticfile): - revision = revisions[synth_rev["sha1"]] - entry = RevisionEntry( - id=revision["id"], - date=ts2dt(revision["date"]), - root=revision["directory"], - ) - revision_add(provenance, archive, [entry], **args) - - # each "entry" in the synth file is one new revision - rows["revision"].add(synth_rev["sha1"].hex()) - assert rows["revision"] == sha1s(provenance.cursor, "revision"), synth_rev[ - "msg" - ] - - # this revision might have added new content objects - rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) - rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) - assert rows["content"] == sha1s(provenance.cursor, "content"), synth_rev["msg"] - - # check for R-C (direct) entries - rows["content_early_in_rev"] |= set( - (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] - ) - assert rows["content_early_in_rev"] == relations( - provenance.cursor, "content", "revision" - ), synth_rev["msg"] - - # check directories - rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) - assert rows["directory"] == sha1s(provenance.cursor, "directory"), synth_rev[ - "msg" - ] - - # check for R-D entries - rows["directory_in_rev"] |= set( - (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] - ) - assert rows["directory_in_rev"] == relations( - provenance.cursor, "directory", "revision" - ), synth_rev["msg"] - - # check for D-C entries - rows["content_in_dir"] |= set( - (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] - ) - assert rows["content_in_dir"] == relations( - provenance.cursor, "content", "directory" - ), synth_rev["msg"] - - # check for location entries - rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) - rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) - rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) - assert rows["location"] == locations(provenance.cursor), synth_rev["msg"] diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py new file mode 100644 index 0000000..48a9685 --- /dev/null +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -0,0 +1,268 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.provenance.model import RevisionEntry +from swh.provenance.provenance import revision_add +from swh.provenance.tests.conftest import ( + fill_storage, + get_datafile, + load_repo_data, + synthetic_result, +) +from swh.provenance.tests.test_provenance_db import ts2dt + + +def sha1s(cur, table): + """return the 'sha1' column from the DB 'table' (as hex) + + 'cur' is a cursor to the provenance index DB. + """ + cur.execute(f"SELECT sha1 FROM {table}") + return set(sha1.hex() for (sha1,) in cur.fetchall()) + + +def locations(cur): + """return the 'path' column from the DB location table + + 'cur' is a cursor to the provenance index DB. + """ + cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location") + return set(x for (x,) in cur.fetchall()) + + +def relations(cur, src, dst): + """return the triplets ('sha1', 'sha1', 'path') from the DB + + for the relation between 'src' table and 'dst' table + (i.e. for C-R, C-D and D-R relations). + + 'cur' is a cursor to the provenance index DB. + """ + relation = { + ("content", "revision"): "content_early_in_rev", + ("content", "directory"): "content_in_dir", + ("directory", "revision"): "directory_in_rev", + }[(src, dst)] + + srccol = {"content": "blob", "directory": "dir"}[src] + dstcol = {"directory": "dir", "revision": "rev"}[dst] + + cur.execute( + f"SELECT encode(src.sha1::bytea, 'hex')," + f" encode(dst.sha1::bytea, 'hex')," + f" encode(location.path::bytea, 'escape') " + f"FROM {relation} as rel, " + f" {src} as src, {dst} as dst, location " + f"WHERE rel.{srccol}=src.id AND rel.{dstcol}=dst.id AND rel.loc=location.id" + ) + return set(cur.fetchall()) + + +def get_timestamp(cur, table, sha1): + """return the date for the 'sha1' from the DB 'table' (as hex) + + 'cur' is a cursor to the provenance index DB. + """ + if isinstance(sha1, str): + sha1 = bytes.fromhex(sha1) + cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) + return [date.timestamp() for (date,) in cur.fetchall()] + + +@pytest.mark.parametrize( + "repo, lower, mindepth", + ( + ("cmdbts2", True, 1), + ("cmdbts2", False, 1), + ("cmdbts2", True, 2), + ("cmdbts2", False, 2), + ), +) +def test_provenance_heuristics(provenance, swh_storage, archive, repo, lower, mindepth): + # read data/README.md for more details on how these datasets are generated + data = load_repo_data(repo) + fill_storage(swh_storage, data) + syntheticfile = get_datafile( + f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" + ) + + revisions = {rev["id"]: rev for rev in data["revision"]} + + rows = { + "content": set(), + "content_in_dir": set(), + "content_early_in_rev": set(), + "directory": set(), + "directory_in_rev": set(), + "location": set(), + "revision": set(), + } + + for synth_rev in synthetic_result(syntheticfile): + revision = revisions[synth_rev["sha1"]] + entry = RevisionEntry( + id=revision["id"], + date=ts2dt(revision["date"]), + root=revision["directory"], + ) + revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) + + # each "entry" in the synth file is one new revision + rows["revision"].add(synth_rev["sha1"].hex()) + assert rows["revision"] == sha1s(provenance.cursor, "revision"), synth_rev[ + "msg" + ] + # check the timestamp of the revision + rev_ts = synth_rev["date"] + assert get_timestamp( + provenance.cursor, "revision", synth_rev["sha1"].hex() + ) == [rev_ts], synth_rev["msg"] + + # this revision might have added new content objects + rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) + rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) + assert rows["content"] == sha1s(provenance.cursor, "content"), synth_rev["msg"] + + # check for R-C (direct) entries + # these are added directly in the content_early_in_rev table + rows["content_early_in_rev"] |= set( + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] + ) + assert rows["content_early_in_rev"] == relations( + provenance.cursor, "content", "revision" + ), synth_rev["msg"] + # check timestamps + for rc in synth_rev["R_C"]: + assert get_timestamp(provenance.cursor, "content", rc["dst"]) == [ + rev_ts + rc["rel_ts"] + ], synth_rev["msg"] + + # check directories + # each directory stored in the provenance index is an entry + # in the "directory" table... + rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) + assert rows["directory"] == sha1s(provenance.cursor, "directory"), synth_rev[ + "msg" + ] + + # ... + a number of rows in the "directory_in_rev" table... + # check for R-D entries + rows["directory_in_rev"] |= set( + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] + ) + assert rows["directory_in_rev"] == relations( + provenance.cursor, "directory", "revision" + ), synth_rev["msg"] + # check timestamps + for rd in synth_rev["R_D"]: + assert get_timestamp(provenance.cursor, "directory", rd["dst"]) == [ + rev_ts + rd["rel_ts"] + ], synth_rev["msg"] + + # ... + a number of rows in the "content_in_dir" table + # for content of the directory. + # check for D-C entries + rows["content_in_dir"] |= set( + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] + ) + assert rows["content_in_dir"] == relations( + provenance.cursor, "content", "directory" + ), synth_rev["msg"] + # check timestamps + for dc in synth_rev["D_C"]: + assert get_timestamp(provenance.cursor, "content", dc["dst"]) == [ + rev_ts + dc["rel_ts"] + ], synth_rev["msg"] + + # check for location entries + rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) + rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) + rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) + assert rows["location"] == locations(provenance.cursor), synth_rev["msg"] + + +@pytest.mark.parametrize( + "syntheticfile, args", + ( + ("synthetic_lower_1.txt", {"lower": True, "mindepth": 1}), + ("synthetic_upper_1.txt", {"lower": False, "mindepth": 1}), + ("synthetic_lower_2.txt", {"lower": True, "mindepth": 2}), + ("synthetic_upper_2.txt", {"lower": False, "mindepth": 2}), + ), +) +def test_provenance_heuristics_CMDBTS( + provenance, storage_and_CMDBTS, archive, syntheticfile, args +): + storage, data = storage_and_CMDBTS + + revisions = {rev["id"]: rev for rev in data["revision"]} + + rows = { + "content": set(), + "content_in_dir": set(), + "content_early_in_rev": set(), + "directory": set(), + "directory_in_rev": set(), + "location": set(), + "revision": set(), + } + + for synth_rev in synthetic_result(syntheticfile): + revision = revisions[synth_rev["sha1"]] + entry = RevisionEntry( + id=revision["id"], + date=ts2dt(revision["date"]), + root=revision["directory"], + ) + revision_add(provenance, archive, [entry], **args) + + # each "entry" in the synth file is one new revision + rows["revision"].add(synth_rev["sha1"].hex()) + assert rows["revision"] == sha1s(provenance.cursor, "revision"), synth_rev[ + "msg" + ] + + # this revision might have added new content objects + rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) + rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) + assert rows["content"] == sha1s(provenance.cursor, "content"), synth_rev["msg"] + + # check for R-C (direct) entries + rows["content_early_in_rev"] |= set( + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] + ) + assert rows["content_early_in_rev"] == relations( + provenance.cursor, "content", "revision" + ), synth_rev["msg"] + + # check directories + rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) + assert rows["directory"] == sha1s(provenance.cursor, "directory"), synth_rev[ + "msg" + ] + + # check for R-D entries + rows["directory_in_rev"] |= set( + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] + ) + assert rows["directory_in_rev"] == relations( + provenance.cursor, "directory", "revision" + ), synth_rev["msg"] + + # check for D-C entries + rows["content_in_dir"] |= set( + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] + ) + assert rows["content_in_dir"] == relations( + provenance.cursor, "content", "directory" + ), synth_rev["msg"] + + # check for location entries + rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) + rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) + rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) + assert rows["location"] == locations(provenance.cursor), synth_rev["msg"]