diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py index e1693a5..cf0a39a 100644 --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -1,310 +1,320 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import glob from os import path import re -from typing import Iterable, Iterator, List +from typing import Iterable, Iterator, List, Optional import pytest from typing_extensions import TypedDict from swh.core.api.serializers import msgpack_loads from swh.core.db import BaseDb from swh.core.db.pytest_plugin import postgresql_fact from swh.core.utils import numfile_sortkey as sortkey from swh.model.model import Content, Directory, DirectoryEntry, Revision from swh.model.tests.swh_model_data import TEST_OBJECTS import swh.provenance from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql") SQL_FILES = [ sqlfile for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey) if "-without-path-" not in sqlfile ] provenance_db = postgresql_fact( "postgresql_proc", dbname="provenance", dump_files=SQL_FILES ) @pytest.fixture def provenance(provenance_db): """return a working and initialized provenance db""" from swh.provenance.postgresql.provenancedb_with_path import ( ProvenanceWithPathDB as ProvenanceDB, ) BaseDb.adapt_conn(provenance_db) prov = ProvenanceDB(provenance_db) # in test sessions, we DO want to raise any exception occurring at commit time prov.raise_on_commit = True return prov @pytest.fixture def swh_storage_with_objects(swh_storage): """return a Storage object (postgresql-based by default) with a few of each object type in it The inserted content comes from swh.model.tests.swh_model_data. """ for obj_type in ( "content", "skipped_content", "directory", "revision", "release", "snapshot", "origin", "origin_visit", "origin_visit_status", ): getattr(swh_storage, f"{obj_type}_add")(TEST_OBJECTS[obj_type]) return swh_storage @pytest.fixture def archive_direct(swh_storage_with_objects): return ArchivePostgreSQL(swh_storage_with_objects.get_db().conn) @pytest.fixture def archive_api(swh_storage_with_objects): return ArchiveStorage(swh_storage_with_objects) @pytest.fixture def archive(swh_storage_with_objects): """Return a ArchivePostgreSQL based StorageInterface object""" # this is a workaround to prevent tests from hanging because of an unclosed # transaction. # TODO: refactor the ArchivePostgreSQL to properly deal with # transactions and get rif of this fixture archive = ArchivePostgreSQL(conn=swh_storage_with_objects.get_db().conn) yield archive archive.conn.rollback() def get_datafile(fname): return path.join(path.dirname(__file__), "data", fname) @pytest.fixture def CMDBTS_data(): # imported git tree is https://github.com/grouss/CMDBTS rev 4c5551b496 # ([xxx] is the timestamp): # o - [1609757158] first commit 35ccb8dd1b53d2d8a5c1375eb513ef2beaa79ae5 # | `- README.md * 43f3c871310a8e524004e91f033e7fb3b0bc8475 # o - [1610644094] Reset Empty repository 840b91df68e9549c156942ddd5002111efa15604 # | # o - [1610644094] R0000 9e36e095b79e36a3da104ce272989b39cd68aefd # | `- Red/Blue/Green/a * 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # o - [1610644097] R0001 bfbfcc72ae7fc35d6941386c36280512e6b38440 # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # | `- Red/Blue/Green/b * 9f6e04be05297905f1275d3f4e0bb0583458b2e8 # o - [1610644099] R0002 0a31c9d509783abfd08f9fdfcd3acae20f17dfd0 # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # | |- Red/Blue/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 # | `- Red/Blue/c * a28fa70e725ebda781e772795ca080cd737b823c # o - [1610644101] R0003 ca6ec564c69efd2e5c70fb05486fd3f794765a04 # | |- Red/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # | |- Red/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 # | `- Red/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # o - [1610644103] R0004 fc6e10b7d41b1d56a94091134e3683ce91e80d91 # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # | |- Red/Blue/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 # | `- Red/Blue/c a28fa70e725ebda781e772795ca080cd737b823c # o - [1610644105] R0005 1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17 # | `- Purple/d * c0229d305adf3edf49f031269a70e3e87665fe88 # o - [1610644107] R0006 9a71f967ae1a125be9b6569cc4eccec0aecabb7c # | `- Purple/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 # o - [1610644109] R0007 4fde4ea4494a630030a4bda99d03961d9add00c7 # | |- Dark/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 # | `- Dark/d c0229d305adf3edf49f031269a70e3e87665fe88 # o - [1610644111] R0008 ba00e89d47dc820bb32c783af7123ffc6e58b56d # | |- Dark/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 # | |- Dark/Brown/Purple/e c0229d305adf3edf49f031269a70e3e87665fe88 # | `- Dark/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 # o - [1610644113] R0009 55d4dc9471de6144f935daf3c38878155ca274d5 # | |- Dark/Brown/Purple/f * 94ba40161084e8b80943accd9d24e1f9dd47189b # | |- Dark/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # | `- Dark/f 94ba40161084e8b80943accd9d24e1f9dd47189b # o - [1610644116] R0010 a8939755d0be76cfea136e9e5ebce9bc51c49fef # | |- Dark/Brown/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # | |- Dark/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # | `- Dark/h * 5e8f9ceaee9dafae2e3210e254fdf170295f8b5b # o - [1610644118] R0011 ca1774a07b6e02c1caa7ae678924efa9259ee7c6 # | |- Paris/Brown/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # | |- Paris/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # | `- Paris/i * bbd54b961764094b13f10cef733e3725d0a834c3 # o - [1610644120] R0012 611fe71d75b6ea151b06e3845c09777acc783d82 # | |- Paris/Berlin/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # | |- Paris/Berlin/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # | `- Paris/j * 7ce4fe9a22f589fa1656a752ea371b0ebc2106b1 # o - [1610644122] R0013 4c5551b4969eb2160824494d40b8e1f6187fc01e # |- Paris/Berlin/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Berlin/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Munich/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Munich/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b # |- Paris/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # `- Paris/k * cb79b39935c9392fa5193d9f84a6c35dc9c22c75 return load_repo_data("CMDBTS") def load_repo_data(repo): data = {"revision": [], "directory": [], "content": []} with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj: for etype, value in msgpack_loads(fobj.read()): data[etype].append(value) return data def filter_dict(d, keys): return {k: v for (k, v) in d.items() if k in keys} @pytest.fixture def storage_and_CMDBTS(swh_storage, CMDBTS_data): fill_storage(swh_storage, CMDBTS_data) return swh_storage, CMDBTS_data def fill_storage(storage, data): storage.content_add_metadata( Content.from_dict(content) for content in data["content"] ) storage.directory_add( [ Directory( entries=tuple( [ DirectoryEntry.from_dict( filter_dict(entry, ("name", "type", "target", "perms")) ) for entry in dir["entries"] ] ) ) for dir in data["directory"] ] ) storage.revision_add(Revision.from_dict(revision) for revision in data["revision"]) class SynthRelation(TypedDict): + prefix: Optional[str] path: str src: bytes dst: bytes rel_ts: float class SynthRevision(TypedDict): sha1: bytes date: float msg: str R_C: List[SynthRelation] R_D: List[SynthRelation] D_C: List[SynthRelation] def synthetic_result(filename: str) -> Iterator[SynthRevision]: """Generates dict representations of synthetic revisions found in the synthetic file (from the data/ directory) given as argument of the generator. Generated SynthRevision (typed dict) with the following elements: "sha1": (bytes) sha1 of the revision, "date": (float) timestamp of the revision, "msg": (str) commit message of the revision, "R_C": (list) new R---C relations added by this revision "R_D": (list) new R-D relations added by this revision "D_C": (list) new D-C relations added by this revision Each relation above is a SynthRelation typed dict with: "path": (str) location "src": (bytes) sha1 of the source of the relation "dst": (bytes) sha1 of the destination of the relation "rel_ts": (float) timestamp of the target of the relation (related to the timestamp of the revision) """ with open(get_datafile(filename), "r") as fobj: yield from _parse_synthetic_file(fobj) def _parse_synthetic_file(fobj: Iterable[str]) -> Iterator[SynthRevision]: """Read a 'synthetic' file and generate a dict representation of the synthetic revision for each revision listed in the synthetic file. """ regs = [ "(?PR[0-9]{2,4})?", "(?P[^| ]*)", "([+] )?(?P[^| +]*?)[/]?", "(?P[RDC]) (?P[0-9a-z]{40})", "(?P-?[0-9]+(.[0-9]+)?)", ] regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$") current_rev: List[dict] = [] for m in (regex.match(line) for line in fobj): if m: d = m.groupdict() if d["revname"]: if current_rev: yield _mk_synth_rev(current_rev) current_rev.clear() current_rev.append(d) if current_rev: yield _mk_synth_rev(current_rev) def _mk_synth_rev(synth_rev) -> SynthRevision: assert synth_rev[0]["type"] == "R" rev = SynthRevision( sha1=bytes.fromhex(synth_rev[0]["sha1"]), date=float(synth_rev[0]["ts"]), msg=synth_rev[0]["revname"], R_C=[], R_D=[], D_C=[], ) + current_path = None + # path of the last R-D relation we parsed, used a prefix for next D-C + # relations + for row in synth_rev[1:]: if row["reltype"] == "R---C": assert row["type"] == "C" rev["R_C"].append( SynthRelation( + prefix=None, path=row["path"], src=rev["sha1"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) + current_path = None elif row["reltype"] == "R-D": assert row["type"] == "D" rev["R_D"].append( SynthRelation( + prefix=None, path=row["path"], src=rev["sha1"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) + current_path = row["path"] elif row["reltype"] == "D-C": assert row["type"] == "C" rev["D_C"].append( SynthRelation( + prefix=current_path, path=row["path"], src=rev["R_D"][-1]["dst"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) return rev diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py index 84c3f82..10c44f7 100644 --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -1,186 +1,247 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.provenance.model import RevisionEntry from swh.provenance.provenance import revision_add from swh.provenance.tests.conftest import ( fill_storage, get_datafile, load_repo_data, synthetic_result, ) from swh.provenance.tests.test_provenance_db import ts2dt def sha1s(cur, table): """return the 'sha1' column from the DB 'table' (as hex) 'cur' is a cursor to the provenance index DB. """ cur.execute(f"SELECT sha1 FROM {table}") return set(sha1.hex() for (sha1,) in cur.fetchall()) def locations(cur): """return the 'path' column from the DB location table 'cur' is a cursor to the provenance index DB. """ cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location") return set(x for (x,) in cur.fetchall()) def relations(cur, src, dst): """return the triplets ('sha1', 'sha1', 'path') from the DB for the relation between 'src' table and 'dst' table (i.e. for C-R, C-D and D-R relations). 'cur' is a cursor to the provenance index DB. """ relation = { ("content", "revision"): "content_early_in_rev", ("content", "directory"): "content_in_dir", ("directory", "revision"): "directory_in_rev", }[(src, dst)] srccol = {"content": "blob", "directory": "dir"}[src] dstcol = {"directory": "dir", "revision": "rev"}[dst] cur.execute( f"SELECT encode(src.sha1::bytea, 'hex')," f" encode(dst.sha1::bytea, 'hex')," f" encode(location.path::bytea, 'escape') " f"FROM {relation} as rel, " f" {src} as src, {dst} as dst, location " f"WHERE rel.{srccol}=src.id AND rel.{dstcol}=dst.id AND rel.loc=location.id" ) return set(cur.fetchall()) def get_timestamp(cur, table, sha1): """return the date for the 'sha1' from the DB 'table' (as hex) 'cur' is a cursor to the provenance index DB. """ if isinstance(sha1, str): sha1 = bytes.fromhex(sha1) cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) return [date.timestamp() for (date,) in cur.fetchall()] @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) def test_provenance_heuristics(provenance, swh_storage, archive, repo, lower, mindepth): # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) revisions = {rev["id"]: rev for rev in data["revision"]} rows = { "content": set(), "content_in_dir": set(), "content_early_in_rev": set(), "directory": set(), "directory_in_rev": set(), "location": set(), "revision": set(), } for synth_rev in synthetic_result(syntheticfile): revision = revisions[synth_rev["sha1"]] entry = RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) # each "entry" in the synth file is one new revision rows["revision"].add(synth_rev["sha1"].hex()) assert rows["revision"] == sha1s(provenance.cursor, "revision"), synth_rev[ "msg" ] # check the timestamp of the revision rev_ts = synth_rev["date"] assert get_timestamp( provenance.cursor, "revision", synth_rev["sha1"].hex() ) == [rev_ts], synth_rev["msg"] # this revision might have added new content objects rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) assert rows["content"] == sha1s(provenance.cursor, "content"), synth_rev["msg"] # check for R-C (direct) entries # these are added directly in the content_early_in_rev table rows["content_early_in_rev"] |= set( (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] ) assert rows["content_early_in_rev"] == relations( provenance.cursor, "content", "revision" ), synth_rev["msg"] # check timestamps for rc in synth_rev["R_C"]: assert get_timestamp(provenance.cursor, "content", rc["dst"]) == [ rev_ts + rc["rel_ts"] ], synth_rev["msg"] # check directories # each directory stored in the provenance index is an entry # in the "directory" table... rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) assert rows["directory"] == sha1s(provenance.cursor, "directory"), synth_rev[ "msg" ] # ... + a number of rows in the "directory_in_rev" table... # check for R-D entries rows["directory_in_rev"] |= set( (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] ) assert rows["directory_in_rev"] == relations( provenance.cursor, "directory", "revision" ), synth_rev["msg"] # check timestamps for rd in synth_rev["R_D"]: assert get_timestamp(provenance.cursor, "directory", rd["dst"]) == [ rev_ts + rd["rel_ts"] ], synth_rev["msg"] # ... + a number of rows in the "content_in_dir" table # for content of the directory. # check for D-C entries rows["content_in_dir"] |= set( (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] ) assert rows["content_in_dir"] == relations( provenance.cursor, "content", "directory" ), synth_rev["msg"] # check timestamps for dc in synth_rev["D_C"]: assert get_timestamp(provenance.cursor, "content", dc["dst"]) == [ rev_ts + dc["rel_ts"] ], synth_rev["msg"] # check for location entries rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) assert rows["location"] == locations(provenance.cursor), synth_rev["msg"] + + +@pytest.mark.parametrize( + "repo, lower, mindepth", + ( + ("cmdbts2", True, 1), + ("cmdbts2", False, 1), + ("cmdbts2", True, 2), + ("cmdbts2", False, 2), + ("out-of-order", True, 1), + ), +) +def test_provenance_heuristics_content_find_all( + provenance, swh_storage, archive, repo, lower, mindepth +): + # read data/README.md for more details on how these datasets are generated + data = load_repo_data(repo) + fill_storage(swh_storage, data) + revisions = [ + RevisionEntry( + id=revision["id"], + date=ts2dt(revision["date"]), + root=revision["directory"], + ) + for revision in data["revision"] + ] + + # XXX adding all revisions at once should be working just fine, but it does not... + # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) + # ...so add revisions one at a time for now + for revision in revisions: + revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth) + + syntheticfile = get_datafile( + f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" + ) + expected_occurrences = {} + for synth_rev in synthetic_result(syntheticfile): + rev_id = synth_rev["sha1"].hex() + rev_ts = synth_rev["date"] + + for rc in synth_rev["R_C"]: + expected_occurrences.setdefault(rc["dst"].hex(), []).append( + (rev_id, rev_ts, rc["path"]) + ) + for dc in synth_rev["D_C"]: + assert dc["prefix"] is not None # to please mypy + expected_occurrences.setdefault(dc["dst"].hex(), []).append( + (rev_id, rev_ts, dc["prefix"] + "/" + dc["path"]) + ) + + for content_id, results in expected_occurrences.items(): + expected = [(content_id, *result) for result in results] + db_occurrences = [ + (blob.hex(), rev.hex(), date.timestamp(), path.decode()) + for blob, rev, date, path in provenance.content_find_all( + bytes.fromhex(content_id) + ) + ] + assert len(db_occurrences) == len(expected) + assert set(db_occurrences) == set(expected)