diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -5,8 +5,11 @@ import glob from os import path +import re +from typing import Iterable, Iterator, List import pytest +from typing_extensions import TypedDict from swh.core.api.serializers import msgpack_loads from swh.core.db import BaseDb @@ -84,6 +87,10 @@ archive.conn.rollback() +def get_datafile(fname): + return path.join(path.dirname(__file__), "data", fname) + + @pytest.fixture def CMDBTS_data(): # imported git tree is https://github.com/grouss/CMDBTS rev 4c5551b496 @@ -145,9 +152,7 @@ # |- Paris/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b # `- Paris/k * cb79b39935c9392fa5193d9f84a6c35dc9c22c75 data = {"revision": [], "directory": [], "content": []} - with open( - path.join(path.dirname(__file__), "data", "CMDBTS.msgpack"), "rb" - ) as fobj: + with open(get_datafile("CMDBTS.msgpack"), "rb") as fobj: for etype, value in msgpack_loads(fobj.read()): data[etype].append(value) return data @@ -181,3 +186,115 @@ Revision.from_dict(revision) for revision in CMDBTS_data["revision"] ) return swh_storage, CMDBTS_data + + +class SynthRelation(TypedDict): + path: str + src: bytes + dst: bytes + rel_ts: float + + +class SynthRevision(TypedDict): + sha1: bytes + date: float + msg: str + R_C: List[SynthRelation] + R_D: List[SynthRelation] + D_C: List[SynthRelation] + + +def synthetic_result(filename: str) -> Iterator[SynthRevision]: + """Generates dict representations of synthetic revisions found in the synthetic + file (from the data/ directory) given as argument of the generator. + + Generated SynthRevision (typed dict) with the following elements: + + "sha1": (bytes) sha1 of the revision, + "date": (float) timestamp of the revision, + "msg": (str) commit message of the revision, + "R_C": (list) new R---C relations added by this revision + "R_D": (list) new R-D relations added by this revision + "D_C": (list) new D-C relations added by this revision + + Each relation above is a SynthRelation typed dict with: + + "path": (str) location + "src": (bytes) sha1 of the source of the relation + "dst": (bytes) sha1 of the destination of the relation + "rel_ts": (float) timestamp of the target of the relation + (related to the timestamp of the revision) + + """ + + with open(get_datafile(filename), "r") as fobj: + yield from _parse_synthetic_file(fobj) + + +def _parse_synthetic_file(fobj: Iterable[str]) -> Iterator[SynthRevision]: + """Read a 'synthetic' file and generate a dict representation of the synthetic + revision for each revision listed in the synthetic file. + """ + regs = [ + "(?PR[0-9]{4})?", + "(?P[^| ]*)", + "(?P[^|]*?)", + "(?P[RDC]) (?P[0-9a-z]{40})", + "(?P-?[0-9]+(.[0-9]+)?)", + ] + regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *$") + current_rev: List[dict] = [] + for m in (regex.match(line) for line in fobj): + if m: + d = m.groupdict() + if d["revname"]: + if current_rev: + yield _mk_synth_rev(current_rev) + current_rev.clear() + current_rev.append(d) + if current_rev: + yield _mk_synth_rev(current_rev) + + +def _mk_synth_rev(synth_rev) -> SynthRevision: + assert synth_rev[0]["type"] == "R" + rev = SynthRevision( + sha1=bytes.fromhex(synth_rev[0]["sha1"]), + date=float(synth_rev[0]["ts"]), + msg=synth_rev[0]["revname"], + R_C=[], + R_D=[], + D_C=[], + ) + for row in synth_rev[1:]: + if row["reltype"] == "R---C": + assert row["type"] == "C" + rev["R_C"].append( + SynthRelation( + path=row["path"], + src=rev["sha1"], + dst=bytes.fromhex(row["sha1"]), + rel_ts=float(row["ts"]), + ) + ) + elif row["reltype"] == "R-D": + assert row["type"] == "D" + rev["R_D"].append( + SynthRelation( + path=row["path"], + src=rev["sha1"], + dst=bytes.fromhex(row["sha1"]), + rel_ts=float(row["ts"]), + ) + ) + elif row["reltype"] == "D-C": + assert row["type"] == "C" + rev["D_C"].append( + SynthRelation( + path=row["path"], + src=rev["R_D"][-1]["dst"], + dst=bytes.fromhex(row["sha1"]), + rel_ts=float(row["ts"]), + ) + ) + return rev diff --git a/swh/provenance/tests/data/synthetic_noroot_lower.txt b/swh/provenance/tests/data/synthetic_noroot_lower.txt new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/data/synthetic_noroot_lower.txt @@ -0,0 +1,92 @@ +1610644094.0 9e36e095b79e36a3da104ce272989b39cd68aefd R0000 +R0000 | | | R 9e36e095b79e36a3da104ce272989b39cd68aefd | 1610644094.0 + | R---C | Red/Blue/Green/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | 0.0 + +1610644097.0 bfbfcc72ae7fc35d6941386c36280512e6b38440 R0001 +R0001 | | | R bfbfcc72ae7fc35d6941386c36280512e6b38440 | 1610644097.0 + | R---C | Red/Blue/Green/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -3.0 + | R---C | Red/Blue/Green/b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | 0.0 + +1610644099.0 0a31c9d509783abfd08f9fdfcd3acae20f17dfd0 R0002 +R0002 | | | R 0a31c9d509783abfd08f9fdfcd3acae20f17dfd0 | 1610644099.0 + | R---C | Red/Blue/c | C a28fa70e725ebda781e772795ca080cd737b823c | 0.0 + | R-D | Red/Blue/Green/ | D 4b6387dc2c85d82f0e2375461b687dabb03aa97c | -2.0 + | D-C | + a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -5.0 + | D-C | + b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -2.0 + +1610644101.0 ca6ec564c69efd2e5c70fb05486fd3f794765a04 R0003 +R0003 | | | R ca6ec564c69efd2e5c70fb05486fd3f794765a04 | 1610644101.0 + | R-D | Red/ | D a85553c8942940668e613b94bd31367af3342add | -4.0 + | D-C | + Green/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -7.0 + | D-C | + Green/b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -4.0 + | D-C | + a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -7.0 + +1610644103.0 fc6e10b7d41b1d56a94091134e3683ce91e80d91 R0004 +R0004 | | | R fc6e10b7d41b1d56a94091134e3683ce91e80d91 | 1610644103.0 + | R-D | Red/Blue/ | D 735a0930abcc27cb388db466a508ea6a3f1e0e44 | -4.0 + | D-C | + Green/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -9.0 + | D-C | + Green/b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -6.0 + | D-C | + c | C a28fa70e725ebda781e772795ca080cd737b823c | -4.0 + +1610644105.0 1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17 R0005 +R0005 | | | R 1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17 | 1610644105.0 + | R---C | Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | 0.0 + +1610644107.0 9a71f967ae1a125be9b6569cc4eccec0aecabb7c R0006 +R0006 | | | R 9a71f967ae1a125be9b6569cc4eccec0aecabb7c | 1610644107.0 + | R-D | Purple/Brown/Purple/ | D ca73d509e70701874164be821598db244240d379 | -2.0 + | D-C | + d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -2.0 + +1610644109.0 4fde4ea4494a630030a4bda99d03961d9add00c7 R0007 +R0007 | | | R 4fde4ea4494a630030a4bda99d03961d9add00c7 | 1610644109.0 + | R-D | Dark/ | D 602ed30f501574bf93b0503eb17ca9795da7549c | -4.0 + | D-C | + Brown/Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 + | D-C | + d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 + +1610644111.0 ba00e89d47dc820bb32c783af7123ffc6e58b56d R0008 +R0008 | | | R ba00e89d47dc820bb32c783af7123ffc6e58b56d | 1610644111.0 + | R-D | Dark/ | D 4bbafface78adfae9abb45c54256588c1f63172f | -6.0 + | D-C | + Brown/Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 + | D-C | + Brown/Purple/e | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 + | D-C | + a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -17.0 + +1610644113.0 55d4dc9471de6144f935daf3c38878155ca274d5 R0009 +R0009 | | | R 55d4dc9471de6144f935daf3c38878155ca274d5 | 1610644113.0 + | R---C | Dark/Brown/Purple/f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | 0.0 + | R---C | Dark/Brown/Purple/g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | 0.0 + | R---C | Dark/f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | 0.0 + +1610644116.0 a8939755d0be76cfea136e9e5ebce9bc51c49fef R0010 +R0010 | | | R a8939755d0be76cfea136e9e5ebce9bc51c49fef | 1610644116.0 + | R---C | Dark/h | C 5e8f9ceaee9dafae2e3210e254fdf170295f8b5b | 0.0 + | R-D | Dark/Brown/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -3.0 + | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -3.0 + | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -3.0 + +1610644118.0 ca1774a07b6e02c1caa7ae678924efa9259ee7c6 R0011 +R0011 | | | R ca1774a07b6e02c1caa7ae678924efa9259ee7c6 | 1610644118.0 + | R---C | Paris/i | C bbd54b961764094b13f10cef733e3725d0a834c3 | 0.0 + | R-D | Paris/Brown/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -5.0 + | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -5.0 + | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -5.0 + +1610644120.0 611fe71d75b6ea151b06e3845c09777acc783d82 R0012 +R0012 | | | R 611fe71d75b6ea151b06e3845c09777acc783d82 | 1610644120.0 + | R---C | Paris/j | C 7ce4fe9a22f589fa1656a752ea371b0ebc2106b1 | 0.0 + | R-D | Paris/Berlin/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -7.0 + | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -7.0 + | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -7.0 + +1610644122.0 4c5551b4969eb2160824494d40b8e1f6187fc01e R0013 +R0013 | | | R 4c5551b4969eb2160824494d40b8e1f6187fc01e | 1610644122.0 + | R---C | Paris/k | C cb79b39935c9392fa5193d9f84a6c35dc9c22c75 | 0.0 + | R-D | Paris/Berlin/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -9.0 + | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 + | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 + | R-D | Paris/Munich/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -9.0 + | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 + | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 + | R-D | Paris/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -9.0 + | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 + | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 + diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -9,6 +9,7 @@ from swh.provenance.origin import OriginEntry from swh.provenance.provenance import origin_add, revision_add from swh.provenance.revision import RevisionEntry +from swh.provenance.tests.conftest import synthetic_result def ts2dt(ts: dict) -> datetime.datetime: @@ -161,3 +162,71 @@ assert bytes(blob) == contentid assert bytes(rev).hex() == expected["rev"] assert int(date.timestamp()) == expected["date"] + + +def test_provenance_db(provenance, storage_and_CMDBTS, archive_pg): + storage, data = storage_and_CMDBTS + + revisions = {rev["id"]: rev for rev in data["revision"]} + + rows = { + "content": set(), + "content_in_dir": set(), + "content_early_in_rev": set(), + "directory": set(), + "directory_in_rev": set(), + "location": set(), + "revision": set(), + } + + def db_count(table): + provenance.cursor.execute(f"SELECT count(*) FROM {table}") + return provenance.cursor.fetchone()[0] + + for synth_rev in synthetic_result("synthetic_noroot_lower.txt"): + revision = revisions[synth_rev["sha1"]] + entry = RevisionEntry( + archive_pg, + id=revision["id"], + date=ts2dt(revision["date"]), + root=revision["directory"], + parents=revision["parents"], + ) + revision_add(provenance, archive_pg, entry) + + # each "entry" in the synth file is one new revision + rows["revision"].add(synth_rev["sha1"]) + assert len(rows["revision"]) == db_count("revision") + + # this revision might have added new content objects + rows["content"] |= set(x["dst"] for x in synth_rev["R_C"]) + rows["content"] |= set(x["dst"] for x in synth_rev["D_C"]) + assert len(rows["content"]) == db_count("content") + + # check for R-C (direct) entries + rows["content_early_in_rev"] |= set( + (x["src"], x["dst"], x["path"]) for x in synth_rev["R_C"] + ) + assert len(rows["content_early_in_rev"]) == db_count("content_early_in_rev") + + # check directories + rows["directory"] |= set(x["dst"] for x in synth_rev["R_D"]) + assert len(rows["directory"]) == db_count("directory") + + # check for R-D entries + rows["directory_in_rev"] |= set( + (x["src"], x["dst"], x["path"]) for x in synth_rev["R_D"] + ) + assert len(rows["directory_in_rev"]) == db_count("directory_in_rev") + + # check for D-C entries + rows["content_in_dir"] |= set( + (x["src"], x["dst"], x["path"]) for x in synth_rev["D_C"] + ) + assert len(rows["content_in_dir"]) == db_count("content_in_dir") + + # check for location entries + rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) + rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) + rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) + assert len(rows["location"]) == db_count("location")