diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -239,7 +239,7 @@ regs = [ "(?PR[0-9]{4})?", "(?P[^| ]*)", - "(?P[^|]*?)", + "([+] )?(?P[^| +]*?)[/]?", "(?P[RDC]) (?P[0-9a-z]{40})", "(?P-?[0-9]+(.[0-9]+)?)", ] diff --git a/swh/provenance/tests/data/synthetic_noroot_lower.txt b/swh/provenance/tests/data/synthetic_lower_1.txt rename from swh/provenance/tests/data/synthetic_noroot_lower.txt rename to swh/provenance/tests/data/synthetic_lower_1.txt --- a/swh/provenance/tests/data/synthetic_noroot_lower.txt +++ b/swh/provenance/tests/data/synthetic_lower_1.txt @@ -89,4 +89,3 @@ | R-D | Paris/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -9.0 | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 - diff --git a/swh/provenance/tests/data/synthetic_noroot_lower.txt b/swh/provenance/tests/data/synthetic_lower_2.txt rename from swh/provenance/tests/data/synthetic_noroot_lower.txt rename to swh/provenance/tests/data/synthetic_lower_2.txt --- a/swh/provenance/tests/data/synthetic_noroot_lower.txt +++ b/swh/provenance/tests/data/synthetic_lower_2.txt @@ -16,10 +16,10 @@ 1610644101.0 ca6ec564c69efd2e5c70fb05486fd3f794765a04 R0003 R0003 | | | R ca6ec564c69efd2e5c70fb05486fd3f794765a04 | 1610644101.0 - | R-D | Red/ | D a85553c8942940668e613b94bd31367af3342add | -4.0 - | D-C | + Green/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -7.0 - | D-C | + Green/b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -4.0 + | R---C | Red/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -7.0 + | R-D | Red/Green | D 4b6387dc2c85d82f0e2375461b687dabb03aa97c | -4.0 | D-C | + a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -7.0 + | D-C | + b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -4.0 1610644103.0 fc6e10b7d41b1d56a94091134e3683ce91e80d91 R0004 R0004 | | | R fc6e10b7d41b1d56a94091134e3683ce91e80d91 | 1610644103.0 @@ -39,16 +39,16 @@ 1610644109.0 4fde4ea4494a630030a4bda99d03961d9add00c7 R0007 R0007 | | | R 4fde4ea4494a630030a4bda99d03961d9add00c7 | 1610644109.0 - | R-D | Dark/ | D 602ed30f501574bf93b0503eb17ca9795da7549c | -4.0 - | D-C | + Brown/Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 + | R---C | Dark/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 + | R-D | Dark/Brown/Purple/ | D ca73d509e70701874164be821598db244240d379 | -4.0 | D-C | + d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 1610644111.0 ba00e89d47dc820bb32c783af7123ffc6e58b56d R0008 R0008 | | | R ba00e89d47dc820bb32c783af7123ffc6e58b56d | 1610644111.0 - | R-D | Dark/ | D 4bbafface78adfae9abb45c54256588c1f63172f | -6.0 - | D-C | + Brown/Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 - | D-C | + Brown/Purple/e | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 - | D-C | + a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -17.0 + | R---C | Dark/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -17.0 + | R-D | Dark/Brown/Purple/ | D b97c42f8e71723c78c947a7b2221893387c9d4df | -6.0 + | D-C | + d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 + | D-C | + e | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 1610644113.0 55d4dc9471de6144f935daf3c38878155ca274d5 R0009 R0009 | | | R 55d4dc9471de6144f935daf3c38878155ca274d5 | 1610644113.0 @@ -89,4 +89,3 @@ | R-D | Paris/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -9.0 | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 - diff --git a/swh/provenance/tests/data/synthetic_noroot_upper.txt b/swh/provenance/tests/data/synthetic_upper_1.txt rename from swh/provenance/tests/data/synthetic_noroot_upper.txt rename to swh/provenance/tests/data/synthetic_upper_1.txt diff --git a/swh/provenance/tests/data/synthetic_noroot_upper.txt b/swh/provenance/tests/data/synthetic_upper_2.txt rename from swh/provenance/tests/data/synthetic_noroot_upper.txt rename to swh/provenance/tests/data/synthetic_upper_2.txt --- a/swh/provenance/tests/data/synthetic_noroot_upper.txt +++ b/swh/provenance/tests/data/synthetic_upper_2.txt @@ -16,17 +16,17 @@ 1610644101.0 ca6ec564c69efd2e5c70fb05486fd3f794765a04 R0003 R0003 | | | R ca6ec564c69efd2e5c70fb05486fd3f794765a04 | 1610644101.0 - | R-D | Red/ | D a85553c8942940668e613b94bd31367af3342add | -4.0 - | D-C | + Green/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -7.0 - | D-C | + Green/b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -4.0 + | R---C | Red/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -7.0 + | R-D | Red/Green | D 4b6387dc2c85d82f0e2375461b687dabb03aa97c | -4.0 | D-C | + a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -7.0 + | D-C | + b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -4.0 1610644103.0 fc6e10b7d41b1d56a94091134e3683ce91e80d91 R0004 R0004 | | | R fc6e10b7d41b1d56a94091134e3683ce91e80d91 | 1610644103.0 - | R-D | Red/ | D 87917478bc66dc4cd35f4a708e76c4409d436ae7 | -4.0 - | D-C | + Blue/Green/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -9.0 - | D-C | + Blue/Green/b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -6.0 - | D-C | + Blue/c | C a28fa70e725ebda781e772795ca080cd737b823c | -4.0 + | R-D | Red/Blue/ | D 735a0930abcc27cb388db466a508ea6a3f1e0e44 | -4.0 + | D-C | + Green/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -9.0 + | D-C | + Green/b | C 9f6e04be05297905f1275d3f4e0bb0583458b2e8 | -6.0 + | D-C | + c | C a28fa70e725ebda781e772795ca080cd737b823c | -4.0 1610644105.0 1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17 R0005 R0005 | | | R 1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17 | 1610644105.0 @@ -34,21 +34,21 @@ 1610644107.0 9a71f967ae1a125be9b6569cc4eccec0aecabb7c R0006 R0006 | | | R 9a71f967ae1a125be9b6569cc4eccec0aecabb7c | 1610644107.0 - | R-D | Purple/ | D 98170783516a25f55b77f34a97390046e2747ab7 | -2.0 - | D-C | + Brown/Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -2.0 + | R-D | Purple/Brown/ | D 7ce3f063b92b184db82b5740d75d4712b0503ac4 | -2.0 + | D-C | + Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -2.0 1610644109.0 4fde4ea4494a630030a4bda99d03961d9add00c7 R0007 R0007 | | | R 4fde4ea4494a630030a4bda99d03961d9add00c7 | 1610644109.0 - | R-D | Dark/ | D 602ed30f501574bf93b0503eb17ca9795da7549c | -4.0 - | D-C | + Brown/Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 - | D-C | + d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 + | R---C | Dark/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 + | R-D | Dark/Brown/ | D 7ce3f063b92b184db82b5740d75d4712b0503ac4 | -4.0 + | D-C | + Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -4.0 1610644111.0 ba00e89d47dc820bb32c783af7123ffc6e58b56d R0008 R0008 | | | R ba00e89d47dc820bb32c783af7123ffc6e58b56d | 1610644111.0 - | R-D | Dark/ | D 4bbafface78adfae9abb45c54256588c1f63172f | -6.0 - | D-C | + Brown/Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 - | D-C | + Brown/Purple/e | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 - | D-C | + a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -17.0 + | R---C | Dark/a | C 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 | -17.0 + | R-D | Dark/Brown/ | D af1ac471a925a423b712a5d19783cd30cf73bca3 | -6.0 + | D-C | + Purple/d | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 + | D-C | + Purple/e | C c0229d305adf3edf49f031269a70e3e87665fe88 | -6.0 1610644113.0 55d4dc9471de6144f935daf3c38878155ca274d5 R0009 R0009 | | | R 55d4dc9471de6144f935daf3c38878155ca274d5 | 1610644113.0 @@ -89,4 +89,3 @@ | R-D | Paris/Purple/ | D f86f65f0e58940f36c088cb1455da5bc224230bc | -9.0 | D-C | + f | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 | D-C | + g | C 94ba40161084e8b80943accd9d24e1f9dd47189b | -9.0 - diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -161,14 +161,64 @@ assert int(date.timestamp()) == expected["date"] +def sha1s(cur, table): + """return the 'sha1' column from the DB 'table' (as hex) + + 'cur' is a cursor to the provenance index DB. + """ + cur.execute(f"SELECT sha1 FROM {table}") + return set(sha1.hex() for (sha1,) in cur.fetchall()) + + +def locations(cur): + """return the 'path' column from the DB location table + + 'cur' is a cursor to the provenance index DB. + """ + cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location") + return set(x for (x,) in cur.fetchall()) + + +def relations(cur, src, dst): + """return the triplets ('sha1', 'sha1', 'path') from the DB + + for the relation between 'src' table and 'dst' table + (i.e. for C-R, C-D and D-R relations). + + 'cur' is a cursor to the provenance index DB. + """ + relation = { + ("content", "revision"): "content_early_in_rev", + ("content", "directory"): "content_in_dir", + ("directory", "revision"): "directory_in_rev", + }[(src, dst)] + + srccol = {"content": "blob", "directory": "dir"}[src] + dstcol = {"directory": "dir", "revision": "rev"}[dst] + + cur.execute( + f"SELECT encode(src.sha1::bytea, 'hex')," + f" encode(dst.sha1::bytea, 'hex')," + f" encode(location.path::bytea, 'escape') " + f"FROM {relation} as rel, " + f" {src} as src, {dst} as dst, location " + f"WHERE rel.{srccol}=src.id AND rel.{dstcol}=dst.id AND rel.loc=location.id" + ) + return set(cur.fetchall()) + + @pytest.mark.parametrize( "syntheticfile, args", ( - ("synthetic_noroot_lower.txt", {"lower": True, "mindepth": 1}), - ("synthetic_noroot_upper.txt", {"lower": False, "mindepth": 1}), + ("synthetic_lower_1.txt", {"lower": True, "mindepth": 1}), + ("synthetic_upper_1.txt", {"lower": False, "mindepth": 1}), + ("synthetic_lower_2.txt", {"lower": True, "mindepth": 2}), + ("synthetic_upper_2.txt", {"lower": False, "mindepth": 2}), ), ) -def test_provenance_db(provenance, storage_and_CMDBTS, archive, syntheticfile, args): +def test_provenance_heuristics( + provenance, storage_and_CMDBTS, archive, syntheticfile, args +): storage, data = storage_and_CMDBTS revisions = {rev["id"]: rev for rev in data["revision"]} @@ -183,10 +233,6 @@ "revision": set(), } - def db_count(table): - provenance.cursor.execute(f"SELECT count(*) FROM {table}") - return provenance.cursor.fetchone()[0] - for synth_rev in synthetic_result(syntheticfile): revision = revisions[synth_rev["sha1"]] entry = RevisionEntry( @@ -194,40 +240,49 @@ ) revision_add(provenance, archive, entry, **args) - # import pdb; pdb.set_trace() # each "entry" in the synth file is one new revision - rows["revision"].add(synth_rev["sha1"]) - assert len(rows["revision"]) == db_count("revision") + rows["revision"].add(synth_rev["sha1"].hex()) + assert rows["revision"] == sha1s(provenance.cursor, "revision"), synth_rev[ + "msg" + ] # this revision might have added new content objects - rows["content"] |= set(x["dst"] for x in synth_rev["R_C"]) - rows["content"] |= set(x["dst"] for x in synth_rev["D_C"]) - assert len(rows["content"]) == db_count("content") + rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) + rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) + assert rows["content"] == sha1s(provenance.cursor, "content"), synth_rev["msg"] # check for R-C (direct) entries rows["content_early_in_rev"] |= set( - (x["src"], x["dst"], x["path"]) for x in synth_rev["R_C"] + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] ) - assert len(rows["content_early_in_rev"]) == db_count("content_early_in_rev") + assert rows["content_early_in_rev"] == relations( + provenance.cursor, "content", "revision" + ), synth_rev["msg"] # check directories - rows["directory"] |= set(x["dst"] for x in synth_rev["R_D"]) - assert len(rows["directory"]) == db_count("directory") + rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) + assert rows["directory"] == sha1s(provenance.cursor, "directory"), synth_rev[ + "msg" + ] # check for R-D entries rows["directory_in_rev"] |= set( - (x["src"], x["dst"], x["path"]) for x in synth_rev["R_D"] + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] ) - assert len(rows["directory_in_rev"]) == db_count("directory_in_rev") + assert rows["directory_in_rev"] == relations( + provenance.cursor, "directory", "revision" + ), synth_rev["msg"] # check for D-C entries rows["content_in_dir"] |= set( - (x["src"], x["dst"], x["path"]) for x in synth_rev["D_C"] + (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] ) - assert len(rows["content_in_dir"]) == db_count("content_in_dir") + assert rows["content_in_dir"] == relations( + provenance.cursor, "content", "directory" + ), synth_rev["msg"] # check for location entries rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) - assert len(rows["location"]) == db_count("location") + assert rows["location"] == locations(provenance.cursor), synth_rev["msg"] diff --git a/swh/provenance/tests/test_provenance_db_storage.py b/swh/provenance/tests/test_provenance_db_storage.py --- a/swh/provenance/tests/test_provenance_db_storage.py +++ b/swh/provenance/tests/test_provenance_db_storage.py @@ -10,7 +10,7 @@ from .test_provenance_db import ( # noqa test_provenance_add_revision, test_provenance_content_find_first, - test_provenance_db, + test_provenance_heuristics, )