diff --git a/swh/provenance/tests/data/out-of-order.msgpack b/swh/provenance/tests/data/out-of-order.msgpack new file mode 100644 index 0000000..774dfa8 Binary files /dev/null and b/swh/provenance/tests/data/out-of-order.msgpack differ diff --git a/swh/provenance/tests/data/out-of-order_repo.yaml b/swh/provenance/tests/data/out-of-order_repo.yaml new file mode 100644 index 0000000..9d13f7a --- /dev/null +++ b/swh/provenance/tests/data/out-of-order_repo.yaml @@ -0,0 +1,35 @@ +- msg: R00 + date: 1000000000 + content: + A/B/C/a: "content a" +- msg: R01 + date: 1000000010 + content: + A/B/C/a: "content a" + A/B/C/b: "content b" +- msg: R02 + date: 1000000020 + content: + A/C/a: "content a" + A/C/b: "content b" +- msg: R03 + date: 1000000030 + content: + A/B/C/a: "content a" + A/B/C/b: "content b" +- msg: R04 + date: 1000000040 + content: + A/C/a: "content a" + A/C/b: "content b" +- msg: R05 + date: 1000000005 # /!\ we add an earlier version of the 'b' file + content: + A/B/C/a: "content a" + A/B/C/b: "content b" +- msg: R06 + date: 1000000050 + content: + A/B/C/a: "content a" + A/B/C/b: "content b" + A/B/c: "content c" diff --git a/swh/provenance/tests/data/synthetic_out-of-order_lower_1.txt b/swh/provenance/tests/data/synthetic_out-of-order_lower_1.txt new file mode 100644 index 0000000..87bab85 --- /dev/null +++ b/swh/provenance/tests/data/synthetic_out-of-order_lower_1.txt @@ -0,0 +1,42 @@ +1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00 +R00 | | | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0 + +1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01 +R01 | | | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10 + | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 + +1000000020 1c533587277731236616cac0d44f3b46c1da0f8a R02 +R02 | | | R 1c533587277731236616cac0d44f3b46c1da0f8a | 1000000020 + | R-D | A/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -10 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -20 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -10 + +1000000030 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb R03 +R03 | | | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb | 1000000030 + | R-D | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -20 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -30 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -20 + +1000000040 0d66eadcc15e0d7f6cfd4289329a7749a1309982 R04 +R04 | | | R 0d66eadcc15e0d7f6cfd4289329a7749a1309982 | 1000000040 + | R-D | A/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -30 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -40 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -30 + +1000000005 1dfac0491892096948d6a02bf12a2fed4bf75743 R05 +R05 | | | R 1dfac0491892096948d6a02bf12a2fed4bf75743 | 1000000005 + | R---C | A/B/C/a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -5 + | R---C | A/B/C/b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0 + +1000000050 53519b5a5e8cf12a4f81f82e489f95c1d04d5314 R06 +R06 | | | R 53519b5a5e8cf12a4f81f82e489f95c1d04d5314 | 1000000050 + | R---C | A/B/c | C fa08654474ae2ddc4f61ee3a43d017ba65a439c3 | 0 +# Note the ts below (-40) is NOT the same as the maxdate of its content (-45)! +# This is because the ts of the existing frontier (the R-D below) has not been updated by the +# "new" version of the b file (aka older ts) of R05. +# /!\ This is true only when ingesting revisions one at a time! + | R-D | A/B/C | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | -40 + | D-C | + a | C 20329687bb9c1231a7e05afe86160343ad49b494 | -50 + | D-C | + b | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | -45 diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py index 894a52b..84c3f82 100644 --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -1,185 +1,186 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.provenance.model import RevisionEntry from swh.provenance.provenance import revision_add from swh.provenance.tests.conftest import ( fill_storage, get_datafile, load_repo_data, synthetic_result, ) from swh.provenance.tests.test_provenance_db import ts2dt def sha1s(cur, table): """return the 'sha1' column from the DB 'table' (as hex) 'cur' is a cursor to the provenance index DB. """ cur.execute(f"SELECT sha1 FROM {table}") return set(sha1.hex() for (sha1,) in cur.fetchall()) def locations(cur): """return the 'path' column from the DB location table 'cur' is a cursor to the provenance index DB. """ cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location") return set(x for (x,) in cur.fetchall()) def relations(cur, src, dst): """return the triplets ('sha1', 'sha1', 'path') from the DB for the relation between 'src' table and 'dst' table (i.e. for C-R, C-D and D-R relations). 'cur' is a cursor to the provenance index DB. """ relation = { ("content", "revision"): "content_early_in_rev", ("content", "directory"): "content_in_dir", ("directory", "revision"): "directory_in_rev", }[(src, dst)] srccol = {"content": "blob", "directory": "dir"}[src] dstcol = {"directory": "dir", "revision": "rev"}[dst] cur.execute( f"SELECT encode(src.sha1::bytea, 'hex')," f" encode(dst.sha1::bytea, 'hex')," f" encode(location.path::bytea, 'escape') " f"FROM {relation} as rel, " f" {src} as src, {dst} as dst, location " f"WHERE rel.{srccol}=src.id AND rel.{dstcol}=dst.id AND rel.loc=location.id" ) return set(cur.fetchall()) def get_timestamp(cur, table, sha1): """return the date for the 'sha1' from the DB 'table' (as hex) 'cur' is a cursor to the provenance index DB. """ if isinstance(sha1, str): sha1 = bytes.fromhex(sha1) cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) return [date.timestamp() for (date,) in cur.fetchall()] @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), + ("out-of-order", True, 1), ), ) def test_provenance_heuristics(provenance, swh_storage, archive, repo, lower, mindepth): # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) revisions = {rev["id"]: rev for rev in data["revision"]} rows = { "content": set(), "content_in_dir": set(), "content_early_in_rev": set(), "directory": set(), "directory_in_rev": set(), "location": set(), "revision": set(), } for synth_rev in synthetic_result(syntheticfile): revision = revisions[synth_rev["sha1"]] entry = RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) # each "entry" in the synth file is one new revision rows["revision"].add(synth_rev["sha1"].hex()) assert rows["revision"] == sha1s(provenance.cursor, "revision"), synth_rev[ "msg" ] # check the timestamp of the revision rev_ts = synth_rev["date"] assert get_timestamp( provenance.cursor, "revision", synth_rev["sha1"].hex() ) == [rev_ts], synth_rev["msg"] # this revision might have added new content objects rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) assert rows["content"] == sha1s(provenance.cursor, "content"), synth_rev["msg"] # check for R-C (direct) entries # these are added directly in the content_early_in_rev table rows["content_early_in_rev"] |= set( (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] ) assert rows["content_early_in_rev"] == relations( provenance.cursor, "content", "revision" ), synth_rev["msg"] # check timestamps for rc in synth_rev["R_C"]: assert get_timestamp(provenance.cursor, "content", rc["dst"]) == [ rev_ts + rc["rel_ts"] ], synth_rev["msg"] # check directories # each directory stored in the provenance index is an entry # in the "directory" table... rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) assert rows["directory"] == sha1s(provenance.cursor, "directory"), synth_rev[ "msg" ] # ... + a number of rows in the "directory_in_rev" table... # check for R-D entries rows["directory_in_rev"] |= set( (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] ) assert rows["directory_in_rev"] == relations( provenance.cursor, "directory", "revision" ), synth_rev["msg"] # check timestamps for rd in synth_rev["R_D"]: assert get_timestamp(provenance.cursor, "directory", rd["dst"]) == [ rev_ts + rd["rel_ts"] ], synth_rev["msg"] # ... + a number of rows in the "content_in_dir" table # for content of the directory. # check for D-C entries rows["content_in_dir"] |= set( (x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] ) assert rows["content_in_dir"] == relations( provenance.cursor, "content", "directory" ), synth_rev["msg"] # check timestamps for dc in synth_rev["D_C"]: assert get_timestamp(provenance.cursor, "content", dc["dst"]) == [ rev_ts + dc["rel_ts"] ], synth_rev["msg"] # check for location entries rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) assert rows["location"] == locations(provenance.cursor), synth_rev["msg"]