Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/tests/test_provenance_heuristics.py
- This file was added.
# Copyright (C) 2021 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import pytest | |||||
from swh.provenance.model import RevisionEntry | |||||
from swh.provenance.provenance import revision_add | |||||
from swh.provenance.tests.conftest import ( | |||||
fill_storage, | |||||
get_datafile, | |||||
load_repo_data, | |||||
synthetic_result, | |||||
) | |||||
from swh.provenance.tests.test_provenance_db import ts2dt | |||||
def sha1s(cur, table): | |||||
"""return the 'sha1' column from the DB 'table' (as hex) | |||||
'cur' is a cursor to the provenance index DB. | |||||
""" | |||||
cur.execute(f"SELECT sha1 FROM {table}") | |||||
return set(sha1.hex() for (sha1,) in cur.fetchall()) | |||||
def locations(cur): | |||||
"""return the 'path' column from the DB location table | |||||
'cur' is a cursor to the provenance index DB. | |||||
""" | |||||
cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location") | |||||
return set(x for (x,) in cur.fetchall()) | |||||
def relations(cur, src, dst): | |||||
"""return the triplets ('sha1', 'sha1', 'path') from the DB | |||||
for the relation between 'src' table and 'dst' table | |||||
(i.e. for C-R, C-D and D-R relations). | |||||
'cur' is a cursor to the provenance index DB. | |||||
""" | |||||
relation = { | |||||
("content", "revision"): "content_early_in_rev", | |||||
("content", "directory"): "content_in_dir", | |||||
("directory", "revision"): "directory_in_rev", | |||||
}[(src, dst)] | |||||
srccol = {"content": "blob", "directory": "dir"}[src] | |||||
dstcol = {"directory": "dir", "revision": "rev"}[dst] | |||||
cur.execute( | |||||
f"SELECT encode(src.sha1::bytea, 'hex')," | |||||
f" encode(dst.sha1::bytea, 'hex')," | |||||
f" encode(location.path::bytea, 'escape') " | |||||
f"FROM {relation} as rel, " | |||||
f" {src} as src, {dst} as dst, location " | |||||
f"WHERE rel.{srccol}=src.id AND rel.{dstcol}=dst.id AND rel.loc=location.id" | |||||
) | |||||
aeviso: is this SQL syntax actually joining the tables? | |||||
Done Inline ActionsShould do: fullset=# EXPLAIN SELECT encode(src.sha1::bytea, 'hex'),encode(dst.sha1::bytea, 'hex'),encode(location.path::bytea, 'escape') FROM content_early_in_rev as rel, content as src, revision as dst, location WHERE rel.blob=src.id AND rel.rev=dst.id AND rel.loc=location.id; Hash Join (cost=699125.84..9540426.01 rows=110655704 width=96) Hash Cond: (rel.loc = location.id) -> Hash Join (cost=438373.68..5934494.63 rows=110655704 width=50) Hash Cond: (rel.rev = dst.id) -> Hash Join (cost=160071.36..3586227.92 rows=110655704 width=37) Hash Cond: (rel.blob = src.id) -> Seq Scan on content_early_in_rev rel (cost=0.00..1811401.04 rows=110655704 width=24) -> Hash (cost=82185.16..82185.16 rows=4028016 width=29) -> Seq Scan on content src (cost=0.00..82185.16 rows=4028016 width=29) -> Hash (cost=135469.03..135469.03 rows=7386903 width=29) -> Seq Scan on revision dst (cost=0.00..135469.03 rows=7386903 width=29) -> Hash (cost=123266.96..123266.96 rows=5915296 width=59) -> Seq Scan on location (cost=0.00..123266.96 rows=5915296 width=59) JIT: Functions: 27 Options: Inlining true, Optimization true, Expressions true, Deforming true (16 lignes) fullset=# EXPLAIN SELECT encode(src.sha1::bytea, 'hex'), encode(dst.sha1::bytea, 'hex'), encode(location.path::bytea, 'escape') FROM content_early_in_rev as rel INNER JOIN content as src on (rel.blob=src.id) INNER JOIN revision as dst ON (rel.rev=dst.id) INNER JOIN location ON (rel.loc=location.id); Hash Join (cost=699125.84..9540426.01 rows=110655704 width=96) Hash Cond: (rel.loc = location.id) -> Hash Join (cost=438373.68..5934494.63 rows=110655704 width=50) Hash Cond: (rel.rev = dst.id) -> Hash Join (cost=160071.36..3586227.92 rows=110655704 width=37) Hash Cond: (rel.blob = src.id) -> Seq Scan on content_early_in_rev rel (cost=0.00..1811401.04 rows=110655704 width=24) -> Hash (cost=82185.16..82185.16 rows=4028016 width=29) -> Seq Scan on content src (cost=0.00..82185.16 rows=4028016 width=29) -> Hash (cost=135469.03..135469.03 rows=7386903 width=29) -> Seq Scan on revision dst (cost=0.00..135469.03 rows=7386903 width=29) -> Hash (cost=123266.96..123266.96 rows=5915296 width=59) -> Seq Scan on location (cost=0.00..123266.96 rows=5915296 width=59) JIT: Functions: 27 Options: Inlining true, Optimization true, Expressions true, Deforming true (16 lignes) douardda: Should do:
```
fullset=# EXPLAIN
SELECT encode(src.sha1::bytea, 'hex'),encode(dst.sha1::bytea… | |||||
Not Done Inline ActionsNote that it's an optimization of Postgres' query planner. More naive SQL engines would do a cartesian product and filter. vlorentz: Note that it's an optimization of Postgres' query planner. More naive SQL engines would do a… | |||||
Done Inline Actionsexact, but thanks g*d we only use postgres :-) douardda: exact, but thanks g*d we only use postgres :-) | |||||
Done Inline ActionsAlso, since it's used for tests only, we really don't care if the engine actually does the cartesian product here, but yeah, thx postgres anyway :-) douardda: Also, since it's used for tests only, we really don't care if the engine actually does the… | |||||
Not Done Inline Actionsshouldn't we use it for the first/all occurrence(s) query as well? I mean in the provenance backend aeviso: shouldn't we use it for the first/all occurrence(s) query as well? I mean in the provenance… | |||||
Done Inline Actionspossibly, I'll have a look douardda: possibly, I'll have a look | |||||
Done Inline Actionsactually the way it's now written in the provenance backend (using inner join) is better since it does not depend on the sql backend being smart. So let's stick to the way it is there. douardda: actually the way it's now written in the provenance backend (using inner join) is better since… | |||||
return set(cur.fetchall()) | |||||
def get_timestamp(cur, table, sha1): | |||||
"""return the date for the 'sha1' from the DB 'table' (as hex) | |||||
'cur' is a cursor to the provenance index DB. | |||||
""" | |||||
if isinstance(sha1, str): | |||||
sha1 = bytes.fromhex(sha1) | |||||
cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) | |||||
return [date.timestamp() for (date,) in cur.fetchall()] | |||||
@pytest.mark.parametrize( | |||||
"repo, lower, mindepth", | |||||
( | |||||
("cmdbts2", True, 1), | |||||
("cmdbts2", False, 1), | |||||
("cmdbts2", True, 2), | |||||
("cmdbts2", False, 2), | |||||
), | |||||
) | |||||
def test_provenance_heuristics(provenance, swh_storage, archive, repo, lower, mindepth): | |||||
# read data/README.md for more details on how these datasets are generated | |||||
data = load_repo_data(repo) | |||||
fill_storage(swh_storage, data) | |||||
syntheticfile = get_datafile( | |||||
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" | |||||
) | |||||
revisions = {rev["id"]: rev for rev in data["revision"]} | |||||
rows = { | |||||
"content": set(), | |||||
"content_in_dir": set(), | |||||
"content_early_in_rev": set(), | |||||
"directory": set(), | |||||
"directory_in_rev": set(), | |||||
"location": set(), | |||||
"revision": set(), | |||||
} | |||||
for synth_rev in synthetic_result(syntheticfile): | |||||
revision = revisions[synth_rev["sha1"]] | |||||
entry = RevisionEntry( | |||||
id=revision["id"], | |||||
date=ts2dt(revision["date"]), | |||||
root=revision["directory"], | |||||
) | |||||
revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) | |||||
# each "entry" in the synth file is one new revision | |||||
rows["revision"].add(synth_rev["sha1"].hex()) | |||||
assert rows["revision"] == sha1s(provenance.cursor, "revision"), synth_rev[ | |||||
"msg" | |||||
] | |||||
# check the timestamp of the revision | |||||
rev_ts = synth_rev["date"] | |||||
assert get_timestamp( | |||||
provenance.cursor, "revision", synth_rev["sha1"].hex() | |||||
) == [rev_ts], synth_rev["msg"] | |||||
# this revision might have added new content objects | |||||
rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) | |||||
rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) | |||||
assert rows["content"] == sha1s(provenance.cursor, "content"), synth_rev["msg"] | |||||
# check for R-C (direct) entries | |||||
# these are added directly in the content_early_in_rev table | |||||
rows["content_early_in_rev"] |= set( | |||||
(x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] | |||||
) | |||||
assert rows["content_early_in_rev"] == relations( | |||||
provenance.cursor, "content", "revision" | |||||
), synth_rev["msg"] | |||||
# check timestamps | |||||
for rc in synth_rev["R_C"]: | |||||
assert get_timestamp(provenance.cursor, "content", rc["dst"]) == [ | |||||
rev_ts + rc["rel_ts"] | |||||
], synth_rev["msg"] | |||||
# check directories | |||||
# each directory stored in the provenance index is an entry | |||||
# in the "directory" table... | |||||
rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) | |||||
assert rows["directory"] == sha1s(provenance.cursor, "directory"), synth_rev[ | |||||
"msg" | |||||
] | |||||
# ... + a number of rows in the "directory_in_rev" table... | |||||
# check for R-D entries | |||||
rows["directory_in_rev"] |= set( | |||||
(x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] | |||||
) | |||||
assert rows["directory_in_rev"] == relations( | |||||
provenance.cursor, "directory", "revision" | |||||
), synth_rev["msg"] | |||||
# check timestamps | |||||
for rd in synth_rev["R_D"]: | |||||
assert get_timestamp(provenance.cursor, "directory", rd["dst"]) == [ | |||||
rev_ts + rd["rel_ts"] | |||||
], synth_rev["msg"] | |||||
# ... + a number of rows in the "content_in_dir" table | |||||
# for content of the directory. | |||||
# check for D-C entries | |||||
rows["content_in_dir"] |= set( | |||||
(x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] | |||||
) | |||||
assert rows["content_in_dir"] == relations( | |||||
provenance.cursor, "content", "directory" | |||||
), synth_rev["msg"] | |||||
# check timestamps | |||||
for dc in synth_rev["D_C"]: | |||||
assert get_timestamp(provenance.cursor, "content", dc["dst"]) == [ | |||||
rev_ts + dc["rel_ts"] | |||||
], synth_rev["msg"] | |||||
# check for location entries | |||||
rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) | |||||
rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) | |||||
rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) | |||||
assert rows["location"] == locations(provenance.cursor), synth_rev["msg"] | |||||
@pytest.mark.parametrize( | |||||
"syntheticfile, args", | |||||
( | |||||
("synthetic_lower_1.txt", {"lower": True, "mindepth": 1}), | |||||
("synthetic_upper_1.txt", {"lower": False, "mindepth": 1}), | |||||
("synthetic_lower_2.txt", {"lower": True, "mindepth": 2}), | |||||
("synthetic_upper_2.txt", {"lower": False, "mindepth": 2}), | |||||
), | |||||
) | |||||
def test_provenance_heuristics_CMDBTS( | |||||
provenance, storage_and_CMDBTS, archive, syntheticfile, args | |||||
): | |||||
storage, data = storage_and_CMDBTS | |||||
revisions = {rev["id"]: rev for rev in data["revision"]} | |||||
rows = { | |||||
"content": set(), | |||||
"content_in_dir": set(), | |||||
"content_early_in_rev": set(), | |||||
"directory": set(), | |||||
"directory_in_rev": set(), | |||||
"location": set(), | |||||
"revision": set(), | |||||
} | |||||
for synth_rev in synthetic_result(syntheticfile): | |||||
revision = revisions[synth_rev["sha1"]] | |||||
entry = RevisionEntry( | |||||
id=revision["id"], | |||||
date=ts2dt(revision["date"]), | |||||
root=revision["directory"], | |||||
) | |||||
revision_add(provenance, archive, [entry], **args) | |||||
# each "entry" in the synth file is one new revision | |||||
rows["revision"].add(synth_rev["sha1"].hex()) | |||||
assert rows["revision"] == sha1s(provenance.cursor, "revision"), synth_rev[ | |||||
"msg" | |||||
] | |||||
# this revision might have added new content objects | |||||
rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) | |||||
rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) | |||||
assert rows["content"] == sha1s(provenance.cursor, "content"), synth_rev["msg"] | |||||
# check for R-C (direct) entries | |||||
rows["content_early_in_rev"] |= set( | |||||
(x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_C"] | |||||
) | |||||
assert rows["content_early_in_rev"] == relations( | |||||
provenance.cursor, "content", "revision" | |||||
), synth_rev["msg"] | |||||
# check directories | |||||
rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) | |||||
assert rows["directory"] == sha1s(provenance.cursor, "directory"), synth_rev[ | |||||
"msg" | |||||
] | |||||
# check for R-D entries | |||||
rows["directory_in_rev"] |= set( | |||||
(x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["R_D"] | |||||
) | |||||
assert rows["directory_in_rev"] == relations( | |||||
provenance.cursor, "directory", "revision" | |||||
), synth_rev["msg"] | |||||
# check for D-C entries | |||||
rows["content_in_dir"] |= set( | |||||
(x["dst"].hex(), x["src"].hex(), x["path"]) for x in synth_rev["D_C"] | |||||
) | |||||
assert rows["content_in_dir"] == relations( | |||||
provenance.cursor, "content", "directory" | |||||
), synth_rev["msg"] | |||||
# check for location entries | |||||
rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) | |||||
rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) | |||||
rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) | |||||
assert rows["location"] == locations(provenance.cursor), synth_rev["msg"] |
is this SQL syntax actually joining the tables?