Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/tests/test_provenance_heuristics.py
Show All 19 Lines | |||||
def sha1s(cur, table): | def sha1s(cur, table): | ||||
"""return the 'sha1' column from the DB 'table' (as hex) | """return the 'sha1' column from the DB 'table' (as hex) | ||||
'cur' is a cursor to the provenance index DB. | 'cur' is a cursor to the provenance index DB. | ||||
""" | """ | ||||
cur.execute(f"SELECT sha1 FROM {table}") | cur.execute(f"SELECT sha1 FROM {table}") | ||||
return set(sha1.hex() for (sha1,) in cur.fetchall()) | return set(row["sha1"].hex() for row in cur.fetchall()) | ||||
def locations(cur): | def locations(cur): | ||||
"""return the 'path' column from the DB location table | """return the 'path' column from the DB location table | ||||
'cur' is a cursor to the provenance index DB. | 'cur' is a cursor to the provenance index DB. | ||||
""" | """ | ||||
cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location") | cur.execute("SELECT encode(location.path::bytea, 'escape') AS path FROM location") | ||||
return set(x for (x,) in cur.fetchall()) | return set(row["path"] for row in cur.fetchall()) | ||||
def relations(cur, src, dst): | def relations(cur, src, dst): | ||||
"""return the triplets ('sha1', 'sha1', 'path') from the DB | """return the triplets ('sha1', 'sha1', 'path') from the DB | ||||
for the relation between 'src' table and 'dst' table | for the relation between 'src' table and 'dst' table | ||||
(i.e. for C-R, C-D and D-R relations). | (i.e. for C-R, C-D and D-R relations). | ||||
'cur' is a cursor to the provenance index DB. | 'cur' is a cursor to the provenance index DB. | ||||
""" | """ | ||||
relation = f"{src}_in_{dst}" | relation = f"{src}_in_{dst}" | ||||
cur.execute("select swh_get_dbflavor()") | cur.execute("SELECT swh_get_dbflavor() AS flavor") | ||||
with_path = cur.fetchone()[0] == "with-path" | with_path = cur.fetchone()["flavor"] == "with-path" | ||||
# note that the columns have the same name as the relations they refer to, | # note that the columns have the same name as the relations they refer to, | ||||
# so we can write things like "rel.{dst}=src.id" in the query below | # so we can write things like "rel.{dst}=src.id" in the query below | ||||
if with_path: | if with_path: | ||||
cur.execute( | cur.execute( | ||||
f""" | f""" | ||||
SELECT encode(src.sha1::bytea, 'hex'), | SELECT encode(src.sha1::bytea, 'hex') AS src, | ||||
encode(dst.sha1::bytea, 'hex'), | encode(dst.sha1::bytea, 'hex') AS dst, | ||||
encode(location.path::bytea, 'escape') | encode(location.path::bytea, 'escape') AS path | ||||
FROM {relation} as relation | FROM {relation} as relation | ||||
INNER JOIN {src} AS src ON (relation.{src} = src.id) | INNER JOIN {src} AS src ON (relation.{src} = src.id) | ||||
INNER JOIN {dst} AS dst ON (relation.{dst} = dst.id) | INNER JOIN {dst} AS dst ON (relation.{dst} = dst.id) | ||||
INNER JOIN location ON (relation.location = location.id) | INNER JOIN location ON (relation.location = location.id) | ||||
""" | """ | ||||
) | ) | ||||
else: | else: | ||||
cur.execute( | cur.execute( | ||||
f""" | f""" | ||||
SELECT encode(src.sha1::bytea, 'hex'), | SELECT encode(src.sha1::bytea, 'hex') AS src, | ||||
encode(dst.sha1::bytea, 'hex'), | encode(dst.sha1::bytea, 'hex') AS dst, | ||||
'' | '' AS path | ||||
FROM {relation} as relation | FROM {relation} as relation | ||||
INNER JOIN {src} AS src ON (src.id = relation.{src}) | INNER JOIN {src} AS src ON (src.id = relation.{src}) | ||||
INNER JOIN {dst} AS dst ON (dst.id = relation.{dst}) | INNER JOIN {dst} AS dst ON (dst.id = relation.{dst}) | ||||
""" | """ | ||||
) | ) | ||||
return set(cur.fetchall()) | return set((row["src"], row["dst"], row["path"]) for row in cur.fetchall()) | ||||
def get_timestamp(cur, table, sha1): | def get_timestamp(cur, table, sha1): | ||||
"""return the date for the 'sha1' from the DB 'table' (as hex) | """return the date for the 'sha1' from the DB 'table' (as hex) | ||||
'cur' is a cursor to the provenance index DB. | 'cur' is a cursor to the provenance index DB. | ||||
""" | """ | ||||
if isinstance(sha1, str): | if isinstance(sha1, str): | ||||
sha1 = hash_to_bytes(sha1) | sha1 = hash_to_bytes(sha1) | ||||
cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) | cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) | ||||
return [date.timestamp() for (date,) in cur.fetchall()] | return [row["date"].timestamp() for row in cur.fetchall()] | ||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"repo, lower, mindepth", | "repo, lower, mindepth", | ||||
( | ( | ||||
("cmdbts2", True, 1), | ("cmdbts2", True, 1), | ||||
("cmdbts2", False, 1), | ("cmdbts2", False, 1), | ||||
("cmdbts2", True, 2), | ("cmdbts2", True, 2), | ||||
▲ Show 20 Lines • Show All 151 Lines • ▼ Show 20 Lines | ): | ||||
) | ) | ||||
expected_occurrences = {} | expected_occurrences = {} | ||||
for synth_rev in synthetic_result(syntheticfile): | for synth_rev in synthetic_result(syntheticfile): | ||||
rev_id = synth_rev["sha1"].hex() | rev_id = synth_rev["sha1"].hex() | ||||
rev_ts = synth_rev["date"] | rev_ts = synth_rev["date"] | ||||
for rc in synth_rev["R_C"]: | for rc in synth_rev["R_C"]: | ||||
expected_occurrences.setdefault(rc["dst"].hex(), []).append( | expected_occurrences.setdefault(rc["dst"].hex(), []).append( | ||||
(rev_id, rev_ts, maybe_path(rc["path"])) | (rev_id, rev_ts, None, maybe_path(rc["path"])) | ||||
) | ) | ||||
for dc in synth_rev["D_C"]: | for dc in synth_rev["D_C"]: | ||||
assert dc["prefix"] is not None # to please mypy | assert dc["prefix"] is not None # to please mypy | ||||
expected_occurrences.setdefault(dc["dst"].hex(), []).append( | expected_occurrences.setdefault(dc["dst"].hex(), []).append( | ||||
(rev_id, rev_ts, maybe_path(dc["prefix"] + "/" + dc["path"])) | (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) | ||||
) | ) | ||||
for content_id, results in expected_occurrences.items(): | for content_id, results in expected_occurrences.items(): | ||||
expected = [(content_id, *result) for result in results] | expected = [(content_id, *result) for result in results] | ||||
db_occurrences = [ | db_occurrences = [ | ||||
(blob.hex(), rev.hex(), date.timestamp(), path.decode()) | ( | ||||
for blob, rev, date, path in provenance.content_find_all( | occur.content.hex(), | ||||
hash_to_bytes(content_id) | occur.revision.hex(), | ||||
occur.date.timestamp(), | |||||
occur.origin, | |||||
occur.path.decode(), | |||||
) | ) | ||||
for occur in provenance.content_find_all(hash_to_bytes(content_id)) | |||||
] | ] | ||||
if provenance.storage.with_path: | if provenance.storage.with_path: | ||||
# this is not true if the db stores no path, because a same content | # this is not true if the db stores no path, because a same content | ||||
# that appears several times in a given revision may be reported | # that appears several times in a given revision may be reported | ||||
# only once by content_find_all() | # only once by content_find_all() | ||||
assert len(db_occurrences) == len(expected) | assert len(db_occurrences) == len(expected) | ||||
assert set(db_occurrences) == set(expected) | assert set(db_occurrences) == set(expected) | ||||
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines | for synth_rev in synthetic_result(syntheticfile): | ||||
expected_first[sha1] = (rev_id, rev_ts, rc["path"]) | expected_first[sha1] = (rev_id, rev_ts, rc["path"]) | ||||
for dc in synth_rev["D_C"]: | for dc in synth_rev["D_C"]: | ||||
sha1 = rc["dst"].hex() | sha1 = rc["dst"].hex() | ||||
assert sha1 in expected_first | assert sha1 in expected_first | ||||
# nothing to do there, this content cannot be a "first seen file" | # nothing to do there, this content cannot be a "first seen file" | ||||
for content_id, (rev_id, ts, paths) in expected_first.items(): | for content_id, (rev_id, ts, paths) in expected_first.items(): | ||||
(r_sha1, r_rev_id, r_ts, r_path) = provenance.content_find_first( | occur = provenance.content_find_first(hash_to_bytes(content_id)) | ||||
hash_to_bytes(content_id) | assert occur.content.hex() == content_id | ||||
) | assert occur.revision.hex() == rev_id | ||||
assert r_sha1.hex() == content_id | assert occur.date.timestamp() == ts | ||||
assert r_rev_id.hex() == rev_id | assert occur.origin is None | ||||
assert r_ts.timestamp() == ts | |||||
if provenance.storage.with_path: | if provenance.storage.with_path: | ||||
assert r_path.decode() in paths | assert occur.path.decode() in paths |