Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/tests/test_provenance_heuristics.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from typing import Dict, List, Tuple | from datetime import datetime | ||||
from typing import Any, Dict, List, Optional, Set, Tuple | |||||
import psycopg2 | |||||
import pytest | import pytest | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.model.model import Sha1Git | |||||
from swh.provenance.archive import ArchiveInterface | |||||
from swh.provenance.model import RevisionEntry | from swh.provenance.model import RevisionEntry | ||||
from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase | |||||
from swh.provenance.provenance import ProvenanceInterface | |||||
from swh.provenance.revision import revision_add | from swh.provenance.revision import revision_add | ||||
from swh.provenance.tests.conftest import ( | from swh.provenance.tests.conftest import ( | ||||
fill_storage, | fill_storage, | ||||
get_datafile, | get_datafile, | ||||
load_repo_data, | load_repo_data, | ||||
synthetic_result, | synthetic_result, | ||||
) | ) | ||||
from swh.provenance.tests.test_provenance_db import ts2dt | from swh.provenance.tests.test_provenance_db import ts2dt | ||||
from swh.storage.postgresql.storage import Storage | |||||
def sha1s(cur, table): | def sha1s(cur: psycopg2.extensions.cursor, table: str) -> Set[Sha1Git]: | ||||
"""return the 'sha1' column from the DB 'table' (as hex) | """return the 'sha1' column from the DB 'table' (as hex) | ||||
'cur' is a cursor to the provenance index DB. | 'cur' is a cursor to the provenance index DB. | ||||
""" | """ | ||||
cur.execute(f"SELECT sha1 FROM {table}") | cur.execute(f"SELECT sha1 FROM {table}") | ||||
return set(row["sha1"].hex() for row in cur.fetchall()) | return set(row["sha1"].hex() for row in cur.fetchall()) | ||||
def locations(cur): | def locations(cur: psycopg2.extensions.cursor) -> Set[bytes]: | ||||
"""return the 'path' column from the DB location table | """return the 'path' column from the DB location table | ||||
'cur' is a cursor to the provenance index DB. | 'cur' is a cursor to the provenance index DB. | ||||
""" | """ | ||||
cur.execute("SELECT encode(location.path::bytea, 'escape') AS path FROM location") | cur.execute("SELECT encode(location.path::bytea, 'escape') AS path FROM location") | ||||
return set(row["path"] for row in cur.fetchall()) | return set(row["path"] for row in cur.fetchall()) | ||||
def relations(cur, src, dst): | def relations( | ||||
cur: psycopg2.extensions.cursor, src: str, dst: str | |||||
) -> Set[Tuple[Sha1Git, Sha1Git, bytes]]: | |||||
"""return the triplets ('sha1', 'sha1', 'path') from the DB | """return the triplets ('sha1', 'sha1', 'path') from the DB | ||||
for the relation between 'src' table and 'dst' table | for the relation between 'src' table and 'dst' table | ||||
(i.e. for C-R, C-D and D-R relations). | (i.e. for C-R, C-D and D-R relations). | ||||
'cur' is a cursor to the provenance index DB. | 'cur' is a cursor to the provenance index DB. | ||||
""" | """ | ||||
relation = f"{src}_in_{dst}" | relation = f"{src}_in_{dst}" | ||||
Show All 23 Lines | else: | ||||
FROM {relation} as relation | FROM {relation} as relation | ||||
INNER JOIN {src} AS src ON (src.id = relation.{src}) | INNER JOIN {src} AS src ON (src.id = relation.{src}) | ||||
INNER JOIN {dst} AS dst ON (dst.id = relation.{dst}) | INNER JOIN {dst} AS dst ON (dst.id = relation.{dst}) | ||||
""" | """ | ||||
) | ) | ||||
return set((row["src"], row["dst"], row["path"]) for row in cur.fetchall()) | return set((row["src"], row["dst"], row["path"]) for row in cur.fetchall()) | ||||
def get_timestamp(cur, table, sha1): | def get_timestamp( | ||||
cur: psycopg2.extensions.cursor, table: str, sha1: Sha1Git | |||||
) -> List[datetime]: | |||||
"""return the date for the 'sha1' from the DB 'table' (as hex) | """return the date for the 'sha1' from the DB 'table' (as hex) | ||||
'cur' is a cursor to the provenance index DB. | 'cur' is a cursor to the provenance index DB. | ||||
""" | """ | ||||
if isinstance(sha1, str): | |||||
sha1 = hash_to_bytes(sha1) | |||||
cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) | cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) | ||||
return [row["date"].timestamp() for row in cur.fetchall()] | return [row["date"].timestamp() for row in cur.fetchall()] | ||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"repo, lower, mindepth", | "repo, lower, mindepth", | ||||
( | ( | ||||
("cmdbts2", True, 1), | ("cmdbts2", True, 1), | ||||
("cmdbts2", False, 1), | ("cmdbts2", False, 1), | ||||
("cmdbts2", True, 2), | ("cmdbts2", True, 2), | ||||
("cmdbts2", False, 2), | ("cmdbts2", False, 2), | ||||
("out-of-order", True, 1), | ("out-of-order", True, 1), | ||||
), | ), | ||||
) | ) | ||||
def test_provenance_heuristics(provenance, swh_storage, archive, repo, lower, mindepth): | def test_provenance_heuristics( | ||||
provenance: ProvenanceInterface, | |||||
swh_storage: Storage, | |||||
archive: ArchiveInterface, | |||||
repo: str, | |||||
lower: bool, | |||||
mindepth: int, | |||||
) -> None: | |||||
# read data/README.md for more details on how these datasets are generated | # read data/README.md for more details on how these datasets are generated | ||||
data = load_repo_data(repo) | data = load_repo_data(repo) | ||||
fill_storage(swh_storage, data) | fill_storage(swh_storage, data) | ||||
syntheticfile = get_datafile( | syntheticfile = get_datafile( | ||||
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" | f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" | ||||
) | ) | ||||
revisions = {rev["id"]: rev for rev in data["revision"]} | revisions = {rev["id"]: rev for rev in data["revision"]} | ||||
rows = { | rows: Dict[str, Set[Any]] = { | ||||
"content": set(), | "content": set(), | ||||
"content_in_directory": set(), | "content_in_directory": set(), | ||||
"content_in_revision": set(), | "content_in_revision": set(), | ||||
"directory": set(), | "directory": set(), | ||||
"directory_in_revision": set(), | "directory_in_revision": set(), | ||||
"location": set(), | "location": set(), | ||||
"revision": set(), | "revision": set(), | ||||
} | } | ||||
assert isinstance(provenance.storage, ProvenanceDBBase) | |||||
cursor = provenance.storage.cursor | cursor = provenance.storage.cursor | ||||
def maybe_path(path: str) -> str: | def maybe_path(path: str) -> str: | ||||
assert isinstance(provenance.storage, ProvenanceDBBase) | |||||
if provenance.storage.with_path: | if provenance.storage.with_path: | ||||
return path | return path | ||||
return "" | return "" | ||||
for synth_rev in synthetic_result(syntheticfile): | for synth_rev in synthetic_result(syntheticfile): | ||||
revision = revisions[synth_rev["sha1"]] | revision = revisions[synth_rev["sha1"]] | ||||
entry = RevisionEntry( | entry = RevisionEntry( | ||||
id=revision["id"], | id=revision["id"], | ||||
date=ts2dt(revision["date"]), | date=ts2dt(revision["date"]), | ||||
root=revision["directory"], | root=revision["directory"], | ||||
) | ) | ||||
revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) | revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) | ||||
# each "entry" in the synth file is one new revision | # each "entry" in the synth file is one new revision | ||||
rows["revision"].add(synth_rev["sha1"].hex()) | rows["revision"].add(synth_rev["sha1"].hex()) | ||||
assert rows["revision"] == sha1s(cursor, "revision"), synth_rev["msg"] | assert rows["revision"] == sha1s(cursor, "revision"), synth_rev["msg"] | ||||
# check the timestamp of the revision | # check the timestamp of the revision | ||||
rev_ts = synth_rev["date"] | rev_ts = synth_rev["date"] | ||||
assert get_timestamp(cursor, "revision", synth_rev["sha1"].hex()) == [ | assert get_timestamp(cursor, "revision", synth_rev["sha1"]) == [ | ||||
rev_ts | rev_ts | ||||
], synth_rev["msg"] | ], synth_rev["msg"] | ||||
# this revision might have added new content objects | # this revision might have added new content objects | ||||
rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) | rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) | ||||
rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) | rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) | ||||
assert rows["content"] == sha1s(cursor, "content"), synth_rev["msg"] | assert rows["content"] == sha1s(cursor, "content"), synth_rev["msg"] | ||||
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | ( | ||||
("cmdbts2", True, 1), | ("cmdbts2", True, 1), | ||||
("cmdbts2", False, 1), | ("cmdbts2", False, 1), | ||||
("cmdbts2", True, 2), | ("cmdbts2", True, 2), | ||||
("cmdbts2", False, 2), | ("cmdbts2", False, 2), | ||||
("out-of-order", True, 1), | ("out-of-order", True, 1), | ||||
), | ), | ||||
) | ) | ||||
def test_provenance_heuristics_content_find_all( | def test_provenance_heuristics_content_find_all( | ||||
provenance, swh_storage, archive, repo, lower, mindepth | provenance: ProvenanceInterface, | ||||
): | swh_storage: Storage, | ||||
archive: ArchiveInterface, | |||||
repo: str, | |||||
lower: bool, | |||||
mindepth: int, | |||||
) -> None: | |||||
# read data/README.md for more details on how these datasets are generated | # read data/README.md for more details on how these datasets are generated | ||||
data = load_repo_data(repo) | data = load_repo_data(repo) | ||||
fill_storage(swh_storage, data) | fill_storage(swh_storage, data) | ||||
revisions = [ | revisions = [ | ||||
RevisionEntry( | RevisionEntry( | ||||
id=revision["id"], | id=revision["id"], | ||||
date=ts2dt(revision["date"]), | date=ts2dt(revision["date"]), | ||||
root=revision["directory"], | root=revision["directory"], | ||||
) | ) | ||||
for revision in data["revision"] | for revision in data["revision"] | ||||
] | ] | ||||
def maybe_path(path: str) -> str: | def maybe_path(path: str) -> str: | ||||
assert isinstance(provenance.storage, ProvenanceDBBase) | |||||
if provenance.storage.with_path: | if provenance.storage.with_path: | ||||
return path | return path | ||||
return "" | return "" | ||||
# XXX adding all revisions at once should be working just fine, but it does not... | # XXX adding all revisions at once should be working just fine, but it does not... | ||||
# revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) | # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) | ||||
# ...so add revisions one at a time for now | # ...so add revisions one at a time for now | ||||
for revision in revisions: | for revision in revisions: | ||||
revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth) | revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth) | ||||
syntheticfile = get_datafile( | syntheticfile = get_datafile( | ||||
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" | f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" | ||||
) | ) | ||||
expected_occurrences = {} | expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {} | ||||
for synth_rev in synthetic_result(syntheticfile): | for synth_rev in synthetic_result(syntheticfile): | ||||
rev_id = synth_rev["sha1"].hex() | rev_id = synth_rev["sha1"].hex() | ||||
rev_ts = synth_rev["date"] | rev_ts = synth_rev["date"] | ||||
for rc in synth_rev["R_C"]: | for rc in synth_rev["R_C"]: | ||||
expected_occurrences.setdefault(rc["dst"].hex(), []).append( | expected_occurrences.setdefault(rc["dst"].hex(), []).append( | ||||
(rev_id, rev_ts, None, maybe_path(rc["path"])) | (rev_id, rev_ts, None, maybe_path(rc["path"])) | ||||
) | ) | ||||
for dc in synth_rev["D_C"]: | for dc in synth_rev["D_C"]: | ||||
assert dc["prefix"] is not None # to please mypy | assert dc["prefix"] is not None # to please mypy | ||||
expected_occurrences.setdefault(dc["dst"].hex(), []).append( | expected_occurrences.setdefault(dc["dst"].hex(), []).append( | ||||
(rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) | (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) | ||||
) | ) | ||||
assert isinstance(provenance.storage, ProvenanceDBBase) | |||||
for content_id, results in expected_occurrences.items(): | for content_id, results in expected_occurrences.items(): | ||||
expected = [(content_id, *result) for result in results] | expected = [(content_id, *result) for result in results] | ||||
db_occurrences = [ | db_occurrences = [ | ||||
( | ( | ||||
occur.content.hex(), | occur.content.hex(), | ||||
occur.revision.hex(), | occur.revision.hex(), | ||||
occur.date.timestamp(), | occur.date.timestamp(), | ||||
occur.origin, | occur.origin, | ||||
Show All 15 Lines | ( | ||||
("cmdbts2", True, 1), | ("cmdbts2", True, 1), | ||||
("cmdbts2", False, 1), | ("cmdbts2", False, 1), | ||||
("cmdbts2", True, 2), | ("cmdbts2", True, 2), | ||||
("cmdbts2", False, 2), | ("cmdbts2", False, 2), | ||||
("out-of-order", True, 1), | ("out-of-order", True, 1), | ||||
), | ), | ||||
) | ) | ||||
def test_provenance_heuristics_content_find_first( | def test_provenance_heuristics_content_find_first( | ||||
provenance, swh_storage, archive, repo, lower, mindepth | provenance: ProvenanceInterface, | ||||
): | swh_storage: Storage, | ||||
archive: ArchiveInterface, | |||||
repo: str, | |||||
lower: bool, | |||||
mindepth: int, | |||||
) -> None: | |||||
# read data/README.md for more details on how these datasets are generated | # read data/README.md for more details on how these datasets are generated | ||||
data = load_repo_data(repo) | data = load_repo_data(repo) | ||||
fill_storage(swh_storage, data) | fill_storage(swh_storage, data) | ||||
revisions = [ | revisions = [ | ||||
RevisionEntry( | RevisionEntry( | ||||
id=revision["id"], | id=revision["id"], | ||||
date=ts2dt(revision["date"]), | date=ts2dt(revision["date"]), | ||||
root=revision["directory"], | root=revision["directory"], | ||||
) | ) | ||||
for revision in data["revision"] | for revision in data["revision"] | ||||
] | ] | ||||
# XXX adding all revisions at once should be working just fine, but it does not... | # XXX adding all revisions at once should be working just fine, but it does not... | ||||
# revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) | # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) | ||||
# ...so add revisions one at a time for now | # ...so add revisions one at a time for now | ||||
for revision in revisions: | for revision in revisions: | ||||
revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth) | revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth) | ||||
syntheticfile = get_datafile( | syntheticfile = get_datafile( | ||||
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" | f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" | ||||
) | ) | ||||
expected_first: Dict[str, Tuple[str, str, List[str]]] = {} | expected_first: Dict[str, Tuple[str, float, List[str]]] = {} | ||||
# dict of tuples (blob_id, rev_id, [path, ...]) the third element for path | # dict of tuples (blob_id, rev_id, [path, ...]) the third element for path | ||||
# is a list because a content can be added at several places in a single | # is a list because a content can be added at several places in a single | ||||
# revision, in which case the result of content_find_first() is one of | # revision, in which case the result of content_find_first() is one of | ||||
# those path, but we have no guarantee which one it will return. | # those path, but we have no guarantee which one it will return. | ||||
for synth_rev in synthetic_result(syntheticfile): | for synth_rev in synthetic_result(syntheticfile): | ||||
rev_id = synth_rev["sha1"].hex() | rev_id = synth_rev["sha1"].hex() | ||||
rev_ts = synth_rev["date"] | rev_ts = synth_rev["date"] | ||||
for rc in synth_rev["R_C"]: | for rc in synth_rev["R_C"]: | ||||
sha1 = rc["dst"].hex() | sha1 = rc["dst"].hex() | ||||
if sha1 not in expected_first: | if sha1 not in expected_first: | ||||
assert rc["rel_ts"] == 0 | assert rc["rel_ts"] == 0 | ||||
expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) | expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) | ||||
else: | else: | ||||
if rev_ts == expected_first[sha1][1]: | if rev_ts == expected_first[sha1][1]: | ||||
expected_first[sha1][2].append(rc["path"]) | expected_first[sha1][2].append(rc["path"]) | ||||
elif rev_ts < expected_first[sha1][1]: | elif rev_ts < expected_first[sha1][1]: | ||||
expected_first[sha1] = (rev_id, rev_ts, rc["path"]) | expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) | ||||
for dc in synth_rev["D_C"]: | for dc in synth_rev["D_C"]: | ||||
sha1 = rc["dst"].hex() | sha1 = rc["dst"].hex() | ||||
assert sha1 in expected_first | assert sha1 in expected_first | ||||
# nothing to do there, this content cannot be a "first seen file" | # nothing to do there, this content cannot be a "first seen file" | ||||
assert isinstance(provenance.storage, ProvenanceDBBase) | |||||
for content_id, (rev_id, ts, paths) in expected_first.items(): | for content_id, (rev_id, ts, paths) in expected_first.items(): | ||||
occur = provenance.content_find_first(hash_to_bytes(content_id)) | occur = provenance.content_find_first(hash_to_bytes(content_id)) | ||||
assert occur is not None | |||||
assert occur.content.hex() == content_id | assert occur.content.hex() == content_id | ||||
assert occur.revision.hex() == rev_id | assert occur.revision.hex() == rev_id | ||||
assert occur.date.timestamp() == ts | assert occur.date.timestamp() == ts | ||||
assert occur.origin is None | assert occur.origin is None | ||||
if provenance.storage.with_path: | if provenance.storage.with_path: | ||||
assert occur.path.decode() in paths | assert occur.path.decode() in paths |