Differential D5962 Diff 21438 swh/provenance/tests/test_provenance_heuristics.py

Changeset View

Standalone View

swh/provenance/tests/test_provenance_heuristics.py

# Copyright (C) 2021 The Software Heritage developers		# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

from typing import Dict, List, Tuple		from datetime import datetime
		from typing import Any, Dict, List, Optional, Set, Tuple

		import psycopg2
import pytest		import pytest

from swh.model.hashutil import hash_to_bytes		from swh.model.hashutil import hash_to_bytes
		from swh.model.model import Sha1Git
		from swh.provenance.archive import ArchiveInterface
from swh.provenance.model import RevisionEntry		from swh.provenance.model import RevisionEntry
		from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase
		from swh.provenance.provenance import ProvenanceInterface
from swh.provenance.revision import revision_add		from swh.provenance.revision import revision_add
from swh.provenance.tests.conftest import (		from swh.provenance.tests.conftest import (
fill_storage,		fill_storage,
get_datafile,		get_datafile,
load_repo_data,		load_repo_data,
synthetic_result,		synthetic_result,
)		)
from swh.provenance.tests.test_provenance_db import ts2dt		from swh.provenance.tests.test_provenance_db import ts2dt
		from swh.storage.postgresql.storage import Storage


def sha1s(cur, table):		def sha1s(cur: psycopg2.extensions.cursor, table: str) -> Set[Sha1Git]:
"""return the 'sha1' column from the DB 'table' (as hex)		"""return the 'sha1' column from the DB 'table' (as hex)

'cur' is a cursor to the provenance index DB.		'cur' is a cursor to the provenance index DB.
"""		"""
cur.execute(f"SELECT sha1 FROM {table}")		cur.execute(f"SELECT sha1 FROM {table}")
return set(row["sha1"].hex() for row in cur.fetchall())		return set(row["sha1"].hex() for row in cur.fetchall())


def locations(cur):		def locations(cur: psycopg2.extensions.cursor) -> Set[bytes]:
"""return the 'path' column from the DB location table		"""return the 'path' column from the DB location table

'cur' is a cursor to the provenance index DB.		'cur' is a cursor to the provenance index DB.
"""		"""
cur.execute("SELECT encode(location.path::bytea, 'escape') AS path FROM location")		cur.execute("SELECT encode(location.path::bytea, 'escape') AS path FROM location")
return set(row["path"] for row in cur.fetchall())		return set(row["path"] for row in cur.fetchall())


def relations(cur, src, dst):		def relations(
		cur: psycopg2.extensions.cursor, src: str, dst: str
		) -> Set[Tuple[Sha1Git, Sha1Git, bytes]]:
"""return the triplets ('sha1', 'sha1', 'path') from the DB		"""return the triplets ('sha1', 'sha1', 'path') from the DB

for the relation between 'src' table and 'dst' table		for the relation between 'src' table and 'dst' table
(i.e. for C-R, C-D and D-R relations).		(i.e. for C-R, C-D and D-R relations).

'cur' is a cursor to the provenance index DB.		'cur' is a cursor to the provenance index DB.
"""		"""
relation = f"{src}_in_{dst}"		relation = f"{src}_in_{dst}"
Show All 23 Lines	else:
FROM {relation} as relation		FROM {relation} as relation
INNER JOIN {src} AS src ON (src.id = relation.{src})		INNER JOIN {src} AS src ON (src.id = relation.{src})
INNER JOIN {dst} AS dst ON (dst.id = relation.{dst})		INNER JOIN {dst} AS dst ON (dst.id = relation.{dst})
"""		"""
)		)
return set((row["src"], row["dst"], row["path"]) for row in cur.fetchall())		return set((row["src"], row["dst"], row["path"]) for row in cur.fetchall())


def get_timestamp(cur, table, sha1):		def get_timestamp(
		cur: psycopg2.extensions.cursor, table: str, sha1: Sha1Git
		) -> List[datetime]:
"""return the date for the 'sha1' from the DB 'table' (as hex)		"""return the date for the 'sha1' from the DB 'table' (as hex)

'cur' is a cursor to the provenance index DB.		'cur' is a cursor to the provenance index DB.
"""		"""
if isinstance(sha1, str):
sha1 = hash_to_bytes(sha1)
cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,))		cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,))
return [row["date"].timestamp() for row in cur.fetchall()]		return [row["date"].timestamp() for row in cur.fetchall()]


@pytest.mark.parametrize(		@pytest.mark.parametrize(
"repo, lower, mindepth",		"repo, lower, mindepth",
(		(
("cmdbts2", True, 1),		("cmdbts2", True, 1),
("cmdbts2", False, 1),		("cmdbts2", False, 1),
("cmdbts2", True, 2),		("cmdbts2", True, 2),
("cmdbts2", False, 2),		("cmdbts2", False, 2),
("out-of-order", True, 1),		("out-of-order", True, 1),
),		),
)		)
def test_provenance_heuristics(provenance, swh_storage, archive, repo, lower, mindepth):		def test_provenance_heuristics(
		provenance: ProvenanceInterface,
		swh_storage: Storage,
		archive: ArchiveInterface,
		repo: str,
		lower: bool,
		mindepth: int,
		) -> None:
# read data/README.md for more details on how these datasets are generated		# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)		data = load_repo_data(repo)
fill_storage(swh_storage, data)		fill_storage(swh_storage, data)
syntheticfile = get_datafile(		syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"		f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)		)

revisions = {rev["id"]: rev for rev in data["revision"]}		revisions = {rev["id"]: rev for rev in data["revision"]}

rows = {		rows: Dict[str, Set[Any]] = {
"content": set(),		"content": set(),
"content_in_directory": set(),		"content_in_directory": set(),
"content_in_revision": set(),		"content_in_revision": set(),
"directory": set(),		"directory": set(),
"directory_in_revision": set(),		"directory_in_revision": set(),
"location": set(),		"location": set(),
"revision": set(),		"revision": set(),
}		}
		assert isinstance(provenance.storage, ProvenanceDBBase)
cursor = provenance.storage.cursor		cursor = provenance.storage.cursor

def maybe_path(path: str) -> str:		def maybe_path(path: str) -> str:
		assert isinstance(provenance.storage, ProvenanceDBBase)
if provenance.storage.with_path:		if provenance.storage.with_path:
return path		return path
return ""		return ""

for synth_rev in synthetic_result(syntheticfile):		for synth_rev in synthetic_result(syntheticfile):
revision = revisions[synth_rev["sha1"]]		revision = revisions[synth_rev["sha1"]]
entry = RevisionEntry(		entry = RevisionEntry(
id=revision["id"],		id=revision["id"],
date=ts2dt(revision["date"]),		date=ts2dt(revision["date"]),
root=revision["directory"],		root=revision["directory"],
)		)
revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)		revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)

# each "entry" in the synth file is one new revision		# each "entry" in the synth file is one new revision
rows["revision"].add(synth_rev["sha1"].hex())		rows["revision"].add(synth_rev["sha1"].hex())
assert rows["revision"] == sha1s(cursor, "revision"), synth_rev["msg"]		assert rows["revision"] == sha1s(cursor, "revision"), synth_rev["msg"]
# check the timestamp of the revision		# check the timestamp of the revision
rev_ts = synth_rev["date"]		rev_ts = synth_rev["date"]
assert get_timestamp(cursor, "revision", synth_rev["sha1"].hex()) == [		assert get_timestamp(cursor, "revision", synth_rev["sha1"]) == [
rev_ts		rev_ts
], synth_rev["msg"]		], synth_rev["msg"]

# this revision might have added new content objects		# this revision might have added new content objects
rows["content"] \|= set(x["dst"].hex() for x in synth_rev["R_C"])		rows["content"] \|= set(x["dst"].hex() for x in synth_rev["R_C"])
rows["content"] \|= set(x["dst"].hex() for x in synth_rev["D_C"])		rows["content"] \|= set(x["dst"].hex() for x in synth_rev["D_C"])
assert rows["content"] == sha1s(cursor, "content"), synth_rev["msg"]		assert rows["content"] == sha1s(cursor, "content"), synth_rev["msg"]

▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	(
("cmdbts2", True, 1),		("cmdbts2", True, 1),
("cmdbts2", False, 1),		("cmdbts2", False, 1),
("cmdbts2", True, 2),		("cmdbts2", True, 2),
("cmdbts2", False, 2),		("cmdbts2", False, 2),
("out-of-order", True, 1),		("out-of-order", True, 1),
),		),
)		)
def test_provenance_heuristics_content_find_all(		def test_provenance_heuristics_content_find_all(
provenance, swh_storage, archive, repo, lower, mindepth		provenance: ProvenanceInterface,
):		swh_storage: Storage,
		archive: ArchiveInterface,
		repo: str,
		lower: bool,
		mindepth: int,
		) -> None:
# read data/README.md for more details on how these datasets are generated		# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)		data = load_repo_data(repo)
fill_storage(swh_storage, data)		fill_storage(swh_storage, data)
revisions = [		revisions = [
RevisionEntry(		RevisionEntry(
id=revision["id"],		id=revision["id"],
date=ts2dt(revision["date"]),		date=ts2dt(revision["date"]),
root=revision["directory"],		root=revision["directory"],
)		)
for revision in data["revision"]		for revision in data["revision"]
]		]

def maybe_path(path: str) -> str:		def maybe_path(path: str) -> str:
		assert isinstance(provenance.storage, ProvenanceDBBase)
if provenance.storage.with_path:		if provenance.storage.with_path:
return path		return path
return ""		return ""

# XXX adding all revisions at once should be working just fine, but it does not...		# XXX adding all revisions at once should be working just fine, but it does not...
# revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)		# revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
# ...so add revisions one at a time for now		# ...so add revisions one at a time for now
for revision in revisions:		for revision in revisions:
revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)		revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)

syntheticfile = get_datafile(		syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"		f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)		)
expected_occurrences = {}		expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {}
for synth_rev in synthetic_result(syntheticfile):		for synth_rev in synthetic_result(syntheticfile):
rev_id = synth_rev["sha1"].hex()		rev_id = synth_rev["sha1"].hex()
rev_ts = synth_rev["date"]		rev_ts = synth_rev["date"]

for rc in synth_rev["R_C"]:		for rc in synth_rev["R_C"]:
expected_occurrences.setdefault(rc["dst"].hex(), []).append(		expected_occurrences.setdefault(rc["dst"].hex(), []).append(
(rev_id, rev_ts, None, maybe_path(rc["path"]))		(rev_id, rev_ts, None, maybe_path(rc["path"]))
)		)
for dc in synth_rev["D_C"]:		for dc in synth_rev["D_C"]:
assert dc["prefix"] is not None # to please mypy		assert dc["prefix"] is not None # to please mypy
expected_occurrences.setdefault(dc["dst"].hex(), []).append(		expected_occurrences.setdefault(dc["dst"].hex(), []).append(
(rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))		(rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))
)		)

		assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, results in expected_occurrences.items():		for content_id, results in expected_occurrences.items():
expected = [(content_id, *result) for result in results]		expected = [(content_id, *result) for result in results]
db_occurrences = [		db_occurrences = [
(		(
occur.content.hex(),		occur.content.hex(),
occur.revision.hex(),		occur.revision.hex(),
occur.date.timestamp(),		occur.date.timestamp(),
occur.origin,		occur.origin,
Show All 15 Lines	(
("cmdbts2", True, 1),		("cmdbts2", True, 1),
("cmdbts2", False, 1),		("cmdbts2", False, 1),
("cmdbts2", True, 2),		("cmdbts2", True, 2),
("cmdbts2", False, 2),		("cmdbts2", False, 2),
("out-of-order", True, 1),		("out-of-order", True, 1),
),		),
)		)
def test_provenance_heuristics_content_find_first(		def test_provenance_heuristics_content_find_first(
provenance, swh_storage, archive, repo, lower, mindepth		provenance: ProvenanceInterface,
):		swh_storage: Storage,
		archive: ArchiveInterface,
		repo: str,
		lower: bool,
		mindepth: int,
		) -> None:
# read data/README.md for more details on how these datasets are generated		# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)		data = load_repo_data(repo)
fill_storage(swh_storage, data)		fill_storage(swh_storage, data)
revisions = [		revisions = [
RevisionEntry(		RevisionEntry(
id=revision["id"],		id=revision["id"],
date=ts2dt(revision["date"]),		date=ts2dt(revision["date"]),
root=revision["directory"],		root=revision["directory"],
)		)
for revision in data["revision"]		for revision in data["revision"]
]		]

# XXX adding all revisions at once should be working just fine, but it does not...		# XXX adding all revisions at once should be working just fine, but it does not...
# revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)		# revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
# ...so add revisions one at a time for now		# ...so add revisions one at a time for now
for revision in revisions:		for revision in revisions:
revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)		revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)

syntheticfile = get_datafile(		syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"		f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)		)
expected_first: Dict[str, Tuple[str, str, List[str]]] = {}		expected_first: Dict[str, Tuple[str, float, List[str]]] = {}
# dict of tuples (blob_id, rev_id, [path, ...]) the third element for path		# dict of tuples (blob_id, rev_id, [path, ...]) the third element for path
# is a list because a content can be added at several places in a single		# is a list because a content can be added at several places in a single
# revision, in which case the result of content_find_first() is one of		# revision, in which case the result of content_find_first() is one of
# those path, but we have no guarantee which one it will return.		# those path, but we have no guarantee which one it will return.
for synth_rev in synthetic_result(syntheticfile):		for synth_rev in synthetic_result(syntheticfile):
rev_id = synth_rev["sha1"].hex()		rev_id = synth_rev["sha1"].hex()
rev_ts = synth_rev["date"]		rev_ts = synth_rev["date"]

for rc in synth_rev["R_C"]:		for rc in synth_rev["R_C"]:
sha1 = rc["dst"].hex()		sha1 = rc["dst"].hex()
if sha1 not in expected_first:		if sha1 not in expected_first:
assert rc["rel_ts"] == 0		assert rc["rel_ts"] == 0
expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])		expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
else:		else:
if rev_ts == expected_first[sha1][1]:		if rev_ts == expected_first[sha1][1]:
expected_first[sha1][2].append(rc["path"])		expected_first[sha1][2].append(rc["path"])
elif rev_ts < expected_first[sha1][1]:		elif rev_ts < expected_first[sha1][1]:
expected_first[sha1] = (rev_id, rev_ts, rc["path"])		expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])

for dc in synth_rev["D_C"]:		for dc in synth_rev["D_C"]:
sha1 = rc["dst"].hex()		sha1 = rc["dst"].hex()
assert sha1 in expected_first		assert sha1 in expected_first
# nothing to do there, this content cannot be a "first seen file"		# nothing to do there, this content cannot be a "first seen file"

		assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, (rev_id, ts, paths) in expected_first.items():		for content_id, (rev_id, ts, paths) in expected_first.items():
occur = provenance.content_find_first(hash_to_bytes(content_id))		occur = provenance.content_find_first(hash_to_bytes(content_id))
		assert occur is not None
assert occur.content.hex() == content_id		assert occur.content.hex() == content_id
assert occur.revision.hex() == rev_id		assert occur.revision.hex() == rev_id
assert occur.date.timestamp() == ts		assert occur.date.timestamp() == ts
assert occur.origin is None		assert occur.origin is None
if provenance.storage.with_path:		if provenance.storage.with_path:
assert occur.path.decode() in paths		assert occur.path.decode() in paths