Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py
index 8927f54..475d954 100644
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_provenance_heuristics.py
@@ -1,316 +1,324 @@
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Any, Dict, List, Optional, Set, Tuple
import pytest
from swh.model.hashutil import hash_to_bytes
from swh.provenance.archive import ArchiveInterface
from swh.provenance.model import RevisionEntry
from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase
from swh.provenance.provenance import EntityType, ProvenanceInterface, RelationType
from swh.provenance.revision import revision_add
from swh.provenance.tests.conftest import (
fill_storage,
get_datafile,
load_repo_data,
synthetic_result,
)
from swh.provenance.tests.test_provenance_db import ts2dt
from swh.storage.postgresql.storage import Storage
@pytest.mark.parametrize(
"repo, lower, mindepth",
(
("cmdbts2", True, 1),
("cmdbts2", False, 1),
("cmdbts2", True, 2),
("cmdbts2", False, 2),
("out-of-order", True, 1),
),
)
def test_provenance_heuristics(
provenance: ProvenanceInterface,
swh_storage: Storage,
archive: ArchiveInterface,
repo: str,
lower: bool,
mindepth: int,
) -> None:
# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)
fill_storage(swh_storage, data)
syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)
revisions = {rev["id"]: rev for rev in data["revision"]}
rows: Dict[str, Set[Any]] = {
"content": set(),
"content_in_directory": set(),
"content_in_revision": set(),
"directory": set(),
"directory_in_revision": set(),
"location": set(),
"revision": set(),
}
def maybe_path(path: str) -> Optional[bytes]:
assert isinstance(provenance.storage, ProvenanceDBBase)
if provenance.storage.with_path:
return path.encode("utf-8")
return None
for synth_rev in synthetic_result(syntheticfile):
revision = revisions[synth_rev["sha1"]]
entry = RevisionEntry(
id=revision["id"],
date=ts2dt(revision["date"]),
root=revision["directory"],
)
revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)
# each "entry" in the synth file is one new revision
rows["revision"].add(synth_rev["sha1"])
assert rows["revision"] == provenance.storage.entity_get_all(
EntityType.REVISION
), synth_rev["msg"]
# check the timestamp of the revision
rev_ts = synth_rev["date"]
rev_date, _ = provenance.storage.revision_get([synth_rev["sha1"]])[
synth_rev["sha1"]
]
assert rev_date is not None and rev_ts == rev_date.timestamp(), synth_rev["msg"]
# this revision might have added new content objects
rows["content"] |= set(x["dst"] for x in synth_rev["R_C"])
rows["content"] |= set(x["dst"] for x in synth_rev["D_C"])
assert rows["content"] == provenance.storage.entity_get_all(
EntityType.CONTENT
), synth_rev["msg"]
# check for R-C (direct) entries
# these are added directly in the content_early_in_rev table
rows["content_in_revision"] |= set(
(x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_C"]
)
assert rows["content_in_revision"] == provenance.storage.relation_get_all(
RelationType.CNT_EARLY_IN_REV
), synth_rev["msg"]
# check timestamps
for rc in synth_rev["R_C"]:
assert (
rev_ts + rc["rel_ts"]
== provenance.storage.content_get([rc["dst"]])[rc["dst"]].timestamp()
), synth_rev["msg"]
# check directories
# each directory stored in the provenance index is an entry
# in the "directory" table...
rows["directory"] |= set(x["dst"] for x in synth_rev["R_D"])
assert rows["directory"] == provenance.storage.entity_get_all(
EntityType.DIRECTORY
), synth_rev["msg"]
# ... + a number of rows in the "directory_in_rev" table...
# check for R-D entries
rows["directory_in_revision"] |= set(
(x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_D"]
)
assert rows["directory_in_revision"] == provenance.storage.relation_get_all(
RelationType.DIR_IN_REV
), synth_rev["msg"]
# check timestamps
for rd in synth_rev["R_D"]:
assert (
rev_ts + rd["rel_ts"]
== provenance.storage.directory_get([rd["dst"]])[rd["dst"]].timestamp()
), synth_rev["msg"]
# ... + a number of rows in the "content_in_dir" table
# for content of the directory.
# check for D-C entries
rows["content_in_directory"] |= set(
(x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["D_C"]
)
assert rows["content_in_directory"] == provenance.storage.relation_get_all(
RelationType.CNT_IN_DIR
), synth_rev["msg"]
# check timestamps
for dc in synth_rev["D_C"]:
assert (
rev_ts + dc["rel_ts"]
== provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp()
), synth_rev["msg"]
assert isinstance(provenance.storage, ProvenanceDBBase)
if provenance.storage.with_path:
# check for location entries
rows["location"] |= set(x["path"] for x in synth_rev["R_C"])
rows["location"] |= set(x["path"] for x in synth_rev["D_C"])
rows["location"] |= set(x["path"] for x in synth_rev["R_D"])
assert rows["location"] == provenance.storage.location_get(), synth_rev[
"msg"
]
@pytest.mark.parametrize(
"repo, lower, mindepth",
(
("cmdbts2", True, 1),
("cmdbts2", False, 1),
("cmdbts2", True, 2),
("cmdbts2", False, 2),
("out-of-order", True, 1),
),
)
+@pytest.mark.parametrize("batch", (True, False))
def test_provenance_heuristics_content_find_all(
provenance: ProvenanceInterface,
swh_storage: Storage,
archive: ArchiveInterface,
repo: str,
lower: bool,
mindepth: int,
+ batch: bool,
) -> None:
# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)
fill_storage(swh_storage, data)
revisions = [
RevisionEntry(
id=revision["id"],
date=ts2dt(revision["date"]),
root=revision["directory"],
)
for revision in data["revision"]
]
def maybe_path(path: str) -> str:
assert isinstance(provenance.storage, ProvenanceDBBase)
if provenance.storage.with_path:
return path
return ""
- # XXX adding all revisions at once should be working just fine, but it does not...
- # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
- # ...so add revisions one at a time for now
- for revision in revisions:
- revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)
+ if batch:
+ revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
+ else:
+ for revision in revisions:
+ revision_add(
+ provenance, archive, [revision], lower=lower, mindepth=mindepth
+ )
syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)
expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {}
for synth_rev in synthetic_result(syntheticfile):
rev_id = synth_rev["sha1"].hex()
rev_ts = synth_rev["date"]
for rc in synth_rev["R_C"]:
expected_occurrences.setdefault(rc["dst"].hex(), []).append(
(rev_id, rev_ts, None, maybe_path(rc["path"]))
)
for dc in synth_rev["D_C"]:
assert dc["prefix"] is not None # to please mypy
expected_occurrences.setdefault(dc["dst"].hex(), []).append(
(rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))
)
assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, results in expected_occurrences.items():
expected = [(content_id, *result) for result in results]
db_occurrences = [
(
occur.content.hex(),
occur.revision.hex(),
occur.date.timestamp(),
occur.origin,
occur.path.decode(),
)
for occur in provenance.content_find_all(hash_to_bytes(content_id))
]
if provenance.storage.with_path:
# this is not true if the db stores no path, because a same content
# that appears several times in a given revision may be reported
# only once by content_find_all()
assert len(db_occurrences) == len(expected)
assert set(db_occurrences) == set(expected)
@pytest.mark.parametrize(
"repo, lower, mindepth",
(
("cmdbts2", True, 1),
("cmdbts2", False, 1),
("cmdbts2", True, 2),
("cmdbts2", False, 2),
("out-of-order", True, 1),
),
)
+@pytest.mark.parametrize("batch", (True, False))
def test_provenance_heuristics_content_find_first(
provenance: ProvenanceInterface,
swh_storage: Storage,
archive: ArchiveInterface,
repo: str,
lower: bool,
mindepth: int,
+ batch: bool,
) -> None:
# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)
fill_storage(swh_storage, data)
revisions = [
RevisionEntry(
id=revision["id"],
date=ts2dt(revision["date"]),
root=revision["directory"],
)
for revision in data["revision"]
]
- # XXX adding all revisions at once should be working just fine, but it does not...
- # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
- # ...so add revisions one at a time for now
- for revision in revisions:
- revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)
+ if batch:
+ revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
+ else:
+ for revision in revisions:
+ revision_add(
+ provenance, archive, [revision], lower=lower, mindepth=mindepth
+ )
syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)
expected_first: Dict[str, Tuple[str, float, List[str]]] = {}
# dict of tuples (blob_id, rev_id, [path, ...]) the third element for path
# is a list because a content can be added at several places in a single
# revision, in which case the result of content_find_first() is one of
# those path, but we have no guarantee which one it will return.
for synth_rev in synthetic_result(syntheticfile):
rev_id = synth_rev["sha1"].hex()
rev_ts = synth_rev["date"]
for rc in synth_rev["R_C"]:
sha1 = rc["dst"].hex()
if sha1 not in expected_first:
assert rc["rel_ts"] == 0
expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
else:
if rev_ts == expected_first[sha1][1]:
expected_first[sha1][2].append(rc["path"])
elif rev_ts < expected_first[sha1][1]:
expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
for dc in synth_rev["D_C"]:
sha1 = rc["dst"].hex()
assert sha1 in expected_first
# nothing to do there, this content cannot be a "first seen file"
assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, (rev_id, ts, paths) in expected_first.items():
occur = provenance.content_find_first(hash_to_bytes(content_id))
assert occur is not None
assert occur.content.hex() == content_id
assert occur.revision.hex() == rev_id
assert occur.date.timestamp() == ts
assert occur.origin is None
if provenance.storage.with_path:
assert occur.path.decode() in paths

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:30 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3272837

Event Timeline