Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345770
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
View Options
diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py
index 8927f54..475d954 100644
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_provenance_heuristics.py
@@ -1,316 +1,324 @@
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Any, Dict, List, Optional, Set, Tuple
import pytest
from swh.model.hashutil import hash_to_bytes
from swh.provenance.archive import ArchiveInterface
from swh.provenance.model import RevisionEntry
from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase
from swh.provenance.provenance import EntityType, ProvenanceInterface, RelationType
from swh.provenance.revision import revision_add
from swh.provenance.tests.conftest import (
fill_storage,
get_datafile,
load_repo_data,
synthetic_result,
)
from swh.provenance.tests.test_provenance_db import ts2dt
from swh.storage.postgresql.storage import Storage
@pytest.mark.parametrize(
"repo, lower, mindepth",
(
("cmdbts2", True, 1),
("cmdbts2", False, 1),
("cmdbts2", True, 2),
("cmdbts2", False, 2),
("out-of-order", True, 1),
),
)
def test_provenance_heuristics(
provenance: ProvenanceInterface,
swh_storage: Storage,
archive: ArchiveInterface,
repo: str,
lower: bool,
mindepth: int,
) -> None:
# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)
fill_storage(swh_storage, data)
syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)
revisions = {rev["id"]: rev for rev in data["revision"]}
rows: Dict[str, Set[Any]] = {
"content": set(),
"content_in_directory": set(),
"content_in_revision": set(),
"directory": set(),
"directory_in_revision": set(),
"location": set(),
"revision": set(),
}
def maybe_path(path: str) -> Optional[bytes]:
assert isinstance(provenance.storage, ProvenanceDBBase)
if provenance.storage.with_path:
return path.encode("utf-8")
return None
for synth_rev in synthetic_result(syntheticfile):
revision = revisions[synth_rev["sha1"]]
entry = RevisionEntry(
id=revision["id"],
date=ts2dt(revision["date"]),
root=revision["directory"],
)
revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)
# each "entry" in the synth file is one new revision
rows["revision"].add(synth_rev["sha1"])
assert rows["revision"] == provenance.storage.entity_get_all(
EntityType.REVISION
), synth_rev["msg"]
# check the timestamp of the revision
rev_ts = synth_rev["date"]
rev_date, _ = provenance.storage.revision_get([synth_rev["sha1"]])[
synth_rev["sha1"]
]
assert rev_date is not None and rev_ts == rev_date.timestamp(), synth_rev["msg"]
# this revision might have added new content objects
rows["content"] |= set(x["dst"] for x in synth_rev["R_C"])
rows["content"] |= set(x["dst"] for x in synth_rev["D_C"])
assert rows["content"] == provenance.storage.entity_get_all(
EntityType.CONTENT
), synth_rev["msg"]
# check for R-C (direct) entries
# these are added directly in the content_early_in_rev table
rows["content_in_revision"] |= set(
(x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_C"]
)
assert rows["content_in_revision"] == provenance.storage.relation_get_all(
RelationType.CNT_EARLY_IN_REV
), synth_rev["msg"]
# check timestamps
for rc in synth_rev["R_C"]:
assert (
rev_ts + rc["rel_ts"]
== provenance.storage.content_get([rc["dst"]])[rc["dst"]].timestamp()
), synth_rev["msg"]
# check directories
# each directory stored in the provenance index is an entry
# in the "directory" table...
rows["directory"] |= set(x["dst"] for x in synth_rev["R_D"])
assert rows["directory"] == provenance.storage.entity_get_all(
EntityType.DIRECTORY
), synth_rev["msg"]
# ... + a number of rows in the "directory_in_rev" table...
# check for R-D entries
rows["directory_in_revision"] |= set(
(x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_D"]
)
assert rows["directory_in_revision"] == provenance.storage.relation_get_all(
RelationType.DIR_IN_REV
), synth_rev["msg"]
# check timestamps
for rd in synth_rev["R_D"]:
assert (
rev_ts + rd["rel_ts"]
== provenance.storage.directory_get([rd["dst"]])[rd["dst"]].timestamp()
), synth_rev["msg"]
# ... + a number of rows in the "content_in_dir" table
# for content of the directory.
# check for D-C entries
rows["content_in_directory"] |= set(
(x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["D_C"]
)
assert rows["content_in_directory"] == provenance.storage.relation_get_all(
RelationType.CNT_IN_DIR
), synth_rev["msg"]
# check timestamps
for dc in synth_rev["D_C"]:
assert (
rev_ts + dc["rel_ts"]
== provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp()
), synth_rev["msg"]
assert isinstance(provenance.storage, ProvenanceDBBase)
if provenance.storage.with_path:
# check for location entries
rows["location"] |= set(x["path"] for x in synth_rev["R_C"])
rows["location"] |= set(x["path"] for x in synth_rev["D_C"])
rows["location"] |= set(x["path"] for x in synth_rev["R_D"])
assert rows["location"] == provenance.storage.location_get(), synth_rev[
"msg"
]
@pytest.mark.parametrize(
"repo, lower, mindepth",
(
("cmdbts2", True, 1),
("cmdbts2", False, 1),
("cmdbts2", True, 2),
("cmdbts2", False, 2),
("out-of-order", True, 1),
),
)
+@pytest.mark.parametrize("batch", (True, False))
def test_provenance_heuristics_content_find_all(
provenance: ProvenanceInterface,
swh_storage: Storage,
archive: ArchiveInterface,
repo: str,
lower: bool,
mindepth: int,
+ batch: bool,
) -> None:
# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)
fill_storage(swh_storage, data)
revisions = [
RevisionEntry(
id=revision["id"],
date=ts2dt(revision["date"]),
root=revision["directory"],
)
for revision in data["revision"]
]
def maybe_path(path: str) -> str:
assert isinstance(provenance.storage, ProvenanceDBBase)
if provenance.storage.with_path:
return path
return ""
- # XXX adding all revisions at once should be working just fine, but it does not...
- # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
- # ...so add revisions one at a time for now
- for revision in revisions:
- revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)
+ if batch:
+ revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
+ else:
+ for revision in revisions:
+ revision_add(
+ provenance, archive, [revision], lower=lower, mindepth=mindepth
+ )
syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)
expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {}
for synth_rev in synthetic_result(syntheticfile):
rev_id = synth_rev["sha1"].hex()
rev_ts = synth_rev["date"]
for rc in synth_rev["R_C"]:
expected_occurrences.setdefault(rc["dst"].hex(), []).append(
(rev_id, rev_ts, None, maybe_path(rc["path"]))
)
for dc in synth_rev["D_C"]:
assert dc["prefix"] is not None # to please mypy
expected_occurrences.setdefault(dc["dst"].hex(), []).append(
(rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))
)
assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, results in expected_occurrences.items():
expected = [(content_id, *result) for result in results]
db_occurrences = [
(
occur.content.hex(),
occur.revision.hex(),
occur.date.timestamp(),
occur.origin,
occur.path.decode(),
)
for occur in provenance.content_find_all(hash_to_bytes(content_id))
]
if provenance.storage.with_path:
# this is not true if the db stores no path, because a same content
# that appears several times in a given revision may be reported
# only once by content_find_all()
assert len(db_occurrences) == len(expected)
assert set(db_occurrences) == set(expected)
@pytest.mark.parametrize(
"repo, lower, mindepth",
(
("cmdbts2", True, 1),
("cmdbts2", False, 1),
("cmdbts2", True, 2),
("cmdbts2", False, 2),
("out-of-order", True, 1),
),
)
+@pytest.mark.parametrize("batch", (True, False))
def test_provenance_heuristics_content_find_first(
provenance: ProvenanceInterface,
swh_storage: Storage,
archive: ArchiveInterface,
repo: str,
lower: bool,
mindepth: int,
+ batch: bool,
) -> None:
# read data/README.md for more details on how these datasets are generated
data = load_repo_data(repo)
fill_storage(swh_storage, data)
revisions = [
RevisionEntry(
id=revision["id"],
date=ts2dt(revision["date"]),
root=revision["directory"],
)
for revision in data["revision"]
]
- # XXX adding all revisions at once should be working just fine, but it does not...
- # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
- # ...so add revisions one at a time for now
- for revision in revisions:
- revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth)
+ if batch:
+ revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
+ else:
+ for revision in revisions:
+ revision_add(
+ provenance, archive, [revision], lower=lower, mindepth=mindepth
+ )
syntheticfile = get_datafile(
f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
)
expected_first: Dict[str, Tuple[str, float, List[str]]] = {}
# dict of tuples (blob_id, rev_id, [path, ...]) the third element for path
# is a list because a content can be added at several places in a single
# revision, in which case the result of content_find_first() is one of
# those path, but we have no guarantee which one it will return.
for synth_rev in synthetic_result(syntheticfile):
rev_id = synth_rev["sha1"].hex()
rev_ts = synth_rev["date"]
for rc in synth_rev["R_C"]:
sha1 = rc["dst"].hex()
if sha1 not in expected_first:
assert rc["rel_ts"] == 0
expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
else:
if rev_ts == expected_first[sha1][1]:
expected_first[sha1][2].append(rc["path"])
elif rev_ts < expected_first[sha1][1]:
expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
for dc in synth_rev["D_C"]:
sha1 = rc["dst"].hex()
assert sha1 in expected_first
# nothing to do there, this content cannot be a "first seen file"
assert isinstance(provenance.storage, ProvenanceDBBase)
for content_id, (rev_id, ts, paths) in expected_first.items():
occur = provenance.content_find_first(hash_to_bytes(content_id))
assert occur is not None
assert occur.content.hex() == content_id
assert occur.revision.hex() == rev_id
assert occur.date.timestamp() == ts
assert occur.origin is None
if provenance.storage.with_path:
assert occur.path.decode() in paths
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:30 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3272837
Attached To
rDPROV Provenance database
Event Timeline
Log In to Comment