Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/tests/test_provenance_storage.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import inspect | import inspect | ||||
import os | import os | ||||
from typing import Any, Dict, Iterable, Optional, Set | from typing import Any, Dict, Iterable, Optional, Set, Tuple | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.model.model import Sha1Git | from swh.model.model import Sha1Git | ||||
from swh.provenance.archive import ArchiveInterface | from swh.provenance.archive import ArchiveInterface | ||||
from swh.provenance.interface import ( | from swh.provenance.interface import ( | ||||
EntityType, | EntityType, | ||||
ProvenanceInterface, | ProvenanceInterface, | ||||
▲ Show 20 Lines • Show All 153 Lines • ▼ Show 20 Lines | ) -> None: | ||||
) | ) | ||||
def dircontent( | def dircontent( | ||||
data: Dict[str, Any], | data: Dict[str, Any], | ||||
ref: Sha1Git, | ref: Sha1Git, | ||||
dir: Dict[str, Any], | dir: Dict[str, Any], | ||||
prefix: bytes = b"", | prefix: bytes = b"", | ||||
) -> Iterable[RelationData]: | ) -> Iterable[Tuple[Sha1Git, RelationData]]: | ||||
content = { | content = { | ||||
RelationData(entry["target"], ref, os.path.join(prefix, entry["name"])) | ( | ||||
entry["target"], | |||||
RelationData(dst=ref, path=os.path.join(prefix, entry["name"])), | |||||
) | |||||
for entry in dir["entries"] | for entry in dir["entries"] | ||||
if entry["type"] == "file" | if entry["type"] == "file" | ||||
} | } | ||||
for entry in dir["entries"]: | for entry in dir["entries"]: | ||||
if entry["type"] == "dir": | if entry["type"] == "dir": | ||||
child = next( | child = next( | ||||
subdir | subdir | ||||
for subdir in data["directory"] | for subdir in data["directory"] | ||||
Show All 14 Lines | elif entity == EntityType.DIRECTORY: | ||||
return storage.directory_add({sha1: None for sha1 in ids}) | return storage.directory_add({sha1: None for sha1 in ids}) | ||||
else: # entity == EntityType.REVISION: | else: # entity == EntityType.REVISION: | ||||
return storage.revision_add( | return storage.revision_add( | ||||
{sha1: RevisionData(date=None, origin=None) for sha1 in ids} | {sha1: RevisionData(date=None, origin=None) for sha1 in ids} | ||||
) | ) | ||||
def relation_add_and_compare_result( | def relation_add_and_compare_result( | ||||
storage: ProvenanceStorageInterface, relation: RelationType, data: Set[RelationData] | storage: ProvenanceStorageInterface, | ||||
relation: RelationType, | |||||
data: Dict[Sha1Git, Set[RelationData]], | |||||
) -> None: | ) -> None: | ||||
# Source, destinations and locations must be added in advance. | # Source, destinations and locations must be added in advance. | ||||
src, *_, dst = relation.value.split("_") | src, *_, dst = relation.value.split("_") | ||||
srcs = {sha1 for sha1 in data} | |||||
if src != "origin": | if src != "origin": | ||||
assert entity_add(storage, EntityType(src), {entry.src for entry in data}) | assert entity_add(storage, EntityType(src), srcs) | ||||
dsts = {rel.dst for rels in data.values() for rel in rels} | |||||
if dst != "origin": | if dst != "origin": | ||||
assert entity_add(storage, EntityType(dst), {entry.dst for entry in data}) | assert entity_add(storage, EntityType(dst), dsts) | ||||
if storage.with_path(): | if storage.with_path(): | ||||
assert storage.location_add( | assert storage.location_add( | ||||
{entry.path for entry in data if entry.path is not None} | {rel.path for rels in data.values() for rel in rels if rel.path is not None} | ||||
) | ) | ||||
assert data | assert data | ||||
assert storage.relation_add(relation, data) | assert storage.relation_add(relation, data) | ||||
for row in data: | for src_sha1 in srcs: | ||||
assert relation_compare_result( | relation_compare_result( | ||||
storage.relation_get(relation, [row.src]), | storage.relation_get(relation, [src_sha1]), | ||||
{entry for entry in data if entry.src == row.src}, | {src_sha1: data[src_sha1]}, | ||||
storage.with_path(), | storage.with_path(), | ||||
) | ) | ||||
assert relation_compare_result( | for dst_sha1 in dsts: | ||||
storage.relation_get( | relation_compare_result( | ||||
relation, | storage.relation_get(relation, [dst_sha1], reverse=True), | ||||
[row.dst], | { | ||||
reverse=True, | src_sha1: { | ||||
), | RelationData(dst=dst_sha1, path=rel.path) | ||||
{entry for entry in data if entry.dst == row.dst}, | for rel in rels | ||||
if dst_sha1 == rel.dst | |||||
} | |||||
for src_sha1, rels in data.items() | |||||
if dst_sha1 in {rel.dst for rel in rels} | |||||
}, | |||||
storage.with_path(), | storage.with_path(), | ||||
) | ) | ||||
relation_compare_result( | |||||
assert relation_compare_result( | |||||
storage.relation_get_all(relation), data, storage.with_path() | storage.relation_get_all(relation), data, storage.with_path() | ||||
) | ) | ||||
def relation_compare_result( | def relation_compare_result( | ||||
computed: Set[RelationData], expected: Set[RelationData], with_path: bool | computed: Dict[Sha1Git, Set[RelationData]], | ||||
) -> bool: | expected: Dict[Sha1Git, Set[RelationData]], | ||||
return { | with_path: bool, | ||||
RelationData(row.src, row.dst, row.path if with_path else None) | ) -> None: | ||||
for row in expected | assert { | ||||
src_sha1: { | |||||
RelationData(dst=rel.dst, path=rel.path if with_path else None) | |||||
for rel in rels | |||||
} | |||||
for src_sha1, rels in expected.items() | |||||
} == computed | } == computed | ||||
def test_provenance_storage_relation( | def test_provenance_storage_relation( | ||||
provenance_storage: ProvenanceStorageInterface, | provenance_storage: ProvenanceStorageInterface, | ||||
) -> None: | ) -> None: | ||||
"""Tests relation methods for every `ProvenanceStorageInterface` implementation.""" | """Tests relation methods for every `ProvenanceStorageInterface` implementation.""" | ||||
# Read data/README.md for more details on how these datasets are generated. | # Read data/README.md for more details on how these datasets are generated. | ||||
data = load_repo_data("cmdbts2") | data = load_repo_data("cmdbts2") | ||||
# Test content-in-revision relation. | # Test content-in-revision relation. | ||||
# Create flat models of every root directory for the revisions in the dataset. | # Create flat models of every root directory for the revisions in the dataset. | ||||
cnt_in_rev: Set[RelationData] = set() | cnt_in_rev: Dict[Sha1Git, Set[RelationData]] = {} | ||||
for rev in data["revision"]: | for rev in data["revision"]: | ||||
root = next( | root = next( | ||||
subdir for subdir in data["directory"] if subdir["id"] == rev["directory"] | subdir for subdir in data["directory"] if subdir["id"] == rev["directory"] | ||||
) | ) | ||||
cnt_in_rev.update(dircontent(data, rev["id"], root)) | for cnt, rel in dircontent(data, rev["id"], root): | ||||
cnt_in_rev.setdefault(cnt, set()).add(rel) | |||||
relation_add_and_compare_result( | relation_add_and_compare_result( | ||||
provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev | provenance_storage, RelationType.CNT_EARLY_IN_REV, cnt_in_rev | ||||
) | ) | ||||
# Test content-in-directory relation. | # Test content-in-directory relation. | ||||
# Create flat models for every directory in the dataset. | # Create flat models for every directory in the dataset. | ||||
cnt_in_dir: Set[RelationData] = set() | cnt_in_dir: Dict[Sha1Git, Set[RelationData]] = {} | ||||
for dir in data["directory"]: | for dir in data["directory"]: | ||||
cnt_in_dir.update(dircontent(data, dir["id"], dir)) | for cnt, rel in dircontent(data, dir["id"], dir): | ||||
cnt_in_dir.setdefault(cnt, set()).add(rel) | |||||
relation_add_and_compare_result( | relation_add_and_compare_result( | ||||
provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir | provenance_storage, RelationType.CNT_IN_DIR, cnt_in_dir | ||||
) | ) | ||||
# Test content-in-directory relation. | # Test content-in-directory relation. | ||||
# Add root directories to their correspondent revision in the dataset. | # Add root directories to their correspondent revision in the dataset. | ||||
dir_in_rev = { | dir_in_rev: Dict[Sha1Git, Set[RelationData]] = {} | ||||
RelationData(rev["directory"], rev["id"], b".") for rev in data["revision"] | for rev in data["revision"]: | ||||
} | dir_in_rev.setdefault(rev["directory"], set()).add( | ||||
RelationData(dst=rev["id"], path=b".") | |||||
) | |||||
relation_add_and_compare_result( | relation_add_and_compare_result( | ||||
provenance_storage, RelationType.DIR_IN_REV, dir_in_rev | provenance_storage, RelationType.DIR_IN_REV, dir_in_rev | ||||
) | ) | ||||
# Test revision-in-origin relation. | # Test revision-in-origin relation. | ||||
# Add all revisions that are head of some snapshot branch to the corresponding | |||||
# origin. | |||||
rev_in_org = { | |||||
RelationData( | |||||
branch["target"], | |||||
hash_to_bytes(origin_identifier({"url": status["origin"]})), | |||||
None, | |||||
) | |||||
for status in data["origin_visit_status"] | |||||
if status["snapshot"] is not None | |||||
for snapshot in data["snapshot"] | |||||
if snapshot["id"] == status["snapshot"] | |||||
for _, branch in snapshot["branches"].items() | |||||
if branch["target_type"] == "revision" | |||||
} | |||||
# Origins must be inserted in advance (cannot be done by `entity_add` inside | # Origins must be inserted in advance (cannot be done by `entity_add` inside | ||||
# `relation_add_and_compare_result`). | # `relation_add_and_compare_result`). | ||||
orgs = { | orgs = { | ||||
hash_to_bytes(origin_identifier(origin)): origin["url"] | hash_to_bytes(origin_identifier(origin)): origin["url"] | ||||
for origin in data["origin"] | for origin in data["origin"] | ||||
} | } | ||||
assert provenance_storage.origin_add(orgs) | assert provenance_storage.origin_add(orgs) | ||||
# Add all revisions that are head of some snapshot branch to the corresponding | |||||
# origin. | |||||
rev_in_org: Dict[Sha1Git, Set[RelationData]] = {} | |||||
for status in data["origin_visit_status"]: | |||||
if status["snapshot"] is not None: | |||||
for snapshot in data["snapshot"]: | |||||
if snapshot["id"] == status["snapshot"]: | |||||
for branch in snapshot["branches"].values(): | |||||
if branch["target_type"] == "revision": | |||||
rev_in_org.setdefault(branch["target"], set()).add( | |||||
RelationData( | |||||
dst=hash_to_bytes( | |||||
origin_identifier({"url": status["origin"]}) | |||||
), | |||||
path=None, | |||||
) | |||||
) | |||||
relation_add_and_compare_result( | relation_add_and_compare_result( | ||||
provenance_storage, RelationType.REV_IN_ORG, rev_in_org | provenance_storage, RelationType.REV_IN_ORG, rev_in_org | ||||
) | ) | ||||
# Test revision-before-revision relation. | # Test revision-before-revision relation. | ||||
# For each revision in the data set add an entry for each parent to the relation. | # For each revision in the data set add an entry for each parent to the relation. | ||||
rev_before_rev = { | rev_before_rev: Dict[Sha1Git, Set[RelationData]] = {} | ||||
RelationData(parent, rev["id"], None) | for rev in data["revision"]: | ||||
for rev in data["revision"] | for parent in rev["parents"]: | ||||
for parent in rev["parents"] | rev_before_rev.setdefault(parent, set()).add( | ||||
} | RelationData(dst=rev["id"], path=None) | ||||
) | |||||
relation_add_and_compare_result( | relation_add_and_compare_result( | ||||
provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev | provenance_storage, RelationType.REV_BEFORE_REV, rev_before_rev | ||||
) | ) | ||||
def test_provenance_storage_find( | def test_provenance_storage_find( | ||||
archive: ArchiveInterface, | archive: ArchiveInterface, | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
▲ Show 20 Lines • Show All 122 Lines • Show Last 20 Lines |