diff --git a/swh/loader/tests/__init__.py b/swh/loader/tests/__init__.py index 8149f72..34ab54d 100644 --- a/swh/loader/tests/__init__.py +++ b/swh/loader/tests/__init__.py @@ -1,228 +1,243 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import subprocess from collections import defaultdict from pathlib import PosixPath from typing import Any, Dict, Iterable, List, Optional, Tuple, Union from swh.model.model import OriginVisitStatus, Snapshot, TargetType from swh.model.hashutil import hash_to_bytes from swh.storage.interface import StorageInterface from swh.storage.algos.origin import origin_get_latest_visit_status def assert_last_visit_matches( storage, url: str, status: str, type: Optional[str] = None, snapshot: Optional[bytes] = None, ) -> OriginVisitStatus: """This retrieves the last visit and visit_status which are expected to exist. This also checks that the {visit|visit_status} have their respective properties correctly set. This returns the last visit_status for that given origin. Args: url: Origin url status: Check that the visit status has the given status type: Check that the returned visit has the given type snapshot: Check that the visit status points to the given snapshot Raises: AssertionError in case visit or visit status is not found, or any of the type, status and snapshot mismatch Returns: the visit status for further check during the remaining part of the test. """ visit_and_status = origin_get_latest_visit_status(storage, url) assert visit_and_status is not None, f"Origin {url} has no visits" visit, visit_status = visit_and_status if type: assert visit.type == type, f"Visit has type {visit.type} instead of {type}" assert ( visit_status.status == status ), f"Visit_status has status {visit_status.status} instead of {status}" if snapshot is not None: assert visit_status.snapshot is not None assert visit_status.snapshot == snapshot, ( f"Visit_status points to snapshot {visit_status.snapshot.hex()} " f"instead of {snapshot.hex()}" ) return visit_status def prepare_repository_from_archive( archive_path: str, filename: Optional[str] = None, tmp_path: Union[PosixPath, str] = "/tmp", ) -> str: """Given an existing archive_path, uncompress it. Returns a file repo url which can be used as origin url. This does not deal with the case where the archive passed along does not exist. """ if not isinstance(tmp_path, str): tmp_path = str(tmp_path) # uncompress folder/repositories/dump for the loader to ingest subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path]) # build the origin url (or some derivative form) _fname = filename if filename else os.path.basename(archive_path) repo_url = f"file://{tmp_path}/{_fname}" return repo_url def encode_target(target: Dict) -> Dict: """Test helper to ease readability in test """ if not target: return target target_type = target["target_type"] target_data = target["target"] if target_type == "alias" and isinstance(target_data, str): encoded_target = target_data.encode("utf-8") elif isinstance(target_data, str): encoded_target = hash_to_bytes(target_data) else: encoded_target = target_data return {"target": encoded_target, "target_type": target_type} class InconsistentAliasBranchError(AssertionError): """When an alias branch targets an inexistent branch.""" pass class InexistentObjectsError(AssertionError): """When a targeted branch reference does not exist in the storage""" pass def check_snapshot( snapshot: Union[Dict[str, Any], Snapshot], storage: StorageInterface, allowed_empty: Iterable[Tuple[TargetType, bytes]] = [], ): """Check that: - snapshot exists in the storage and match - each object reference up to the revision/release targets exists Args: snapshot: full snapshot to check for existence and consistency storage: storage to lookup information into allowed_empty: Iterable of branch we allow to be empty (some edge case loaders allows this case to happen, nixguix for example allows the branch evaluation" to target the nixpkgs git commit reference, which may not yet be resolvable at loading time) Returns: the snapshot stored in the storage for further test assertion if any is needed. """ if isinstance(snapshot, Snapshot): expected_snapshot = snapshot elif isinstance(snapshot, dict): # dict must be snapshot compliant snapshot_dict = {"id": hash_to_bytes(snapshot["id"])} branches = {} for branch, target in snapshot["branches"].items(): if isinstance(branch, str): branch = branch.encode("utf-8") branches[branch] = encode_target(target) snapshot_dict["branches"] = branches expected_snapshot = Snapshot.from_dict(snapshot_dict) else: raise AssertionError(f"variable 'snapshot' must be a snapshot: {snapshot!r}") snapshot_dict = storage.snapshot_get(expected_snapshot.id) if snapshot_dict is None: raise AssertionError(f"Snapshot {expected_snapshot.id.hex()} is not found") snapshot_dict.pop("next_branch") actual_snaphot = Snapshot.from_dict(snapshot_dict) assert isinstance(actual_snaphot, Snapshot) assert expected_snapshot == actual_snaphot - branches_by_target_type = defaultdict(list) + objects_by_target_type = defaultdict(list) object_to_branch = {} for branch, target in actual_snaphot.branches.items(): if (target.target_type, branch) in allowed_empty: # safe for those elements to not be checked for existence continue - branches_by_target_type[target.target_type].append(target.target) + objects_by_target_type[target.target_type].append(target.target) object_to_branch[target.target] = branch # check that alias references target something that exists, otherwise raise - aliases: List[bytes] = branches_by_target_type.get(TargetType.ALIAS, []) + aliases: List[bytes] = objects_by_target_type.get(TargetType.ALIAS, []) for alias in aliases: if alias not in actual_snaphot.branches: raise InconsistentAliasBranchError( f"Alias branch {alias.decode('utf-8')} " f"should be in {list(actual_snaphot.branches)}" ) - revs = branches_by_target_type.get(TargetType.REVISION) + revs = objects_by_target_type.get(TargetType.REVISION) if revs: revisions = list(storage.revision_get(revs)) not_found = [rev_id for rev_id, rev in zip(revs, revisions) if rev is None] if not_found: missing_objs = ", ".join( str((object_to_branch[rev], rev.hex())) for rev in not_found ) raise InexistentObjectsError( f"Branch/Revision(s) {missing_objs} should exist in storage" ) + # retrieve information from revision + for rev in revisions: + objects_by_target_type[TargetType.DIRECTORY].append(rev["directory"]) + object_to_branch[rev["directory"]] = rev["id"] - rels = branches_by_target_type.get(TargetType.RELEASE) + rels = objects_by_target_type.get(TargetType.RELEASE) if rels: - releases = list(storage.release_get(rels)) - not_found = [rel_id for rel_id, rel in zip(rels, releases) if rel is None] + not_found = list(storage.release_missing(rels)) if not_found: missing_objs = ", ".join( str((object_to_branch[rel], rel.hex())) for rel in not_found ) raise InexistentObjectsError( f"Branch/Release(s) {missing_objs} should exist in storage" ) + dirs = objects_by_target_type.get(TargetType.DIRECTORY) + if dirs: + not_found = list(storage.directory_missing(dirs)) + if not_found: + missing_objs = ", ".join( + str((object_to_branch[dir_].hex(), dir_.hex())) for dir_ in not_found + ) + raise InexistentObjectsError( + f"Missing directories {missing_objs}: " + "(revision exists, directory target does not)" + ) + # for retro compat, returned the dict, remove when clients are migrated return snapshot_dict def get_stats(storage) -> Dict: """Adaptation utils to unify the stats counters across storage implementation. """ storage.refresh_stat_counters() stats = storage.stat_counters() keys = [ "content", "directory", "origin", "origin_visit", "person", "release", "revision", "skipped_content", "snapshot", ] return {k: stats.get(k) for k in keys} diff --git a/swh/loader/tests/test_init.py b/swh/loader/tests/test_init.py index 4593a3e..80ed821 100644 --- a/swh/loader/tests/test_init.py +++ b/swh/loader/tests/test_init.py @@ -1,458 +1,490 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import pytest import os import subprocess from swh.model.from_disk import DentryPerms from swh.model.model import ( Content, Directory, DirectoryEntry, ObjectType, OriginVisit, OriginVisitStatus, Person, Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) from swh.model.hashutil import hash_to_bytes from swh.loader.tests import ( assert_last_visit_matches, encode_target, check_snapshot, prepare_repository_from_archive, InconsistentAliasBranchError, InexistentObjectsError, ) hash_hex = "43e45d56f88993aae6a0198013efa80716fd8920" ORIGIN_VISIT = OriginVisit( origin="some-url", visit=1, date=datetime.datetime.now(tz=datetime.timezone.utc), type="archive", ) ORIGIN_VISIT_STATUS = OriginVisitStatus( origin="some-url", visit=1, date=datetime.datetime.now(tz=datetime.timezone.utc), status="full", snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), metadata=None, ) CONTENT = Content( data=b"42\n", length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( "673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a" ), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" ), status="visible", ) DIRECTORY = Directory( id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), entries=tuple( [ DirectoryEntry( name=b"foo", type="file", target=CONTENT.sha1_git, perms=DentryPerms.content, ) ] ), ) REVISION = Revision( id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"nicolas@example.com", fullname=b"Nicolas Dandrimont ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset=120, negative_utc=False, ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"stefano@example.com", fullname=b"St\xc3fano Zacchiroli ", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset=0, negative_utc=True, ), parents=(), type=RevisionType.GIT, directory=DIRECTORY.id, metadata={ "checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",}, "signed-off-by": "some-dude", }, extra_headers=( (b"gpgsig", b"test123"), (b"mergetag", b"foo\\bar"), (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), ), synthetic=True, ) RELEASE = Release( id=hash_to_bytes("3e9050196aa288264f2a9d279d6abab8b158448b"), name=b"v0.0.2", author=Person( name=b"tony", email=b"tony@ardumont.fr", fullname=b"tony ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634336813, microseconds=0), offset=0, negative_utc=False, ), target=REVISION.id, target_type=ObjectType.REVISION, message=b"yet another synthetic release", synthetic=True, ) SNAPSHOT = Snapshot( id=hash_to_bytes("2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"release/0.1.0": SnapshotBranch( target=RELEASE.id, target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch(target=REVISION.id, target_type=TargetType.REVISION,), b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), b"evaluation": SnapshotBranch( # branch dedicated to not exist in storage target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), }, ) @pytest.fixture def mock_storage(mocker): mock_storage = mocker.patch("swh.loader.tests.origin_get_latest_visit_status") mock_storage.return_value = ORIGIN_VISIT, ORIGIN_VISIT_STATUS return mock_storage def test_assert_last_visit_matches_raise(mock_storage, mocker): """Not finding origin visit_and_statu should raise """ # overwrite so we raise because we do not find the right visit mock_storage.return_value = None with pytest.raises(AssertionError, match="Origin url has no visits"): assert_last_visit_matches(mock_storage, "url", status="full") assert mock_storage.called is True def test_assert_last_visit_matches_wrong_status(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_status = "partial" assert ORIGIN_VISIT_STATUS.status != expected_status with pytest.raises(AssertionError, match="Visit_status has status"): assert_last_visit_matches(mock_storage, "url", status=expected_status) assert mock_storage.called is True def test_assert_last_visit_matches_wrong_type(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_type = "git" assert ORIGIN_VISIT.type != expected_type with pytest.raises(AssertionError, match="Visit has type"): assert_last_visit_matches( mock_storage, "url", status=ORIGIN_VISIT_STATUS.status, type=expected_type, # mismatched type will raise ) assert mock_storage.called is True def test_assert_last_visit_matches_wrong_snapshot(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_snapshot_id = hash_to_bytes("e92cc0710eb6cf9efd5b920a8453e1e07157b6cd") assert ORIGIN_VISIT_STATUS.snapshot != expected_snapshot_id with pytest.raises(AssertionError, match="Visit_status points to snapshot"): assert_last_visit_matches( mock_storage, "url", status=ORIGIN_VISIT_STATUS.status, snapshot=expected_snapshot_id, # mismatched snapshot will raise ) assert mock_storage.called is True def test_assert_last_visit_matches(mock_storage, mocker): """Correct visit detected should return the visit_status """ visit_type = ORIGIN_VISIT.type visit_status = ORIGIN_VISIT_STATUS.status visit_snapshot = ORIGIN_VISIT_STATUS.snapshot actual_visit_status = assert_last_visit_matches( mock_storage, "url", type=visit_type, status=visit_status, snapshot=visit_snapshot, ) assert actual_visit_status == ORIGIN_VISIT_STATUS assert mock_storage.called is True def test_prepare_repository_from_archive_failure(): # does not deal with inexistent archive so raise assert os.path.exists("unknown-archive") is False with pytest.raises(subprocess.CalledProcessError, match="exit status 2"): prepare_repository_from_archive("unknown-archive") def test_prepare_repository_from_archive(datadir, tmp_path): archive_name = "0805nexter-1.1.0" archive_path = os.path.join(str(datadir), f"{archive_name}.tar.gz") assert os.path.exists(archive_path) is True tmp_path = str(tmp_path) # deals with path string repo_url = prepare_repository_from_archive( archive_path, filename=archive_name, tmp_path=tmp_path ) expected_uncompressed_archive_path = os.path.join(tmp_path, archive_name) assert repo_url == f"file://{expected_uncompressed_archive_path}" assert os.path.exists(expected_uncompressed_archive_path) def test_prepare_repository_from_archive_no_filename(datadir, tmp_path): archive_name = "0805nexter-1.1.0" archive_path = os.path.join(str(datadir), f"{archive_name}.tar.gz") assert os.path.exists(archive_path) is True # deals with path as posix path (for tmp_path) repo_url = prepare_repository_from_archive(archive_path, tmp_path=tmp_path) tmp_path = str(tmp_path) expected_uncompressed_archive_path = os.path.join(tmp_path, archive_name) expected_repo_url = os.path.join(tmp_path, f"{archive_name}.tar.gz") assert repo_url == f"file://{expected_repo_url}" # passing along the filename does not influence the on-disk extraction # just the repo-url computation assert os.path.exists(expected_uncompressed_archive_path) def test_encode_target(): assert encode_target(None) is None for target_alias in ["something", b"something"]: target = { "target_type": "alias", "target": target_alias, } actual_alias_encode_target = encode_target(target) assert actual_alias_encode_target == { "target_type": "alias", "target": b"something", } for hash_ in [hash_hex, hash_to_bytes(hash_hex)]: target = {"target_type": "revision", "target": hash_} actual_encode_target = encode_target(target) assert actual_encode_target == { "target_type": "revision", "target": hash_to_bytes(hash_hex), } def test_check_snapshot(swh_storage): """Everything should be fine when snapshot is found and the snapshot reference up to the revision exist in the storage. """ # Create a consistent snapshot arborescence tree in storage found = False for entry in DIRECTORY.entries: if entry.target == CONTENT.sha1_git: found = True break assert found is True assert REVISION.directory == DIRECTORY.id assert RELEASE.target == REVISION.id for branch, target in SNAPSHOT.branches.items(): if branch == b"alias": assert target.target in SNAPSHOT.branches elif branch == b"evaluation": # this one does not exist and we are safelisting its check below continue else: assert target.target in [REVISION.id, RELEASE.id] swh_storage.content_add([CONTENT.to_dict()]) swh_storage.directory_add([DIRECTORY.to_dict()]) swh_storage.revision_add([REVISION.to_dict()]) swh_storage.release_add([RELEASE.to_dict()]) s = swh_storage.snapshot_add([SNAPSHOT.to_dict()]) assert s == { "snapshot:add": 1, } for snap in [SNAPSHOT, SNAPSHOT.to_dict()]: # all should be fine! check_snapshot( snap, swh_storage, allowed_empty=[(TargetType.REVISION, b"evaluation")] ) def test_check_snapshot_failures(swh_storage): """Failure scenarios: 0. snapshot parameter is not a snapshot 1. snapshot id is correct but branches mismatched 2. snapshot id is not correct, it's not found in the storage 3. snapshot reference an alias which does not exist 4. snapshot is found in storage, targeted revision does not exist - 5. snapshot is found in storage, targeted release does not exist + 5. snapshot is found in storage, targeted revision exists but the directory the + revision targets does not exist + 6. snapshot is found in storage, targeted release does not exist The following are not dealt with yet: - 6. snapshot is found in storage, targeted directory does not exist - 7. snapshot is found in storage, targeted content does not exist + 7. snapshot is found in storage, nested targeted directories does not exist + 8. snapshot is found in storage, nested targeted contents does not exist """ snap_id_hex = "2498dbf535f882bc7f9a18fb16c9ad27fda7bab7" snapshot = Snapshot( id=hash_to_bytes(snap_id_hex), branches={ b"master": SnapshotBranch( target=hash_to_bytes(hash_hex), target_type=TargetType.REVISION, ), }, ) s = swh_storage.snapshot_add([snapshot]) assert s == { "snapshot:add": 1, } unexpected_snapshot = { "id": "2498dbf535f882bc7f9a18fb16c9ad27fda7bab7", # id is correct "branches": { b"master": { "target": hash_to_bytes(hash_hex), # wrong branch "target_type": "release", } }, } # 0. not a Snapshot object, raise! with pytest.raises(AssertionError, match="variable 'snapshot' must be a snapshot"): check_snapshot(ORIGIN_VISIT, swh_storage) # 1. snapshot id is correct but branches mismatched for snap_id in [snap_id_hex, snapshot.id]: with pytest.raises(AssertionError, match="Differing attributes"): unexpected_snapshot["id"] = snap_id check_snapshot(unexpected_snapshot, swh_storage) # 2. snapshot id is not correct, it's not found in the storage wrong_snap_id_hex = "999666f535f882bc7f9a18fb16c9ad27fda7bab7" for snap_id in [wrong_snap_id_hex, hash_to_bytes(wrong_snap_id_hex)]: unexpected_snapshot["id"] = wrong_snap_id_hex with pytest.raises(AssertionError, match="is not found"): check_snapshot(unexpected_snapshot, swh_storage) # 3. snapshot references an inexistent alias snapshot0 = Snapshot( id=hash_to_bytes("123666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), }, ) swh_storage.snapshot_add([snapshot0]) with pytest.raises(InconsistentAliasBranchError, match="Alias branch HEAD"): check_snapshot(snapshot0, swh_storage) # 4. snapshot is found in storage, targeted revision does not exist + + rev_not_found = list(swh_storage.revision_missing([REVISION.id])) + assert len(rev_not_found) == 1 + snapshot1 = Snapshot( id=hash_to_bytes("456666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), }, ) swh_storage.snapshot_add([snapshot1]) with pytest.raises(InexistentObjectsError, match="Branch/Revision"): check_snapshot(snapshot1, swh_storage) + # 5. snapshot is found in storage, targeted revision exists but the directory the + # revision targets does not exist + swh_storage.revision_add([REVISION.to_dict()]) + + dir_not_found = list(swh_storage.directory_missing([REVISION.directory])) + assert len(dir_not_found) == 1 + snapshot2 = Snapshot( + id=hash_to_bytes("987123f535f882bc7f9a18fb16c9ad27fda7bab7"), + branches={ + b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), + b"HEAD": SnapshotBranch( + target=REVISION.id, target_type=TargetType.REVISION, + ), + }, + ) + + swh_storage.snapshot_add([snapshot2.to_dict()]) + with pytest.raises(InexistentObjectsError, match="Missing directories"): + check_snapshot(snapshot2, swh_storage) + + assert DIRECTORY.id == REVISION.directory + swh_storage.directory_add([DIRECTORY]) + + # 6. snapshot is found in storage, targeted release does not exist + + snapshot3 = Snapshot( id=hash_to_bytes("789666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), b"release/0.1.0": SnapshotBranch( target=RELEASE.id, target_type=TargetType.RELEASE, ), }, ) - swh_storage.snapshot_add([snapshot2]) + swh_storage.snapshot_add([snapshot3]) with pytest.raises(InexistentObjectsError, match="Branch/Release"): - check_snapshot(snapshot2, swh_storage) + check_snapshot(snapshot3, swh_storage)