diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py index 4c8bbdf..6a5e273 100644 --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -1,561 +1,572 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import json import logging import pytest from json.decoder import JSONDecodeError -from typing import Dict, Optional, Tuple +from swh.storage.interface import StorageInterface +from typing import Any, Dict, Iterable, Optional, Tuple, Union from unittest.mock import patch -from swh.model.model import Snapshot +from swh.model.model import Snapshot, TargetType from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( NixGuixLoader, retrieve_sources, clean_sources, ) from swh.loader.package.utils import download from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.storage.exc import HashCollision from swh.loader.tests import ( assert_last_visit_matches, get_stats, - check_snapshot, + check_snapshot as check_snapshot_full, ) sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json" +def check_snapshot( + snapshot: Union[Dict[str, Any], Snapshot], + storage: StorageInterface, + allowed_empty: Iterable[Tuple[TargetType, bytes]] = [ + (TargetType.REVISION, b"evaluation") + ], +): + return check_snapshot_full(snapshot, storage, allowed_empty) + + def test_retrieve_sources(swh_config, requests_mock_datadir): j = retrieve_sources(sources_url) assert "sources" in j.keys() assert len(j["sources"]) == 2 def test_retrieve_non_existing(swh_config, requests_mock_datadir): with pytest.raises(ValueError): NixGuixLoader("https://non-existing-url") def test_retrieve_non_json(swh_config, requests_mock_datadir): with pytest.raises(JSONDecodeError): NixGuixLoader("https://example.com/file.txt") def test_clean_sources_invalid_schema(swh_config, requests_mock_datadir): sources = {} with pytest.raises(ValueError, match="sources structure invalid, missing: .*"): clean_sources(sources) def test_clean_sources_invalid_version(swh_config, requests_mock_datadir): for version_ok in [1, "1"]: # Check those versions are fine clean_sources({"version": version_ok, "sources": [], "revision": "my-revision"}) for version_ko in [0, "0", 2, "2"]: # Check version != 1 raise an error with pytest.raises( ValueError, match="sources structure version .* is not supported" ): clean_sources( {"version": version_ko, "sources": [], "revision": "my-revision"} ) def test_clean_sources_invalid_sources(swh_config, requests_mock_datadir): sources = { "version": 1, "sources": [ # Valid source {"type": "url", "urls": ["my-url"], "integrity": "my-integrity"}, # integrity is missing {"type": "url", "urls": ["my-url"],}, # urls is not a list {"type": "url", "urls": "my-url", "integrity": "my-integrity"}, # type is not url {"type": "git", "urls": ["my-url"], "integrity": "my-integrity"}, # missing fields which got double-checked nonetheless... {"integrity": "my-integrity"}, ], "revision": "my-revision", } clean = clean_sources(sources) assert len(clean["sources"]) == 1 def check_snapshot_revisions_ok(snapshot, storage): """Ensure the snapshot revisions are structurally as expected """ revision_ids = [] for name, branch in snapshot["branches"].items(): if name == b"evaluation": continue # skipping that particular branch if branch["target_type"] == "revision": revision_ids.append(branch["target"]) revisions = storage.revision_get(revision_ids) for rev in revisions: metadata = rev["metadata"] raw = metadata["extrinsic"]["raw"] assert "url" in raw assert "integrity" in raw def test_loader_one_visit(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) res = loader.load() assert res["status"] == "eventful" stats = get_stats(loader.storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats # The visit is partial because urls pointing to non tarball file # are not handled yet assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) def test_uncompress_failure(swh_config, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress function fails on such kind of files. However, even in this case of failure (because of the url https://example.com/file.txt), a snapshot and a visit has to be created (with a status partial since all files are not archived). """ loader = NixGuixLoader(sources_url) loader_status = loader.load() urls = [s["urls"][0] for s in loader.sources] assert "https://example.com/file.txt" in urls assert loader_status["status"] == "eventful" # The visit is partial because urls pointing to non tarball files # are not handled yet assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) def test_loader_incremental(swh_config, requests_mock_datadir): """Ensure a second visit do not download artifact already downloaded by the previous visit. """ loader = NixGuixLoader(sources_url) load_status = loader.load() loader.load() expected_snapshot_id = "0c5881c74283793ebe9a09a105a9381e41380383" assert load_status == {"status": "eventful", "snapshot_id": expected_snapshot_id} assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) expected_branches = { "evaluation": { "target": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7", "target_type": "revision", }, "https://github.com/owner-1/repository-1/revision-1.tgz": { "target": "488ad4e7b8e2511258725063cf43a2b897c503b4", "target_type": "revision", }, } expected_snapshot = { "id": expected_snapshot_id, "branches": expected_branches, } snapshot = check_snapshot(expected_snapshot, storage=loader.storage) check_snapshot_revisions_ok(snapshot, loader.storage) urls = [ m.url for m in requests_mock_datadir.request_history if m.url == ("https://github.com/owner-1/repository-1/revision-1.tgz") ] # The artifact # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only # visited one time assert len(urls) == 1 def test_loader_two_visits(swh_config, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. The first visit creates a snapshot containing one tarball. The second visit creates a snapshot containing the same tarball and another tarball. """ loader = NixGuixLoader(sources_url) load_status = loader.load() expected_snapshot_id = "0c5881c74283793ebe9a09a105a9381e41380383" assert load_status == {"status": "eventful", "snapshot_id": expected_snapshot_id} assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) expected_branches = { "evaluation": { "target": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7", "target_type": "revision", }, "https://github.com/owner-1/repository-1/revision-1.tgz": { "target": "488ad4e7b8e2511258725063cf43a2b897c503b4", "target_type": "revision", }, } expected_snapshot = { "id": expected_snapshot_id, "branches": expected_branches, } snapshot = check_snapshot(expected_snapshot, storage=loader.storage) check_snapshot_revisions_ok(snapshot, loader.storage) stats = get_stats(loader.storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats loader = NixGuixLoader(sources_url) load_status = loader.load() expected_snapshot_id = "b0bfa75cbd0cc90aac3b9e95fb0f59c731176d97" assert load_status == {"status": "eventful", "snapshot_id": expected_snapshot_id} assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) # This ensures visits are incremental. Indeed, if we request a # second time an url, because of the requests_mock_datadir_visits # fixture, the file has to end with `_visit1`. expected_branches = { "evaluation": { "target": "602140776b2ce6c9159bcf52ada73a297c063d5e", "target_type": "revision", }, "https://github.com/owner-1/repository-1/revision-1.tgz": { "target": "488ad4e7b8e2511258725063cf43a2b897c503b4", "target_type": "revision", }, "https://github.com/owner-2/repository-1/revision-1.tgz": { "target": "85e0bad74e33e390aaeb74f139853ae3863ee544", "target_type": "revision", }, } expected_snapshot = { "id": expected_snapshot_id, "branches": expected_branches, } snapshot = check_snapshot(expected_snapshot, storage=loader.storage) check_snapshot_revisions_ok(snapshot, loader.storage) stats = get_stats(loader.storage) assert { "content": 2, "directory": 5, "origin": 1, "origin_visit": 2, "person": 1, "release": 0, "revision": 2, "skipped_content": 0, "snapshot": 2, } == stats def test_resolve_revision_from(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) known_artifacts = { "id1": {"extrinsic": {"raw": {"url": "url1", "integrity": "integrity1"}}}, "id2": {"extrinsic": {"raw": {"url": "url2", "integrity": "integrity2"}}}, } metadata = {"url": "url1", "integrity": "integrity1"} assert loader.resolve_revision_from(known_artifacts, metadata) == "id1" metadata = {"url": "url3", "integrity": "integrity3"} assert loader.resolve_revision_from(known_artifacts, metadata) == None # noqa def test_evaluation_branch(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) res = loader.load() assert res["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) expected_branches = { "https://github.com/owner-1/repository-1/revision-1.tgz": { "target": "488ad4e7b8e2511258725063cf43a2b897c503b4", "target_type": "revision", }, "evaluation": { "target": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7", "target_type": "revision", }, } expected_snapshot = { "id": "0c5881c74283793ebe9a09a105a9381e41380383", "branches": expected_branches, } snapshot = check_snapshot(expected_snapshot, storage=loader.storage) check_snapshot_revisions_ok(snapshot, loader.storage) def test_eoferror(swh_config, requests_mock_datadir): """Load a truncated archive which is invalid to make the uncompress function raising the exception EOFError. We then check if a snapshot is created, meaning this error is well managed. """ sources = ( "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa ) loader = NixGuixLoader(sources) loader.load() expected_branches = { "evaluation": { "target": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7", "target_type": "revision", }, } expected_snapshot = { "id": "4257fa2350168c6bfec726a06452ea27a2c0cb33", "branches": expected_branches, } snapshot = check_snapshot(expected_snapshot, storage=loader.storage) check_snapshot_revisions_ok(snapshot, loader.storage) def fake_download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, ) -> Tuple[str, Dict]: """Fake download which raises HashCollision (for the sake of test simpliciy, let's accept that makes sense) For tests purpose only. """ if url == "https://example.com/file.txt": # instead of failing because it's a file not dealt with by the nix guix # loader, make it raise a hash collision raise HashCollision("sha1", "f92d74e3874587aaf443d1db961d4e26dde13e9c", []) return download(url, dest, hashes, filename, auth) def test_raise_exception(swh_config, requests_mock_datadir, mocker): mock_download = mocker.patch("swh.loader.package.loader.download") mock_download.side_effect = fake_download loader = NixGuixLoader(sources_url) res = loader.load() expected_snapshot_id = "0c5881c74283793ebe9a09a105a9381e41380383" assert res == { "status": "eventful", "snapshot_id": expected_snapshot_id, } expected_branches = { "https://github.com/owner-1/repository-1/revision-1.tgz": { "target": "488ad4e7b8e2511258725063cf43a2b897c503b4", "target_type": "revision", }, "evaluation": { "target": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7", "target_type": "revision", }, } expected_snapshot = { "id": expected_snapshot_id, "branches": expected_branches, } snapshot = check_snapshot(expected_snapshot, storage=loader.storage) check_snapshot_revisions_ok(snapshot, loader.storage) assert len(mock_download.mock_calls) == 2 # The visit is partial because some artifact downloads failed assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) def test_load_nixguix_one_common_artifact_from_other_loader( swh_config, datadir, requests_mock_datadir_visits, caplog ): """Misformatted revision should be caught and logged, then loading continues """ caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader") # 1. first ingest with for example the archive loader gnu_url = "https://ftp.gnu.org/gnu/8sync/" release = "0.1.0" artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz" gnu_artifacts = [ { "time": 944729610, "url": artifact_url, "length": 221837, "filename": f"8sync-{release}.tar.gz", "version": release, } ] archive_loader = ArchiveLoader(url=gnu_url, artifacts=gnu_artifacts) actual_load_status = archive_loader.load() expected_snapshot_id = "c419397fd912039825ebdbea378bc6283f006bf5" assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa assert_last_visit_matches( archive_loader.storage, gnu_url, status="full", type="tar" ) gnu_snapshot = archive_loader.storage.snapshot_get( hash_to_bytes(expected_snapshot_id) ) first_revision = gnu_snapshot["branches"][f"releases/{release}".encode("utf-8")] # 2. Then ingest with the nixguix loader which lists the same artifact within its # sources.json # ensure test setup is ok data_sources = os.path.join( datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json" ) all_sources = json.loads(open(data_sources).read()) found = False for source in all_sources["sources"]: if source["urls"][0] == artifact_url: found = True assert ( found is True ), f"test setup error: {artifact_url} must be in {data_sources}" # first visit with a snapshot, ok sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" loader = NixGuixLoader(sources_url) actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="full", type="nixguix" ) snapshot_id = actual_load_status2["snapshot_id"] snapshot = loader.storage.snapshot_get(hash_to_bytes(snapshot_id)) snapshot.pop("next_branch") # snapshot_get endpoint detail to drop # simulate a snapshot already seen with a revision with the wrong metadata structure # This revision should be skipped, thus making the artifact being ingested again. with patch( "swh.loader.package.loader.PackageLoader.last_snapshot" ) as last_snapshot: # mutate the snapshot to target a revision with the wrong metadata structure # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision old_revision = next(loader.storage.revision_get([first_revision["target"]])) # assert that revision is not in the right format assert old_revision["metadata"]["extrinsic"]["raw"].get("integrity", {}) == {} # mutate snapshot to create a clash snapshot["branches"][artifact_url.encode("utf-8")] = { "target_type": "revision", "target": old_revision["id"], } # modify snapshot to actually change revision metadata structure so we simulate # a revision written by somebody else (structure different) last_snapshot.return_value = Snapshot.from_dict(snapshot) loader = NixGuixLoader(sources_url) actual_load_status3 = loader.load() assert last_snapshot.called assert actual_load_status3["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="full", type="nixguix" ) new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965" assert actual_load_status3["snapshot_id"] == new_snapshot_id last_snapshot = loader.storage.snapshot_get(hash_to_bytes(new_snapshot_id)) new_revision_branch = last_snapshot["branches"][artifact_url.encode("utf-8")] assert new_revision_branch["target_type"] == "revision" new_revision = next( loader.storage.revision_get([new_revision_branch["target"]]) ) # the new revision has the correct structure, so it got ingested alright by the # new run assert new_revision["metadata"]["extrinsic"]["raw"]["integrity"] is not None nb_detections = 0 actual_detection: Dict for record in caplog.records: logtext = record.getMessage() if "Unexpected metadata revision structure detected:" in logtext: nb_detections += 1 actual_detection = record.args["context"] assert actual_detection # as many calls as there are sources listed in the sources.json assert nb_detections == len(all_sources["sources"]) assert actual_detection == { "revision": hash_to_hex(old_revision["id"]), "reason": "'integrity'", "known_artifact": old_revision["metadata"], } diff --git a/swh/loader/tests/__init__.py b/swh/loader/tests/__init__.py index d88648e..8149f72 100644 --- a/swh/loader/tests/__init__.py +++ b/swh/loader/tests/__init__.py @@ -1,167 +1,228 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import subprocess +from collections import defaultdict from pathlib import PosixPath -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union -from swh.model.model import OriginVisitStatus, Snapshot +from swh.model.model import OriginVisitStatus, Snapshot, TargetType from swh.model.hashutil import hash_to_bytes from swh.storage.interface import StorageInterface from swh.storage.algos.origin import origin_get_latest_visit_status def assert_last_visit_matches( storage, url: str, status: str, type: Optional[str] = None, snapshot: Optional[bytes] = None, ) -> OriginVisitStatus: """This retrieves the last visit and visit_status which are expected to exist. This also checks that the {visit|visit_status} have their respective properties correctly set. This returns the last visit_status for that given origin. Args: url: Origin url status: Check that the visit status has the given status type: Check that the returned visit has the given type snapshot: Check that the visit status points to the given snapshot Raises: AssertionError in case visit or visit status is not found, or any of the type, status and snapshot mismatch Returns: the visit status for further check during the remaining part of the test. """ visit_and_status = origin_get_latest_visit_status(storage, url) assert visit_and_status is not None, f"Origin {url} has no visits" visit, visit_status = visit_and_status if type: assert visit.type == type, f"Visit has type {visit.type} instead of {type}" assert ( visit_status.status == status ), f"Visit_status has status {visit_status.status} instead of {status}" if snapshot is not None: assert visit_status.snapshot is not None assert visit_status.snapshot == snapshot, ( f"Visit_status points to snapshot {visit_status.snapshot.hex()} " f"instead of {snapshot.hex()}" ) return visit_status def prepare_repository_from_archive( archive_path: str, filename: Optional[str] = None, tmp_path: Union[PosixPath, str] = "/tmp", ) -> str: """Given an existing archive_path, uncompress it. Returns a file repo url which can be used as origin url. This does not deal with the case where the archive passed along does not exist. """ if not isinstance(tmp_path, str): tmp_path = str(tmp_path) # uncompress folder/repositories/dump for the loader to ingest subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path]) # build the origin url (or some derivative form) _fname = filename if filename else os.path.basename(archive_path) repo_url = f"file://{tmp_path}/{_fname}" return repo_url def encode_target(target: Dict) -> Dict: """Test helper to ease readability in test """ if not target: return target target_type = target["target_type"] target_data = target["target"] if target_type == "alias" and isinstance(target_data, str): encoded_target = target_data.encode("utf-8") elif isinstance(target_data, str): encoded_target = hash_to_bytes(target_data) else: encoded_target = target_data return {"target": encoded_target, "target_type": target_type} +class InconsistentAliasBranchError(AssertionError): + """When an alias branch targets an inexistent branch.""" + + pass + + +class InexistentObjectsError(AssertionError): + """When a targeted branch reference does not exist in the storage""" + + pass + + def check_snapshot( - snapshot: Union[Dict[str, Any], Snapshot], storage: StorageInterface + snapshot: Union[Dict[str, Any], Snapshot], + storage: StorageInterface, + allowed_empty: Iterable[Tuple[TargetType, bytes]] = [], ): - """Check for snapshot match. - - The hashes can be both in hex or bytes, the necessary conversion will happen prior - to check. + """Check that: + - snapshot exists in the storage and match + - each object reference up to the revision/release targets exists Args: snapshot: full snapshot to check for existence and consistency storage: storage to lookup information into + allowed_empty: Iterable of branch we allow to be empty (some edge case loaders + allows this case to happen, nixguix for example allows the branch evaluation" + to target the nixpkgs git commit reference, which may not yet be resolvable at + loading time) Returns: the snapshot stored in the storage for further test assertion if any is needed. """ if isinstance(snapshot, Snapshot): expected_snapshot = snapshot elif isinstance(snapshot, dict): # dict must be snapshot compliant snapshot_dict = {"id": hash_to_bytes(snapshot["id"])} branches = {} for branch, target in snapshot["branches"].items(): if isinstance(branch, str): branch = branch.encode("utf-8") branches[branch] = encode_target(target) snapshot_dict["branches"] = branches expected_snapshot = Snapshot.from_dict(snapshot_dict) else: raise AssertionError(f"variable 'snapshot' must be a snapshot: {snapshot!r}") - snap = storage.snapshot_get(expected_snapshot.id) - if snap is None: + snapshot_dict = storage.snapshot_get(expected_snapshot.id) + if snapshot_dict is None: raise AssertionError(f"Snapshot {expected_snapshot.id.hex()} is not found") - assert snap["next_branch"] is None # we don't deal with large snapshot in tests - snap.pop("next_branch") - actual_snap = Snapshot.from_dict(snap) - - assert expected_snapshot == actual_snap - - return snap # for retro compat, returned the dict, remove when clients are migrated + snapshot_dict.pop("next_branch") + actual_snaphot = Snapshot.from_dict(snapshot_dict) + assert isinstance(actual_snaphot, Snapshot) + + assert expected_snapshot == actual_snaphot + + branches_by_target_type = defaultdict(list) + object_to_branch = {} + for branch, target in actual_snaphot.branches.items(): + if (target.target_type, branch) in allowed_empty: + # safe for those elements to not be checked for existence + continue + branches_by_target_type[target.target_type].append(target.target) + object_to_branch[target.target] = branch + + # check that alias references target something that exists, otherwise raise + aliases: List[bytes] = branches_by_target_type.get(TargetType.ALIAS, []) + for alias in aliases: + if alias not in actual_snaphot.branches: + raise InconsistentAliasBranchError( + f"Alias branch {alias.decode('utf-8')} " + f"should be in {list(actual_snaphot.branches)}" + ) + + revs = branches_by_target_type.get(TargetType.REVISION) + if revs: + revisions = list(storage.revision_get(revs)) + not_found = [rev_id for rev_id, rev in zip(revs, revisions) if rev is None] + if not_found: + missing_objs = ", ".join( + str((object_to_branch[rev], rev.hex())) for rev in not_found + ) + raise InexistentObjectsError( + f"Branch/Revision(s) {missing_objs} should exist in storage" + ) + + rels = branches_by_target_type.get(TargetType.RELEASE) + if rels: + releases = list(storage.release_get(rels)) + not_found = [rel_id for rel_id, rel in zip(rels, releases) if rel is None] + if not_found: + missing_objs = ", ".join( + str((object_to_branch[rel], rel.hex())) for rel in not_found + ) + raise InexistentObjectsError( + f"Branch/Release(s) {missing_objs} should exist in storage" + ) + + # for retro compat, returned the dict, remove when clients are migrated + return snapshot_dict def get_stats(storage) -> Dict: """Adaptation utils to unify the stats counters across storage implementation. """ storage.refresh_stat_counters() stats = storage.stat_counters() keys = [ "content", "directory", "origin", "origin_visit", "person", "release", "revision", "skipped_content", "snapshot", ] return {k: stats.get(k) for k in keys} diff --git a/swh/loader/tests/test_init.py b/swh/loader/tests/test_init.py index cbd6693..caafdab 100644 --- a/swh/loader/tests/test_init.py +++ b/swh/loader/tests/test_init.py @@ -1,258 +1,455 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import pytest import os import subprocess +from swh.model.from_disk import DentryPerms from swh.model.model import ( + Content, + Directory, + DirectoryEntry, + ObjectType, OriginVisit, OriginVisitStatus, + Person, + Release, + Revision, + RevisionType, Snapshot, SnapshotBranch, TargetType, + Timestamp, + TimestampWithTimezone, ) from swh.model.hashutil import hash_to_bytes from swh.loader.tests import ( assert_last_visit_matches, encode_target, check_snapshot, prepare_repository_from_archive, + InconsistentAliasBranchError, + InexistentObjectsError, ) hash_hex = "43e45d56f88993aae6a0198013efa80716fd8920" ORIGIN_VISIT = OriginVisit( origin="some-url", visit=1, date=datetime.datetime.now(tz=datetime.timezone.utc), type="archive", ) ORIGIN_VISIT_STATUS = OriginVisitStatus( origin="some-url", visit=1, date=datetime.datetime.now(tz=datetime.timezone.utc), status="full", snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), metadata=None, ) +CONTENT = Content( + data=b"42\n", + length=3, + sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), + sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), + sha256=hash_to_bytes( + "673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a" + ), + blake2s256=hash_to_bytes( + "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" + ), + status="visible", +) + + +DIRECTORY = Directory( + id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), + entries=tuple( + [ + DirectoryEntry( + name=b"foo", + type="file", + target=CONTENT.sha1_git, + perms=DentryPerms.content, + ) + ] + ), +) + + +REVISION = Revision( + id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"), + message=b"hello", + author=Person( + name=b"Nicolas Dandrimont", + email=b"nicolas@example.com", + fullname=b"Nicolas Dandrimont ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567890, microseconds=0), + offset=120, + negative_utc=False, + ), + committer=Person( + name=b"St\xc3fano Zacchiroli", + email=b"stefano@example.com", + fullname=b"St\xc3fano Zacchiroli ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1123456789, microseconds=0), + offset=0, + negative_utc=True, + ), + parents=(), + type=RevisionType.GIT, + directory=DIRECTORY.id, + metadata={ + "checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",}, + "signed-off-by": "some-dude", + }, + extra_headers=( + (b"gpgsig", b"test123"), + (b"mergetag", b"foo\\bar"), + (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), + ), + synthetic=True, +) + + +RELEASE = Release( + id=hash_to_bytes("3e9050196aa288264f2a9d279d6abab8b158448b"), + name=b"v0.0.2", + author=Person( + name=b"tony", email=b"tony@ardumont.fr", fullname=b"tony ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1634336813, microseconds=0), + offset=0, + negative_utc=False, + ), + target=REVISION.id, + target_type=ObjectType.REVISION, + message=b"yet another synthetic release", + synthetic=True, +) + + +SNAPSHOT = Snapshot( + id=hash_to_bytes("2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"), + branches={ + b"release/0.1.0": SnapshotBranch( + target=RELEASE.id, target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch(target=REVISION.id, target_type=TargetType.REVISION,), + b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), + b"evaluation": SnapshotBranch( # branch dedicated to not exist in storage + target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), + target_type=TargetType.REVISION, + ), + }, +) + + @pytest.fixture def mock_storage(mocker): mock_storage = mocker.patch("swh.loader.tests.origin_get_latest_visit_status") mock_storage.return_value = ORIGIN_VISIT, ORIGIN_VISIT_STATUS return mock_storage def test_assert_last_visit_matches_raise(mock_storage, mocker): """Not finding origin visit_and_statu should raise """ # overwrite so we raise because we do not find the right visit mock_storage.return_value = None with pytest.raises(AssertionError, match="Origin url has no visits"): assert_last_visit_matches(mock_storage, "url", status="full") assert mock_storage.called is True def test_assert_last_visit_matches_wrong_status(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_status = "partial" assert ORIGIN_VISIT_STATUS.status != expected_status with pytest.raises(AssertionError, match="Visit_status has status"): assert_last_visit_matches(mock_storage, "url", status=expected_status) assert mock_storage.called is True def test_assert_last_visit_matches_wrong_type(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_type = "git" assert ORIGIN_VISIT.type != expected_type with pytest.raises(AssertionError, match="Visit has type"): assert_last_visit_matches( mock_storage, "url", status=ORIGIN_VISIT_STATUS.status, type=expected_type, # mismatched type will raise ) assert mock_storage.called is True def test_assert_last_visit_matches_wrong_snapshot(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_snapshot_id = hash_to_bytes("e92cc0710eb6cf9efd5b920a8453e1e07157b6cd") assert ORIGIN_VISIT_STATUS.snapshot != expected_snapshot_id with pytest.raises(AssertionError, match="Visit_status points to snapshot"): assert_last_visit_matches( mock_storage, "url", status=ORIGIN_VISIT_STATUS.status, snapshot=expected_snapshot_id, # mismatched snapshot will raise ) assert mock_storage.called is True def test_assert_last_visit_matches(mock_storage, mocker): """Correct visit detected should return the visit_status """ visit_type = ORIGIN_VISIT.type visit_status = ORIGIN_VISIT_STATUS.status visit_snapshot = ORIGIN_VISIT_STATUS.snapshot actual_visit_status = assert_last_visit_matches( mock_storage, "url", type=visit_type, status=visit_status, snapshot=visit_snapshot, ) assert actual_visit_status == ORIGIN_VISIT_STATUS assert mock_storage.called is True def test_prepare_repository_from_archive_failure(): # does not deal with inexistent archive so raise assert os.path.exists("unknown-archive") is False with pytest.raises(subprocess.CalledProcessError, match="exit status 2"): prepare_repository_from_archive("unknown-archive") def test_prepare_repository_from_archive(datadir, tmp_path): archive_name = "0805nexter-1.1.0" archive_path = os.path.join(str(datadir), f"{archive_name}.tar.gz") assert os.path.exists(archive_path) is True tmp_path = str(tmp_path) # deals with path string repo_url = prepare_repository_from_archive( archive_path, filename=archive_name, tmp_path=tmp_path ) expected_uncompressed_archive_path = os.path.join(tmp_path, archive_name) assert repo_url == f"file://{expected_uncompressed_archive_path}" assert os.path.exists(expected_uncompressed_archive_path) def test_prepare_repository_from_archive_no_filename(datadir, tmp_path): archive_name = "0805nexter-1.1.0" archive_path = os.path.join(str(datadir), f"{archive_name}.tar.gz") assert os.path.exists(archive_path) is True # deals with path as posix path (for tmp_path) repo_url = prepare_repository_from_archive(archive_path, tmp_path=tmp_path) tmp_path = str(tmp_path) expected_uncompressed_archive_path = os.path.join(tmp_path, archive_name) expected_repo_url = os.path.join(tmp_path, f"{archive_name}.tar.gz") assert repo_url == f"file://{expected_repo_url}" # passing along the filename does not influence the on-disk extraction # just the repo-url computation assert os.path.exists(expected_uncompressed_archive_path) def test_encode_target(): assert encode_target(None) is None for target_alias in ["something", b"something"]: target = { "target_type": "alias", "target": target_alias, } actual_alias_encode_target = encode_target(target) assert actual_alias_encode_target == { "target_type": "alias", "target": b"something", } for hash_ in [hash_hex, hash_to_bytes(hash_hex)]: target = {"target_type": "revision", "target": hash_} actual_encode_target = encode_target(target) assert actual_encode_target == { "target_type": "revision", "target": hash_to_bytes(hash_hex), } def test_check_snapshot(swh_storage): - """Check snapshot should not raise when everything is fine""" - snapshot = Snapshot( - id=hash_to_bytes("2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"), - branches={ - b"master": SnapshotBranch( - target=hash_to_bytes(hash_hex), target_type=TargetType.REVISION, - ), - }, - ) + """Everything should be fine when snapshot is found and the snapshot reference up to the + revision exist in the storage. - s = swh_storage.snapshot_add([snapshot]) + """ + # Create a consistent snapshot arborescence tree in storage + found = False + for entry in DIRECTORY.entries: + if entry.target == CONTENT.sha1_git: + found = True + break + assert found is True + + assert REVISION.directory == DIRECTORY.id + assert RELEASE.target == REVISION.id + + for branch, target in SNAPSHOT.branches.items(): + if branch == b"alias": + assert target.target in SNAPSHOT.branches + elif branch == b"evaluation": + # this one does not exist and we are safelisting its check below + continue + else: + assert target.target in [REVISION.id, RELEASE.id] + + swh_storage.content_add([CONTENT.to_dict()]) + swh_storage.directory_add([DIRECTORY.to_dict()]) + swh_storage.revision_add([REVISION.to_dict()]) + swh_storage.release_add([RELEASE.to_dict()]) + s = swh_storage.snapshot_add([SNAPSHOT.to_dict()]) assert s == { "snapshot:add": 1, } - for snap in [snapshot, snapshot.to_dict()]: - check_snapshot(snap, swh_storage) + for snap in [SNAPSHOT, SNAPSHOT.to_dict()]: + # all should be fine! + check_snapshot( + snap, swh_storage, allowed_empty=[(TargetType.REVISION, b"evaluation")] + ) + + +def test_check_snapshot_failures(swh_storage): + """Failure scenarios: + 0. snapshot parameter is not a snapshot + 1. snapshot id is correct but branches mismatched + 2. snapshot id is not correct, it's not found in the storage + 3. snapshot reference an alias which does not exist + 4. snapshot is found in storage, targeted revision does not exist + 5. snapshot is found in storage, targeted release does not exist -def test_check_snapshot_failure(swh_storage): - """check_snapshot should raise if something goes wrong""" + The following are not dealt with yet: + 6. snapshot is found in storage, targeted directory does not exist + 7. snapshot is found in storage, targeted content does not exist + + """ snap_id_hex = "2498dbf535f882bc7f9a18fb16c9ad27fda7bab7" snapshot = Snapshot( id=hash_to_bytes(snap_id_hex), branches={ b"master": SnapshotBranch( target=hash_to_bytes(hash_hex), target_type=TargetType.REVISION, ), }, ) s = swh_storage.snapshot_add([snapshot]) assert s == { "snapshot:add": 1, } unexpected_snapshot = { "id": "2498dbf535f882bc7f9a18fb16c9ad27fda7bab7", # id is correct "branches": { "master": {"target": hash_hex, "target_type": "release",} # wrong branch }, } - # id is correct, the branch is wrong, that should raise nonetheless + # 0. not a Snapshot object, raise! + with pytest.raises(AssertionError, match="variable 'snapshot' must be a snapshot"): + check_snapshot(ORIGIN_VISIT, swh_storage) + + # 1. snapshot id is correct but branches mismatched for snap_id in [snap_id_hex, snapshot.id]: with pytest.raises(AssertionError, match="Differing attributes"): unexpected_snapshot["id"] = snap_id check_snapshot(unexpected_snapshot, swh_storage) - # snapshot id which does not exist + # 2. snapshot id is not correct, it's not found in the storage wrong_snap_id_hex = "999666f535f882bc7f9a18fb16c9ad27fda7bab7" for snap_id in [wrong_snap_id_hex, hash_to_bytes(wrong_snap_id_hex)]: unexpected_snapshot["id"] = wrong_snap_id_hex with pytest.raises(AssertionError, match="is not found"): check_snapshot(unexpected_snapshot, swh_storage) - # not a Snapshot object, raise! - with pytest.raises(AssertionError, match="variable 'snapshot' must be a snapshot"): - check_snapshot(ORIGIN_VISIT, swh_storage) + # 3. snapshot references an inexistent alias + snapshot0 = Snapshot( + id=hash_to_bytes("123666f535f882bc7f9a18fb16c9ad27fda7bab7"), + branches={ + b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), + }, + ) + swh_storage.snapshot_add([snapshot0]) + + with pytest.raises(InconsistentAliasBranchError, match="Alias branch HEAD"): + check_snapshot(snapshot0, swh_storage) + + # 4. snapshot is found in storage, targeted revision does not exist + snapshot1 = Snapshot( + id=hash_to_bytes("456666f535f882bc7f9a18fb16c9ad27fda7bab7"), + branches={ + b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), + b"HEAD": SnapshotBranch( + target=REVISION.id, target_type=TargetType.REVISION, + ), + }, + ) + + swh_storage.snapshot_add([snapshot1]) + + with pytest.raises(InexistentObjectsError, match="Branch/Revision"): + check_snapshot(snapshot1, swh_storage) + + swh_storage.revision_add([REVISION.to_dict()]) + snapshot2 = Snapshot( + id=hash_to_bytes("789666f535f882bc7f9a18fb16c9ad27fda7bab7"), + branches={ + b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), + b"HEAD": SnapshotBranch( + target=REVISION.id, target_type=TargetType.REVISION, + ), + b"release/0.1.0": SnapshotBranch( + target=RELEASE.id, target_type=TargetType.RELEASE, + ), + }, + ) + + swh_storage.snapshot_add([snapshot2]) + + with pytest.raises(InexistentObjectsError, match="Branch/Release"): + check_snapshot(snapshot2, swh_storage)