diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py index 3a4e6a2..06d61d5 100644 --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -1,672 +1,672 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from json.decoder import JSONDecodeError import logging import os from typing import Dict, Optional, Tuple from unittest.mock import patch import attr import pytest from swh.loader.package import __version__ from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( NixGuixLoader, NixGuixPackageInfo, clean_sources, make_pattern_unsupported_file_extension, parse_sources, retrieve_sources, ) from swh.loader.package.utils import download from swh.loader.tests import assert_last_visit_matches from swh.loader.tests import check_snapshot as check_snapshot_full from swh.loader.tests import get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.identifiers import SWHID from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, Snapshot, SnapshotBranch, TargetType, ) from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.storage.exc import HashCollision from swh.storage.interface import PagedResult, StorageInterface sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json" @pytest.fixture def raw_sources(datadir) -> bytes: with open( os.path.join( datadir, "https_nix-community.github.io", "nixpkgs-swh_sources.json" ), "rb", ) as f: return f.read() SNAPSHOT1 = Snapshot( id=hash_to_bytes("0c5881c74283793ebe9a09a105a9381e41380383"), branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes("488ad4e7b8e2511258725063cf43a2b897c503b4"), target_type=TargetType.REVISION, ), }, ) def check_snapshot(snapshot: Snapshot, storage: StorageInterface): # The `evaluation` branch is allowed to be unresolvable. It's possible at current # nixguix visit time, it is not yet visited (the git loader is in charge of its # visit for now). For more details, check the # swh.loader.package.nixguix.NixGuixLoader.extra_branches docstring. check_snapshot_full( snapshot, storage, allowed_empty=[(TargetType.REVISION, b"evaluation")] ) assert isinstance(snapshot, Snapshot) # then ensure the snapshot revisions are structurally as expected revision_ids = [] for name, branch in snapshot.branches.items(): if name == b"evaluation": continue # skipping that particular branch (cf. previous comment) if branch.target_type == TargetType.REVISION: revision_ids.append(branch.target) revisions = storage.revision_get(revision_ids) for rev in revisions: assert rev is not None metadata = rev.metadata assert metadata is not None raw = metadata["extrinsic"]["raw"] assert "url" in raw assert "integrity" in raw def test_retrieve_sources(swh_config, requests_mock_datadir): j = parse_sources(retrieve_sources(sources_url)) assert "sources" in j.keys() assert len(j["sources"]) == 2 def test_retrieve_non_existing(swh_config, requests_mock_datadir): with pytest.raises(ValueError): NixGuixLoader("https://non-existing-url") def test_retrieve_non_json(swh_config, requests_mock_datadir): with pytest.raises(JSONDecodeError): NixGuixLoader("https://example.com/file.txt") def test_clean_sources_invalid_schema(swh_config, requests_mock_datadir): sources = {} with pytest.raises(ValueError, match="sources structure invalid, missing: .*"): clean_sources(sources) def test_clean_sources_invalid_version(swh_config, requests_mock_datadir): for version_ok in [1, "1"]: # Check those versions are fine clean_sources({"version": version_ok, "sources": [], "revision": "my-revision"}) for version_ko in [0, "0", 2, "2"]: # Check version != 1 raise an error with pytest.raises( ValueError, match="sources structure version .* is not supported" ): clean_sources( {"version": version_ko, "sources": [], "revision": "my-revision"} ) def test_clean_sources_invalid_sources(swh_config, requests_mock_datadir): valid_sources = [ # 1 valid source {"type": "url", "urls": ["my-url.tar.gz"], "integrity": "my-integrity"}, ] sources = { "version": 1, "sources": valid_sources + [ # integrity is missing {"type": "url", "urls": ["my-url.tgz"],}, # urls is not a list {"type": "url", "urls": "my-url.zip", "integrity": "my-integrity"}, # type is not url {"type": "git", "urls": ["my-url.zip"], "integrity": "my-integrity"}, # missing fields which got double-checked nonetheless... {"integrity": "my-integrity"}, ], "revision": "my-revision", } clean = clean_sources(sources) assert len(clean["sources"]) == len(valid_sources) def test_make_pattern_unsupported_file_extension(): unsupported_extensions = ["el", "c", "txt"] supported_extensions = ["Z", "7z"] # for test actual_unsupported_pattern = make_pattern_unsupported_file_extension( unsupported_extensions ) for supported_ext in supported_extensions: assert supported_ext not in unsupported_extensions supported_filepath = f"anything.{supported_ext}" actual_match = actual_unsupported_pattern.match(supported_filepath) assert not actual_match for unsupported_ext in unsupported_extensions: unsupported_filepath = f"something.{unsupported_ext}" actual_match = actual_unsupported_pattern.match(unsupported_filepath) assert actual_match def test_clean_sources_unsupported_artifacts(swh_config, requests_mock_datadir): unsupported_file_extensions = [ "iso", "whl", "gem", "pom", "msi", "pod", "png", "rock", "ttf", "jar", "c", "el", "rpm", "diff", "patch", ] supported_sources = [ { "type": "url", "urls": [f"https://server.org/my-url.{ext}"], "integrity": "my-integrity", } for ext in [ "known-unknown-but-ok", # this is fine as well with the current approach "zip", "tar.gz", "tgz", "tar.bz2", "tbz", "tbz2", "tar.xz", "tar", "zip", "7z", "Z", ] ] unsupported_sources = [ { "type": "url", "urls": [f"https://server.org/my-url.{ext}"], "integrity": "my-integrity", } for ext in unsupported_file_extensions ] sources = { "version": 1, "sources": supported_sources + unsupported_sources, "revision": "my-revision", } clean = clean_sources(sources, unsupported_file_extensions) assert len(clean["sources"]) == len(supported_sources) def test_loader_one_visit(swh_config, requests_mock_datadir, raw_sources): loader = NixGuixLoader(sources_url) res = loader.load() assert res["status"] == "eventful" stats = get_stats(loader.storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats # The visit is partial because urls pointing to non tarball file # are not handled yet assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) - (_, visit_status) = origin_get_latest_visit_status(loader.storage, sources_url) + visit_status = origin_get_latest_visit_status(loader.storage, sources_url) snapshot_swhid = SWHID( object_type="snapshot", object_id=hash_to_hex(visit_status.snapshot) ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url=sources_url, ) expected_metadata = [ RawExtrinsicMetadata( type=MetadataTargetType.SNAPSHOT, target=snapshot_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.nixguix.loader.NixGuixLoader", version=__version__, ), discovery_date=loader.visit_date, format="nixguix-sources-json", metadata=raw_sources, origin=sources_url, ) ] assert loader.storage.raw_extrinsic_metadata_get( MetadataTargetType.SNAPSHOT, snapshot_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) def test_uncompress_failure(swh_config, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress function fails on such kind of files. However, even in this case of failure (because of the url https://example.com/file.txt), a snapshot and a visit has to be created (with a status partial since all files are not archived). """ loader = NixGuixLoader(sources_url) loader_status = loader.load() urls = [s["urls"][0] for s in loader.sources] assert "https://example.com/file.txt" in urls assert loader_status["status"] == "eventful" # The visit is partial because urls pointing to non tarball files # are not handled yet assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) def test_loader_incremental(swh_config, requests_mock_datadir): """Ensure a second visit do not download artifact already downloaded by the previous visit. """ loader = NixGuixLoader(sources_url) load_status = loader.load() loader.load() assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=loader.storage) urls = [ m.url for m in requests_mock_datadir.request_history if m.url == ("https://github.com/owner-1/repository-1/revision-1.tgz") ] # The artifact # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only # visited one time assert len(urls) == 1 def test_loader_two_visits(swh_config, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. The first visit creates a snapshot containing one tarball. The second visit creates a snapshot containing the same tarball and another tarball. """ loader = NixGuixLoader(sources_url) load_status = loader.load() assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=loader.storage) stats = get_stats(loader.storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats loader = NixGuixLoader(sources_url) load_status = loader.load() expected_snapshot_id_hex = "b0bfa75cbd0cc90aac3b9e95fb0f59c731176d97" expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex) assert load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id_hex, } assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix", snapshot=expected_snapshot_id, ) # This ensures visits are incremental. Indeed, if we request a # second time an url, because of the requests_mock_datadir_visits # fixture, the file has to end with `_visit1`. expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("602140776b2ce6c9159bcf52ada73a297c063d5e"), target_type=TargetType.REVISION, ), b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes("488ad4e7b8e2511258725063cf43a2b897c503b4"), target_type=TargetType.REVISION, ), b"https://github.com/owner-2/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes("85e0bad74e33e390aaeb74f139853ae3863ee544"), target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) stats = get_stats(loader.storage) assert { "content": 2, "directory": 5, "origin": 1, "origin_visit": 2, "release": 0, "revision": 2, "skipped_content": 0, "snapshot": 2, } == stats def test_resolve_revision_from(swh_config, requests_mock_datadir, datadir): loader = NixGuixLoader(sources_url) known_artifacts = { "id1": {"extrinsic": {"raw": {"url": "url1", "integrity": "integrity1"}}}, "id2": {"extrinsic": {"raw": {"url": "url2", "integrity": "integrity2"}}}, } p_info = NixGuixPackageInfo.from_metadata( {"url": "url1", "integrity": "integrity1"} ) assert loader.resolve_revision_from(known_artifacts, p_info) == "id1" p_info = NixGuixPackageInfo.from_metadata( {"url": "url3", "integrity": "integrity3"} ) assert loader.resolve_revision_from(known_artifacts, p_info) == None # noqa def test_evaluation_branch(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) res = loader.load() assert res["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=loader.storage) def test_eoferror(swh_config, requests_mock_datadir): """Load a truncated archive which is invalid to make the uncompress function raising the exception EOFError. We then check if a snapshot is created, meaning this error is well managed. """ sources = ( "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa ) loader = NixGuixLoader(sources) loader.load() expected_snapshot = Snapshot( id=hash_to_bytes("4257fa2350168c6bfec726a06452ea27a2c0cb33"), branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) def fake_download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, ) -> Tuple[str, Dict]: """Fake download which raises HashCollision (for the sake of test simpliciy, let's accept that makes sense) For tests purpose only. """ if url == "https://example.com/file.txt": # instead of failing because it's a file not dealt with by the nix guix # loader, make it raise a hash collision raise HashCollision("sha1", "f92d74e3874587aaf443d1db961d4e26dde13e9c", []) return download(url, dest, hashes, filename, auth) def test_raise_exception(swh_config, requests_mock_datadir, mocker): mock_download = mocker.patch("swh.loader.package.loader.download") mock_download.side_effect = fake_download loader = NixGuixLoader(sources_url) res = loader.load() assert res == { "status": "eventful", "snapshot_id": SNAPSHOT1.id.hex(), } check_snapshot(SNAPSHOT1, storage=loader.storage) assert len(mock_download.mock_calls) == 2 # The visit is partial because some artifact downloads failed assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) def test_load_nixguix_one_common_artifact_from_other_loader( swh_config, datadir, requests_mock_datadir_visits, caplog ): """Misformatted revision should be caught and logged, then loading continues """ caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader") # 1. first ingest with for example the archive loader gnu_url = "https://ftp.gnu.org/gnu/8sync/" release = "0.1.0" artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz" gnu_artifacts = [ { "time": 944729610, "url": artifact_url, "length": 221837, "filename": f"8sync-{release}.tar.gz", "version": release, } ] archive_loader = ArchiveLoader(url=gnu_url, artifacts=gnu_artifacts) actual_load_status = archive_loader.load() expected_snapshot_id = "c419397fd912039825ebdbea378bc6283f006bf5" assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa assert_last_visit_matches( archive_loader.storage, gnu_url, status="full", type="tar" ) gnu_snapshot: Snapshot = snapshot_get_all_branches( archive_loader.storage, hash_to_bytes(expected_snapshot_id) ) first_revision = gnu_snapshot.branches[f"releases/{release}".encode("utf-8")] # 2. Then ingest with the nixguix loader which lists the same artifact within its # sources.json # ensure test setup is ok data_sources = os.path.join( datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json" ) all_sources = json.loads(open(data_sources).read()) found = False for source in all_sources["sources"]: if source["urls"][0] == artifact_url: found = True assert ( found is True ), f"test setup error: {artifact_url} must be in {data_sources}" # first visit with a snapshot, ok sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" loader = NixGuixLoader(sources_url) actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="full", type="nixguix" ) snapshot_id = actual_load_status2["snapshot_id"] snapshot = snapshot_get_all_branches(loader.storage, hash_to_bytes(snapshot_id)) assert snapshot # simulate a snapshot already seen with a revision with the wrong metadata structure # This revision should be skipped, thus making the artifact being ingested again. with patch( "swh.loader.package.loader.PackageLoader.last_snapshot" ) as last_snapshot: # mutate the snapshot to target a revision with the wrong metadata structure # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision old_revision = loader.storage.revision_get([first_revision.target])[0] # assert that revision is not in the right format assert old_revision.metadata["extrinsic"]["raw"].get("integrity", {}) == {} # mutate snapshot to create a clash snapshot = attr.evolve( snapshot, branches={ **snapshot.branches, artifact_url.encode("utf-8"): SnapshotBranch( target_type=TargetType.REVISION, target=hash_to_bytes(old_revision.id), ), }, ) # modify snapshot to actually change revision metadata structure so we simulate # a revision written by somebody else (structure different) last_snapshot.return_value = snapshot loader = NixGuixLoader(sources_url) actual_load_status3 = loader.load() assert last_snapshot.called assert actual_load_status3["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="full", type="nixguix" ) new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965" assert actual_load_status3["snapshot_id"] == new_snapshot_id last_snapshot = snapshot_get_all_branches( loader.storage, hash_to_bytes(new_snapshot_id) ) new_revision_branch = last_snapshot.branches[artifact_url.encode("utf-8")] assert new_revision_branch.target_type == TargetType.REVISION new_revision = loader.storage.revision_get([new_revision_branch.target])[0] # the new revision has the correct structure, so it got ingested alright by the # new run assert new_revision.metadata["extrinsic"]["raw"]["integrity"] is not None nb_detections = 0 actual_detection: Dict for record in caplog.records: logtext = record.getMessage() if "Unexpected metadata revision structure detected:" in logtext: nb_detections += 1 actual_detection = record.args["context"] assert actual_detection # as many calls as there are sources listed in the sources.json assert nb_detections == len(all_sources["sources"]) assert actual_detection == { "revision": hash_to_hex(old_revision.id), "reason": "'integrity'", "known_artifact": old_revision.metadata, } diff --git a/swh/loader/tests/__init__.py b/swh/loader/tests/__init__.py index 79d87eb..a9601ff 100644 --- a/swh/loader/tests/__init__.py +++ b/swh/loader/tests/__init__.py @@ -1,260 +1,261 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import os from pathlib import PosixPath import subprocess from typing import Dict, Iterable, List, Optional, Tuple, Union from swh.model.hashutil import hash_to_bytes from swh.model.model import OriginVisitStatus, Snapshot, TargetType from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.storage.interface import StorageInterface def assert_last_visit_matches( storage, url: str, status: str, type: Optional[str] = None, snapshot: Optional[bytes] = None, ) -> OriginVisitStatus: """This retrieves the last visit and visit_status which are expected to exist. This also checks that the {visit|visit_status} have their respective properties correctly set. This returns the last visit_status for that given origin. Args: url: Origin url status: Check that the visit status has the given status type: Check that the returned visit has the given type snapshot: Check that the visit status points to the given snapshot Raises: AssertionError in case visit or visit status is not found, or any of the type, status and snapshot mismatch Returns: the visit status for further check during the remaining part of the test. """ - visit_and_status = origin_get_latest_visit_status(storage, url) - assert visit_and_status is not None, f"Origin {url} has no visits" - visit, visit_status = visit_and_status + visit_status = origin_get_latest_visit_status(storage, url) + assert visit_status is not None, f"Origin {url} has no visits" if type: - assert visit.type == type, f"Visit has type {visit.type} instead of {type}" + assert ( + visit_status.type == type + ), f"Visit has type {visit_status.type} instead of {type}" assert ( visit_status.status == status ), f"Visit_status has status {visit_status.status} instead of {status}" if snapshot is not None: assert visit_status.snapshot is not None assert visit_status.snapshot == snapshot, ( f"Visit_status points to snapshot {visit_status.snapshot.hex()} " f"instead of {snapshot.hex()}" ) return visit_status def prepare_repository_from_archive( archive_path: str, filename: Optional[str] = None, tmp_path: Union[PosixPath, str] = "/tmp", ) -> str: """Given an existing archive_path, uncompress it. Returns a file repo url which can be used as origin url. This does not deal with the case where the archive passed along does not exist. """ if not isinstance(tmp_path, str): tmp_path = str(tmp_path) # uncompress folder/repositories/dump for the loader to ingest subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path]) # build the origin url (or some derivative form) _fname = filename if filename else os.path.basename(archive_path) repo_url = f"file://{tmp_path}/{_fname}" return repo_url def encode_target(target: Dict) -> Dict: """Test helper to ease readability in test """ if not target: return target target_type = target["target_type"] target_data = target["target"] if target_type == "alias" and isinstance(target_data, str): encoded_target = target_data.encode("utf-8") elif isinstance(target_data, str): encoded_target = hash_to_bytes(target_data) else: encoded_target = target_data return {"target": encoded_target, "target_type": target_type} class InconsistentAliasBranchError(AssertionError): """When an alias branch targets an inexistent branch.""" pass class InexistentObjectsError(AssertionError): """When a targeted branch reference does not exist in the storage""" pass def check_snapshot( snapshot: Snapshot, storage: StorageInterface, allowed_empty: Iterable[Tuple[TargetType, bytes]] = [], ) -> Snapshot: """Check that: - snapshot exists in the storage and match - each object reference up to the revision/release targets exists Args: snapshot: full snapshot to check for existence and consistency storage: storage to lookup information into allowed_empty: Iterable of branch we allow to be empty (some edge case loaders allows this case to happen, nixguix for example allows the branch evaluation" to target the nixpkgs git commit reference, which may not yet be resolvable at loading time) Returns: the snapshot stored in the storage for further test assertion if any is needed. """ if not isinstance(snapshot, Snapshot): raise AssertionError(f"variable 'snapshot' must be a snapshot: {snapshot!r}") expected_snapshot = snapshot_get_all_branches(storage, snapshot.id) if expected_snapshot is None: raise AssertionError(f"Snapshot {snapshot.id.hex()} is not found") assert snapshot == expected_snapshot objects_by_target_type = defaultdict(list) object_to_branch = {} for branch, target in expected_snapshot.branches.items(): if (target.target_type, branch) in allowed_empty: # safe for those elements to not be checked for existence continue objects_by_target_type[target.target_type].append(target.target) object_to_branch[target.target] = branch # check that alias references target something that exists, otherwise raise aliases: List[bytes] = objects_by_target_type.get(TargetType.ALIAS, []) for alias in aliases: if alias not in expected_snapshot.branches: raise InconsistentAliasBranchError( f"Alias branch {alias.decode('utf-8')} " f"should be in {list(expected_snapshot.branches)}" ) revs = objects_by_target_type.get(TargetType.REVISION) if revs: revisions = storage.revision_get(revs) not_found = [rev_id for rev_id, rev in zip(revs, revisions) if rev is None] if not_found: missing_objs = ", ".join( str((object_to_branch[rev], rev.hex())) for rev in not_found ) raise InexistentObjectsError( f"Branch/Revision(s) {missing_objs} should exist in storage" ) # retrieve information from revision for revision in revisions: assert revision is not None objects_by_target_type[TargetType.DIRECTORY].append(revision.directory) object_to_branch[revision.directory] = revision.id rels = objects_by_target_type.get(TargetType.RELEASE) if rels: not_found = list(storage.release_missing(rels)) if not_found: missing_objs = ", ".join( str((object_to_branch[rel], rel.hex())) for rel in not_found ) raise InexistentObjectsError( f"Branch/Release(s) {missing_objs} should exist in storage" ) # first level dirs exist? dirs = objects_by_target_type.get(TargetType.DIRECTORY) if dirs: not_found = list(storage.directory_missing(dirs)) if not_found: missing_objs = ", ".join( str((object_to_branch[dir_].hex(), dir_.hex())) for dir_ in not_found ) raise InexistentObjectsError( f"Missing directories {missing_objs}: " "(revision exists, directory target does not)" ) for dir_ in dirs: # retrieve new objects to check for existence paths = storage.directory_ls(dir_, recursive=True) for path in paths: if path["type"] == "dir": target_type = TargetType.DIRECTORY else: target_type = TargetType.CONTENT target = path["target"] objects_by_target_type[target_type].append(target) object_to_branch[target] = dir_ # check nested directories dirs = objects_by_target_type.get(TargetType.DIRECTORY) if dirs: not_found = list(storage.directory_missing(dirs)) if not_found: missing_objs = ", ".join( str((object_to_branch[dir_].hex(), dir_.hex())) for dir_ in not_found ) raise InexistentObjectsError( f"Missing directories {missing_objs}: " "(revision exists, directory target does not)" ) # check contents directories cnts = objects_by_target_type.get(TargetType.CONTENT) if cnts: not_found = list(storage.content_missing_per_sha1_git(cnts)) if not_found: missing_objs = ", ".join( str((object_to_branch[cnt].hex(), cnt.hex())) for cnt in not_found ) raise InexistentObjectsError(f"Missing contents {missing_objs}") return snapshot def get_stats(storage) -> Dict: """Adaptation utils to unify the stats counters across storage implementation. """ storage.refresh_stat_counters() stats = storage.stat_counters() keys = [ "content", "directory", "origin", "origin_visit", "release", "revision", "skipped_content", "snapshot", ] return {k: stats.get(k) for k in keys} diff --git a/swh/loader/tests/test_init.py b/swh/loader/tests/test_init.py index ce920c2..0f83780 100644 --- a/swh/loader/tests/test_init.py +++ b/swh/loader/tests/test_init.py @@ -1,506 +1,507 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os import subprocess import attr import pytest from swh.loader.tests import ( InconsistentAliasBranchError, InexistentObjectsError, assert_last_visit_matches, check_snapshot, encode_target, prepare_repository_from_archive, ) from swh.model.from_disk import DentryPerms from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Content, Directory, DirectoryEntry, ObjectType, OriginVisit, OriginVisitStatus, Person, Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) hash_hex = "43e45d56f88993aae6a0198013efa80716fd8920" ORIGIN_VISIT = OriginVisit( origin="some-url", visit=1, date=datetime.datetime.now(tz=datetime.timezone.utc), type="archive", ) ORIGIN_VISIT_STATUS = OriginVisitStatus( origin="some-url", visit=1, + type="archive", date=datetime.datetime.now(tz=datetime.timezone.utc), status="full", snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), metadata=None, ) CONTENT = Content( data=b"42\n", length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( "673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a" ), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" ), status="visible", ) DIRECTORY = Directory( id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), entries=tuple( [ DirectoryEntry( name=b"foo", type="file", target=CONTENT.sha1_git, perms=DentryPerms.content, ) ] ), ) REVISION = Revision( id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"nicolas@example.com", fullname=b"Nicolas Dandrimont ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset=120, negative_utc=False, ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"stefano@example.com", fullname=b"St\xc3fano Zacchiroli ", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset=0, negative_utc=True, ), parents=(), type=RevisionType.GIT, directory=DIRECTORY.id, metadata={ "checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",}, "signed-off-by": "some-dude", }, extra_headers=( (b"gpgsig", b"test123"), (b"mergetag", b"foo\\bar"), (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), ), synthetic=True, ) RELEASE = Release( id=hash_to_bytes("3e9050196aa288264f2a9d279d6abab8b158448b"), name=b"v0.0.2", author=Person( name=b"tony", email=b"tony@ardumont.fr", fullname=b"tony ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634336813, microseconds=0), offset=0, negative_utc=False, ), target=REVISION.id, target_type=ObjectType.REVISION, message=b"yet another synthetic release", synthetic=True, ) SNAPSHOT = Snapshot( id=hash_to_bytes("2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"release/0.1.0": SnapshotBranch( target=RELEASE.id, target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch(target=REVISION.id, target_type=TargetType.REVISION,), b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), b"evaluation": SnapshotBranch( # branch dedicated to not exist in storage target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), }, ) @pytest.fixture def mock_storage(mocker): mock_storage = mocker.patch("swh.loader.tests.origin_get_latest_visit_status") - mock_storage.return_value = ORIGIN_VISIT, ORIGIN_VISIT_STATUS + mock_storage.return_value = ORIGIN_VISIT_STATUS return mock_storage def test_assert_last_visit_matches_raise(mock_storage, mocker): """Not finding origin visit_and_statu should raise """ # overwrite so we raise because we do not find the right visit mock_storage.return_value = None with pytest.raises(AssertionError, match="Origin url has no visits"): assert_last_visit_matches(mock_storage, "url", status="full") assert mock_storage.called is True def test_assert_last_visit_matches_wrong_status(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_status = "partial" assert ORIGIN_VISIT_STATUS.status != expected_status with pytest.raises(AssertionError, match="Visit_status has status"): assert_last_visit_matches(mock_storage, "url", status=expected_status) assert mock_storage.called is True def test_assert_last_visit_matches_wrong_type(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_type = "git" assert ORIGIN_VISIT.type != expected_type with pytest.raises(AssertionError, match="Visit has type"): assert_last_visit_matches( mock_storage, "url", status=ORIGIN_VISIT_STATUS.status, type=expected_type, # mismatched type will raise ) assert mock_storage.called is True def test_assert_last_visit_matches_wrong_snapshot(mock_storage, mocker): """Wrong visit detected should raise AssertionError """ expected_snapshot_id = hash_to_bytes("e92cc0710eb6cf9efd5b920a8453e1e07157b6cd") assert ORIGIN_VISIT_STATUS.snapshot != expected_snapshot_id with pytest.raises(AssertionError, match="Visit_status points to snapshot"): assert_last_visit_matches( mock_storage, "url", status=ORIGIN_VISIT_STATUS.status, snapshot=expected_snapshot_id, # mismatched snapshot will raise ) assert mock_storage.called is True def test_assert_last_visit_matches(mock_storage, mocker): """Correct visit detected should return the visit_status """ visit_type = ORIGIN_VISIT.type visit_status = ORIGIN_VISIT_STATUS.status visit_snapshot = ORIGIN_VISIT_STATUS.snapshot actual_visit_status = assert_last_visit_matches( mock_storage, "url", type=visit_type, status=visit_status, snapshot=visit_snapshot, ) assert actual_visit_status == ORIGIN_VISIT_STATUS assert mock_storage.called is True def test_prepare_repository_from_archive_failure(): # does not deal with inexistent archive so raise assert os.path.exists("unknown-archive") is False with pytest.raises(subprocess.CalledProcessError, match="exit status 2"): prepare_repository_from_archive("unknown-archive") def test_prepare_repository_from_archive(datadir, tmp_path): archive_name = "0805nexter-1.1.0" archive_path = os.path.join(str(datadir), f"{archive_name}.tar.gz") assert os.path.exists(archive_path) is True tmp_path = str(tmp_path) # deals with path string repo_url = prepare_repository_from_archive( archive_path, filename=archive_name, tmp_path=tmp_path ) expected_uncompressed_archive_path = os.path.join(tmp_path, archive_name) assert repo_url == f"file://{expected_uncompressed_archive_path}" assert os.path.exists(expected_uncompressed_archive_path) def test_prepare_repository_from_archive_no_filename(datadir, tmp_path): archive_name = "0805nexter-1.1.0" archive_path = os.path.join(str(datadir), f"{archive_name}.tar.gz") assert os.path.exists(archive_path) is True # deals with path as posix path (for tmp_path) repo_url = prepare_repository_from_archive(archive_path, tmp_path=tmp_path) tmp_path = str(tmp_path) expected_uncompressed_archive_path = os.path.join(tmp_path, archive_name) expected_repo_url = os.path.join(tmp_path, f"{archive_name}.tar.gz") assert repo_url == f"file://{expected_repo_url}" # passing along the filename does not influence the on-disk extraction # just the repo-url computation assert os.path.exists(expected_uncompressed_archive_path) def test_encode_target(): assert encode_target(None) is None for target_alias in ["something", b"something"]: target = { "target_type": "alias", "target": target_alias, } actual_alias_encode_target = encode_target(target) assert actual_alias_encode_target == { "target_type": "alias", "target": b"something", } for hash_ in [hash_hex, hash_to_bytes(hash_hex)]: target = {"target_type": "revision", "target": hash_} actual_encode_target = encode_target(target) assert actual_encode_target == { "target_type": "revision", "target": hash_to_bytes(hash_hex), } def test_check_snapshot(swh_storage): """Everything should be fine when snapshot is found and the snapshot reference up to the revision exist in the storage. """ # Create a consistent snapshot arborescence tree in storage found = False for entry in DIRECTORY.entries: if entry.target == CONTENT.sha1_git: found = True break assert found is True assert REVISION.directory == DIRECTORY.id assert RELEASE.target == REVISION.id for branch, target in SNAPSHOT.branches.items(): if branch == b"alias": assert target.target in SNAPSHOT.branches elif branch == b"evaluation": # this one does not exist and we are safelisting its check below continue else: assert target.target in [REVISION.id, RELEASE.id] swh_storage.content_add([CONTENT]) swh_storage.directory_add([DIRECTORY]) swh_storage.revision_add([REVISION]) swh_storage.release_add([RELEASE]) s = swh_storage.snapshot_add([SNAPSHOT]) assert s == { "snapshot:add": 1, } # all should be fine! check_snapshot( SNAPSHOT, swh_storage, allowed_empty=[(TargetType.REVISION, b"evaluation")] ) def test_check_snapshot_failures(swh_storage): """Failure scenarios: 0. snapshot parameter is not a snapshot 1. snapshot id is correct but branches mismatched 2. snapshot id is not correct, it's not found in the storage 3. snapshot reference an alias which does not exist 4. snapshot is found in storage, targeted revision does not exist 5. snapshot is found in storage, targeted revision exists but the directory the revision targets does not exist 6. snapshot is found in storage, target revision exists, targeted directory by the revision exist. Content targeted by the directory does not exist. 7. snapshot is found in storage, targeted release does not exist """ snap_id_hex = "2498dbf535f882bc7f9a18fb16c9ad27fda7bab7" snapshot = Snapshot( id=hash_to_bytes(snap_id_hex), branches={ b"master": SnapshotBranch( target=hash_to_bytes(hash_hex), target_type=TargetType.REVISION, ), }, ) s = swh_storage.snapshot_add([snapshot]) assert s == { "snapshot:add": 1, } unexpected_snapshot = Snapshot( branches={ b"tip": SnapshotBranch( # wrong branch target=hash_to_bytes(hash_hex), target_type=TargetType.RELEASE ) }, ) # 0. not a Snapshot object, raise! with pytest.raises(AssertionError, match="variable 'snapshot' must be a snapshot"): check_snapshot(ORIGIN_VISIT, swh_storage) # 1. snapshot id is correct but branches mismatched with pytest.raises(AssertionError): # sadly debian build raises only assertion check_snapshot(attr.evolve(unexpected_snapshot, id=snapshot.id), swh_storage) # 2. snapshot id is not correct, it's not found in the storage wrong_snap_id = hash_to_bytes("999666f535f882bc7f9a18fb16c9ad27fda7bab7") with pytest.raises(AssertionError, match="is not found"): check_snapshot(attr.evolve(unexpected_snapshot, id=wrong_snap_id), swh_storage) # 3. snapshot references an inexistent alias snapshot0 = Snapshot( id=hash_to_bytes("123666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), }, ) swh_storage.snapshot_add([snapshot0]) with pytest.raises(InconsistentAliasBranchError, match="Alias branch HEAD"): check_snapshot(snapshot0, swh_storage) # 4. snapshot is found in storage, targeted revision does not exist rev_not_found = list(swh_storage.revision_missing([REVISION.id])) assert len(rev_not_found) == 1 snapshot1 = Snapshot( id=hash_to_bytes("456666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), }, ) swh_storage.snapshot_add([snapshot1]) with pytest.raises(InexistentObjectsError, match="Branch/Revision"): check_snapshot(snapshot1, swh_storage) # 5. snapshot is found in storage, targeted revision exists but the directory the # revision targets does not exist swh_storage.revision_add([REVISION]) dir_not_found = list(swh_storage.directory_missing([REVISION.directory])) assert len(dir_not_found) == 1 snapshot2 = Snapshot( id=hash_to_bytes("987123f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), }, ) swh_storage.snapshot_add([snapshot2]) with pytest.raises(InexistentObjectsError, match="Missing directories"): check_snapshot(snapshot2, swh_storage) assert DIRECTORY.id == REVISION.directory swh_storage.directory_add([DIRECTORY]) # 6. snapshot is found in storage, target revision exists, targeted directory by the # revision exist. Content targeted by the directory does not exist. assert DIRECTORY.entries[0].target == CONTENT.sha1_git not_found = list(swh_storage.content_missing_per_sha1_git([CONTENT.sha1_git])) assert len(not_found) == 1 swh_storage.directory_add([DIRECTORY]) snapshot3 = Snapshot( id=hash_to_bytes("091456f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), }, ) swh_storage.snapshot_add([snapshot3]) with pytest.raises(InexistentObjectsError, match="Missing content(s)"): check_snapshot(snapshot3, swh_storage) # 7. snapshot is found in storage, targeted release does not exist # release targets the revisions which exists assert RELEASE.target == REVISION.id snapshot4 = Snapshot( id=hash_to_bytes("789666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch(target=b"HEAD", target_type=TargetType.ALIAS,), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), b"release/0.1.0": SnapshotBranch( target=RELEASE.id, target_type=TargetType.RELEASE, ), }, ) swh_storage.snapshot_add([snapshot4]) with pytest.raises(InexistentObjectsError, match="Branch/Release"): check_snapshot(snapshot4, swh_storage)