diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py index fed8fd8..621d5d0 100644 --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -1,332 +1,333 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import logging import requests from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union import attr from swh.model.hashutil import hash_to_hex, hash_to_bytes from swh.model.model import ( Person, Revision, RevisionType, TimestampWithTimezone, Sha1Git, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, ) from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import cached_method, download +from swh.storage.algos.snapshot import snapshot_get_all_branches logger = logging.getLogger(__name__) @attr.s class DepositPackageInfo(BasePackageInfo): filename = attr.ib(type=str) # instead of Optional[str] raw_info = attr.ib(type=Dict[str, Any]) author_date = attr.ib(type=datetime.datetime) """codemeta:dateCreated if any, deposit completed_date otherwise""" commit_date = attr.ib(type=datetime.datetime) """codemeta:datePublished if any, deposit completed_date otherwise""" client = attr.ib(type=str) id = attr.ib(type=int) """Internal ID of the deposit in the deposit DB""" collection = attr.ib(type=str) """The collection in the deposit; see SWORD specification.""" author = attr.ib(type=Person) committer = attr.ib(type=Person) revision_parents = attr.ib(type=Tuple[Sha1Git, ...]) """Revisions created from previous deposits, that will be used as parents of the revision created for this deposit.""" @classmethod def from_metadata( cls, metadata: Dict[str, Any], url: str, filename: str ) -> "DepositPackageInfo": # Note: # `date` and `committer_date` are always transmitted by the deposit read api # which computes itself the values. The loader needs to use those to create the # revision. raw_metadata_from_origin = json.dumps( metadata["origin_metadata"]["metadata"] ).encode() metadata = metadata.copy() # FIXME: this removes information from 'raw' metadata depo = metadata.pop("deposit") return cls( url=url, filename=filename, author_date=depo["author_date"], commit_date=depo["committer_date"], client=depo["client"], id=depo["id"], collection=depo["collection"], author=parse_author(depo["author"]), committer=parse_author(depo["committer"]), revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]), raw_info=metadata, revision_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="sword-v2-atom-codemeta-v2-in-json", metadata=raw_metadata_from_origin, ), ], ) class DepositLoader(PackageLoader[DepositPackageInfo]): """Load pypi origin's artifact releases into swh archive. """ visit_type = "deposit" def __init__(self, url: str, deposit_id: str): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity """ super().__init__(url=url) config_deposit = self.config["deposit"] self.deposit_id = deposit_id self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"]) def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot # branch return ["HEAD"] def get_metadata_authority(self) -> MetadataAuthority: provider = self.metadata()["origin_metadata"]["provider"] assert provider["provider_type"] == "deposit_client" return MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=provider["provider_url"], metadata={ "name": provider["provider_name"], **(provider["metadata"] or {}), }, ) def get_metadata_fetcher(self) -> MetadataFetcher: tool = self.metadata()["origin_metadata"]["tool"] return MetadataFetcher( name=tool["name"], version=tool["version"], metadata=tool["configuration"], ) def get_package_info( self, version: str ) -> Iterator[Tuple[str, DepositPackageInfo]]: p_info = DepositPackageInfo.from_metadata( self.metadata(), url=self.url, filename="archive.zip", ) yield "HEAD", p_info def download_package( self, p_info: DepositPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Override to allow use of the dedicated deposit client """ return [self.client.archive_get(self.deposit_id, tmpdir, p_info.filename)] def build_revision( self, p_info: DepositPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: message = ( f"{p_info.client}: Deposit {p_info.id} in collection {p_info.collection}" ).encode("utf-8") return Revision( type=RevisionType.TAR, message=message, author=p_info.author, date=TimestampWithTimezone.from_dict(p_info.author_date), committer=p_info.committer, committer_date=TimestampWithTimezone.from_dict(p_info.commit_date), parents=p_info.revision_parents, directory=directory, synthetic=True, metadata={ "extrinsic": { "provider": self.client.metadata_url(self.deposit_id), "when": self.visit_date.isoformat(), "raw": p_info.raw_info, }, }, ) def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: origin_metadata = self.metadata()["origin_metadata"] return [ RawExtrinsicMetadataCore( format="sword-v2-atom-codemeta-v2-in-json", metadata=json.dumps(origin_metadata["metadata"]).encode(), ) ] @cached_method def metadata(self): """Returns metadata from the deposit server""" return self.client.metadata_get(self.deposit_id) def load(self) -> Dict: # First making sure the deposit is known prior to trigger a loading try: self.metadata() except ValueError: logger.error(f"Unknown deposit {self.deposit_id}, ignoring") return {"status": "failed"} # Then usual loading r = super().load() success = r["status"] != "failed" # Update deposit status try: if not success: self.client.status_update(self.deposit_id, status="failed") return r snapshot_id = hash_to_bytes(r["snapshot_id"]) - snapshot = self.storage.snapshot_get(snapshot_id) + snapshot = snapshot_get_all_branches(self.storage, snapshot_id) if not snapshot: return r - branches = snapshot["branches"] + branches = snapshot.branches logger.debug("branches: %s", branches) if not branches: return r - rev_id = branches[b"HEAD"]["target"] + rev_id = branches[b"HEAD"].target revisions = list(self.storage.revision_get([rev_id])) if not revisions: return r revision = revisions[0] if not revision: return r # Retrieve the revision identifier dir_id = revision["directory"] # update the deposit's status to success with its # revision-id and directory-id self.client.status_update( self.deposit_id, status="done", revision_id=hash_to_hex(rev_id), directory_id=hash_to_hex(dir_id), snapshot_id=r["snapshot_id"], origin_url=self.url, ) except Exception: logger.exception("Problem when trying to update the deposit's status") return {"status": "failed"} return r def parse_author(author) -> Person: """See prior fixme """ return Person( fullname=author["fullname"].encode("utf-8"), name=author["name"].encode("utf-8"), email=author["email"].encode("utf-8"), ) class ApiClient: """Private Deposit Api client """ def __init__(self, url, auth: Optional[Mapping[str, str]]): self.base_url = url.rstrip("/") self.auth = None if not auth else (auth["username"], auth["password"]) def do(self, method: str, url: str, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in get/post/put Returns: The request's execution output """ method_fn = getattr(requests, method) if self.auth: kwargs["auth"] = self.auth return method_fn(url, *args, **kwargs) def archive_get( self, deposit_id: Union[int, str], tmpdir: str, filename: str ) -> Tuple[str, Dict]: """Retrieve deposit's archive artifact locally """ url = f"{self.base_url}/{deposit_id}/raw/" return download(url, dest=tmpdir, filename=filename, auth=self.auth) def metadata_url(self, deposit_id: Union[int, str]) -> str: return f"{self.base_url}/{deposit_id}/meta/" def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]: """Retrieve deposit's metadata artifact as json """ url = self.metadata_url(deposit_id) r = self.do("get", url) if r.ok: return r.json() msg = f"Problem when retrieving deposit metadata at {url}" logger.error(msg) raise ValueError(msg) def status_update( self, deposit_id: Union[int, str], status: str, revision_id: Optional[str] = None, directory_id: Optional[str] = None, snapshot_id: Optional[str] = None, origin_url: Optional[str] = None, ): """Update deposit's information including status, and persistent identifiers result of the loading. """ url = f"{self.base_url}/{deposit_id}/update/" payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if snapshot_id: payload["snapshot_id"] = snapshot_id if origin_url: payload["origin_url"] = origin_url self.do("put", url, json=payload) diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py index 9fb9d29..7a9e9a7 100644 --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -1,673 +1,681 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import json from json.decoder import JSONDecodeError import logging from typing import Dict, Optional, Tuple from unittest.mock import patch +import attr import pytest from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( NixGuixPackageInfo, NixGuixLoader, parse_sources, retrieve_sources, clean_sources, make_pattern_unsupported_file_extension, ) from swh.loader.package.utils import download from swh.model.identifiers import SWHID from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, Snapshot, SnapshotBranch, TargetType, ) from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.storage.exc import HashCollision from swh.storage.algos.origin import origin_get_latest_visit_status +from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.storage.interface import PagedResult, StorageInterface from swh.loader.package import __version__ from swh.loader.tests import ( assert_last_visit_matches, get_stats, check_snapshot as check_snapshot_full, ) sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json" @pytest.fixture def raw_sources(datadir) -> bytes: with open( os.path.join( datadir, "https_nix-community.github.io", "nixpkgs-swh_sources.json" ), "rb", ) as f: return f.read() SNAPSHOT1 = Snapshot( id=hash_to_bytes("0c5881c74283793ebe9a09a105a9381e41380383"), branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes("488ad4e7b8e2511258725063cf43a2b897c503b4"), target_type=TargetType.REVISION, ), }, ) def check_snapshot(snapshot: Snapshot, storage: StorageInterface): # The `evaluation` branch is allowed to be unresolvable. It's possible at current # nixguix visit time, it is not yet visited (the git loader is in charge of its # visit for now). For more details, check the # swh.loader.package.nixguix.NixGuixLoader.extra_branches docstring. check_snapshot_full( snapshot, storage, allowed_empty=[(TargetType.REVISION, b"evaluation")] ) assert isinstance(snapshot, Snapshot) # then ensure the snapshot revisions are structurally as expected revision_ids = [] for name, branch in snapshot.branches.items(): if name == b"evaluation": continue # skipping that particular branch (cf. previous comment) if branch.target_type == TargetType.REVISION: revision_ids.append(branch.target) revisions = storage.revision_get(revision_ids) for rev in revisions: assert rev is not None metadata = rev["metadata"] raw = metadata["extrinsic"]["raw"] assert "url" in raw assert "integrity" in raw def test_retrieve_sources(swh_config, requests_mock_datadir): j = parse_sources(retrieve_sources(sources_url)) assert "sources" in j.keys() assert len(j["sources"]) == 2 def test_retrieve_non_existing(swh_config, requests_mock_datadir): with pytest.raises(ValueError): NixGuixLoader("https://non-existing-url") def test_retrieve_non_json(swh_config, requests_mock_datadir): with pytest.raises(JSONDecodeError): NixGuixLoader("https://example.com/file.txt") def test_clean_sources_invalid_schema(swh_config, requests_mock_datadir): sources = {} with pytest.raises(ValueError, match="sources structure invalid, missing: .*"): clean_sources(sources) def test_clean_sources_invalid_version(swh_config, requests_mock_datadir): for version_ok in [1, "1"]: # Check those versions are fine clean_sources({"version": version_ok, "sources": [], "revision": "my-revision"}) for version_ko in [0, "0", 2, "2"]: # Check version != 1 raise an error with pytest.raises( ValueError, match="sources structure version .* is not supported" ): clean_sources( {"version": version_ko, "sources": [], "revision": "my-revision"} ) def test_clean_sources_invalid_sources(swh_config, requests_mock_datadir): valid_sources = [ # 1 valid source {"type": "url", "urls": ["my-url.tar.gz"], "integrity": "my-integrity"}, ] sources = { "version": 1, "sources": valid_sources + [ # integrity is missing {"type": "url", "urls": ["my-url.tgz"],}, # urls is not a list {"type": "url", "urls": "my-url.zip", "integrity": "my-integrity"}, # type is not url {"type": "git", "urls": ["my-url.zip"], "integrity": "my-integrity"}, # missing fields which got double-checked nonetheless... {"integrity": "my-integrity"}, ], "revision": "my-revision", } clean = clean_sources(sources) assert len(clean["sources"]) == len(valid_sources) def test_make_pattern_unsupported_file_extension(): unsupported_extensions = ["el", "c", "txt"] supported_extensions = ["Z", "7z"] # for test actual_unsupported_pattern = make_pattern_unsupported_file_extension( unsupported_extensions ) for supported_ext in supported_extensions: assert supported_ext not in unsupported_extensions supported_filepath = f"anything.{supported_ext}" actual_match = actual_unsupported_pattern.match(supported_filepath) assert not actual_match for unsupported_ext in unsupported_extensions: unsupported_filepath = f"something.{unsupported_ext}" actual_match = actual_unsupported_pattern.match(unsupported_filepath) assert actual_match def test_clean_sources_unsupported_artifacts(swh_config, requests_mock_datadir): unsupported_file_extensions = [ "iso", "whl", "gem", "pom", "msi", "pod", "png", "rock", "ttf", "jar", "c", "el", "rpm", "diff", "patch", ] supported_sources = [ { "type": "url", "urls": [f"https://server.org/my-url.{ext}"], "integrity": "my-integrity", } for ext in [ "known-unknown-but-ok", # this is fine as well with the current approach "zip", "tar.gz", "tgz", "tar.bz2", "tbz", "tbz2", "tar.xz", "tar", "zip", "7z", "Z", ] ] unsupported_sources = [ { "type": "url", "urls": [f"https://server.org/my-url.{ext}"], "integrity": "my-integrity", } for ext in unsupported_file_extensions ] sources = { "version": 1, "sources": supported_sources + unsupported_sources, "revision": "my-revision", } clean = clean_sources(sources, unsupported_file_extensions) assert len(clean["sources"]) == len(supported_sources) def test_loader_one_visit(swh_config, requests_mock_datadir, raw_sources): loader = NixGuixLoader(sources_url) res = loader.load() assert res["status"] == "eventful" stats = get_stats(loader.storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats # The visit is partial because urls pointing to non tarball file # are not handled yet assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) (_, visit_status) = origin_get_latest_visit_status(loader.storage, sources_url) snapshot_swhid = SWHID( object_type="snapshot", object_id=hash_to_hex(visit_status.snapshot) ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url=sources_url, ) expected_metadata = [ RawExtrinsicMetadata( type=MetadataTargetType.SNAPSHOT, id=snapshot_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.nixguix.loader.NixGuixLoader", version=__version__, ), discovery_date=loader.visit_date, format="nixguix-sources-json", metadata=raw_sources, origin=sources_url, ) ] assert loader.storage.raw_extrinsic_metadata_get( type=MetadataTargetType.SNAPSHOT, id=snapshot_swhid, authority=metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) def test_uncompress_failure(swh_config, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress function fails on such kind of files. However, even in this case of failure (because of the url https://example.com/file.txt), a snapshot and a visit has to be created (with a status partial since all files are not archived). """ loader = NixGuixLoader(sources_url) loader_status = loader.load() urls = [s["urls"][0] for s in loader.sources] assert "https://example.com/file.txt" in urls assert loader_status["status"] == "eventful" # The visit is partial because urls pointing to non tarball files # are not handled yet assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) def test_loader_incremental(swh_config, requests_mock_datadir): """Ensure a second visit do not download artifact already downloaded by the previous visit. """ loader = NixGuixLoader(sources_url) load_status = loader.load() loader.load() assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=loader.storage) urls = [ m.url for m in requests_mock_datadir.request_history if m.url == ("https://github.com/owner-1/repository-1/revision-1.tgz") ] # The artifact # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only # visited one time assert len(urls) == 1 def test_loader_two_visits(swh_config, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. The first visit creates a snapshot containing one tarball. The second visit creates a snapshot containing the same tarball and another tarball. """ loader = NixGuixLoader(sources_url) load_status = loader.load() assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=loader.storage) stats = get_stats(loader.storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats loader = NixGuixLoader(sources_url) load_status = loader.load() expected_snapshot_id_hex = "b0bfa75cbd0cc90aac3b9e95fb0f59c731176d97" expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex) assert load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id_hex, } assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix", snapshot=expected_snapshot_id, ) # This ensures visits are incremental. Indeed, if we request a # second time an url, because of the requests_mock_datadir_visits # fixture, the file has to end with `_visit1`. expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("602140776b2ce6c9159bcf52ada73a297c063d5e"), target_type=TargetType.REVISION, ), b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes("488ad4e7b8e2511258725063cf43a2b897c503b4"), target_type=TargetType.REVISION, ), b"https://github.com/owner-2/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes("85e0bad74e33e390aaeb74f139853ae3863ee544"), target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) stats = get_stats(loader.storage) assert { "content": 2, "directory": 5, "origin": 1, "origin_visit": 2, "person": 1, "release": 0, "revision": 2, "skipped_content": 0, "snapshot": 2, } == stats def test_resolve_revision_from(swh_config, requests_mock_datadir, datadir): loader = NixGuixLoader(sources_url) known_artifacts = { "id1": {"extrinsic": {"raw": {"url": "url1", "integrity": "integrity1"}}}, "id2": {"extrinsic": {"raw": {"url": "url2", "integrity": "integrity2"}}}, } p_info = NixGuixPackageInfo.from_metadata( {"url": "url1", "integrity": "integrity1"} ) assert loader.resolve_revision_from(known_artifacts, p_info) == "id1" p_info = NixGuixPackageInfo.from_metadata( {"url": "url3", "integrity": "integrity3"} ) assert loader.resolve_revision_from(known_artifacts, p_info) == None # noqa def test_evaluation_branch(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) res = loader.load() assert res["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=loader.storage) def test_eoferror(swh_config, requests_mock_datadir): """Load a truncated archive which is invalid to make the uncompress function raising the exception EOFError. We then check if a snapshot is created, meaning this error is well managed. """ sources = ( "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa ) loader = NixGuixLoader(sources) loader.load() expected_snapshot = Snapshot( id=hash_to_bytes("4257fa2350168c6bfec726a06452ea27a2c0cb33"), branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) def fake_download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, ) -> Tuple[str, Dict]: """Fake download which raises HashCollision (for the sake of test simpliciy, let's accept that makes sense) For tests purpose only. """ if url == "https://example.com/file.txt": # instead of failing because it's a file not dealt with by the nix guix # loader, make it raise a hash collision raise HashCollision("sha1", "f92d74e3874587aaf443d1db961d4e26dde13e9c", []) return download(url, dest, hashes, filename, auth) def test_raise_exception(swh_config, requests_mock_datadir, mocker): mock_download = mocker.patch("swh.loader.package.loader.download") mock_download.side_effect = fake_download loader = NixGuixLoader(sources_url) res = loader.load() assert res == { "status": "eventful", "snapshot_id": SNAPSHOT1.id.hex(), } check_snapshot(SNAPSHOT1, storage=loader.storage) assert len(mock_download.mock_calls) == 2 # The visit is partial because some artifact downloads failed assert_last_visit_matches( loader.storage, sources_url, status="partial", type="nixguix" ) def test_load_nixguix_one_common_artifact_from_other_loader( swh_config, datadir, requests_mock_datadir_visits, caplog ): """Misformatted revision should be caught and logged, then loading continues """ caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader") # 1. first ingest with for example the archive loader gnu_url = "https://ftp.gnu.org/gnu/8sync/" release = "0.1.0" artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz" gnu_artifacts = [ { "time": 944729610, "url": artifact_url, "length": 221837, "filename": f"8sync-{release}.tar.gz", "version": release, } ] archive_loader = ArchiveLoader(url=gnu_url, artifacts=gnu_artifacts) actual_load_status = archive_loader.load() expected_snapshot_id = "c419397fd912039825ebdbea378bc6283f006bf5" assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa assert_last_visit_matches( archive_loader.storage, gnu_url, status="full", type="tar" ) - gnu_snapshot = archive_loader.storage.snapshot_get( - hash_to_bytes(expected_snapshot_id) + gnu_snapshot: Snapshot = snapshot_get_all_branches( + archive_loader.storage, hash_to_bytes(expected_snapshot_id) ) - first_revision = gnu_snapshot["branches"][f"releases/{release}".encode("utf-8")] + first_revision = gnu_snapshot.branches[f"releases/{release}".encode("utf-8")] # 2. Then ingest with the nixguix loader which lists the same artifact within its # sources.json # ensure test setup is ok data_sources = os.path.join( datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json" ) all_sources = json.loads(open(data_sources).read()) found = False for source in all_sources["sources"]: if source["urls"][0] == artifact_url: found = True assert ( found is True ), f"test setup error: {artifact_url} must be in {data_sources}" # first visit with a snapshot, ok sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" loader = NixGuixLoader(sources_url) actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="full", type="nixguix" ) snapshot_id = actual_load_status2["snapshot_id"] - snapshot = loader.storage.snapshot_get(hash_to_bytes(snapshot_id)) - snapshot.pop("next_branch") # snapshot_get endpoint detail to drop + snapshot = snapshot_get_all_branches(loader.storage, hash_to_bytes(snapshot_id)) + assert snapshot # simulate a snapshot already seen with a revision with the wrong metadata structure # This revision should be skipped, thus making the artifact being ingested again. with patch( "swh.loader.package.loader.PackageLoader.last_snapshot" ) as last_snapshot: # mutate the snapshot to target a revision with the wrong metadata structure # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision - old_revision = next(loader.storage.revision_get([first_revision["target"]])) + old_revision = next(loader.storage.revision_get([first_revision.target])) # assert that revision is not in the right format assert old_revision["metadata"]["extrinsic"]["raw"].get("integrity", {}) == {} # mutate snapshot to create a clash - snapshot["branches"][artifact_url.encode("utf-8")] = { - "target_type": "revision", - "target": hash_to_bytes(old_revision["id"]), - } + snapshot = attr.evolve( + snapshot, + branches={ + **snapshot.branches, + artifact_url.encode("utf-8"): SnapshotBranch( + target_type=TargetType.REVISION, + target=hash_to_bytes(old_revision["id"]), + ), + }, + ) # modify snapshot to actually change revision metadata structure so we simulate # a revision written by somebody else (structure different) - last_snapshot.return_value = Snapshot.from_dict(snapshot) + last_snapshot.return_value = snapshot loader = NixGuixLoader(sources_url) actual_load_status3 = loader.load() assert last_snapshot.called assert actual_load_status3["status"] == "eventful" assert_last_visit_matches( loader.storage, sources_url, status="full", type="nixguix" ) new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965" assert actual_load_status3["snapshot_id"] == new_snapshot_id - last_snapshot = loader.storage.snapshot_get(hash_to_bytes(new_snapshot_id)) - new_revision_branch = last_snapshot["branches"][artifact_url.encode("utf-8")] - assert new_revision_branch["target_type"] == "revision" - - new_revision = next( - loader.storage.revision_get([new_revision_branch["target"]]) + last_snapshot = snapshot_get_all_branches( + loader.storage, hash_to_bytes(new_snapshot_id) ) + new_revision_branch = last_snapshot.branches[artifact_url.encode("utf-8")] + assert new_revision_branch.target_type == TargetType.REVISION + + new_revision = next(loader.storage.revision_get([new_revision_branch.target])) # the new revision has the correct structure, so it got ingested alright by the # new run assert new_revision["metadata"]["extrinsic"]["raw"]["integrity"] is not None nb_detections = 0 actual_detection: Dict for record in caplog.records: logtext = record.getMessage() if "Unexpected metadata revision structure detected:" in logtext: nb_detections += 1 actual_detection = record.args["context"] assert actual_detection # as many calls as there are sources listed in the sources.json assert nb_detections == len(all_sources["sources"]) assert actual_detection == { "revision": hash_to_hex(old_revision["id"]), "reason": "'integrity'", "known_artifact": old_revision["metadata"], }