diff --git a/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 new file mode 120000 --- /dev/null +++ b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 @@ -0,0 +1 @@ +gnu_8sync_8sync-0.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 new file mode 120000 --- /dev/null +++ b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 @@ -0,0 +1 @@ +gnu_8sync_8sync-0.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -97,11 +97,30 @@ def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict ) -> Optional[bytes]: - + logger.debug("#### resolve_revision_from") for rev_id, known_artifact in known_artifacts.items(): - known_integrity = known_artifact["extrinsic"]["raw"]["integrity"] - if artifact_metadata["integrity"] == known_integrity: - return rev_id + logger.debug("#### rev_id: %s", hashutil.hash_to_hex(rev_id)) + logger.debug("#### known_artifact: %s", known_artifact) + try: + known_integrity = known_artifact["extrinsic"]["raw"]["integrity"] + except KeyError as e: + logger.warning( + "Divergent metadata revision structure detected: %(context)s", + { + "context": { + "revision": hashutil.hash_to_hex(rev_id), + "reason": str(e), + "known_artifact": known_artifact, + } + }, + ) + # metadata field for the revision is not as expected by the loader + # nixguix. We consider this not the right revision and continue checking + # the other revisions + continue + else: + if artifact_metadata["integrity"] == known_integrity: + return rev_id return None def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: diff --git a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org new file mode 120000 --- /dev/null +++ b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org @@ -0,0 +1 @@ +../../../archive/tests/data/https_ftp.gnu.org \ No newline at end of file diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json new file mode 100644 --- /dev/null +++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json @@ -0,0 +1,16 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + } + ], + "version": 1, + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +} diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 new file mode 100644 --- /dev/null +++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 @@ -0,0 +1,16 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz" ], + "integrity": "sha256-4wn2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + } + ], + "version": 1, + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +} diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -3,12 +3,19 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os +import json +import logging + import pytest +from json.decoder import JSONDecodeError from typing import Dict, Optional, Tuple -from json.decoder import JSONDecodeError +from unittest.mock import patch +from swh.model.model import Snapshot +from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( NixGuixLoader, retrieve_sources, @@ -17,6 +24,7 @@ from swh.loader.package.tests.common import get_stats, check_snapshot from swh.loader.package.utils import download +from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.storage.exc import HashCollision sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json" @@ -391,3 +399,122 @@ # The visit is partial because some hash collision were detected assert origin_visit["status"] == "partial" assert origin_visit["type"] == "nixguix" + + +def test_load_nixguix_one_common_artifact_from_other_loader( + swh_config, datadir, requests_mock_datadir_visits, caplog +): + """Misformatted revision should be caught and logged, then loading continues + + """ + caplog.set_level(logging.WARNING, "swh.loader.package.nixguix.loader") + + # 1. first ingest with for example the archive loader + gnu_url = "https://ftp.gnu.org/gnu/8sync/" + release = "0.1.0" + artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz" + gnu_artifacts = [ + { + "time": 944729610, + "url": artifact_url, + "length": 221837, + "filename": f"8sync-{release}.tar.gz", + "version": release, + } + ] + archive_loader = ArchiveLoader(url=gnu_url, artifacts=gnu_artifacts) + actual_load_status = archive_loader.load() + expected_snapshot_id = "c419397fd912039825ebdbea378bc6283f006bf5" + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa + + gnu_snapshot = archive_loader.storage.snapshot_get( + hash_to_bytes(expected_snapshot_id) + ) + + first_revision = gnu_snapshot["branches"][f"releases/{release}".encode("utf-8")] + + # 2. Then ingest with the nixguix loader which lists the same artifact within its + # sources.json + + # ensure test setup is ok + data_sources = os.path.join( + datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json" + ) + all_sources = json.loads(open(data_sources).read()) + found = False + for source in all_sources["sources"]: + if source["urls"][0] == artifact_url: + found = True + assert ( + found is True + ), f"test setup error: {artifact_url} must be in {data_sources}" + + # first visit with a snapshot, ok + sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" + loader = NixGuixLoader(sources_url) + actual_load_status2 = loader.load() + assert actual_load_status2["status"] == "eventful" + + snapshot_id = actual_load_status2["snapshot_id"] + snapshot = loader.storage.snapshot_get(hash_to_bytes(snapshot_id)) + snapshot.pop("next_branch") # snapshot_get endpoint detail to drop + + # simulate a snapshot already seen with a revision with the wrong metadata structure + # This revision should be skipped, thus making the artifact being ingested again. + with patch( + "swh.loader.package.loader.PackageLoader.last_snapshot" + ) as last_snapshot: + # mutate the snapshot to target a revision with the wrong metadata structure + # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision + old_revision = next(loader.storage.revision_get([first_revision["target"]])) + # assert that revision is not in the right format + assert old_revision["metadata"]["extrinsic"]["raw"].get("integrity", {}) == {} + + # mutate snapshot to create a clash + snapshot["branches"][artifact_url.encode("utf-8")] = { + "target_type": "revision", + "target": old_revision["id"], + } + + # modify snapshot to actually change revision metadata structure so we simulate + # a revision written by somebody else (structure different) + last_snapshot.return_value = Snapshot.from_dict(snapshot) + + loader = NixGuixLoader(sources_url) + actual_load_status3 = loader.load() + assert last_snapshot.called + assert actual_load_status3["status"] == "eventful" + + new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965" + assert actual_load_status3["snapshot_id"] == new_snapshot_id + + last_snapshot = loader.storage.snapshot_get(hash_to_bytes(new_snapshot_id)) + new_revision_branch = last_snapshot["branches"][artifact_url.encode("utf-8")] + assert new_revision_branch["target_type"] == "revision" + + new_revision = next( + loader.storage.revision_get([new_revision_branch["target"]]) + ) + + # the new revision has the correct structure, so it got ingested alright by the + # new run + assert new_revision["metadata"]["extrinsic"]["raw"]["integrity"] is not None + + nb_detections = 0 + actual_detection: Dict + for record in caplog.records: + logtext = record.getMessage() + if "Divergent metadata revision structure detected:" in logtext: + nb_detections += 1 + actual_detection = record.args["context"] + + assert actual_detection + # as many calls as there are sources listed in the sources.json + assert nb_detections == len(all_sources["sources"]) + + assert actual_detection == { + "revision": hash_to_hex(old_revision["id"]), + "reason": "'integrity'", + "known_artifact": old_revision["metadata"], + }