diff --git a/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 new file mode 120000 --- /dev/null +++ b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 @@ -0,0 +1 @@ +gnu_8sync_8sync-0.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -97,11 +97,21 @@ def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict ) -> Optional[bytes]: - for rev_id, known_artifact in known_artifacts.items(): - known_integrity = known_artifact["extrinsic"]["raw"]["integrity"] - if artifact_metadata["integrity"] == known_integrity: - return rev_id + try: + known_integrity = known_artifact["extrinsic"]["raw"]["integrity"] + except KeyError as e: + logger.warning( + "Divergent metadata revision structure detected, skipping. " + f"Reason: {e}" + ) + # metadata field for the revision is not as expected by the loader + # nixguix. We consider this not the right revision and continue checking + # the other revisions + continue + else: + if artifact_metadata["integrity"] == known_integrity: + return rev_id return None def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: diff --git a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org new file mode 120000 --- /dev/null +++ b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org @@ -0,0 +1 @@ +../../../archive/tests/data/https_ftp.gnu.org \ No newline at end of file diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 +++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 @@ -2,18 +2,13 @@ "sources": [ { "type": "url", - "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ], "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" }, { "type": "url", - "urls": [ "https://github.com/owner-2/repository-1/revision-1.tgz" ], - "integrity": "sha256-+vRlzTcnhMlynJGGMuAgMnUGdjpSqGabhcQ/SlRplAE=" - }, - { - "type": "url", - "urls": [ "https://example.com/file.txt" ], - "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" + "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" } ], "version": 1, diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json new file mode 100644 --- /dev/null +++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json @@ -0,0 +1,16 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + } + ], + "version": 1, + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +} diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 new file mode 100644 --- /dev/null +++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 @@ -0,0 +1,16 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + } + ], + "version": 1, + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +} diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -3,12 +3,17 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os +import json import pytest +from json.decoder import JSONDecodeError from typing import Dict, Optional, Tuple -from json.decoder import JSONDecodeError +from unittest.mock import patch +from swh.model.model import Snapshot +from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( NixGuixLoader, retrieve_sources, @@ -17,6 +22,7 @@ from swh.loader.package.tests.common import get_stats, check_snapshot from swh.loader.package.utils import download +from swh.model.hashutil import hash_to_bytes from swh.storage.exc import HashCollision sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json" @@ -391,3 +397,101 @@ # The visit is partial because some hash collision were detected assert origin_visit["status"] == "partial" assert origin_visit["type"] == "nixguix" + + +def test_load_nixguix_one_common_artifact_from_other_loader( + swh_config, datadir, requests_mock_datadir_visits +): + """Ingesting an artifact already loaded by another loader should be skipped + + """ + # first ingest with for example the archive loader + gnu_url = "https://ftp.gnu.org/gnu/8sync/" + release = "0.1.0" + artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz" + gnu_artifacts = [ + { + "time": 944729610, + "url": artifact_url, + "length": 221837, + "filename": f"8sync-{release}.tar.gz", + "version": release, + } + ] + archive_loader = ArchiveLoader(url=gnu_url, artifacts=gnu_artifacts) + actual_load_status = archive_loader.load() + expected_snapshot_id = "c419397fd912039825ebdbea378bc6283f006bf5" + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa + + gnu_snapshot = archive_loader.storage.snapshot_get( + hash_to_bytes(expected_snapshot_id) + ) + + first_revision = gnu_snapshot["branches"][f"releases/{release}".encode("utf-8")] + + # ensure setup is ok + data_sources = os.path.join( + datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json" + ) + + all_sources = json.loads(open(data_sources).read()) + found = False + for source in all_sources["sources"]: + if source["urls"][0] == artifact_url: + found = True + assert found is True, f"test setup error: {artifact_url} must be in {data_sources}" + + # Then ingest with the nixguix loader which lists the same artifact within its + # sources.json + # first visit with a snapshot, ok + sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" + loader = NixGuixLoader(sources_url) + actual_load_status2 = loader.load() + assert actual_load_status2["status"] == "eventful" + + snapshot_id = actual_load_status2["snapshot_id"] + + snapshot = loader.storage.snapshot_get(hash_to_bytes(snapshot_id)) + # import yaml + # print(yaml.safe_load(open(swh_config).read())) + # print(swh_config) + + # simulate a snapshot already seen with a revision with the wrong metadata structure + # This revision should be skipped, thus making the artifact being ingested again. + with patch( + "swh.loader.package.loader.PackageLoader.last_snapshot" + ) as last_snapshot: + # mutate the snapshot to target a revision with the wrong metadata structure + snapshot["branches"][artifact_url.encode("utf-8")] = first_revision + old_revision = next(loader.storage.revision_get([first_revision["target"]])) + # assert that revision is not in the right format + assert ( + old_revision["metadata"] + .get("extrinsic", {}) + .get("raw", {}) + .get("integrity", {}) + == {} + ) + + # modify snapshot to actually change revision metadata structure + # so we simulate a revision written by somebody else + snapshot.pop("next_branch") # snapshot_get endpoint detail to drop + last_snapshot.return_value = Snapshot.from_dict(snapshot) + + loader = NixGuixLoader(sources_url) + actual_load_status3 = loader.load() + # assert actual_load_status3["status"] == "eventful" + + new_snapshot_id = "ea50a9c3d721a125c39150a9eb0003c82ee2fd23" + assert actual_load_status3["snapshot_id"] == new_snapshot_id + + last_snapshot = loader.storage.snapshot_get(hash_to_bytes(new_snapshot_id)) + new_revision_branch = last_snapshot["branches"][artifact_url.encode("utf-8")] + assert new_revision_branch["target_type"] == "revision" + + new_revision = next( + loader.storage.revision_get([new_revision_branch["target"]]) + ) + # the new revision has the correct structure + assert new_revision["metadata"]["extrinsic"]["raw"]["integrity"] is not None