diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -750,9 +750,32 @@ ) try: # FIXME: Ensure no "nar" computations is required for file - assert self.checksums_computation == "standard" with tempfile.TemporaryDirectory() as tmpdir: - file_path, _ = download(url, dest=tmpdir, hashes=self.checksums) + file_path, _ = download( + url, dest=tmpdir, hashes=self.standard_hashes + ) + if self.checksums_computation == "nar": + # hashes are not "standard", so we need an extra check to happen + self.log.debug("Content to check nar hashes: %s", file_path) + actual_checksums = nix_hashes( + Path(file_path), self.checksums.keys() + ).hexdigest() + + if actual_checksums != self.checksums: + errors.append( + ValueError( + f"Checksum mismatched on <{url}>: " + f"{actual_checksums} != {self.checksums}" + ) + ) + self.log.debug( + "Mismatched checksums <%s>: continue on next mirror " + "url if any", + url, + ) + continue + + # FIXME: Content.from_dict should work? with open(file_path, "rb") as file: self.content = Content.from_data(file.read()) except ValueError as e: diff --git a/swh/loader/core/tests/conftest.py b/swh/loader/core/tests/conftest.py --- a/swh/loader/core/tests/conftest.py +++ b/swh/loader/core/tests/conftest.py @@ -4,11 +4,15 @@ # See top-level LICENSE file for more information from os import path +from pathlib import Path import shutil +import tempfile from typing import Dict, List, Union import pytest +from swh.core.tarball import uncompress +from swh.loader.core.utils import nix_hashes from swh.model.hashutil import MultiHash nix_store_missing = shutil.which("nix-store") is None @@ -20,6 +24,14 @@ return path.join(datadir, "https_example.org", "archives_dummy-hello.tar.gz") +@pytest.fixture +def content_path(datadir): + """Return filepath fetched by ContentLoader test runs.""" + return path.join( + datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp" + ) + + def compute_hashes( filepath: str, cksum_algos: Union[str, List[str]] = "sha256" ) -> Dict[str, str]: @@ -28,6 +40,30 @@ return MultiHash.from_path(filepath, hash_names=checksum_algos).hexdigest() +def compute_nar_hashes( + filepath: str, cksum_algos: Union[str, List[str]] = "sha256" +) -> Dict[str, str]: + """Compute nar checksums dict out of a filepath (tarball or plain file). + + If it's a tarball, this uncompresses it in a temporary directory to compute the nix + hashes. + + """ + path = Path(filepath) + checksum_algos = {cksum_algos} if isinstance(cksum_algos, str) else set(cksum_algos) + with tempfile.TemporaryDirectory() as tmpdir: + if "".join(path.suffixes) == ".tar.gz": # tarball + directory_path = Path(tmpdir) + directory_path.mkdir(parents=True, exist_ok=True) + uncompress(str(path), dest=str(directory_path)) + path_on_disk = next(directory_path.iterdir()) + else: + path_on_disk = path + + hashes = nix_hashes(path_on_disk, hash_names=checksum_algos).hexdigest() + return hashes + + @pytest.fixture def tarball_with_std_hashes(tarball_path): return ( @@ -38,8 +74,16 @@ @pytest.fixture def tarball_with_nar_hashes(tarball_path): - # FIXME: compute it instead of hard-coding it - return ( - tarball_path, - {"sha256": "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"}, + nar_hashes = compute_nar_hashes(tarball_path, "sha256") + # Ensure it's the same as the initial computed from the cli + assert ( + nar_hashes["sha256"] + == "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4" ) + return (tarball_path, nar_hashes) + + +@pytest.fixture +def content_with_nar_hashes(content_path): + nar_hashes = compute_nar_hashes(content_path, "sha256").hexdigest() + return (content_path, nar_hashes) diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -6,7 +6,6 @@ import datetime import hashlib import logging -import os import time from unittest.mock import MagicMock, call @@ -34,7 +33,7 @@ ) import swh.storage.exc -from .conftest import compute_hashes, nix_store_missing +from .conftest import compute_hashes, compute_nar_hashes, nix_store_missing ORIGIN = Origin(url="some-url") PARENT_ORIGIN = Origin(url="base-origin-url") @@ -517,14 +516,6 @@ CONTENT_URL = f"{CONTENT_MIRROR}/project/asdf/archives/asdf-3.3.5.lisp" -@pytest.fixture -def content_path(datadir): - """Return filepath fetched by ContentLoader test runs.""" - return os.path.join( - datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp" - ) - - def test_content_loader_missing_field(swh_storage): """It should raise if the ContentLoader is missing checksums field""" origin = Origin(CONTENT_URL) @@ -616,13 +607,22 @@ assert result == {"status": "eventful"} -def test_content_loader_ok_simple(swh_storage, requests_mock_datadir, content_path): +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +@pytest.mark.parametrize("checksums_computation", ["standard", "nar"]) +def test_content_loader_ok_simple( + swh_storage, requests_mock_datadir, content_path, checksums_computation +): """It should be an eventful visit on a new file, then uneventful""" + compute_hashes_fn = ( + compute_nar_hashes if checksums_computation == "nar" else compute_hashes + ) + origin = Origin(CONTENT_URL) loader = ContentLoader( swh_storage, origin.url, - checksums=compute_hashes(content_path, ["sha1", "sha256", "sha512"]), + checksums=compute_hashes_fn(content_path, ["sha1", "sha256", "sha512"]), + checksums_computation=checksums_computation, ) result = loader.load() @@ -638,9 +638,16 @@ assert result2 == {"status": "uneventful"} -def test_content_loader_hash_mismatch(swh_storage, requests_mock_datadir, content_path): +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +@pytest.mark.parametrize("checksums_computation", ["standard", "nar"]) +def test_content_loader_hash_mismatch( + swh_storage, requests_mock_datadir, content_path, checksums_computation +): """It should be an eventful visit on a new file, then uneventful""" - checksums = compute_hashes(content_path, ["sha1", "sha256", "sha512"]) + compute_hashes_fn = ( + compute_nar_hashes if checksums_computation == "nar" else compute_hashes + ) + checksums = compute_hashes_fn(content_path, ["sha1", "sha256", "sha512"]) erratic_checksums = { algo: chksum.replace("a", "e") # alter checksums to fail integrity check for algo, chksum in checksums.items() @@ -650,6 +657,7 @@ swh_storage, origin.url, checksums=erratic_checksums, + checksums_computation=checksums_computation, ) result = loader.load() @@ -717,43 +725,16 @@ ) +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +@pytest.mark.parametrize("checksums_computation", ["standard", "nar"]) def test_directory_loader_hash_mismatch( - caplog, swh_storage, requests_mock_datadir, tarball_with_std_hashes + caplog, swh_storage, requests_mock_datadir, tarball_path, checksums_computation ): """It should not ingest tarball with mismatched checksum""" - tarball_path, checksums = tarball_with_std_hashes - - origin = Origin(DIRECTORY_URL) - erratic_checksums = { - algo: chksum.replace("a", "e") # alter checksums to fail integrity check - for algo, chksum in checksums.items() - } - - loader = DirectoryLoader( - swh_storage, - origin.url, - checksums=erratic_checksums, # making the integrity check fail - ) - result = loader.load() - - assert result == {"status": "failed"} - - _check_load_failure( - caplog, - loader, - ValueError, - "mismatched", - status="failed", - origin=origin, + compute_hashes_fn = ( + compute_nar_hashes if checksums_computation == "nar" else compute_hashes ) - - -@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") -def test_directory_loader_hash_mismatch_nar( - caplog, swh_storage, requests_mock_datadir, tarball_with_nar_hashes -): - """It should not ingest tarball with mismatched checksum""" - tarball_path, checksums = tarball_with_nar_hashes + checksums = compute_hashes_fn(tarball_path, ["sha1", "sha256", "sha512"]) origin = Origin(DIRECTORY_URL) erratic_checksums = { @@ -765,7 +746,7 @@ swh_storage, origin.url, checksums=erratic_checksums, # making the integrity check fail - checksums_computation="nar", + checksums_computation=checksums_computation, ) result = loader.load() @@ -803,44 +784,22 @@ assert result == {"status": "eventful"} +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +@pytest.mark.parametrize("checksums_computation", ["standard", "nar"]) def test_directory_loader_ok_simple( - swh_storage, requests_mock_datadir, tarball_with_std_hashes + swh_storage, requests_mock_datadir, tarball_path, checksums_computation ): """It should be an eventful visit on a new tarball, then uneventful""" origin = Origin(DIRECTORY_URL) - tarball_path, checksums = tarball_with_std_hashes - loader = DirectoryLoader( - swh_storage, - origin.url, - checksums=checksums, + compute_hashes_fn = ( + compute_nar_hashes if checksums_computation == "nar" else compute_hashes ) - result = loader.load() - - assert result == {"status": "eventful"} - - visit_status = assert_last_visit_matches( - swh_storage, origin.url, status="full", type="directory" - ) - assert visit_status.snapshot is not None - - result2 = loader.load() - - assert result2 == {"status": "uneventful"} - - -@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") -def test_directory_loader_ok_with_nar( - swh_storage, requests_mock_datadir, tarball_with_nar_hashes -): - """It should be an eventful visit on a tarball with nar hashes, then uneventful""" - tarball_path, nar_checksums = tarball_with_nar_hashes - origin = Origin(DIRECTORY_URL) loader = DirectoryLoader( swh_storage, origin.url, - checksums=nar_checksums, - checksums_computation="nar", + checksums=compute_hashes_fn(tarball_path, ["sha1", "sha256", "sha512"]), + checksums_computation=checksums_computation, ) result = loader.load()