diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -750,9 +750,32 @@ ) try: # FIXME: Ensure no "nar" computations is required for file - assert self.checksums_computation == "standard" with tempfile.TemporaryDirectory() as tmpdir: - file_path, _ = download(url, dest=tmpdir, hashes=self.checksums) + file_path, _ = download( + url, dest=tmpdir, hashes=self.standard_hashes + ) + if self.checksums_computation == "nar": + # hashes are not "standard", so we need an extra check to happen + self.log.debug("Content to check nar hashes: %s", file_path) + actual_checksums = nix_hashes( + Path(file_path), self.checksums.keys() + ).hexdigest() + + if actual_checksums != self.checksums: + errors.append( + ValueError( + f"Checksum mismatched on <{url}>: " + f"{actual_checksums} != {self.checksums}" + ) + ) + self.log.debug( + "Mismatched checksums <%s>: continue on next mirror " + "url if any", + url, + ) + continue + + # FIXME: Content.from_dict should work? with open(file_path, "rb") as file: self.content = Content.from_data(file.read()) except ValueError as e: diff --git a/swh/loader/core/tests/conftest.py b/swh/loader/core/tests/conftest.py --- a/swh/loader/core/tests/conftest.py +++ b/swh/loader/core/tests/conftest.py @@ -5,10 +5,11 @@ from os import path import shutil -from typing import Dict, List, Union +from typing import Dict, List import pytest +from swh.loader.core.utils import compute_nar_hashes from swh.model.hashutil import MultiHash nix_store_missing = shutil.which("nix-store") is None @@ -20,12 +21,17 @@ return path.join(datadir, "https_example.org", "archives_dummy-hello.tar.gz") -def compute_hashes( - filepath: str, cksum_algos: Union[str, List[str]] = "sha256" -) -> Dict[str, str]: +@pytest.fixture +def content_path(datadir): + """Return filepath fetched by ContentLoader test runs.""" + return path.join( + datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp" + ) + + +def compute_hashes(filepath: str, hash_names: List[str] = ["sha256"]) -> Dict[str, str]: """Compute checksums dict out of a filepath""" - checksum_algos = {cksum_algos} if isinstance(cksum_algos, str) else set(cksum_algos) - return MultiHash.from_path(filepath, hash_names=checksum_algos).hexdigest() + return MultiHash.from_path(filepath, hash_names=hash_names).hexdigest() @pytest.fixture @@ -38,8 +44,21 @@ @pytest.fixture def tarball_with_nar_hashes(tarball_path): - # FIXME: compute it instead of hard-coding it - return ( - tarball_path, - {"sha256": "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"}, + nar_hashes = compute_nar_hashes(tarball_path, ["sha256"]) + # Ensure it's the same hash as the initial one computed from the cli + assert ( + nar_hashes["sha256"] + == "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4" + ) + return (tarball_path, nar_hashes) + + +@pytest.fixture +def content_with_nar_hashes(content_path): + nar_hashes = compute_nar_hashes(content_path, ["sha256"], is_tarball=False) + # Ensure it's the same hash as the initial one computed from the cli + assert ( + nar_hashes["sha256"] + == "0b555a4d13e530460425d1dc20332294f151067fb64a7e49c7de501f05b0a41a" ) + return (content_path, nar_hashes) diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -4,9 +4,9 @@ # See top-level LICENSE file for more information import datetime +from functools import partial import hashlib import logging -import os import time from unittest.mock import MagicMock, call @@ -34,7 +34,7 @@ ) import swh.storage.exc -from .conftest import compute_hashes, nix_store_missing +from .conftest import compute_hashes, compute_nar_hashes, nix_store_missing ORIGIN = Origin(url="some-url") PARENT_ORIGIN = Origin(url="base-origin-url") @@ -517,14 +517,6 @@ CONTENT_URL = f"{CONTENT_MIRROR}/project/asdf/archives/asdf-3.3.5.lisp" -@pytest.fixture -def content_path(datadir): - """Return filepath fetched by ContentLoader test runs.""" - return os.path.join( - datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp" - ) - - def test_content_loader_missing_field(swh_storage): """It should raise if the ContentLoader is missing checksums field""" origin = Origin(CONTENT_URL) @@ -609,20 +601,34 @@ swh_storage, dead_origin.url, fallback_urls=[fallback_url_ok, fallback_url_ko], - checksums=compute_hashes(content_path, checksum_algo), + checksums=compute_hashes(content_path, [checksum_algo]), ) result = loader.load() assert result == {"status": "eventful"} -def test_content_loader_ok_simple(swh_storage, requests_mock_datadir, content_path): +compute_content_nar_hashes = partial(compute_nar_hashes, is_tarball=False) + + +@pytest.mark.skipif( + nix_store_missing, reason="requires nix-store binary from nix binaries" +) +@pytest.mark.parametrize("checksums_computation", ["standard", "nar"]) +def test_content_loader_ok_simple( + swh_storage, requests_mock_datadir, content_path, checksums_computation +): """It should be an eventful visit on a new file, then uneventful""" + compute_hashes_fn = ( + compute_content_nar_hashes if checksums_computation == "nar" else compute_hashes + ) + origin = Origin(CONTENT_URL) loader = ContentLoader( swh_storage, origin.url, - checksums=compute_hashes(content_path, ["sha1", "sha256", "sha512"]), + checksums=compute_hashes_fn(content_path, ["sha1", "sha256", "sha512"]), + checksums_computation=checksums_computation, ) result = loader.load() @@ -638,9 +644,18 @@ assert result2 == {"status": "uneventful"} -def test_content_loader_hash_mismatch(swh_storage, requests_mock_datadir, content_path): +@pytest.mark.skipif( + nix_store_missing, reason="requires nix-store binary from nix binaries" +) +@pytest.mark.parametrize("checksums_computation", ["standard", "nar"]) +def test_content_loader_hash_mismatch( + swh_storage, requests_mock_datadir, content_path, checksums_computation +): """It should be an eventful visit on a new file, then uneventful""" - checksums = compute_hashes(content_path, ["sha1", "sha256", "sha512"]) + compute_hashes_fn = ( + compute_content_nar_hashes if checksums_computation == "nar" else compute_hashes + ) + checksums = compute_hashes_fn(content_path, ["sha1", "sha256", "sha512"]) erratic_checksums = { algo: chksum.replace("a", "e") # alter checksums to fail integrity check for algo, chksum in checksums.items() @@ -650,6 +665,7 @@ swh_storage, origin.url, checksums=erratic_checksums, + checksums_computation=checksums_computation, ) result = loader.load() @@ -717,43 +733,18 @@ ) +@pytest.mark.skipif( + nix_store_missing, reason="requires nix-store binary from nix binaries" +) +@pytest.mark.parametrize("checksums_computation", ["standard", "nar"]) def test_directory_loader_hash_mismatch( - caplog, swh_storage, requests_mock_datadir, tarball_with_std_hashes + caplog, swh_storage, requests_mock_datadir, tarball_path, checksums_computation ): """It should not ingest tarball with mismatched checksum""" - tarball_path, checksums = tarball_with_std_hashes - - origin = Origin(DIRECTORY_URL) - erratic_checksums = { - algo: chksum.replace("a", "e") # alter checksums to fail integrity check - for algo, chksum in checksums.items() - } - - loader = DirectoryLoader( - swh_storage, - origin.url, - checksums=erratic_checksums, # making the integrity check fail - ) - result = loader.load() - - assert result == {"status": "failed"} - - _check_load_failure( - caplog, - loader, - ValueError, - "mismatched", - status="failed", - origin=origin, + compute_hashes_fn = ( + compute_nar_hashes if checksums_computation == "nar" else compute_hashes ) - - -@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") -def test_directory_loader_hash_mismatch_nar( - caplog, swh_storage, requests_mock_datadir, tarball_with_nar_hashes -): - """It should not ingest tarball with mismatched checksum""" - tarball_path, checksums = tarball_with_nar_hashes + checksums = compute_hashes_fn(tarball_path, ["sha1", "sha256", "sha512"]) origin = Origin(DIRECTORY_URL) erratic_checksums = { @@ -765,7 +756,7 @@ swh_storage, origin.url, checksums=erratic_checksums, # making the integrity check fail - checksums_computation="nar", + checksums_computation=checksums_computation, ) result = loader.load() @@ -803,44 +794,24 @@ assert result == {"status": "eventful"} +@pytest.mark.skipif( + nix_store_missing, reason="requires nix-store binary from nix binaries" +) +@pytest.mark.parametrize("checksums_computation", ["standard", "nar"]) def test_directory_loader_ok_simple( - swh_storage, requests_mock_datadir, tarball_with_std_hashes + swh_storage, requests_mock_datadir, tarball_path, checksums_computation ): """It should be an eventful visit on a new tarball, then uneventful""" origin = Origin(DIRECTORY_URL) - tarball_path, checksums = tarball_with_std_hashes - loader = DirectoryLoader( - swh_storage, - origin.url, - checksums=checksums, + compute_hashes_fn = ( + compute_nar_hashes if checksums_computation == "nar" else compute_hashes ) - result = loader.load() - - assert result == {"status": "eventful"} - - visit_status = assert_last_visit_matches( - swh_storage, origin.url, status="full", type="directory" - ) - assert visit_status.snapshot is not None - - result2 = loader.load() - - assert result2 == {"status": "uneventful"} - - -@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") -def test_directory_loader_ok_with_nar( - swh_storage, requests_mock_datadir, tarball_with_nar_hashes -): - """It should be an eventful visit on a tarball with nar hashes, then uneventful""" - tarball_path, nar_checksums = tarball_with_nar_hashes - origin = Origin(DIRECTORY_URL) loader = DirectoryLoader( swh_storage, origin.url, - checksums=nar_checksums, - checksums_computation="nar", + checksums=compute_hashes_fn(tarball_path, ["sha1", "sha256", "sha512"]), + checksums_computation=checksums_computation, ) result = loader.load() diff --git a/swh/loader/core/tests/test_utils.py b/swh/loader/core/tests/test_utils.py --- a/swh/loader/core/tests/test_utils.py +++ b/swh/loader/core/tests/test_utils.py @@ -19,6 +19,7 @@ CloneTimeout, clean_dangling_folders, clone_with_timeout, + compute_nar_hashes, nix_hashes, parse_visit_date, ) @@ -216,3 +217,23 @@ actual_multihash = nix_hashes(directory, nar_checksums.keys()) assert actual_multihash.hexdigest() == nar_checksums + + +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +def test_compute_nar_hashes_tarball(tarball_with_nar_hashes): + tarball_path, nar_checksums = tarball_with_nar_hashes + + actual_checksums = compute_nar_hashes(tarball_path, nar_checksums.keys()) + + assert actual_checksums == nar_checksums + + +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +def test_compute_nar_hashes_file(content_with_nar_hashes): + content_path, nar_checksums = content_with_nar_hashes + + actual_checksums = compute_nar_hashes( + content_path, nar_checksums.keys(), is_tarball=False + ) + + assert actual_checksums == nar_checksums diff --git a/swh/loader/core/utils.py b/swh/loader/core/utils.py --- a/swh/loader/core/utils.py +++ b/swh/loader/core/utils.py @@ -11,14 +11,16 @@ import shutil import signal from subprocess import PIPE, Popen +import tempfile import time import traceback -from typing import Callable, Iterable, Optional, Union +from typing import Callable, Dict, Iterable, List, Optional, Union from billiard import Process, Queue # type: ignore from dateutil.parser import parse import psutil +from swh.core.tarball import uncompress from swh.loader.exception import MissingOptionalDependency from swh.model.hashutil import MultiHash @@ -152,3 +154,35 @@ multi_hash.update(chunk) return multi_hash + + +def compute_nar_hashes( + filepath: Path, + hash_names: List[str] = ["sha256"], + is_tarball=True, +) -> Dict[str, str]: + """Compute nar checksums dict out of a filepath (tarball or plain file). + + If it's a tarball, this uncompresses the tarball in a temporary directory to compute + the nix hashes (and then cleans it up). + + Args: + filepath: The tarball (if is_tarball is True) or a filepath + hash_names: The list of checksums to compute + is_tarball: Whether filepath represents a tarball or not + + Returns: + The dict of checksums values whose keys are present in hash_names. + + """ + with tempfile.TemporaryDirectory() as tmpdir: + if is_tarball: + directory_path = Path(tmpdir) + directory_path.mkdir(parents=True, exist_ok=True) + uncompress(str(filepath), dest=str(directory_path)) + path_on_disk = next(directory_path.iterdir()) + else: + path_on_disk = filepath + + hashes = nix_hashes(path_on_disk, hash_names).hexdigest() + return hashes