diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -19,7 +19,8 @@ from swh.core.statsd import Statsd from swh.core.tarball import uncompress from swh.loader.core.metadata_fetchers import CredentialsType, get_fetchers_for_lister -from swh.loader.exception import NotFound +from swh.loader.core.utils import nix_store_check +from swh.loader.exception import NotFound, UnsupportedChecksumComputation from swh.loader.package.utils import download from swh.model import from_disk from swh.model.model import ( @@ -670,14 +671,26 @@ storage: StorageInterface, url: str, checksums: Dict[str, str], + checksums_computation: str = "standard", fallback_urls: List[str] = None, **kwargs, ): super().__init__(storage, url, **kwargs) self.snapshot: Optional[Snapshot] = None self.checksums = checksums + self.checksums_computations = checksums_computation + if self.checksums_computations not in ("nar", "standard"): + raise UnsupportedChecksumComputation( + "Unsupported checksums computations: %s", + self.checksums_computations, + ) + fallback_urls_ = fallback_urls or [] self.mirror_urls: List[str] = [self.origin.url, *fallback_urls_] + # Ensure content received matched the "standard" checksums received + self.standard_hashes = ( + self.checksums if self.checksums_computations == "standard" else {} + ) def prepare(self) -> None: self.last_snapshot = snapshot_get_latest(self.storage, self.origin.url) @@ -727,7 +740,15 @@ ) try: with tempfile.TemporaryDirectory() as tmpdir: - file_path, _ = download(url, dest=tmpdir, hashes=self.checksums) + # the following includes the hash computation check + file_path, _ = download( + url, dest=tmpdir, hashes=self.standard_hashes + ) + if self.checksums_computations == "nar": + # hashes are not "standard", so we need an extra check to happen + # on the file itself + nix_store_check(file_path, self.checksums) + with open(file_path, "rb") as file: self.content = Content.from_data(file.read()) except HTTPError as http_error: @@ -811,11 +832,11 @@ ) with tempfile.TemporaryDirectory() as tmpdir: try: + # Ensure content received matched the "standard" checksums received tarball_path, extrinsic_metadata = download( url, tmpdir, - # Ensure content received matched the checksums received - hashes=self.checksums, + hashes=self.standard_hashes, extra_request_headers={"Accept-Encoding": "identity"}, ) except ValueError as e: @@ -832,9 +853,17 @@ directory_path = os.path.join(tmpdir, "src") os.makedirs(directory_path, exist_ok=True) uncompress(tarball_path, dest=directory_path) - self.log.debug("uncompressed path to directory: %s", directory_path) + if self.checksums_computations == "nar": + # hashes are not "standard", so we need an extra check to happen + # on the uncompressed tarball + dir_to_check = os.path.join( + directory_path, os.listdir(directory_path)[0] + ) + self.log.debug("Directory to check nar hashes: %s", dir_to_check) + nix_store_check(dir_to_check, self.checksums) + self.directory = from_disk.Directory.from_disk( path=directory_path.encode("utf-8"), max_content_length=self.max_content_size, diff --git a/swh/loader/core/tests/conftest.py b/swh/loader/core/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/loader/core/tests/conftest.py @@ -0,0 +1,8 @@ +# Copyright (C) 2018-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import shutil + +nix_store_missing = shutil.which("nix-store") is None diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -35,6 +35,8 @@ ) import swh.storage.exc +from .conftest import nix_store_missing + ORIGIN = Origin(url="some-url") PARENT_ORIGIN = Origin(url="base-origin-url") @@ -767,3 +769,29 @@ result2 = loader.load() assert result2 == {"status": "uneventful"} + + +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +def test_directory_loader_ok_with_nar(swh_storage, requests_mock_datadir, tarball_path): + """It should be an eventful visit on a tarball with nar hashes, then uneventful""" + origin = Origin(DIRECTORY_URL) + loader = DirectoryLoader( + swh_storage, + origin.url, + checksums={ + "sha256": "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4" + }, + checksums_computation="nar", + ) + result = loader.load() + + assert result == {"status": "eventful"} + + visit_status = assert_last_visit_matches( + swh_storage, origin.url, status="full", type="directory" + ) + assert visit_status.snapshot is not None + + result2 = loader.load() + + assert result2 == {"status": "uneventful"} diff --git a/swh/loader/core/utils.py b/swh/loader/core/utils.py --- a/swh/loader/core/utils.py +++ b/swh/loader/core/utils.py @@ -9,14 +9,18 @@ import os import shutil import signal +from subprocess import PIPE, Popen import time import traceback -from typing import Callable, Optional, Union +from typing import Callable, Dict, Optional, Union from billiard import Process, Queue # type: ignore from dateutil.parser import parse import psutil +from swh.loader.exception import MissingOptionalDependency +from swh.model.hashutil import MultiHash + def clean_dangling_folders(dirpath: str, pattern_check: str, log=None) -> None: """Clean up potential dangling temporary working folder rooted at `dirpath`. Those @@ -125,3 +129,20 @@ return parse(visit_date) raise ValueError(f"invalid visit date {visit_date!r}") + + +def nix_store_check(filepath: str, checksums: Dict[str, str]): + h = MultiHash(hash_names=checksums.keys()) + + try: + command = ["nix-store", "--dump", filepath] + with Popen(command, stdout=PIPE) as proc: + assert proc.stdout is not None + for chunk in proc.stdout: + h.update(chunk) + + actual_hashes = h.hexdigest() + assert actual_hashes == checksums + + except FileNotFoundError: + raise MissingOptionalDependency("nix-store") diff --git a/swh/loader/exception.py b/swh/loader/exception.py --- a/swh/loader/exception.py +++ b/swh/loader/exception.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,3 +11,15 @@ """ pass + + +class MissingOptionalDependency(ValueError): + """An exception raised when an optional runtime dependency is missing.""" + + pass + + +class UnsupportedChecksumComputation(ValueError): + """An exception raised when loader cannot compute such checksums.""" + + pass