diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -7,6 +7,7 @@ import hashlib import logging import os +from pathlib import Path import tempfile import time from typing import Any, ContextManager, Dict, Iterable, List, Optional, Union @@ -19,7 +20,8 @@ from swh.core.statsd import Statsd from swh.core.tarball import uncompress from swh.loader.core.metadata_fetchers import CredentialsType, get_fetchers_for_lister -from swh.loader.exception import NotFound +from swh.loader.core.utils import nix_hashes +from swh.loader.exception import NotFound, UnsupportedChecksumComputation from swh.loader.package.utils import download from swh.model import from_disk from swh.model.model import ( @@ -655,7 +657,12 @@ """Common class for :class:`ContentLoader` and :class:`Directoryloader`. The "checksums" field is a dictionary of hex hashes on the object retrieved (content - or directory). + or directory). When "checksums_computation" is "standard", that means the checksums + are computed on the content of the remote file to retrieve itself (as unix cli + allows, "sha1sum", "sha256sum", ...). When "checksums_computation" is "nar", the + checks is delegated to the `nix-store --dump` command, it's actually checksums on + the content of the remote artifact retrieved. Other "checksums_computation" will + raise UnsupportedChecksumComputation The multiple "fallback" urls received are mirror urls only used to fetch the object if the main origin is no longer available. Those are not stored. @@ -670,14 +677,29 @@ storage: StorageInterface, url: str, checksums: Dict[str, str], + checksums_computation: str = "standard", fallback_urls: List[str] = None, **kwargs, ): super().__init__(storage, url, **kwargs) self.snapshot: Optional[Snapshot] = None self.checksums = checksums + self.checksums_computation = checksums_computation + if self.checksums_computation not in ("nar", "standard"): + raise UnsupportedChecksumComputation( + "Unsupported checksums computations: %s", + self.checksums_computation, + ) + fallback_urls_ = fallback_urls or [] self.mirror_urls: List[str] = [self.origin.url, *fallback_urls_] + # Ensure content received matched the "standard" checksums received, this + # contains the checksums when checksum_computations is "standard", it's empty + # otherwise + self.standard_hashes = ( + self.checksums if self.checksums_computation == "standard" else {} + ) + self.log.debug("Loader checksums computation: %s", self.checksums_computation) def prepare(self) -> None: self.last_snapshot = snapshot_get_latest(self.storage, self.origin.url) @@ -726,6 +748,8 @@ url_.path, ) try: + # FIXME: Ensure no "nar" computations is required for file + assert self.checksums_computation == "standard" with tempfile.TemporaryDirectory() as tmpdir: file_path, _ = download(url, dest=tmpdir, hashes=self.checksums) with open(file_path, "rb") as file: @@ -814,29 +838,41 @@ tarball_path, extrinsic_metadata = download( url, tmpdir, - # Ensure content received matched the checksums received - hashes=self.checksums, + hashes=self.standard_hashes, extra_request_headers={"Accept-Encoding": "identity"}, ) - except ValueError as e: - # Checksum mismatch - self.log.debug("Error: %s", e) + except ValueError: + # Checksum mismatch can happen, so we + self.log.debug( + "Mismatched checksums <%s>: continue on next mirror url if any", + url, + ) continue except HTTPError as http_error: if http_error.response.status_code == 404: self.log.debug( - "Not found '%s', continue on next mirror url if any", url + "Not found <%s>: continue on next mirror url if any", url ) continue - directory_path = os.path.join(tmpdir, "src") - os.makedirs(directory_path, exist_ok=True) - uncompress(tarball_path, dest=directory_path) - + directory_path = Path(tmpdir) / "src" + directory_path.mkdir(parents=True, exist_ok=True) + uncompress(tarball_path, dest=str(directory_path)) self.log.debug("uncompressed path to directory: %s", directory_path) + if self.checksums_computation == "nar": + # hashes are not "standard", so we need an extra check to happen + # on the uncompressed tarball + dir_to_check = next(directory_path.iterdir()) + self.log.debug("Directory to check nar hashes: %s", dir_to_check) + actual_checksums = nix_hashes( + dir_to_check, self.checksums.keys() + ).hexdigest() + + assert actual_checksums == self.checksums + self.directory = from_disk.Directory.from_disk( - path=directory_path.encode("utf-8"), + path=bytes(directory_path), max_content_length=self.max_content_size, ) # Compute the merkle dag from the top-level directory diff --git a/swh/loader/core/tests/conftest.py b/swh/loader/core/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/loader/core/tests/conftest.py @@ -0,0 +1,45 @@ +# Copyright (C) 2018-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from os import path +import shutil +from typing import Dict, List, Union + +import pytest + +from swh.model.hashutil import MultiHash + +nix_store_missing = shutil.which("nix-store") is None + + +@pytest.fixture +def tarball_path(datadir): + """Return tarball filepath fetched by DirectoryLoader test runs.""" + return path.join(datadir, "https_example.org", "archives_dummy-hello.tar.gz") + + +def compute_hashes( + filepath: str, cksum_algos: Union[str, List[str]] = "sha256" +) -> Dict[str, str]: + """Compute checksums dict out of a filepath""" + checksum_algos = {cksum_algos} if isinstance(cksum_algos, str) else set(cksum_algos) + return MultiHash.from_path(filepath, hash_names=checksum_algos).hexdigest() + + +@pytest.fixture +def tarball_with_std_hashes(tarball_path): + return ( + tarball_path, + compute_hashes(tarball_path, ["sha1", "sha256", "sha512"]), + ) + + +@pytest.fixture +def tarball_with_nar_hashes(tarball_path): + # FIXME: compute it instead of hard-coding it + return ( + tarball_path, + {"sha256": "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"}, + ) diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -8,7 +8,6 @@ import logging import os import time -from typing import Dict, List, Union from unittest.mock import MagicMock, call import pytest @@ -22,9 +21,9 @@ DVCSLoader, ) from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol -from swh.loader.exception import NotFound +from swh.loader.exception import NotFound, UnsupportedChecksumComputation from swh.loader.tests import assert_last_visit_matches -from swh.model.hashutil import MultiHash, hash_to_bytes +from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -35,6 +34,8 @@ ) import swh.storage.exc +from .conftest import compute_hashes, nix_store_missing + ORIGIN = Origin(url="some-url") PARENT_ORIGIN = Origin(url="base-origin-url") @@ -524,14 +525,6 @@ ) -def compute_hashes( - filepath: str, cksum_algos: Union[str, List[str]] = "sha256" -) -> Dict[str, str]: - """Compute checksums dict out of a filepath""" - checksum_algos = {cksum_algos} if isinstance(cksum_algos, str) else set(cksum_algos) - return MultiHash.from_path(filepath, hash_names=checksum_algos).hexdigest() - - def test_content_loader_missing_field(swh_storage): """It should raise if the ContentLoader is missing checksums field""" origin = Origin(CONTENT_URL) @@ -539,6 +532,18 @@ ContentLoader(swh_storage, origin.url) +@pytest.mark.parametrize("loader_class", [ContentLoader, DirectoryLoader]) +def test_node_loader_missing_field(swh_storage, loader_class): + """It should raise if the ContentLoader is missing checksums field""" + with pytest.raises(UnsupportedChecksumComputation): + loader_class( + swh_storage, + CONTENT_URL, + checksums={"sha256": "irrelevant-for-that-test"}, + checksums_computation="unsupported", + ) + + def test_content_loader_404(caplog, swh_storage, requests_mock_datadir, content_path): """It should not ingest origin when there is no file to be found (no mirror url)""" unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp") @@ -637,12 +642,6 @@ DIRECTORY_URL = f"{DIRECTORY_MIRROR}/archives/dummy-hello.tar.gz" -@pytest.fixture -def tarball_path(datadir): - """Return tarball filepath fetched by DirectoryLoader test runs.""" - return os.path.join(datadir, "https_example.org", "archives_dummy-hello.tar.gz") - - def test_directory_loader_missing_field(swh_storage): """It should raise if the DirectoryLoader is missing checksums field""" origin = Origin(DIRECTORY_URL) @@ -699,13 +698,15 @@ def test_directory_loader_404_with_integrity_check_failure( - caplog, swh_storage, requests_mock_datadir, tarball_path + caplog, swh_storage, requests_mock_datadir, tarball_with_std_hashes ): """It should not ingest tarball with mismatched checksum""" + tarball_path, checksums = tarball_with_std_hashes + origin = Origin(DIRECTORY_URL) erratic_checksums = { algo: chksum.replace("a", "e") # alter checksums to fail integrity check - for algo, chksum in compute_hashes(tarball_path).items() + for algo, chksum in checksums.items() } loader = DirectoryLoader( @@ -729,9 +730,11 @@ @pytest.mark.parametrize("checksum_algo", ["sha1", "sha256", "sha512"]) def test_directory_loader_ok_with_fallback( - caplog, swh_storage, requests_mock_datadir, tarball_path, checksum_algo + caplog, swh_storage, requests_mock_datadir, tarball_with_std_hashes, checksum_algo ): """It should be an eventful visit even when ingesting through mirror url""" + tarball_path, checksums = tarball_with_std_hashes + dead_origin = Origin(f"{DIRECTORY_MIRROR}/dead-origin-url") fallback_url_ok = DIRECTORY_URL fallback_url_ko = f"{DIRECTORY_MIRROR}/archives/unknown2.tgz" @@ -740,20 +743,51 @@ swh_storage, dead_origin.url, fallback_urls=[fallback_url_ok, fallback_url_ko], - checksums=compute_hashes(tarball_path, checksum_algo), + checksums={checksum_algo: checksums[checksum_algo]}, ) result = loader.load() assert result == {"status": "eventful"} -def test_directory_loader_ok_simple(swh_storage, requests_mock_datadir, tarball_path): +def test_directory_loader_ok_simple( + swh_storage, requests_mock_datadir, tarball_with_std_hashes +): """It should be an eventful visit on a new tarball, then uneventful""" origin = Origin(DIRECTORY_URL) + tarball_path, checksums = tarball_with_std_hashes + loader = DirectoryLoader( + swh_storage, + origin.url, + checksums=checksums, + ) + result = loader.load() + + assert result == {"status": "eventful"} + + visit_status = assert_last_visit_matches( + swh_storage, origin.url, status="full", type="directory" + ) + assert visit_status.snapshot is not None + + result2 = loader.load() + + assert result2 == {"status": "uneventful"} + + +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +def test_directory_loader_ok_with_nar( + swh_storage, requests_mock_datadir, tarball_with_nar_hashes +): + """It should be an eventful visit on a tarball with nar hashes, then uneventful""" + tarball_path, nar_checksums = tarball_with_nar_hashes + origin = Origin(DIRECTORY_URL) + loader = DirectoryLoader( swh_storage, origin.url, - checksums=compute_hashes(tarball_path, ["sha1", "sha256", "sha512"]), + checksums=nar_checksums, + checksums_computation="nar", ) result = loader.load() diff --git a/swh/loader/core/tests/test_utils.py b/swh/loader/core/tests/test_utils.py --- a/swh/loader/core/tests/test_utils.py +++ b/swh/loader/core/tests/test_utils.py @@ -1,23 +1,30 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import os +from pathlib import Path import signal +import tempfile from time import sleep from unittest.mock import patch import pytest +from swh.core.tarball import uncompress from swh.loader.core.utils import ( CloneFailure, CloneTimeout, clean_dangling_folders, clone_with_timeout, + nix_hashes, parse_visit_date, ) +from swh.loader.exception import MissingOptionalDependency + +from .conftest import nix_store_missing def prepare_arborescence_from(tmpdir, folder_names): @@ -185,3 +192,27 @@ def test_utils_parse_visit_date_fails(): with pytest.raises(ValueError, match="invalid"): parse_visit_date(10) # not a string nor a date + + +@patch( + "swh.loader.core.utils.shutil.which", + return_value=None, +) +def test_nix_hashes_missing_nix_store(mock_which): + with pytest.raises(MissingOptionalDependency, match="nix-store"): + nix_hashes("some-irrelevant-filepath", ["sha1"]) + + +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +def test_nix_hashes_compute(tarball_with_nar_hashes): + tarball_path, nar_checksums = tarball_with_nar_hashes + + with tempfile.TemporaryDirectory() as tmpdir: + directory_path = Path(tmpdir) / "src" + directory_path.mkdir(parents=True, exist_ok=True) + uncompress(tarball_path, dest=str(directory_path)) + directory = next(directory_path.iterdir()) + + actual_multihash = nix_hashes(directory, nar_checksums.keys()) + + assert actual_multihash.hexdigest() == nar_checksums diff --git a/swh/loader/core/utils.py b/swh/loader/core/utils.py --- a/swh/loader/core/utils.py +++ b/swh/loader/core/utils.py @@ -7,16 +7,21 @@ from datetime import datetime, timezone import io import os +from pathlib import Path import shutil import signal +from subprocess import PIPE, Popen import time import traceback -from typing import Callable, Optional, Union +from typing import Callable, Iterable, Optional, Union from billiard import Process, Queue # type: ignore from dateutil.parser import parse import psutil +from swh.loader.exception import MissingOptionalDependency +from swh.model.hashutil import MultiHash + def clean_dangling_folders(dirpath: str, pattern_check: str, log=None) -> None: """Clean up potential dangling temporary working folder rooted at `dirpath`. Those @@ -125,3 +130,25 @@ return parse(visit_date) raise ValueError(f"invalid visit date {visit_date!r}") + + +def nix_hashes(filepath: Path, hash_names: Iterable[str]) -> MultiHash: + """Compute nix-store hashes on filepath. + + Raises: + FileNotFoundError in case the nix-store command is not available on the system. + + """ + NIX_STORE = shutil.which("nix-store") + if NIX_STORE is None: + raise MissingOptionalDependency("nix-store") + + multi_hash = MultiHash(hash_names=hash_names) + + command = [NIX_STORE, "--dump", str(filepath)] + with Popen(command, stdout=PIPE) as proc: + assert proc.stdout is not None + for chunk in proc.stdout: + multi_hash.update(chunk) + + return multi_hash diff --git a/swh/loader/exception.py b/swh/loader/exception.py --- a/swh/loader/exception.py +++ b/swh/loader/exception.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,3 +11,15 @@ """ pass + + +class MissingOptionalDependency(ValueError): + """An exception raised when an optional runtime dependency is missing.""" + + pass + + +class UnsupportedChecksumComputation(ValueError): + """An exception raised when loader cannot compute such checksums.""" + + pass