Page MenuHomeSoftware Heritage

D8636.id.diff
No OneTemporary

D8636.id.diff

diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py
--- a/swh/loader/core/loader.py
+++ b/swh/loader/core/loader.py
@@ -750,9 +750,31 @@
)
try:
# FIXME: Ensure no "nar" computations is required for file
- assert self.checksums_computation == "standard"
with tempfile.TemporaryDirectory() as tmpdir:
- file_path, _ = download(url, dest=tmpdir, hashes=self.checksums)
+ file_path, _ = download(
+ url, dest=tmpdir, hashes=self.standard_hashes
+ )
+ if self.checksums_computation == "nar":
+ # hashes are not "standard", so we need an extra check to happen
+ self.log.debug("Content to check nar hashes: %s", file_path)
+ actual_checksums = nix_hashes(
+ Path(file_path), self.checksums.keys()
+ ).hexdigest()
+
+ if actual_checksums != self.checksums:
+ errors.append(
+ ValueError(
+ f"Checksum mismatched on <{url}>: "
+ f"{actual_checksums} != {self.checksums}"
+ )
+ )
+ self.log.debug(
+ "Mismatched checksums <%s>: continue on next mirror "
+ "url if any",
+ url,
+ )
+ continue
+
with open(file_path, "rb") as file:
self.content = Content.from_data(file.read())
except ValueError as e:
diff --git a/swh/loader/core/tests/conftest.py b/swh/loader/core/tests/conftest.py
--- a/swh/loader/core/tests/conftest.py
+++ b/swh/loader/core/tests/conftest.py
@@ -5,10 +5,11 @@
from os import path
import shutil
-from typing import Dict, List, Union
+from typing import Dict, List
import pytest
+from swh.loader.core.utils import compute_nar_hashes
from swh.model.hashutil import MultiHash
nix_store_missing = shutil.which("nix-store") is None
@@ -20,12 +21,17 @@
return path.join(datadir, "https_example.org", "archives_dummy-hello.tar.gz")
-def compute_hashes(
- filepath: str, cksum_algos: Union[str, List[str]] = "sha256"
-) -> Dict[str, str]:
+@pytest.fixture
+def content_path(datadir):
+ """Return filepath fetched by ContentLoader test runs."""
+ return path.join(
+ datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp"
+ )
+
+
+def compute_hashes(filepath: str, hash_names: List[str] = ["sha256"]) -> Dict[str, str]:
"""Compute checksums dict out of a filepath"""
- checksum_algos = {cksum_algos} if isinstance(cksum_algos, str) else set(cksum_algos)
- return MultiHash.from_path(filepath, hash_names=checksum_algos).hexdigest()
+ return MultiHash.from_path(filepath, hash_names=hash_names).hexdigest()
@pytest.fixture
@@ -38,8 +44,21 @@
@pytest.fixture
def tarball_with_nar_hashes(tarball_path):
- # FIXME: compute it instead of hard-coding it
- return (
- tarball_path,
- {"sha256": "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"},
+ nar_hashes = compute_nar_hashes(tarball_path, ["sha256"])
+ # Ensure it's the same hash as the initial one computed from the cli
+ assert (
+ nar_hashes["sha256"]
+ == "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"
+ )
+ return (tarball_path, nar_hashes)
+
+
+@pytest.fixture
+def content_with_nar_hashes(content_path):
+ nar_hashes = compute_nar_hashes(content_path, ["sha256"], is_tarball=False)
+ # Ensure it's the same hash as the initial one computed from the cli
+ assert (
+ nar_hashes["sha256"]
+ == "0b555a4d13e530460425d1dc20332294f151067fb64a7e49c7de501f05b0a41a"
)
+ return (content_path, nar_hashes)
diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py
--- a/swh/loader/core/tests/test_loader.py
+++ b/swh/loader/core/tests/test_loader.py
@@ -4,9 +4,9 @@
# See top-level LICENSE file for more information
import datetime
+from functools import partial
import hashlib
import logging
-import os
import time
from unittest.mock import MagicMock, call
@@ -34,7 +34,7 @@
)
import swh.storage.exc
-from .conftest import compute_hashes, nix_store_missing
+from .conftest import compute_hashes, compute_nar_hashes, nix_store_missing
ORIGIN = Origin(url="some-url")
PARENT_ORIGIN = Origin(url="base-origin-url")
@@ -517,14 +517,6 @@
CONTENT_URL = f"{CONTENT_MIRROR}/project/asdf/archives/asdf-3.3.5.lisp"
-@pytest.fixture
-def content_path(datadir):
- """Return filepath fetched by ContentLoader test runs."""
- return os.path.join(
- datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp"
- )
-
-
def test_content_loader_missing_field(swh_storage):
"""It should raise if the ContentLoader is missing checksums field"""
origin = Origin(CONTENT_URL)
@@ -609,20 +601,34 @@
swh_storage,
dead_origin.url,
fallback_urls=[fallback_url_ok, fallback_url_ko],
- checksums=compute_hashes(content_path, checksum_algo),
+ checksums=compute_hashes(content_path, [checksum_algo]),
)
result = loader.load()
assert result == {"status": "eventful"}
-def test_content_loader_ok_simple(swh_storage, requests_mock_datadir, content_path):
+compute_content_nar_hashes = partial(compute_nar_hashes, is_tarball=False)
+
+
+@pytest.mark.skipif(
+ nix_store_missing, reason="requires nix-store binary from nix binaries"
+)
+@pytest.mark.parametrize("checksums_computation", ["standard", "nar"])
+def test_content_loader_ok_simple(
+ swh_storage, requests_mock_datadir, content_path, checksums_computation
+):
"""It should be an eventful visit on a new file, then uneventful"""
+ compute_hashes_fn = (
+ compute_content_nar_hashes if checksums_computation == "nar" else compute_hashes
+ )
+
origin = Origin(CONTENT_URL)
loader = ContentLoader(
swh_storage,
origin.url,
- checksums=compute_hashes(content_path, ["sha1", "sha256", "sha512"]),
+ checksums=compute_hashes_fn(content_path, ["sha1", "sha256", "sha512"]),
+ checksums_computation=checksums_computation,
)
result = loader.load()
@@ -638,9 +644,18 @@
assert result2 == {"status": "uneventful"}
-def test_content_loader_hash_mismatch(swh_storage, requests_mock_datadir, content_path):
+@pytest.mark.skipif(
+ nix_store_missing, reason="requires nix-store binary from nix binaries"
+)
+@pytest.mark.parametrize("checksums_computation", ["standard", "nar"])
+def test_content_loader_hash_mismatch(
+ swh_storage, requests_mock_datadir, content_path, checksums_computation
+):
"""It should be an eventful visit on a new file, then uneventful"""
- checksums = compute_hashes(content_path, ["sha1", "sha256", "sha512"])
+ compute_hashes_fn = (
+ compute_content_nar_hashes if checksums_computation == "nar" else compute_hashes
+ )
+ checksums = compute_hashes_fn(content_path, ["sha1", "sha256", "sha512"])
erratic_checksums = {
algo: chksum.replace("a", "e") # alter checksums to fail integrity check
for algo, chksum in checksums.items()
@@ -650,6 +665,7 @@
swh_storage,
origin.url,
checksums=erratic_checksums,
+ checksums_computation=checksums_computation,
)
result = loader.load()
@@ -717,43 +733,18 @@
)
+@pytest.mark.skipif(
+ nix_store_missing, reason="requires nix-store binary from nix binaries"
+)
+@pytest.mark.parametrize("checksums_computation", ["standard", "nar"])
def test_directory_loader_hash_mismatch(
- caplog, swh_storage, requests_mock_datadir, tarball_with_std_hashes
+ caplog, swh_storage, requests_mock_datadir, tarball_path, checksums_computation
):
"""It should not ingest tarball with mismatched checksum"""
- tarball_path, checksums = tarball_with_std_hashes
-
- origin = Origin(DIRECTORY_URL)
- erratic_checksums = {
- algo: chksum.replace("a", "e") # alter checksums to fail integrity check
- for algo, chksum in checksums.items()
- }
-
- loader = DirectoryLoader(
- swh_storage,
- origin.url,
- checksums=erratic_checksums, # making the integrity check fail
- )
- result = loader.load()
-
- assert result == {"status": "failed"}
-
- _check_load_failure(
- caplog,
- loader,
- ValueError,
- "mismatched",
- status="failed",
- origin=origin,
+ compute_hashes_fn = (
+ compute_nar_hashes if checksums_computation == "nar" else compute_hashes
)
-
-
-@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)")
-def test_directory_loader_hash_mismatch_nar(
- caplog, swh_storage, requests_mock_datadir, tarball_with_nar_hashes
-):
- """It should not ingest tarball with mismatched checksum"""
- tarball_path, checksums = tarball_with_nar_hashes
+ checksums = compute_hashes_fn(tarball_path, ["sha1", "sha256", "sha512"])
origin = Origin(DIRECTORY_URL)
erratic_checksums = {
@@ -765,7 +756,7 @@
swh_storage,
origin.url,
checksums=erratic_checksums, # making the integrity check fail
- checksums_computation="nar",
+ checksums_computation=checksums_computation,
)
result = loader.load()
@@ -803,44 +794,24 @@
assert result == {"status": "eventful"}
+@pytest.mark.skipif(
+ nix_store_missing, reason="requires nix-store binary from nix binaries"
+)
+@pytest.mark.parametrize("checksums_computation", ["standard", "nar"])
def test_directory_loader_ok_simple(
- swh_storage, requests_mock_datadir, tarball_with_std_hashes
+ swh_storage, requests_mock_datadir, tarball_path, checksums_computation
):
"""It should be an eventful visit on a new tarball, then uneventful"""
origin = Origin(DIRECTORY_URL)
- tarball_path, checksums = tarball_with_std_hashes
- loader = DirectoryLoader(
- swh_storage,
- origin.url,
- checksums=checksums,
+ compute_hashes_fn = (
+ compute_nar_hashes if checksums_computation == "nar" else compute_hashes
)
- result = loader.load()
-
- assert result == {"status": "eventful"}
-
- visit_status = assert_last_visit_matches(
- swh_storage, origin.url, status="full", type="directory"
- )
- assert visit_status.snapshot is not None
-
- result2 = loader.load()
-
- assert result2 == {"status": "uneventful"}
-
-
-@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)")
-def test_directory_loader_ok_with_nar(
- swh_storage, requests_mock_datadir, tarball_with_nar_hashes
-):
- """It should be an eventful visit on a tarball with nar hashes, then uneventful"""
- tarball_path, nar_checksums = tarball_with_nar_hashes
- origin = Origin(DIRECTORY_URL)
loader = DirectoryLoader(
swh_storage,
origin.url,
- checksums=nar_checksums,
- checksums_computation="nar",
+ checksums=compute_hashes_fn(tarball_path, ["sha1", "sha256", "sha512"]),
+ checksums_computation=checksums_computation,
)
result = loader.load()
diff --git a/swh/loader/core/tests/test_utils.py b/swh/loader/core/tests/test_utils.py
--- a/swh/loader/core/tests/test_utils.py
+++ b/swh/loader/core/tests/test_utils.py
@@ -19,6 +19,7 @@
CloneTimeout,
clean_dangling_folders,
clone_with_timeout,
+ compute_nar_hashes,
nix_hashes,
parse_visit_date,
)
@@ -216,3 +217,23 @@
actual_multihash = nix_hashes(directory, nar_checksums.keys())
assert actual_multihash.hexdigest() == nar_checksums
+
+
+@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)")
+def test_compute_nar_hashes_tarball(tarball_with_nar_hashes):
+ tarball_path, nar_checksums = tarball_with_nar_hashes
+
+ actual_checksums = compute_nar_hashes(tarball_path, nar_checksums.keys())
+
+ assert actual_checksums == nar_checksums
+
+
+@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)")
+def test_compute_nar_hashes_file(content_with_nar_hashes):
+ content_path, nar_checksums = content_with_nar_hashes
+
+ actual_checksums = compute_nar_hashes(
+ content_path, nar_checksums.keys(), is_tarball=False
+ )
+
+ assert actual_checksums == nar_checksums
diff --git a/swh/loader/core/utils.py b/swh/loader/core/utils.py
--- a/swh/loader/core/utils.py
+++ b/swh/loader/core/utils.py
@@ -11,14 +11,16 @@
import shutil
import signal
from subprocess import PIPE, Popen
+import tempfile
import time
import traceback
-from typing import Callable, Iterable, Optional, Union
+from typing import Callable, Dict, Iterable, List, Optional, Union
from billiard import Process, Queue # type: ignore
from dateutil.parser import parse
import psutil
+from swh.core.tarball import uncompress
from swh.loader.exception import MissingOptionalDependency
from swh.model.hashutil import MultiHash
@@ -152,3 +154,35 @@
multi_hash.update(chunk)
return multi_hash
+
+
+def compute_nar_hashes(
+ filepath: Path,
+ hash_names: List[str] = ["sha256"],
+ is_tarball=True,
+) -> Dict[str, str]:
+ """Compute nar checksums dict out of a filepath (tarball or plain file).
+
+ If it's a tarball, this uncompresses the tarball in a temporary directory to compute
+ the nix hashes (and then cleans it up).
+
+ Args:
+ filepath: The tarball (if is_tarball is True) or a filepath
+ hash_names: The list of checksums to compute
+ is_tarball: Whether filepath represents a tarball or not
+
+ Returns:
+ The dict of checksums values whose keys are present in hash_names.
+
+ """
+ with tempfile.TemporaryDirectory() as tmpdir:
+ if is_tarball:
+ directory_path = Path(tmpdir)
+ directory_path.mkdir(parents=True, exist_ok=True)
+ uncompress(str(filepath), dest=str(directory_path))
+ path_on_disk = next(directory_path.iterdir())
+ else:
+ path_on_disk = filepath
+
+ hashes = nix_hashes(path_on_disk, hash_names).hexdigest()
+ return hashes

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 11:30 AM (12 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223973

Event Timeline