Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066473
D8636.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D8636.id.diff
View Options
diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py
--- a/swh/loader/core/loader.py
+++ b/swh/loader/core/loader.py
@@ -750,9 +750,31 @@
)
try:
# FIXME: Ensure no "nar" computations is required for file
- assert self.checksums_computation == "standard"
with tempfile.TemporaryDirectory() as tmpdir:
- file_path, _ = download(url, dest=tmpdir, hashes=self.checksums)
+ file_path, _ = download(
+ url, dest=tmpdir, hashes=self.standard_hashes
+ )
+ if self.checksums_computation == "nar":
+ # hashes are not "standard", so we need an extra check to happen
+ self.log.debug("Content to check nar hashes: %s", file_path)
+ actual_checksums = nix_hashes(
+ Path(file_path), self.checksums.keys()
+ ).hexdigest()
+
+ if actual_checksums != self.checksums:
+ errors.append(
+ ValueError(
+ f"Checksum mismatched on <{url}>: "
+ f"{actual_checksums} != {self.checksums}"
+ )
+ )
+ self.log.debug(
+ "Mismatched checksums <%s>: continue on next mirror "
+ "url if any",
+ url,
+ )
+ continue
+
with open(file_path, "rb") as file:
self.content = Content.from_data(file.read())
except ValueError as e:
diff --git a/swh/loader/core/tests/conftest.py b/swh/loader/core/tests/conftest.py
--- a/swh/loader/core/tests/conftest.py
+++ b/swh/loader/core/tests/conftest.py
@@ -5,10 +5,11 @@
from os import path
import shutil
-from typing import Dict, List, Union
+from typing import Dict, List
import pytest
+from swh.loader.core.utils import compute_nar_hashes
from swh.model.hashutil import MultiHash
nix_store_missing = shutil.which("nix-store") is None
@@ -20,12 +21,17 @@
return path.join(datadir, "https_example.org", "archives_dummy-hello.tar.gz")
-def compute_hashes(
- filepath: str, cksum_algos: Union[str, List[str]] = "sha256"
-) -> Dict[str, str]:
+@pytest.fixture
+def content_path(datadir):
+ """Return filepath fetched by ContentLoader test runs."""
+ return path.join(
+ datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp"
+ )
+
+
+def compute_hashes(filepath: str, hash_names: List[str] = ["sha256"]) -> Dict[str, str]:
"""Compute checksums dict out of a filepath"""
- checksum_algos = {cksum_algos} if isinstance(cksum_algos, str) else set(cksum_algos)
- return MultiHash.from_path(filepath, hash_names=checksum_algos).hexdigest()
+ return MultiHash.from_path(filepath, hash_names=hash_names).hexdigest()
@pytest.fixture
@@ -38,8 +44,21 @@
@pytest.fixture
def tarball_with_nar_hashes(tarball_path):
- # FIXME: compute it instead of hard-coding it
- return (
- tarball_path,
- {"sha256": "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"},
+ nar_hashes = compute_nar_hashes(tarball_path, ["sha256"])
+ # Ensure it's the same hash as the initial one computed from the cli
+ assert (
+ nar_hashes["sha256"]
+ == "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"
+ )
+ return (tarball_path, nar_hashes)
+
+
+@pytest.fixture
+def content_with_nar_hashes(content_path):
+ nar_hashes = compute_nar_hashes(content_path, ["sha256"], is_tarball=False)
+ # Ensure it's the same hash as the initial one computed from the cli
+ assert (
+ nar_hashes["sha256"]
+ == "0b555a4d13e530460425d1dc20332294f151067fb64a7e49c7de501f05b0a41a"
)
+ return (content_path, nar_hashes)
diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py
--- a/swh/loader/core/tests/test_loader.py
+++ b/swh/loader/core/tests/test_loader.py
@@ -4,9 +4,9 @@
# See top-level LICENSE file for more information
import datetime
+from functools import partial
import hashlib
import logging
-import os
import time
from unittest.mock import MagicMock, call
@@ -34,7 +34,7 @@
)
import swh.storage.exc
-from .conftest import compute_hashes, nix_store_missing
+from .conftest import compute_hashes, compute_nar_hashes, nix_store_missing
ORIGIN = Origin(url="some-url")
PARENT_ORIGIN = Origin(url="base-origin-url")
@@ -517,14 +517,6 @@
CONTENT_URL = f"{CONTENT_MIRROR}/project/asdf/archives/asdf-3.3.5.lisp"
-@pytest.fixture
-def content_path(datadir):
- """Return filepath fetched by ContentLoader test runs."""
- return os.path.join(
- datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp"
- )
-
-
def test_content_loader_missing_field(swh_storage):
"""It should raise if the ContentLoader is missing checksums field"""
origin = Origin(CONTENT_URL)
@@ -609,20 +601,34 @@
swh_storage,
dead_origin.url,
fallback_urls=[fallback_url_ok, fallback_url_ko],
- checksums=compute_hashes(content_path, checksum_algo),
+ checksums=compute_hashes(content_path, [checksum_algo]),
)
result = loader.load()
assert result == {"status": "eventful"}
-def test_content_loader_ok_simple(swh_storage, requests_mock_datadir, content_path):
+compute_content_nar_hashes = partial(compute_nar_hashes, is_tarball=False)
+
+
+@pytest.mark.skipif(
+ nix_store_missing, reason="requires nix-store binary from nix binaries"
+)
+@pytest.mark.parametrize("checksums_computation", ["standard", "nar"])
+def test_content_loader_ok_simple(
+ swh_storage, requests_mock_datadir, content_path, checksums_computation
+):
"""It should be an eventful visit on a new file, then uneventful"""
+ compute_hashes_fn = (
+ compute_content_nar_hashes if checksums_computation == "nar" else compute_hashes
+ )
+
origin = Origin(CONTENT_URL)
loader = ContentLoader(
swh_storage,
origin.url,
- checksums=compute_hashes(content_path, ["sha1", "sha256", "sha512"]),
+ checksums=compute_hashes_fn(content_path, ["sha1", "sha256", "sha512"]),
+ checksums_computation=checksums_computation,
)
result = loader.load()
@@ -638,9 +644,18 @@
assert result2 == {"status": "uneventful"}
-def test_content_loader_hash_mismatch(swh_storage, requests_mock_datadir, content_path):
+@pytest.mark.skipif(
+ nix_store_missing, reason="requires nix-store binary from nix binaries"
+)
+@pytest.mark.parametrize("checksums_computation", ["standard", "nar"])
+def test_content_loader_hash_mismatch(
+ swh_storage, requests_mock_datadir, content_path, checksums_computation
+):
"""It should be an eventful visit on a new file, then uneventful"""
- checksums = compute_hashes(content_path, ["sha1", "sha256", "sha512"])
+ compute_hashes_fn = (
+ compute_content_nar_hashes if checksums_computation == "nar" else compute_hashes
+ )
+ checksums = compute_hashes_fn(content_path, ["sha1", "sha256", "sha512"])
erratic_checksums = {
algo: chksum.replace("a", "e") # alter checksums to fail integrity check
for algo, chksum in checksums.items()
@@ -650,6 +665,7 @@
swh_storage,
origin.url,
checksums=erratic_checksums,
+ checksums_computation=checksums_computation,
)
result = loader.load()
@@ -717,43 +733,18 @@
)
+@pytest.mark.skipif(
+ nix_store_missing, reason="requires nix-store binary from nix binaries"
+)
+@pytest.mark.parametrize("checksums_computation", ["standard", "nar"])
def test_directory_loader_hash_mismatch(
- caplog, swh_storage, requests_mock_datadir, tarball_with_std_hashes
+ caplog, swh_storage, requests_mock_datadir, tarball_path, checksums_computation
):
"""It should not ingest tarball with mismatched checksum"""
- tarball_path, checksums = tarball_with_std_hashes
-
- origin = Origin(DIRECTORY_URL)
- erratic_checksums = {
- algo: chksum.replace("a", "e") # alter checksums to fail integrity check
- for algo, chksum in checksums.items()
- }
-
- loader = DirectoryLoader(
- swh_storage,
- origin.url,
- checksums=erratic_checksums, # making the integrity check fail
- )
- result = loader.load()
-
- assert result == {"status": "failed"}
-
- _check_load_failure(
- caplog,
- loader,
- ValueError,
- "mismatched",
- status="failed",
- origin=origin,
+ compute_hashes_fn = (
+ compute_nar_hashes if checksums_computation == "nar" else compute_hashes
)
-
-
-@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)")
-def test_directory_loader_hash_mismatch_nar(
- caplog, swh_storage, requests_mock_datadir, tarball_with_nar_hashes
-):
- """It should not ingest tarball with mismatched checksum"""
- tarball_path, checksums = tarball_with_nar_hashes
+ checksums = compute_hashes_fn(tarball_path, ["sha1", "sha256", "sha512"])
origin = Origin(DIRECTORY_URL)
erratic_checksums = {
@@ -765,7 +756,7 @@
swh_storage,
origin.url,
checksums=erratic_checksums, # making the integrity check fail
- checksums_computation="nar",
+ checksums_computation=checksums_computation,
)
result = loader.load()
@@ -803,44 +794,24 @@
assert result == {"status": "eventful"}
+@pytest.mark.skipif(
+ nix_store_missing, reason="requires nix-store binary from nix binaries"
+)
+@pytest.mark.parametrize("checksums_computation", ["standard", "nar"])
def test_directory_loader_ok_simple(
- swh_storage, requests_mock_datadir, tarball_with_std_hashes
+ swh_storage, requests_mock_datadir, tarball_path, checksums_computation
):
"""It should be an eventful visit on a new tarball, then uneventful"""
origin = Origin(DIRECTORY_URL)
- tarball_path, checksums = tarball_with_std_hashes
- loader = DirectoryLoader(
- swh_storage,
- origin.url,
- checksums=checksums,
+ compute_hashes_fn = (
+ compute_nar_hashes if checksums_computation == "nar" else compute_hashes
)
- result = loader.load()
-
- assert result == {"status": "eventful"}
-
- visit_status = assert_last_visit_matches(
- swh_storage, origin.url, status="full", type="directory"
- )
- assert visit_status.snapshot is not None
-
- result2 = loader.load()
-
- assert result2 == {"status": "uneventful"}
-
-
-@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)")
-def test_directory_loader_ok_with_nar(
- swh_storage, requests_mock_datadir, tarball_with_nar_hashes
-):
- """It should be an eventful visit on a tarball with nar hashes, then uneventful"""
- tarball_path, nar_checksums = tarball_with_nar_hashes
- origin = Origin(DIRECTORY_URL)
loader = DirectoryLoader(
swh_storage,
origin.url,
- checksums=nar_checksums,
- checksums_computation="nar",
+ checksums=compute_hashes_fn(tarball_path, ["sha1", "sha256", "sha512"]),
+ checksums_computation=checksums_computation,
)
result = loader.load()
diff --git a/swh/loader/core/tests/test_utils.py b/swh/loader/core/tests/test_utils.py
--- a/swh/loader/core/tests/test_utils.py
+++ b/swh/loader/core/tests/test_utils.py
@@ -19,6 +19,7 @@
CloneTimeout,
clean_dangling_folders,
clone_with_timeout,
+ compute_nar_hashes,
nix_hashes,
parse_visit_date,
)
@@ -216,3 +217,23 @@
actual_multihash = nix_hashes(directory, nar_checksums.keys())
assert actual_multihash.hexdigest() == nar_checksums
+
+
+@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)")
+def test_compute_nar_hashes_tarball(tarball_with_nar_hashes):
+ tarball_path, nar_checksums = tarball_with_nar_hashes
+
+ actual_checksums = compute_nar_hashes(tarball_path, nar_checksums.keys())
+
+ assert actual_checksums == nar_checksums
+
+
+@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)")
+def test_compute_nar_hashes_file(content_with_nar_hashes):
+ content_path, nar_checksums = content_with_nar_hashes
+
+ actual_checksums = compute_nar_hashes(
+ content_path, nar_checksums.keys(), is_tarball=False
+ )
+
+ assert actual_checksums == nar_checksums
diff --git a/swh/loader/core/utils.py b/swh/loader/core/utils.py
--- a/swh/loader/core/utils.py
+++ b/swh/loader/core/utils.py
@@ -11,14 +11,16 @@
import shutil
import signal
from subprocess import PIPE, Popen
+import tempfile
import time
import traceback
-from typing import Callable, Iterable, Optional, Union
+from typing import Callable, Dict, Iterable, List, Optional, Union
from billiard import Process, Queue # type: ignore
from dateutil.parser import parse
import psutil
+from swh.core.tarball import uncompress
from swh.loader.exception import MissingOptionalDependency
from swh.model.hashutil import MultiHash
@@ -152,3 +154,35 @@
multi_hash.update(chunk)
return multi_hash
+
+
+def compute_nar_hashes(
+ filepath: Path,
+ hash_names: List[str] = ["sha256"],
+ is_tarball=True,
+) -> Dict[str, str]:
+ """Compute nar checksums dict out of a filepath (tarball or plain file).
+
+ If it's a tarball, this uncompresses the tarball in a temporary directory to compute
+ the nix hashes (and then cleans it up).
+
+ Args:
+ filepath: The tarball (if is_tarball is True) or a filepath
+ hash_names: The list of checksums to compute
+ is_tarball: Whether filepath represents a tarball or not
+
+ Returns:
+ The dict of checksums values whose keys are present in hash_names.
+
+ """
+ with tempfile.TemporaryDirectory() as tmpdir:
+ if is_tarball:
+ directory_path = Path(tmpdir)
+ directory_path.mkdir(parents=True, exist_ok=True)
+ uncompress(str(filepath), dest=str(directory_path))
+ path_on_disk = next(directory_path.iterdir())
+ else:
+ path_on_disk = filepath
+
+ hashes = nix_hashes(path_on_disk, hash_names).hexdigest()
+ return hashes
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 11:30 AM (12 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223973
Attached To
D8636: ContentLoader: Allow nar computation checks
Event Timeline
Log In to Comment