diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 2.12 -swh.model >= 4.4.0 +swh.model >= 6.5.1 swh.objstorage >= 0.2.2 swh.scheduler >= 0.4.0 swh.storage >= 0.29.0 diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import base64 import datetime import hashlib import logging @@ -23,6 +22,7 @@ from swh.loader.exception import NotFound from swh.loader.package.utils import download, get_url_body from swh.model import from_disk +from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes from swh.model.model import ( BaseContent, Content, @@ -653,27 +653,29 @@ class NodeLoader(BaseLoader): - """Common class for Content and Directory loaders. + """Common class for :class:`ContentLoader` and :class:`Directoryloader`. - The "integrity" field is a normalized information about the checksum used and the - corresponding base64 hash encoded value of the object retrieved (content or - directory). + The "checksums" field is a dictionary of hex hashes on the object retrieved (content + or directory). - The multiple "fallback" urls received are mirror urls so no need to keep those. We - only use them to fetch the actual object if the main origin is no longer available. + The multiple "fallback" urls received are mirror urls only used to fetch the object + if the main origin is no longer available. Those are not stored. + + Ingestion is considered eventful on the first ingestion. Subsequent load of the same + object should end up being an uneventful visit (matching snapshot). """ def __init__( - self, *args, integrity: str, fallback_urls: List[str] = None, **kwargs + self, + *args, + checksums: Dict[str, str], + fallback_urls: List[str] = None, + **kwargs, ): super().__init__(*args, **kwargs) self.snapshot: Optional[Snapshot] = None - # Determine the content checksum stored in the integrity field - # hash- - # https://w3c.github.io/webappsec-subresource-integrity/#grammardef-hash-algo - self.checksum_algo, checksum_value_b64 = integrity.split("-") - self.expected_checksum: bytes = base64.decodebytes(checksum_value_b64.encode()) + self.checksums = checksums fallback_urls_ = fallback_urls or [] self.mirror_urls: List[str] = [self.origin.url, *fallback_urls_] @@ -726,21 +728,41 @@ ) try: data = get_url_body(url) - self.content = Content.from_data(data) - - # Ensure content received matched the integrity field received - actual_checksum = self.content.get_hash(self.checksum_algo) - if actual_checksum == self.expected_checksum: - # match, we have found our content to ingest, exit loop - break # otherwise continue except NotFound: + self.log.debug("Not found %s, continue on next mirror url if any", url) continue - if not self.content: - raise NotFound(f"Unknown origin {self.origin.url}.") + checksum_algos = {c for c in self.checksums.keys()} + content_d = MultiHash.from_data( + data, hash_names=DEFAULT_ALGORITHMS | checksum_algos + ).digest() + + # We now need to determine if we found a matching content (matching the + # checksums provided). As an implementation detail, we must drop the + # checksums that are unsupported by the Content Model (e.g. sha512) + found: bool = False + for checksum_algo in checksum_algos: + if checksum_algo not in DEFAULT_ALGORITHMS: + # We must drop this unssuported checksum algo from the Content model + actual_checksum = content_d.pop(checksum_algo) + else: + actual_checksum = content_d[checksum_algo] + + # and check whether we found a correct content matching the checksum + found = actual_checksum == hash_to_bytes(self.checksums[checksum_algo]) - return False # no more data to fetch + if found: + # We have a match, we have our content to ingest + content_d["data"] = data + content_d["length"] = len(data) + self.content = Content.from_dict(content_d) + # we are done, no more data to fetch + return False + + # If we reach this point, we did not find any proper content, consider the + # origin not found + raise NotFound(f"Unknown origin {self.origin.url}.") def process_data(self) -> bool: """Build the snapshot out of the Content retrieved.""" @@ -799,7 +821,6 @@ Raises NotFound if no tarball is found """ - expected_checksum_hashhex = self.expected_checksum.decode("utf-8") for url in self.mirror_urls: url_ = urlparse(url) self.log.debug( @@ -814,8 +835,8 @@ tarball_path, extrinsic_metadata = download( url, tmpdir, - # Ensure content received matched the integrity field received - hashes={self.checksum_algo: expected_checksum_hashhex}, + # Ensure content received matched the checksums received + hashes=self.checksums, extra_request_headers={"Accept-Encoding": "identity"}, ) except ValueError as e: diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -3,11 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import base64 import datetime import hashlib import logging +import os import time +from typing import Dict, List, Union from unittest.mock import MagicMock, call import pytest @@ -23,7 +24,7 @@ from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol from swh.loader.exception import NotFound from swh.loader.tests import assert_last_visit_matches -from swh.model.hashutil import hash_to_bytes +from swh.model.hashutil import MultiHash, hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -513,22 +514,38 @@ CONTENT_MIRROR = "https://common-lisp.net" CONTENT_URL = f"{CONTENT_MIRROR}/project/asdf/archives/asdf-3.3.5.lisp" -CONTENT_SHA256 = b"77bfa7d03eab048f68da87d630a6436640abfe7d5543202e24c553d5ff32e0a2" -CONTENT_INTEGRITY = f"sha256-{base64.encodebytes(CONTENT_SHA256).decode()}" + + +@pytest.fixture +def content_path(datadir): + """Return filepath fetched by ContentLoader test runs.""" + return os.path.join( + datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp" + ) + + +def compute_hashes( + filepath: str, cksum_algos: Union[str, List[str]] = "sha256" +) -> Dict[str, str]: + """Compute checksums dict out of a filepath""" + checksum_algos = {cksum_algos} if isinstance(cksum_algos, str) else set(cksum_algos) + return MultiHash.from_path(filepath, hash_names=checksum_algos).hexdigest() def test_content_loader_missing_field(swh_storage): + """It should raise if the ContentLoader is missing checksums field""" origin = Origin(CONTENT_URL) with pytest.raises(TypeError, match="missing"): ContentLoader(swh_storage, origin.url) -def test_content_loader_404(caplog, swh_storage, requests_mock_datadir): +def test_content_loader_404(caplog, swh_storage, requests_mock_datadir, content_path): + """It should not ingest origin when there is no file to be found (no mirror url)""" unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp") loader = ContentLoader( swh_storage, unknown_origin.url, - integrity=CONTENT_INTEGRITY, + checksums=compute_hashes(content_path), ) result = loader.load() @@ -544,14 +561,17 @@ ) -def test_content_loader_404_with_fallback(caplog, swh_storage, requests_mock_datadir): +def test_content_loader_404_with_fallback( + caplog, swh_storage, requests_mock_datadir, content_path +): + """It should not ingest origin when there is no file to be found""" unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp") fallback_url_ko = f"{CONTENT_MIRROR}/project/asdf/archives/unknown2.lisp" loader = ContentLoader( swh_storage, unknown_origin.url, fallback_urls=[fallback_url_ko], - integrity=CONTENT_INTEGRITY, + checksums=compute_hashes(content_path), ) result = loader.load() @@ -567,7 +587,15 @@ ) -def test_content_loader_ok_with_fallback(caplog, swh_storage, requests_mock_datadir): +@pytest.mark.parametrize("checksum_algo", ["sha1", "sha256", "sha512"]) +def test_content_loader_ok_with_fallback( + checksum_algo, + caplog, + swh_storage, + requests_mock_datadir, + content_path, +): + """It should be an eventful visit even when ingesting through mirror url""" dead_origin = Origin(f"{CONTENT_MIRROR}/dead-origin-url") fallback_url_ok = CONTENT_URL fallback_url_ko = f"{CONTENT_MIRROR}/project/asdf/archives/unknown2.lisp" @@ -576,19 +604,20 @@ swh_storage, dead_origin.url, fallback_urls=[fallback_url_ok, fallback_url_ko], - integrity=CONTENT_INTEGRITY, + checksums=compute_hashes(content_path, checksum_algo), ) result = loader.load() assert result == {"status": "eventful"} -def test_content_loader_ok_simple(swh_storage, requests_mock_datadir): +def test_content_loader_ok_simple(swh_storage, requests_mock_datadir, content_path): + """It should be an eventful visit on a new file, then uneventful""" origin = Origin(CONTENT_URL) loader = ContentLoader( swh_storage, origin.url, - integrity=CONTENT_INTEGRITY, + checksums=compute_hashes(content_path, ["sha1", "sha256", "sha512"]), ) result = loader.load() @@ -606,22 +635,28 @@ DIRECTORY_MIRROR = "https://example.org" DIRECTORY_URL = f"{DIRECTORY_MIRROR}/archives/dummy-hello.tar.gz" -DIRECTORY_SHA256 = b"7608099df00abcf23ba84f36ce63ad4c8802c3b7d254ddec657488eaf5ffb8d2" -DIRECTORY_INTEGRITY = f"sha256-{base64.encodebytes(DIRECTORY_SHA256).decode()}" + + +@pytest.fixture +def tarball_path(datadir): + """Return tarball filepath fetched by DirectoryLoader test runs.""" + return os.path.join(datadir, "https_example.org", "archives_dummy-hello.tar.gz") def test_directory_loader_missing_field(swh_storage): + """It should raise if the DirectoryLoader is missing checksums field""" origin = Origin(DIRECTORY_URL) with pytest.raises(TypeError, match="missing"): DirectoryLoader(swh_storage, origin.url) -def test_directory_loader_404(caplog, swh_storage, requests_mock_datadir): +def test_directory_loader_404(caplog, swh_storage, requests_mock_datadir, tarball_path): + """It should not ingest origin when there is no tarball to be found (no mirrors)""" unknown_origin = Origin(f"{DIRECTORY_MIRROR}/archives/unknown.tar.gz") loader = DirectoryLoader( swh_storage, unknown_origin.url, - integrity=DIRECTORY_INTEGRITY, + checksums=compute_hashes(tarball_path), ) result = loader.load() @@ -637,14 +672,17 @@ ) -def test_directory_loader_404_with_fallback(caplog, swh_storage, requests_mock_datadir): +def test_directory_loader_404_with_fallback( + caplog, swh_storage, requests_mock_datadir, tarball_path +): + """It should not ingest origin when there is no tarball to be found""" unknown_origin = Origin(f"{DIRECTORY_MIRROR}/archives/unknown.tbz2") fallback_url_ko = f"{DIRECTORY_MIRROR}/archives/elsewhere-unknown2.tbz2" loader = DirectoryLoader( swh_storage, unknown_origin.url, fallback_urls=[fallback_url_ko], - integrity=DIRECTORY_INTEGRITY, + checksums=compute_hashes(tarball_path), ) result = loader.load() @@ -661,18 +699,19 @@ def test_directory_loader_404_with_integrity_check_failure( - caplog, swh_storage, requests_mock_datadir + caplog, swh_storage, requests_mock_datadir, tarball_path ): - """Tarball with mismatched integrity is not ingested.""" + """It should not ingest tarball with mismatched checksum""" origin = Origin(DIRECTORY_URL) - - directory_hash = DIRECTORY_SHA256.decode().replace("e", "a").encode() - directory_integrity = f"sha256-{base64.encodebytes(directory_hash).decode()}" + erratic_checksums = { + algo: chksum.replace("a", "e") # alter checksums to fail integrity check + for algo, chksum in compute_hashes(tarball_path).items() + } loader = DirectoryLoader( swh_storage, origin.url, - integrity=directory_integrity, # making the integrity check fail + checksums=erratic_checksums, # making the integrity check fail ) result = loader.load() @@ -688,7 +727,11 @@ ) -def test_directory_loader_ok_with_fallback(caplog, swh_storage, requests_mock_datadir): +@pytest.mark.parametrize("checksum_algo", ["sha1", "sha256", "sha512"]) +def test_directory_loader_ok_with_fallback( + caplog, swh_storage, requests_mock_datadir, tarball_path, checksum_algo +): + """It should be an eventful visit even when ingesting through mirror url""" dead_origin = Origin(f"{DIRECTORY_MIRROR}/dead-origin-url") fallback_url_ok = DIRECTORY_URL fallback_url_ko = f"{DIRECTORY_MIRROR}/archives/unknown2.tgz" @@ -697,19 +740,20 @@ swh_storage, dead_origin.url, fallback_urls=[fallback_url_ok, fallback_url_ko], - integrity=DIRECTORY_INTEGRITY, + checksums=compute_hashes(tarball_path, checksum_algo), ) result = loader.load() assert result == {"status": "eventful"} -def test_directory_loader_ok_simple(swh_storage, requests_mock_datadir): +def test_directory_loader_ok_simple(swh_storage, requests_mock_datadir, tarball_path): + """It should be an eventful visit on a new tarball, then uneventful""" origin = Origin(DIRECTORY_URL) loader = DirectoryLoader( swh_storage, origin.url, - integrity=DIRECTORY_INTEGRITY, + checksums=compute_hashes(tarball_path, ["sha1", "sha256", "sha512"]), ) result = loader.load()