diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 2.12 -swh.model >= 4.4.0 +swh.model >= 6.5.1 swh.objstorage >= 0.2.2 swh.scheduler >= 0.4.0 swh.storage >= 0.29.0 diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -23,6 +23,7 @@ from swh.loader.exception import NotFound from swh.loader.package.utils import download, get_url_body from swh.model import from_disk +from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash from swh.model.model import ( BaseContent, Content, @@ -726,21 +727,32 @@ ) try: data = get_url_body(url) - self.content = Content.from_data(data) - - # Ensure content received matched the integrity field received - actual_checksum = self.content.get_hash(self.checksum_algo) - if actual_checksum == self.expected_checksum: - # match, we have found our content to ingest, exit loop - break # otherwise continue except NotFound: + self.log.debug("Not found %s, continue on next mirror url if any", url) continue - if not self.content: - raise NotFound(f"Unknown origin {self.origin.url}.") - - return False # no more data to fetch + content_d = MultiHash.from_data( + data, hash_names=DEFAULT_ALGORITHMS | {self.checksum_algo} + ).digest() + if self.checksum_algo not in DEFAULT_ALGORITHMS: + # We must drop it as it's not supported from the model + actual_checksum = content_d.pop(self.checksum_algo) + else: + actual_checksum = content_d[self.checksum_algo] + + # Ensure content received matched the integrity field received + if actual_checksum == self.expected_checksum: + # We have a match, we have our content to ingest + content_d["data"] = data + content_d["length"] = len(data) + self.content = Content.from_dict(content_d) + # we are done, no more data to fetch + return False + + # If we reach this point, we did not find any proper content, consider the + # origin not found + raise NotFound(f"Unknown origin {self.origin.url}.") def process_data(self) -> bool: """Build the snapshot out of the Content retrieved.""" diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -7,6 +7,7 @@ import datetime import hashlib import logging +import os import time from unittest.mock import MagicMock, call @@ -23,7 +24,7 @@ from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol from swh.loader.exception import NotFound from swh.loader.tests import assert_last_visit_matches -from swh.model.hashutil import hash_to_bytes +from swh.model.hashutil import MultiHash, hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -513,8 +514,25 @@ CONTENT_MIRROR = "https://common-lisp.net" CONTENT_URL = f"{CONTENT_MIRROR}/project/asdf/archives/asdf-3.3.5.lisp" -CONTENT_SHA256 = b"77bfa7d03eab048f68da87d630a6436640abfe7d5543202e24c553d5ff32e0a2" -CONTENT_INTEGRITY = f"sha256-{base64.encodebytes(CONTENT_SHA256).decode()}" + + +@pytest.fixture +def content_path(datadir): + """Return the filepath to the content fetched by content loader run.""" + return os.path.join( + datadir, "https_common-lisp.net", "project_asdf_archives_asdf-3.3.5.lisp" + ) + + +def compute_integrity(checksum_algo: str, content_path: str) -> str: + """Compute an integrity field: :.""" + h = MultiHash.from_path(content_path, hash_names={checksum_algo}).digest()[ + checksum_algo + ] + assert isinstance(h, bytes) + val = f"{checksum_algo}-{base64.encodebytes(h).decode()}" + print(val) + return val def test_content_loader_missing_field(swh_storage): @@ -523,12 +541,13 @@ ContentLoader(swh_storage, origin.url) -def test_content_loader_404(caplog, swh_storage, requests_mock_datadir): +def test_content_loader_404(caplog, swh_storage, requests_mock_datadir, content_path): unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp") + content_integrity = compute_integrity("sha256", content_path) loader = ContentLoader( swh_storage, unknown_origin.url, - integrity=CONTENT_INTEGRITY, + integrity=content_integrity, ) result = loader.load() @@ -544,14 +563,17 @@ ) -def test_content_loader_404_with_fallback(caplog, swh_storage, requests_mock_datadir): +def test_content_loader_404_with_fallback( + caplog, swh_storage, requests_mock_datadir, content_path +): unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp") fallback_url_ko = f"{CONTENT_MIRROR}/project/asdf/archives/unknown2.lisp" + content_integrity = compute_integrity("sha256", content_path) loader = ContentLoader( swh_storage, unknown_origin.url, fallback_urls=[fallback_url_ko], - integrity=CONTENT_INTEGRITY, + integrity=content_integrity, ) result = loader.load() @@ -567,28 +589,40 @@ ) -def test_content_loader_ok_with_fallback(caplog, swh_storage, requests_mock_datadir): +@pytest.mark.parametrize("checksum_algo", ["sha256", "sha512"]) +def test_content_loader_ok_with_fallback( + checksum_algo, + caplog, + swh_storage, + requests_mock_datadir, + content_path, +): dead_origin = Origin(f"{CONTENT_MIRROR}/dead-origin-url") fallback_url_ok = CONTENT_URL fallback_url_ko = f"{CONTENT_MIRROR}/project/asdf/archives/unknown2.lisp" + content_integrity = compute_integrity(checksum_algo, content_path) loader = ContentLoader( swh_storage, dead_origin.url, fallback_urls=[fallback_url_ok, fallback_url_ko], - integrity=CONTENT_INTEGRITY, + integrity=content_integrity, ) result = loader.load() assert result == {"status": "eventful"} -def test_content_loader_ok_simple(swh_storage, requests_mock_datadir): +@pytest.mark.parametrize("checksum_algo", ["sha256", "sha512"]) +def test_content_loader_ok_simple( + swh_storage, requests_mock_datadir, content_path, checksum_algo +): origin = Origin(CONTENT_URL) + content_integrity = compute_integrity(checksum_algo, content_path) loader = ContentLoader( swh_storage, origin.url, - integrity=CONTENT_INTEGRITY, + integrity=content_integrity, ) result = loader.load()