diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -738,6 +738,7 @@ def fetch_data(self) -> bool: """Retrieve the content file as a Content Object""" + errors = [] for url in self.mirror_urls: url_ = urlparse(url) self.log.debug( @@ -754,6 +755,13 @@ file_path, _ = download(url, dest=tmpdir, hashes=self.checksums) with open(file_path, "rb") as file: self.content = Content.from_data(file.read()) + except ValueError as e: + errors.append(e) + self.log.debug( + "Mismatched checksums <%s>: continue on next mirror url if any", + url, + ) + continue except HTTPError as http_error: if http_error.response.status_code == 404: self.log.debug( @@ -763,6 +771,9 @@ else: return False # no more data to fetch + if errors: + raise errors[0] + # If we reach this point, we did not find any proper content, consider the # origin not found raise NotFound(f"Unknown origin {self.origin.url}.") @@ -824,6 +835,7 @@ Raises NotFound if no tarball is found """ + errors = [] for url in self.mirror_urls: url_ = urlparse(url) self.log.debug( @@ -841,8 +853,8 @@ hashes=self.standard_hashes, extra_request_headers={"Accept-Encoding": "identity"}, ) - except ValueError: - # Checksum mismatch can happen, so we + except ValueError as e: + errors.append(e) self.log.debug( "Mismatched checksums <%s>: continue on next mirror url if any", url, @@ -869,7 +881,18 @@ dir_to_check, self.checksums.keys() ).hexdigest() - assert actual_checksums == self.checksums + if actual_checksums != self.checksums: + errors.append( + ValueError( + f"Checksum mismatched on <{url}>: " + f"{actual_checksums} != {self.checksums}" + ) + ) + self.log.debug( + "Mismatched checksums <%s>: continue on next mirror url if any", + url, + ) + continue self.directory = from_disk.Directory.from_disk( path=bytes(directory_path), @@ -883,6 +906,9 @@ if self.directory is not None: return False # no more data to fetch + if errors: + raise errors[0] + # if we reach here, we did not find any proper tarball, so consider the origin # not found raise NotFound(f"Unknown origin {self.origin.url}.") diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -638,6 +638,26 @@ assert result2 == {"status": "uneventful"} +def test_content_loader_hash_mismatch(swh_storage, requests_mock_datadir, content_path): + """It should be an eventful visit on a new file, then uneventful""" + checksums = compute_hashes(content_path, ["sha1", "sha256", "sha512"]) + erratic_checksums = { + algo: chksum.replace("a", "e") # alter checksums to fail integrity check + for algo, chksum in checksums.items() + } + origin = Origin(CONTENT_URL) + loader = ContentLoader( + swh_storage, + origin.url, + checksums=erratic_checksums, + ) + result = loader.load() + + assert result == {"status": "failed"} + + assert_last_visit_matches(swh_storage, origin.url, status="failed", type="content") + + DIRECTORY_MIRROR = "https://example.org" DIRECTORY_URL = f"{DIRECTORY_MIRROR}/archives/dummy-hello.tar.gz" @@ -697,7 +717,7 @@ ) -def test_directory_loader_404_with_integrity_check_failure( +def test_directory_loader_hash_mismatch( caplog, swh_storage, requests_mock_datadir, tarball_with_std_hashes ): """It should not ingest tarball with mismatched checksum""" @@ -716,14 +736,47 @@ ) result = loader.load() - assert result == {"status": "uneventful"} + assert result == {"status": "failed"} _check_load_failure( caplog, loader, - NotFound, - "Unknown origin", - status="not_found", + ValueError, + "mismatched", + status="failed", + origin=origin, + ) + + +@pytest.mark.skipif(nix_store_missing, reason="requires nix-bin installed (bullseye)") +def test_directory_loader_hash_mismatch_nar( + caplog, swh_storage, requests_mock_datadir, tarball_with_nar_hashes +): + """It should not ingest tarball with mismatched checksum""" + tarball_path, checksums = tarball_with_nar_hashes + + origin = Origin(DIRECTORY_URL) + erratic_checksums = { + algo: chksum.replace("a", "e") # alter checksums to fail integrity check + for algo, chksum in checksums.items() + } + + loader = DirectoryLoader( + swh_storage, + origin.url, + checksums=erratic_checksums, # making the integrity check fail + checksums_computation="nar", + ) + result = loader.load() + + assert result == {"status": "failed"} + + _check_load_failure( + caplog, + loader, + ValueError, + "mismatched", + status="failed", origin=origin, )