diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py index 92a4220..5178f57 100644 --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -1,96 +1,148 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest from swh.loader.package.utils import download, api_info @pytest.mark.fs def test_download_fail_to_download(tmp_path, requests_mock): url = 'https://pypi.org/pypi/arrow/json' status_code = 404 requests_mock.get(url, status_code=status_code) with pytest.raises(ValueError) as e: download(url, tmp_path) assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % ( url, status_code) @pytest.mark.fs def test_download_fail_length_mismatch(tmp_path, requests_mock): """Mismatch length after download should raise """ filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' wrong_size = len(data) - 3 requests_mock.get(url, text=data, headers={ 'content-length': str(wrong_size) # wrong size! }) with pytest.raises(ValueError) as e: download(url, dest=str(tmp_path)) assert e.value.args[0] == "Error when checking size: %s != %s" % ( wrong_size, len(data) ) @pytest.mark.fs def test_download_ok(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data, headers={ 'content-length': str(len(data)) }) actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes['length'] == len(data) assert actual_hashes['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' assert (actual_hashes['sha256'] == '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') +@pytest.mark.fs +def test_download_ok_with_hashes(tmp_path, requests_mock): + """Download without issue should provide filename and hashes""" + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + requests_mock.get(url, text=data, headers={ + 'content-length': str(len(data)) + }) + + # good hashes for such file + good = { + 'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2', + 'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5', # noqa + } + + actual_filepath, actual_hashes = download(url, dest=str(tmp_path), + hashes=good) + + actual_filename = os.path.basename(actual_filepath) + assert actual_filename == filename + assert actual_hashes['length'] == len(data) + assert actual_hashes['sha1'] == good['sha1'] + assert actual_hashes['sha256'] == good['sha256'] + + @pytest.mark.fs def test_download_fail_hashes_mismatch(tmp_path, requests_mock): """Mismatch hash after download should raise """ - pass + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + requests_mock.get(url, text=data, headers={ + 'content-length': str(len(data)) + }) + + # good hashes for such file + good = { + 'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2', + 'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5', # noqa + } + + for hash_algo in good.keys(): + wrong_hash = good[hash_algo].replace('1', '0') + expected_hashes = good.copy() + expected_hashes[hash_algo] = wrong_hash # set the wrong hash + + with pytest.raises(ValueError) as e: + download(url, dest=str(tmp_path), hashes=expected_hashes) + + assert ( + e.value.args[0] == "Failure when fetching %s. " + "Checksum mismatched: %s != %s" % ( + url, wrong_hash, good[hash_algo] + ) + ) def test_api_info_failure(requests_mock): """Failure to fetch info/release information should raise""" url = 'https://pypi.org/pypi/requests/json' status_code = 400 requests_mock.get(url, status_code=status_code) with pytest.raises(ValueError) as e0: api_info(url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % ( url, status_code ) def test_api_info(requests_mock): """Fetching json info from pypi project should be ok""" url = 'https://pypi.org/pypi/requests/json' requests_mock.get(url, text='{"version": "0.0.1"}') actual_info = api_info(url) assert actual_info == { 'version': '0.0.1', } diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index b2c4210..af9accd 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,80 +1,89 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import requests from typing import Dict, Tuple from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.loader.package import DEFAULT_PARAMS def api_info(url: str) -> Dict: """Basic api client to retrieve information on project. This deals with fetching json metadata about pypi projects. Args: url (str): The api url (e.g PyPI, npm, etc...) Raises: ValueError in case of query failures (for some reasons: 404, ...) Returns: The associated response's information dict """ response = requests.get(url, **DEFAULT_PARAMS) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) return response.json() -def download(url: str, dest: str) -> Tuple[str, Dict]: +def download(url: str, dest: str, hashes: Dict = {}) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to + hashes: Dict of expected hashes (key is the hash algo) for the artifact + to download (those hashes are expected to be hex string) + Raises: - ValueError in case of any error when fetching/computing + ValueError in case of any error when fetching/computing (length, + checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ response = requests.get(url, **DEFAULT_PARAMS, stream=True) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) length = int(response.headers['content-length']) filepath = os.path.join(dest, os.path.basename(url)) h = MultiHash(length=length) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): h.update(chunk) f.write(chunk) actual_length = os.path.getsize(filepath) if length != actual_length: raise ValueError('Error when checking size: %s != %s' % ( length, actual_length)) - # hashes = h.hexdigest() - # actual_digest = hashes['sha256'] - # if actual_digest != artifact['sha256']: - # raise ValueError( - # '%s %s: Checksum mismatched: %s != %s' % ( - # project, version, artifact['sha256'], actual_digest)) + # Also check the expected hashes if provided + if hashes: + actual_hashes = h.hexdigest() + for algo_hash in hashes.keys(): + actual_digest = actual_hashes[algo_hash] + expected_digest = hashes[algo_hash] + if actual_digest != expected_digest: + raise ValueError( + 'Failure when fetching %s. ' + 'Checksum mismatched: %s != %s' % ( + url, expected_digest, actual_digest)) return filepath, { 'length': length, **h.hexdigest() }