diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -259,7 +259,8 @@ extrinsic_hashes = {'sha256': fileinfo['sha256']} logger.debug('extrinsic_hashes(%s): %s', filename, extrinsic_hashes) filepath, hashes = download(uri, dest=tmpdir, filename=filename, - hashes=extrinsic_hashes) + hashes=extrinsic_hashes, + bypass_content_length_header=True) all_hashes[filename] = hashes logger.debug('all_hashes: %s', all_hashes) diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -72,16 +72,35 @@ @pytest.mark.fs -def test_download_headers(tmp_path, requests_mock): +def test_download_ok_no_header(tmp_path, requests_mock): + """Download without issue should provide filename and hashes""" + filename = 'requests-0.0.1.tar.gz' + url = 'https://pypi.org/pypi/requests/%s' % filename + data = 'this is something' + requests_mock.get(url, text=data) # no header information + + actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) + + actual_filename = os.path.basename(actual_filepath) + assert actual_filename == filename + assert actual_hashes['length'] == len(data) + assert actual_hashes['checksums']['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' # noqa + assert (actual_hashes['checksums']['sha256'] == + '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') + + +@pytest.mark.fs +def test_download_bypass_headers(tmp_path, requests_mock): """Check that we send proper headers when downloading files""" filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data, headers={ - 'content-length': str(len(data)) + 'content-length': 'something-wrong', # inject bad stuff not read }) - actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) + actual_filepath, actual_hashes = download( + url, dest=str(tmp_path), bypass_content_length_header=True) assert len(requests_mock.request_history) == 1 req = requests_mock.request_history[0] diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -40,7 +40,8 @@ def download(url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, - auth: Optional[Tuple[str, str]] = None) -> Tuple[str, Dict]: + auth: Optional[Tuple[str, str]] = None, + bypass_content_length_header: bool = False) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. @@ -51,6 +52,8 @@ to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) + bypass_content_length_header: If true, bypass content length header to + compute it provided, some use case have erroneous data) Raises: ValueError in case of any error when fetching/computing (length, @@ -64,13 +67,16 @@ if auth is not None: params['auth'] = auth response = requests.get(url, **params, stream=True) - logger.debug('headers: %s', response.headers) + headers = response.headers + logger.debug('headers: %s', headers) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) - _length = response.headers.get('content-length') - # some server do not provide the content-length header... - length = int(_length) if _length is not None else len(response.content) + # some server do not send header or sometimes send incorrect ones + if bypass_content_length_header or not headers.get('content-length'): + length = len(response.content) + else: + length = int(headers['content-length']) filename = filename if filename else os.path.basename(url) logger.debug('filename: %s', filename) @@ -85,8 +91,8 @@ actual_length = os.path.getsize(filepath) if length != actual_length: - raise ValueError('Error when checking size: %s != %s' % ( - length, actual_length)) + raise ValueError( + f'Error when checking size: {length} != {actual_length}') # Also check the expected hashes if provided if hashes: