diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -154,6 +154,17 @@ _check_download_ok(url, dest=str(tmp_path)) +def test_download_extracting_filename_from_url(tmp_path, requests_mock): + """Extracting filename from url must sanitize the filename first""" + url = "https://example.org/project/requests-0.0.1.tar.gz?a=b&c=d&foo=bar" + + requests_mock.get( + url, status_code=200, text=_data, headers={"content-length": str(len(_data))} + ) + + _check_download_ok(url, dest=str(tmp_path)) + + @pytest.mark.fs @pytest.mark.parametrize( "filename", [f'"{_filename}"', _filename, '"filename with spaces.tar.gz"'] diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -10,7 +10,7 @@ import os import re from typing import Callable, Dict, Optional, Tuple, TypeVar -from urllib.parse import unquote +from urllib.parse import unquote, urlsplit from urllib.request import urlopen import requests @@ -118,7 +118,8 @@ ) response_data = response.iter_content(chunk_size=HASH_BLOCK_SIZE) - filename = filename if filename else os.path.basename(url) + filename = filename if filename else os.path.basename(urlsplit(url).path) + logger.debug("filename: %s", filename) filepath = os.path.join(dest, filename) logger.debug("filepath: %s", filepath)