diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -22,7 +22,7 @@ from pathlib import Path import random from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from urllib.parse import urlparse +from urllib.parse import parse_qsl, urlparse import requests from requests.exceptions import ConnectionError, InvalidSchema, SSLError @@ -43,7 +43,7 @@ class ArtifactNatureMistyped(ValueError): - """Raised when a remote artifact's neither a tarball nor a file. + """Raised when a remote artifact is neither a tarball nor a file. Error of this type are' probably a misconfiguration in the manifest generation that badly typed a vcs repository. @@ -53,6 +53,16 @@ pass +class ArtifactWithoutExtension(ValueError): + """Raised when an artifact nature cannot be determined by its name. + + This exception is solely for internal use of the :meth:`is_tarball` method. + + """ + + pass + + class ChecksumsComputation(Enum): """The possible artifact types listed out of the manifest.""" @@ -140,20 +150,36 @@ """Determine out of an extension whether url is a tarball. Raises: - IndexError in case no extension is available + ArtifactWithoutExtension in case no extension is available """ urlparsed = urlparse(url) if urlparsed.scheme not in ("http", "https", "ftp"): raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") - return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS + + errors = [] + query_params = dict(parse_qsl(urlparsed.query)) + for path in [query_params.get(key) for key in ["f", "file", "url", "name"]] + [ + urlparsed.path + ]: + if not path: + continue + try: + file_ = Path(path).suffixes[-1] + break + except IndexError as e: + errors.append(ArtifactWithoutExtension(e)) + + if errors: + raise errors[-1] + return file_.lstrip(".") in TARBALL_EXTENSIONS index = random.randrange(len(urls)) url = urls[index] try: return _is_tarball(url), urls[0] - except IndexError: + except ArtifactWithoutExtension: if request is None: raise ArtifactNatureUndetected( f"Cannot determine artifact type from url <{url}>" @@ -181,7 +207,7 @@ # FIXME: location is also returned as it's considered the true origin, # true enough? return _is_tarball(location), location - except IndexError: + except ArtifactWithoutExtension: logger.warning( "Still cannot detect extension through location <%s>...", url, diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -38,13 +38,25 @@ + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], ) def test_is_tarball_simple(tarballs): - """Simple check on tarball should discriminate betwenn tarball and file""" + """Simple check on tarball should discriminate between tarball and file""" urls = [f"https://example.org/{tarball}" for tarball in tarballs] is_tar, origin = is_tarball(urls) assert is_tar is True assert origin == urls[0] +@pytest.mark.parametrize( + "query_param", + ["file", "f", "url", "name"], +) +def test_is_tarball_not_so_simple(query_param): + """More involved check on tarball should discriminate between tarball and file""" + url = f"https://example.org/download.php?foo=bar&{query_param}=one.tar.gz" + is_tar, origin = is_tarball([url]) + assert is_tar is True + assert origin == url + + @pytest.mark.parametrize( "files", [