diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -22,7 +22,7 @@ from pathlib import Path import random from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from urllib.parse import urlparse +from urllib.parse import parse_qsl, urlparse import requests from requests.exceptions import ConnectionError, InvalidSchema, SSLError @@ -146,7 +146,23 @@ urlparsed = urlparse(url) if urlparsed.scheme not in ("http", "https", "ftp"): raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") - return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS + + errors = [] + query_params = dict(parse_qsl(urlparsed.query)) + for path in [query_params.get(key) for key in ["f", "file", "url"]] + [ + urlparsed.path + ]: + if not path: + continue + try: + file_ = Path(path).suffixes[-1] + break + except IndexError as e: + errors.append(e) + + if errors: + raise errors[-1] + return file_.lstrip(".") in TARBALL_EXTENSIONS index = random.randrange(len(urls)) url = urls[index] diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -38,7 +38,24 @@ + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], ) def test_is_tarball_simple(tarballs): - """Simple check on tarball should discriminate betwenn tarball and file""" + """Simple check on tarball should discriminate between tarball and file""" + urls = [f"https://example.org/{tarball}" for tarball in tarballs] + is_tar, origin = is_tarball(urls) + assert is_tar is True + assert origin == urls[0] + + +@pytest.mark.parametrize( + "tarballs", + [ + "download.php?file=one.tar.gz&foo=bar", + "count.php?f=one.gzip&foo=bar", + "artifact.php?url=one.zip&foo=bar", + "files?name=one.tbz&foo=bar", + ], +) +def test_is_tarball_not_so_simple(tarballs): + """More involved check on tarball should discriminate between tarball and file""" urls = [f"https://example.org/{tarball}" for tarball in tarballs] is_tar, origin = is_tarball(urls) assert is_tar is True