diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -22,6 +22,7 @@ import logging from pathlib import Path import random +import re from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from urllib.parse import parse_qsl, urlparse @@ -136,20 +137,36 @@ POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) +PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+") + + def url_endswith( urlparsed, extensions: List[str], raise_when_no_extension: bool = True ) -> bool: - """Determine whether urlparsed ends with one of the extensions. + """Determine whether urlparsed ends with one of the extensions passed as parameter. + + This also account for the edge case of a filename with only a version as name (so no + extension in the end.) Raises: - ArtifactWithoutExtension in case no extension is available and raise_when_no_extension - is True (the default) + ArtifactWithoutExtension in case no extension is available and + raise_when_no_extension is True (the default) """ paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)] if raise_when_no_extension and not any(path.suffix != "" for path in paths): raise ArtifactWithoutExtension - return any(path.suffix.endswith(tuple(extensions)) for path in paths) + match = any(path.suffix.endswith(tuple(extensions)) for path in paths) + if match: + return match + # Some false negative can happen (e.g. https:///path/0.1.5)), so make sure + # to catch those + name = Path(urlparsed.path).name + if not PATTERN_VERSION.match(name): + return match + if raise_when_no_extension: + raise ArtifactWithoutExtension + return False def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json --- a/swh/lister/nixguix/tests/data/sources-success.json +++ b/swh/lister/nixguix/tests/data/sources-success.json @@ -265,6 +265,13 @@ "https://github.com/Doom-Utils/deutex/releases/download/v5.2.2/deutex-5.2.2.tar.zst" ], "integrity": "sha256-EO0OelM+yXy20DVI1CWPvsiIUqRbXqTPVDQ3atQXS18=" + }, + { + "type": "url", + "urls": [ + "https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5" + ], + "integrity": "sha256-Kb5f9LN54vxPiO99i8FyNCEw3T53owYfZMinXv5OunM=" } ], "version": "1", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -52,7 +52,13 @@ [(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS] + [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS] + [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE] - + [("two?file=something.el", False), ("foo?two=two&three=three", False)], + + [ + ("two?file=something.el", False), + ("foo?two=two&three=three", False), + ("v1.2.3", False), # with raise_when_no_extension is False + ("2048-game-20151026.1233", False), + ("v2048-game-20151026.1233", False), + ], ) def test_url_endswith(name, expected_result): """It should detect whether url or query params of the urls ends with extensions""" @@ -67,9 +73,12 @@ ) -def test_url_endswith_raise(): +@pytest.mark.parametrize( + "name", ["foo?two=two&three=three", "tar.gz/0.1.5", "tar.gz/v10.3.1"] +) +def test_url_endswith_raise(name): """It should raise when the tested url has no extension""" - urlparsed = urlparse("https://example.org/foo?two=two&three=three") + urlparsed = urlparse(f"https://example.org/{name}") with pytest.raises(ArtifactWithoutExtension): url_endswith(urlparsed, ["unimportant"]) @@ -225,6 +234,12 @@ "Location": "https://static.crates.io/crates/syntect/syntect-4.6.0.crate" }, ) + requests_mock.head( + "https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5", + headers={ + "Content-Type": "application/x-gzip", + }, + ) expected_visit_types = defaultdict(int) # origin upstream is added as origin @@ -248,7 +263,7 @@ expected_visit_types["content"] += 1 elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless expected_visit_types["svn"] += 1 - elif "crates.io" in url: + elif "crates.io" in url or "codeload.github.com" in url: expected_visit_types["directory"] += 1 else: # tarball artifacts expected_visit_types["directory"] += 1