Page MenuHomeSoftware Heritage

D8773.diff
No OneTemporary

D8773.diff

diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -22,6 +22,7 @@
import logging
from pathlib import Path
import random
+import re
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from urllib.parse import parse_qsl, urlparse
@@ -136,20 +137,36 @@
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
+PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
+
+
def url_endswith(
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
) -> bool:
- """Determine whether urlparsed ends with one of the extensions.
+ """Determine whether urlparsed ends with one of the extensions passed as parameter.
+
+ This also account for the edge case of a filename with only a version as name (so no
+ extension in the end.)
Raises:
- ArtifactWithoutExtension in case no extension is available and raise_when_no_extension
- is True (the default)
+ ArtifactWithoutExtension in case no extension is available and
+ raise_when_no_extension is True (the default)
"""
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
- return any(path.suffix.endswith(tuple(extensions)) for path in paths)
+ match = any(path.suffix.endswith(tuple(extensions)) for path in paths)
+ if match:
+ return match
+ # Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
+ # to catch those
+ name = Path(urlparsed.path).name
+ if not PATTERN_VERSION.match(name):
+ return match
+ if raise_when_no_extension:
+ raise ArtifactWithoutExtension
+ return False
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json
--- a/swh/lister/nixguix/tests/data/sources-success.json
+++ b/swh/lister/nixguix/tests/data/sources-success.json
@@ -265,6 +265,13 @@
"https://github.com/Doom-Utils/deutex/releases/download/v5.2.2/deutex-5.2.2.tar.zst"
],
"integrity": "sha256-EO0OelM+yXy20DVI1CWPvsiIUqRbXqTPVDQ3atQXS18="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5"
+ ],
+ "integrity": "sha256-Kb5f9LN54vxPiO99i8FyNCEw3T53owYfZMinXv5OunM="
}
],
"version": "1",
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -52,7 +52,13 @@
[(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
+ [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
+ [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
- + [("two?file=something.el", False), ("foo?two=two&three=three", False)],
+ + [
+ ("two?file=something.el", False),
+ ("foo?two=two&three=three", False),
+ ("v1.2.3", False), # with raise_when_no_extension is False
+ ("2048-game-20151026.1233", False),
+ ("v2048-game-20151026.1233", False),
+ ],
)
def test_url_endswith(name, expected_result):
"""It should detect whether url or query params of the urls ends with extensions"""
@@ -67,9 +73,12 @@
)
-def test_url_endswith_raise():
+@pytest.mark.parametrize(
+ "name", ["foo?two=two&three=three", "tar.gz/0.1.5", "tar.gz/v10.3.1"]
+)
+def test_url_endswith_raise(name):
"""It should raise when the tested url has no extension"""
- urlparsed = urlparse("https://example.org/foo?two=two&three=three")
+ urlparsed = urlparse(f"https://example.org/{name}")
with pytest.raises(ArtifactWithoutExtension):
url_endswith(urlparsed, ["unimportant"])
@@ -225,6 +234,12 @@
"Location": "https://static.crates.io/crates/syntect/syntect-4.6.0.crate"
},
)
+ requests_mock.head(
+ "https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5",
+ headers={
+ "Content-Type": "application/x-gzip",
+ },
+ )
expected_visit_types = defaultdict(int)
# origin upstream is added as origin
@@ -248,7 +263,7 @@
expected_visit_types["content"] += 1
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
expected_visit_types["svn"] += 1
- elif "crates.io" in url:
+ elif "crates.io" in url or "codeload.github.com" in url:
expected_visit_types["directory"] += 1
else: # tarball artifacts
expected_visit_types["directory"] += 1

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 19, 9:46 AM (18 h, 32 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216411

Event Timeline