Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123669
D8763.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
D8763.diff
View Options
diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -37,6 +37,21 @@
logger = logging.getLogger(__name__)
+# By default, ignore binary files and archives containing binaries
+DEFAULT_EXTENSIONS_TO_IGNORE = [
+ "AppImage",
+ "bin",
+ "exe",
+ "iso",
+ "linux64",
+ "msi",
+ "png",
+ "dic",
+ "deb",
+ "rpm",
+]
+
+
class ArtifactNatureUndetected(ValueError):
"""Raised when a remote artifact's nature (tarball, file) cannot be detected."""
@@ -55,11 +70,7 @@
class ArtifactWithoutExtension(ValueError):
- """Raised when an artifact nature cannot be determined by its name.
-
- This exception is solely for internal use of the :meth:`is_tarball` method.
-
- """
+ """Raised when an artifact nature cannot be determined by its name."""
pass
@@ -125,6 +136,22 @@
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
+def url_endswith(
+ urlparsed, extensions: List[str], raise_when_no_extension: bool = True
+) -> bool:
+ """Determine whether urlparsed ends with one of the extensions.
+
+ Raises:
+ ArtifactWithoutExtension in case no extension is available and raise_when_no_extension
+ is True (the default)
+
+ """
+ paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
+ if raise_when_no_extension and not any(path.suffix != "" for path in paths):
+ raise ArtifactWithoutExtension
+ return any(path.suffix.endswith(tuple(extensions)) for path in paths)
+
+
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
"""Determine whether a list of files actually are tarballs or simple files.
@@ -157,13 +184,7 @@
urlparsed = urlparse(url)
if urlparsed.scheme not in ("http", "https", "ftp"):
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
-
- paths = [
- Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)
- ]
- if not any(path.suffix != "" for path in paths):
- raise ArtifactWithoutExtension
- return any(path.suffix.endswith(tuple(TARBALL_EXTENSIONS)) for path in paths)
+ return url_endswith(urlparsed, TARBALL_EXTENSIONS)
index = random.randrange(len(urls))
url = urls[index]
@@ -247,6 +268,10 @@
it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
response header, and then checks the extension again.
+ Optionally, when the `extension_to_ignore` parameter is provided, it extends the
+ default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed.
+ This can be used to drop further binary files detected in the wild.
+
"""
LISTER_NAME = "nixguix"
@@ -260,6 +285,7 @@
credentials: Optional[CredentialsType] = None,
# canonicalize urls, can be turned off during docker runs
canonicalize: bool = True,
+ extensions_to_ignore: List[str] = [],
**kwargs: Any,
):
super().__init__(
@@ -271,6 +297,7 @@
# either full fqdn NixOS/nixpkgs or guix repository urls
# maybe add an assert on those specific urls?
self.origin_upstream = origin_upstream
+ self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
self.session = requests.Session()
# for testing purposes, we may want to skip this step (e.g. docker run and rate
@@ -435,13 +462,34 @@
# 'critical' information about how to recompute the hash (e.g. fs
# layout, executable bit, ...)
logger.warning(
- "Skipping artifact <%s>: 'file' artifact of type <%s> is "
+ "Skipping artifact <%s>: 'file' artifact of type <%s> is"
" missing information to properly check its integrity",
artifact,
artifact_type,
)
continue
+ # At this point plenty of heuristics happened and we should have found
+ # the right origin and its nature.
+
+ # Let's check and filter it out if it is to be ignored (if possible).
+ # Some origin urls may not have extension at this point (e.g
+ # http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
+ if url_endswith(
+ urlparse(origin),
+ self.extensions_to_ignore,
+ raise_when_no_extension=False,
+ ):
+ logger.warning(
+ "Skipping artifact <%s>: 'file' artifact of type <%s> is"
+ " ignored due to lister configuration. It should ignore"
+ " origins with extension [%s]",
+ origin,
+ artifact_type,
+ ",".join(self.extensions_to_ignore),
+ )
+ continue
+
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
yield ArtifactType.ARTIFACT, Artifact(
origin=origin,
diff --git a/swh/lister/nixguix/tests/data/sources-failure.json b/swh/lister/nixguix/tests/data/sources-failure.json
--- a/swh/lister/nixguix/tests/data/sources-failure.json
+++ b/swh/lister/nixguix/tests/data/sources-failure.json
@@ -57,6 +57,123 @@
"type": "url",
"urls": [ "https://code.9front.org/hg/plan9front" ],
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
+ },
+ {
+ "outputHash": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://github.com/KSP-CKAN/CKAN/releases/download/v1.30.4/ckan.exe"
+ ],
+ "integrity": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://github.com/johannesjo/super-productivity/releases/download/v7.5.1/superProductivity-7.5.1.AppImage"
+ ],
+ "integrity": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "19ir6x4c01825hpx2wbbcxkk70ymwbw4j03v8b2xc13ayylwzx0r",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "http://gorilla.dp100.com/downloads/gorilla1537_64.bin"
+ ],
+ "integrity": "sha256-GfTPqfdqBNbFQnsASfji1YMzZ2drcdEvLAIFwEg3OaY=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "1zj53xybygps66m3v5kzi61vqy987zp6bfgk0qin9pja68qq75vx",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/archive-virtio/virtio-win-0.1.196-1/virtio-win.iso"
+ ],
+ "integrity": "sha256-fZeDMTJK3mQjBvO5Ze4/KHm8g4l/lj2qMfo+v3wfRf4=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "02qgsj4h4zrjxkcclx7clsqbqd699kg0dq1xxa9hbj3vfnddjv1f",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://www.pjrc.com/teensy/td_153/TeensyduinoInstall.linux64"
+ ],
+ "integrity": "sha256-LmzZmnV7yAWT6j3gBt5MyTS8sKbsdMrY7DJ/AonUDws=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://dl.winehq.org/wine/wine-mono/6.4.0/wine-mono-6.4.0-x86.msi"
+ ],
+ "integrity": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "00y96w9shbbrdbf6xcjlahqd08154kkrxmqraik7qshiwcqpw7p4",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://raw.githubusercontent.com/webtorrent/webtorrent-desktop/v0.21.0/static/linux/share/icons/hicolor/48x48/apps/webtorrent-desktop.png"
+ ],
+ "integrity": "sha256-5B5+MeMRanxmVBnXnuckJSDQMFRUsm7canktqBM3yQM=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "0lw193jr7ldvln5x5z9p21rz1by46h0say9whfcw2kxs9vprd5b3",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "http://xuxen.eus/static/hunspell/eu_ES.dic"
+ ],
+ "integrity": "sha256-Y5WW7066T8GZgzx5pQE0xK/wcxA3/dKLpbvRk+VIgVM=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "0wbhvypdr96a5ddg6kj41dn9sbl49n7pfi2vs762ij82hm2gvwcm",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://www.openprinting.org/download/printdriver/components/lsb3.2/main/RPMS/noarch/openprinting-ppds-postscript-lexmark-20160218-1lsb3.2.noarch.rpm"
+ ],
+ "integrity": "sha256-lfH9RIUCySjM0VtEd49NhC6dbAtETvNaK8qk3K7fcHE=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "01gy84gr0gw5ap7hpy72azaf6hlzac7vxkn5cgad5sfbyzxgjgc9",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://wire-app.wire.com/linux/debian/pool/main/Wire-3.26.2941_amd64.deb"
+ ],
+ "integrity": "sha256-iT35+vfL6dLUY8XOvg9Tn0Lj1Ffi+AvPVYU/kB9B/gU=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "type": "url",
+ "urls": [
+ "https://elpa.gnu.org/packages/zones.foobar"
+ ],
+ "integrity": "sha256-YRZc7dI3DjUzoSIp4fIshUyhMXIQ/fPKaKnjeYVa4WI="
}
],
"version":"1",
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -8,6 +8,7 @@
import logging
from pathlib import Path
from typing import Dict, List
+from urllib.parse import urlparse
import pytest
import requests
@@ -15,11 +16,14 @@
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
+ DEFAULT_EXTENSIONS_TO_IGNORE,
POSSIBLE_TARBALL_MIMETYPES,
ArtifactNatureMistyped,
ArtifactNatureUndetected,
+ ArtifactWithoutExtension,
NixGuixLister,
is_tarball,
+ url_endswith,
)
from swh.lister.pattern import ListerStats
@@ -43,6 +47,33 @@
return json.loads(datapath.read_text()) if datapath.exists else []
+@pytest.mark.parametrize(
+ "name,expected_result",
+ [(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
+ + [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
+ + [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
+ + [("two?file=something.el", False), ("foo?two=two&three=three", False)],
+)
+def test_url_endswith(name, expected_result):
+ """It should detect whether url or query params of the urls ends with extensions"""
+ urlparsed = urlparse(f"https://example.org/{name}")
+ assert (
+ url_endswith(
+ urlparsed,
+ TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE,
+ raise_when_no_extension=False,
+ )
+ is expected_result
+ )
+
+
+def test_url_endswith_raise():
+ """It should raise when the tested url has no extension"""
+ urlparsed = urlparse("https://example.org/foo?two=two&three=three")
+ with pytest.raises(ArtifactWithoutExtension):
+ url_endswith(urlparsed, ["unimportant"])
+
+
@pytest.mark.parametrize(
"tarballs",
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
@@ -254,10 +285,15 @@
def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
- """NixGuixLister should ignore unsupported or incomplete origins"""
+ """NixGuixLister should ignore unsupported or incomplete or to ignore origins"""
url = SOURCES["nixpkgs"]["manifest"]
origin_upstream = SOURCES["nixpkgs"]["repo"]
- lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+ lister = NixGuixLister(
+ swh_scheduler,
+ url=url,
+ origin_upstream=origin_upstream,
+ extensions_to_ignore=["foobar"],
+ )
response = page_response(datadir, "failure")
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 19 2024, 5:30 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216431
Attached To
D8763: nixguix: Allow lister to ignore specific extensions
Event Timeline
Log In to Comment