Page MenuHomeSoftware Heritage

D8763.id31602.diff
No OneTemporary

D8763.id31602.diff

diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -37,6 +37,21 @@
logger = logging.getLogger(__name__)
+# By default, ignore binary files and archives containing binaries
+DEFAULT_EXTENSIONS_TO_IGNORE = [
+ "AppImage",
+ "bin",
+ "exe",
+ "iso",
+ "linux64",
+ "msi",
+ "png",
+ "dic",
+ "deb",
+ "rpm",
+]
+
+
class ArtifactNatureUndetected(ValueError):
"""Raised when a remote artifact's nature (tarball, file) cannot be detected."""
@@ -55,11 +70,7 @@
class ArtifactWithoutExtension(ValueError):
- """Raised when an artifact nature cannot be determined by its name.
-
- This exception is solely for internal use of the :meth:`is_tarball` method.
-
- """
+ """Raised when an artifact nature cannot be determined by its name."""
pass
@@ -125,6 +136,22 @@
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
+def url_endswith(
+ urlparsed, extensions: List[str], raise_when_no_extension: bool = True
+) -> bool:
+ """Determine whether urlparsed ends with one of the extensions.
+
+ Raises:
+ ArtifactWithoutExtension in case no extension is available and raise_when_no_extension
+ is True (the default)
+
+ """
+ paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
+ if raise_when_no_extension and not any(path.suffix != "" for path in paths):
+ raise ArtifactWithoutExtension
+ return any(path.suffix.endswith(tuple(extensions)) for path in paths)
+
+
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
"""Determine whether a list of files actually are tarballs or simple files.
@@ -157,13 +184,7 @@
urlparsed = urlparse(url)
if urlparsed.scheme not in ("http", "https", "ftp"):
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
-
- paths = [
- Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)
- ]
- if not any(path.suffix != "" for path in paths):
- raise ArtifactWithoutExtension
- return any(path.suffix.endswith(tuple(TARBALL_EXTENSIONS)) for path in paths)
+ return url_endswith(urlparsed, TARBALL_EXTENSIONS)
index = random.randrange(len(urls))
url = urls[index]
@@ -247,6 +268,10 @@
it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
response header, and then checks the extension again.
+ Optionally, when the `extension_to_ignore` parameter is provided, it extends the
+ default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed.
+ This can be used to drop further binary files detected in the wild.
+
"""
LISTER_NAME = "nixguix"
@@ -260,6 +285,7 @@
credentials: Optional[CredentialsType] = None,
# canonicalize urls, can be turned off during docker runs
canonicalize: bool = True,
+ extensions_to_ignore: List[str] = [],
**kwargs: Any,
):
super().__init__(
@@ -271,6 +297,7 @@
# either full fqdn NixOS/nixpkgs or guix repository urls
# maybe add an assert on those specific urls?
self.origin_upstream = origin_upstream
+ self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
self.session = requests.Session()
# for testing purposes, we may want to skip this step (e.g. docker run and rate
@@ -435,13 +462,34 @@
# 'critical' information about how to recompute the hash (e.g. fs
# layout, executable bit, ...)
logger.warning(
- "Skipping artifact <%s>: 'file' artifact of type <%s> is "
+ "Skipping artifact <%s>: 'file' artifact of type <%s> is"
" missing information to properly check its integrity",
artifact,
artifact_type,
)
continue
+ # At this point plenty of heuristics happened and we should have found
+ # the right origin and its nature.
+
+ # Let's check and filter it out if it is to be ignored (if possible).
+ # Some origin urls may not have extension at this point (e.g
+ # http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
+ if url_endswith(
+ urlparse(origin),
+ self.extensions_to_ignore,
+ raise_when_no_extension=False,
+ ):
+ logger.warning(
+ "Skipping artifact <%s>: 'file' artifact of type <%s> is"
+ " ignored due to lister configuration. It should ignore"
+ " origins with extension [%s]",
+ origin,
+ artifact_type,
+ ",".join(self.extensions_to_ignore),
+ )
+ continue
+
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
yield ArtifactType.ARTIFACT, Artifact(
origin=origin,
diff --git a/swh/lister/nixguix/tests/data/sources-failure.json b/swh/lister/nixguix/tests/data/sources-failure.json
--- a/swh/lister/nixguix/tests/data/sources-failure.json
+++ b/swh/lister/nixguix/tests/data/sources-failure.json
@@ -57,6 +57,123 @@
"type": "url",
"urls": [ "https://code.9front.org/hg/plan9front" ],
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
+ },
+ {
+ "outputHash": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://github.com/KSP-CKAN/CKAN/releases/download/v1.30.4/ckan.exe"
+ ],
+ "integrity": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://github.com/johannesjo/super-productivity/releases/download/v7.5.1/superProductivity-7.5.1.AppImage"
+ ],
+ "integrity": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "19ir6x4c01825hpx2wbbcxkk70ymwbw4j03v8b2xc13ayylwzx0r",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "http://gorilla.dp100.com/downloads/gorilla1537_64.bin"
+ ],
+ "integrity": "sha256-GfTPqfdqBNbFQnsASfji1YMzZ2drcdEvLAIFwEg3OaY=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "1zj53xybygps66m3v5kzi61vqy987zp6bfgk0qin9pja68qq75vx",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/archive-virtio/virtio-win-0.1.196-1/virtio-win.iso"
+ ],
+ "integrity": "sha256-fZeDMTJK3mQjBvO5Ze4/KHm8g4l/lj2qMfo+v3wfRf4=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "02qgsj4h4zrjxkcclx7clsqbqd699kg0dq1xxa9hbj3vfnddjv1f",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://www.pjrc.com/teensy/td_153/TeensyduinoInstall.linux64"
+ ],
+ "integrity": "sha256-LmzZmnV7yAWT6j3gBt5MyTS8sKbsdMrY7DJ/AonUDws=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://dl.winehq.org/wine/wine-mono/6.4.0/wine-mono-6.4.0-x86.msi"
+ ],
+ "integrity": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "00y96w9shbbrdbf6xcjlahqd08154kkrxmqraik7qshiwcqpw7p4",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://raw.githubusercontent.com/webtorrent/webtorrent-desktop/v0.21.0/static/linux/share/icons/hicolor/48x48/apps/webtorrent-desktop.png"
+ ],
+ "integrity": "sha256-5B5+MeMRanxmVBnXnuckJSDQMFRUsm7canktqBM3yQM=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "0lw193jr7ldvln5x5z9p21rz1by46h0say9whfcw2kxs9vprd5b3",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "http://xuxen.eus/static/hunspell/eu_ES.dic"
+ ],
+ "integrity": "sha256-Y5WW7066T8GZgzx5pQE0xK/wcxA3/dKLpbvRk+VIgVM=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "0wbhvypdr96a5ddg6kj41dn9sbl49n7pfi2vs762ij82hm2gvwcm",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://www.openprinting.org/download/printdriver/components/lsb3.2/main/RPMS/noarch/openprinting-ppds-postscript-lexmark-20160218-1lsb3.2.noarch.rpm"
+ ],
+ "integrity": "sha256-lfH9RIUCySjM0VtEd49NhC6dbAtETvNaK8qk3K7fcHE=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "outputHash": "01gy84gr0gw5ap7hpy72azaf6hlzac7vxkn5cgad5sfbyzxgjgc9",
+ "outputHashAlgo": "sha256",
+ "outputHashMode": "flat",
+ "type": "url",
+ "urls": [
+ "https://wire-app.wire.com/linux/debian/pool/main/Wire-3.26.2941_amd64.deb"
+ ],
+ "integrity": "sha256-iT35+vfL6dLUY8XOvg9Tn0Lj1Ffi+AvPVYU/kB9B/gU=",
+ "inferredFetcher": "unclassified"
+ },
+ {
+ "type": "url",
+ "urls": [
+ "https://elpa.gnu.org/packages/zones.foobar"
+ ],
+ "integrity": "sha256-YRZc7dI3DjUzoSIp4fIshUyhMXIQ/fPKaKnjeYVa4WI="
}
],
"version":"1",
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -8,6 +8,7 @@
import logging
from pathlib import Path
from typing import Dict, List
+from urllib.parse import urlparse
import pytest
import requests
@@ -15,11 +16,14 @@
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
+ DEFAULT_EXTENSIONS_TO_IGNORE,
POSSIBLE_TARBALL_MIMETYPES,
ArtifactNatureMistyped,
ArtifactNatureUndetected,
+ ArtifactWithoutExtension,
NixGuixLister,
is_tarball,
+ url_endswith,
)
from swh.lister.pattern import ListerStats
@@ -43,6 +47,33 @@
return json.loads(datapath.read_text()) if datapath.exists else []
+@pytest.mark.parametrize(
+ "name,expected_result",
+ [(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
+ + [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
+ + [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
+ + [("two?file=something.el", False), ("foo?two=two&three=three", False)],
+)
+def test_url_endswith(name, expected_result):
+ """It should detect whether url or query params of the urls ends with extensions"""
+ urlparsed = urlparse(f"https://example.org/{name}")
+ assert (
+ url_endswith(
+ urlparsed,
+ TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE,
+ raise_when_no_extension=False,
+ )
+ is expected_result
+ )
+
+
+def test_url_endswith_raise():
+ """It should raise when the tested url has no extension"""
+ urlparsed = urlparse("https://example.org/foo?two=two&three=three")
+ with pytest.raises(ArtifactWithoutExtension):
+ url_endswith(urlparsed, ["unimportant"])
+
+
@pytest.mark.parametrize(
"tarballs",
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
@@ -254,10 +285,15 @@
def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
- """NixGuixLister should ignore unsupported or incomplete origins"""
+ """NixGuixLister should ignore unsupported or incomplete or to ignore origins"""
url = SOURCES["nixpkgs"]["manifest"]
origin_upstream = SOURCES["nixpkgs"]["repo"]
- lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+ lister = NixGuixLister(
+ swh_scheduler,
+ url=url,
+ origin_upstream=origin_upstream,
+ extensions_to_ignore=["foobar"],
+ )
response = page_response(datadir, "failure")

File Metadata

Mime Type
text/plain
Expires
Dec 20 2024, 12:31 AM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216431

Event Timeline