diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -37,6 +37,21 @@ logger = logging.getLogger(__name__) +# By default, ignore binary files and archives containing binaries +DEFAULT_EXTENSIONS_TO_IGNORE = [ + "AppImage", + "bin", + "exe", + "iso", + "linux64", + "msi", + "png", + "dic", + "deb", + "rpm", +] + + class ArtifactNatureUndetected(ValueError): """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" @@ -55,11 +70,7 @@ class ArtifactWithoutExtension(ValueError): - """Raised when an artifact nature cannot be determined by its name. - - This exception is solely for internal use of the :meth:`is_tarball` method. - - """ + """Raised when an artifact nature cannot be determined by its name.""" pass @@ -125,6 +136,22 @@ POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) +def url_endswith( + urlparsed, extensions: List[str], raise_when_no_extension: bool = True +) -> bool: + """Determine whether urlparsed ends with one of the extensions. + + Raises: + ArtifactWithoutExtension in case no extension is available and raise_when_no_extension + is True (the default) + + """ + paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)] + if raise_when_no_extension and not any(path.suffix != "" for path in paths): + raise ArtifactWithoutExtension + return any(path.suffix.endswith(tuple(extensions)) for path in paths) + + def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: """Determine whether a list of files actually are tarballs or simple files. @@ -157,13 +184,7 @@ urlparsed = urlparse(url) if urlparsed.scheme not in ("http", "https", "ftp"): raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") - - paths = [ - Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query) - ] - if not any(path.suffix != "" for path in paths): - raise ArtifactWithoutExtension - return any(path.suffix.endswith(tuple(TARBALL_EXTENSIONS)) for path in paths) + return url_endswith(urlparsed, TARBALL_EXTENSIONS) index = random.randrange(len(urls)) url = urls[index] @@ -247,6 +268,10 @@ it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` response header, and then checks the extension again. + Optionally, when the `extension_to_ignore` parameter is provided, it extends the + default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed. + This can be used to drop further binary files detected in the wild. + """ LISTER_NAME = "nixguix" @@ -260,6 +285,7 @@ credentials: Optional[CredentialsType] = None, # canonicalize urls, can be turned off during docker runs canonicalize: bool = True, + extensions_to_ignore: List[str] = [], **kwargs: Any, ): super().__init__( @@ -271,6 +297,7 @@ # either full fqdn NixOS/nixpkgs or guix repository urls # maybe add an assert on those specific urls? self.origin_upstream = origin_upstream + self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore self.session = requests.Session() # for testing purposes, we may want to skip this step (e.g. docker run and rate @@ -435,13 +462,34 @@ # 'critical' information about how to recompute the hash (e.g. fs # layout, executable bit, ...) logger.warning( - "Skipping artifact <%s>: 'file' artifact of type <%s> is " + "Skipping artifact <%s>: 'file' artifact of type <%s> is" " missing information to properly check its integrity", artifact, artifact_type, ) continue + # At this point plenty of heuristics happened and we should have found + # the right origin and its nature. + + # Let's check and filter it out if it is to be ignored (if possible). + # Some origin urls may not have extension at this point (e.g + # http://git.marmaro.de/?p=mmh;a=snp;h=;sf=tgz), let them through. + if url_endswith( + urlparse(origin), + self.extensions_to_ignore, + raise_when_no_extension=False, + ): + logger.warning( + "Skipping artifact <%s>: 'file' artifact of type <%s> is" + " ignored due to lister configuration. It should ignore" + " origins with extension [%s]", + origin, + artifact_type, + ",".join(self.extensions_to_ignore), + ) + continue + logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) yield ArtifactType.ARTIFACT, Artifact( origin=origin, diff --git a/swh/lister/nixguix/tests/data/sources-failure.json b/swh/lister/nixguix/tests/data/sources-failure.json --- a/swh/lister/nixguix/tests/data/sources-failure.json +++ b/swh/lister/nixguix/tests/data/sources-failure.json @@ -57,6 +57,123 @@ "type": "url", "urls": [ "https://code.9front.org/hg/plan9front" ], "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" + }, + { + "outputHash": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://github.com/KSP-CKAN/CKAN/releases/download/v1.30.4/ckan.exe" + ], + "integrity": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://github.com/johannesjo/super-productivity/releases/download/v7.5.1/superProductivity-7.5.1.AppImage" + ], + "integrity": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "19ir6x4c01825hpx2wbbcxkk70ymwbw4j03v8b2xc13ayylwzx0r", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "http://gorilla.dp100.com/downloads/gorilla1537_64.bin" + ], + "integrity": "sha256-GfTPqfdqBNbFQnsASfji1YMzZ2drcdEvLAIFwEg3OaY=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "1zj53xybygps66m3v5kzi61vqy987zp6bfgk0qin9pja68qq75vx", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/archive-virtio/virtio-win-0.1.196-1/virtio-win.iso" + ], + "integrity": "sha256-fZeDMTJK3mQjBvO5Ze4/KHm8g4l/lj2qMfo+v3wfRf4=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "02qgsj4h4zrjxkcclx7clsqbqd699kg0dq1xxa9hbj3vfnddjv1f", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://www.pjrc.com/teensy/td_153/TeensyduinoInstall.linux64" + ], + "integrity": "sha256-LmzZmnV7yAWT6j3gBt5MyTS8sKbsdMrY7DJ/AonUDws=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://dl.winehq.org/wine/wine-mono/6.4.0/wine-mono-6.4.0-x86.msi" + ], + "integrity": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "00y96w9shbbrdbf6xcjlahqd08154kkrxmqraik7qshiwcqpw7p4", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://raw.githubusercontent.com/webtorrent/webtorrent-desktop/v0.21.0/static/linux/share/icons/hicolor/48x48/apps/webtorrent-desktop.png" + ], + "integrity": "sha256-5B5+MeMRanxmVBnXnuckJSDQMFRUsm7canktqBM3yQM=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "0lw193jr7ldvln5x5z9p21rz1by46h0say9whfcw2kxs9vprd5b3", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "http://xuxen.eus/static/hunspell/eu_ES.dic" + ], + "integrity": "sha256-Y5WW7066T8GZgzx5pQE0xK/wcxA3/dKLpbvRk+VIgVM=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "0wbhvypdr96a5ddg6kj41dn9sbl49n7pfi2vs762ij82hm2gvwcm", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://www.openprinting.org/download/printdriver/components/lsb3.2/main/RPMS/noarch/openprinting-ppds-postscript-lexmark-20160218-1lsb3.2.noarch.rpm" + ], + "integrity": "sha256-lfH9RIUCySjM0VtEd49NhC6dbAtETvNaK8qk3K7fcHE=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "01gy84gr0gw5ap7hpy72azaf6hlzac7vxkn5cgad5sfbyzxgjgc9", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://wire-app.wire.com/linux/debian/pool/main/Wire-3.26.2941_amd64.deb" + ], + "integrity": "sha256-iT35+vfL6dLUY8XOvg9Tn0Lj1Ffi+AvPVYU/kB9B/gU=", + "inferredFetcher": "unclassified" + }, + { + "type": "url", + "urls": [ + "https://elpa.gnu.org/packages/zones.foobar" + ], + "integrity": "sha256-YRZc7dI3DjUzoSIp4fIshUyhMXIQ/fPKaKnjeYVa4WI=" } ], "version":"1", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -8,6 +8,7 @@ import logging from pathlib import Path from typing import Dict, List +from urllib.parse import urlparse import pytest import requests @@ -15,11 +16,14 @@ from swh.lister import TARBALL_EXTENSIONS from swh.lister.nixguix.lister import ( + DEFAULT_EXTENSIONS_TO_IGNORE, POSSIBLE_TARBALL_MIMETYPES, ArtifactNatureMistyped, ArtifactNatureUndetected, + ArtifactWithoutExtension, NixGuixLister, is_tarball, + url_endswith, ) from swh.lister.pattern import ListerStats @@ -43,6 +47,33 @@ return json.loads(datapath.read_text()) if datapath.exists else [] +@pytest.mark.parametrize( + "name,expected_result", + [(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS] + + [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS] + + [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE] + + [("two?file=something.el", False), ("foo?two=two&three=three", False)], +) +def test_url_endswith(name, expected_result): + """It should detect whether url or query params of the urls ends with extensions""" + urlparsed = urlparse(f"https://example.org/{name}") + assert ( + url_endswith( + urlparsed, + TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE, + raise_when_no_extension=False, + ) + is expected_result + ) + + +def test_url_endswith_raise(): + """It should raise when the tested url has no extension""" + urlparsed = urlparse("https://example.org/foo?two=two&three=three") + with pytest.raises(ArtifactWithoutExtension): + url_endswith(urlparsed, ["unimportant"]) + + @pytest.mark.parametrize( "tarballs", [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] @@ -254,10 +285,15 @@ def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): - """NixGuixLister should ignore unsupported or incomplete origins""" + """NixGuixLister should ignore unsupported or incomplete or to ignore origins""" url = SOURCES["nixpkgs"]["manifest"] origin_upstream = SOURCES["nixpkgs"]["repo"] - lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + lister = NixGuixLister( + swh_scheduler, + url=url, + origin_upstream=origin_upstream, + extensions_to_ignore=["foobar"], + ) response = page_response(datadir, "failure")