diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -247,6 +247,10 @@ it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` response header, and then checks the extension again. + Optionally if the `extension_to_ignore` parameter is provided, it filters out those + extensions from the listing. This can be used to drop binary files (e.g. bin, exe, + iso, msi, png, ...). This should be set in the lister configuration file. + """ LISTER_NAME = "nixguix" @@ -260,6 +264,7 @@ credentials: Optional[CredentialsType] = None, # canonicalize urls, can be turned off during docker runs canonicalize: bool = True, + extensions_to_ignore: List[str] = [], **kwargs: Any, ): super().__init__( @@ -271,6 +276,7 @@ # either full fqdn NixOS/nixpkgs or guix repository urls # maybe add an assert on those specific urls? self.origin_upstream = origin_upstream + self.extensions_to_ignore = extensions_to_ignore self.session = requests.Session() # for testing purposes, we may want to skip this step (e.g. docker run and rate @@ -377,6 +383,8 @@ # We'll deal with outputHash as integrity field integrity = outputHash + # FIXME: filter out noisy extensions + try: is_tar, origin = is_tarball(urls, self.session) except ArtifactNatureMistyped: @@ -435,13 +443,39 @@ # 'critical' information about how to recompute the hash (e.g. fs # layout, executable bit, ...) logger.warning( - "Skipping artifact <%s>: 'file' artifact of type <%s> is " + "Skipping artifact <%s>: 'file' artifact of type <%s> is" " missing information to properly check its integrity", artifact, artifact_type, ) continue + # At this point plenty of heuristics happened and we should have found + # the right origin and its nature. + + # Let's check and filter it out if it is to be ignored + if self.extensions_to_ignore: + urlparsed = urlparse(origin) + paths = [ + Path(p) + for (_, p) in [("_", urlparsed.path)] + + parse_qsl(urlparsed.query) + ] + to_ignore = any( + path.suffix.endswith(tuple(self.extensions_to_ignore)) + for path in paths + ) + if to_ignore: + logger.warning( + "Skipping artifact <%s>: 'file' artifact of type <%s> is" + " ignored due to lister configuration. It should ignore" + " origins with extension [%s]", + origin, + artifact_type, + ",".join(self.extensions_to_ignore), + ) + continue + logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) yield ArtifactType.ARTIFACT, Artifact( origin=origin, diff --git a/swh/lister/nixguix/tests/data/sources-failure.json b/swh/lister/nixguix/tests/data/sources-failure.json --- a/swh/lister/nixguix/tests/data/sources-failure.json +++ b/swh/lister/nixguix/tests/data/sources-failure.json @@ -57,6 +57,116 @@ "type": "url", "urls": [ "https://code.9front.org/hg/plan9front" ], "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" + }, + { + "outputHash": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://github.com/KSP-CKAN/CKAN/releases/download/v1.30.4/ckan.exe" + ], + "integrity": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://github.com/johannesjo/super-productivity/releases/download/v7.5.1/superProductivity-7.5.1.AppImage" + ], + "integrity": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "19ir6x4c01825hpx2wbbcxkk70ymwbw4j03v8b2xc13ayylwzx0r", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "http://gorilla.dp100.com/downloads/gorilla1537_64.bin" + ], + "integrity": "sha256-GfTPqfdqBNbFQnsASfji1YMzZ2drcdEvLAIFwEg3OaY=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "1zj53xybygps66m3v5kzi61vqy987zp6bfgk0qin9pja68qq75vx", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/archive-virtio/virtio-win-0.1.196-1/virtio-win.iso" + ], + "integrity": "sha256-fZeDMTJK3mQjBvO5Ze4/KHm8g4l/lj2qMfo+v3wfRf4=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "02qgsj4h4zrjxkcclx7clsqbqd699kg0dq1xxa9hbj3vfnddjv1f", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://www.pjrc.com/teensy/td_153/TeensyduinoInstall.linux64" + ], + "integrity": "sha256-LmzZmnV7yAWT6j3gBt5MyTS8sKbsdMrY7DJ/AonUDws=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://dl.winehq.org/wine/wine-mono/6.4.0/wine-mono-6.4.0-x86.msi" + ], + "integrity": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "00y96w9shbbrdbf6xcjlahqd08154kkrxmqraik7qshiwcqpw7p4", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://raw.githubusercontent.com/webtorrent/webtorrent-desktop/v0.21.0/static/linux/share/icons/hicolor/48x48/apps/webtorrent-desktop.png" + ], + "integrity": "sha256-5B5+MeMRanxmVBnXnuckJSDQMFRUsm7canktqBM3yQM=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "0lw193jr7ldvln5x5z9p21rz1by46h0say9whfcw2kxs9vprd5b3", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "http://xuxen.eus/static/hunspell/eu_ES.dic" + ], + "integrity": "sha256-Y5WW7066T8GZgzx5pQE0xK/wcxA3/dKLpbvRk+VIgVM=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "0wbhvypdr96a5ddg6kj41dn9sbl49n7pfi2vs762ij82hm2gvwcm", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://www.openprinting.org/download/printdriver/components/lsb3.2/main/RPMS/noarch/openprinting-ppds-postscript-lexmark-20160218-1lsb3.2.noarch.rpm" + ], + "integrity": "sha256-lfH9RIUCySjM0VtEd49NhC6dbAtETvNaK8qk3K7fcHE=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "01gy84gr0gw5ap7hpy72azaf6hlzac7vxkn5cgad5sfbyzxgjgc9", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://wire-app.wire.com/linux/debian/pool/main/Wire-3.26.2941_amd64.deb" + ], + "integrity": "sha256-iT35+vfL6dLUY8XOvg9Tn0Lj1Ffi+AvPVYU/kB9B/gU=", + "inferredFetcher": "unclassified" } ], "version":"1", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -254,10 +254,28 @@ def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): - """NixGuixLister should ignore unsupported or incomplete origins""" + """NixGuixLister should ignore unsupported or incomplete or to ignore origins""" url = SOURCES["nixpkgs"]["manifest"] origin_upstream = SOURCES["nixpkgs"]["repo"] - lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + extensions_to_ignore = [ + "AppImage", # specific archive format(.so, ...) + "bin", # specific archive format (dll, ...) + "exe", # specific archive format (dll, ...) + "iso", + "linux64", + "msi", # specific archive format (dll, ...) + "png", + "dic", + # Drop? + "deb", # specific archive format + "rpm", # specific archive format + ] + lister = NixGuixLister( + swh_scheduler, + url=url, + origin_upstream=origin_upstream, + extensions_to_ignore=extensions_to_ignore, + ) response = page_response(datadir, "failure")