diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py index eaa5efd..28e81ef 100644 --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -1,84 +1,91 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import pkg_resources logger = logging.getLogger(__name__) try: __version__ = pkg_resources.get_distribution("swh.lister").version except pkg_resources.DistributionNotFound: __version__ = "devel" USER_AGENT_TEMPLATE = ( f"Software Heritage %s lister v{__version__}" " (+https://www.softwareheritage.org/contact)" ) LISTERS = { entry_point.name.split(".", 1)[1]: entry_point for entry_point in pkg_resources.iter_entry_points("swh.workers") if entry_point.name.split(".", 1)[0] == "lister" } SUPPORTED_LISTERS = list(LISTERS) TARBALL_EXTENSIONS = [ "crate", "gem", "jar", + "love", # zip "zip", "tar", "gz", "tgz", "tbz", "bz2", "bzip2", "lzma", "lz", "txz", "xz", "z", "Z", "7z", + "oxt", # zip + "pak", # zip + "war", # zip + "whl", # zip + "vsix", # zip + "VSIXPackage", # zip "zst", ] """Tarball recognition pattern""" def get_lister(lister_name, db_url=None, **conf): """Instantiate a lister given its name. Args: lister_name (str): Lister's name conf (dict): Configuration dict (lister db cnx, policy, priority...) Returns: Tuple (instantiated lister, drop_tables function, init schema function, insert minimum data function) """ if lister_name not in LISTERS: raise ValueError( "Invalid lister %s: only supported listers are %s" % (lister_name, SUPPORTED_LISTERS) ) if db_url: conf["lister"] = {"cls": "local", "args": {"db": db_url}} registry_entry = LISTERS[lister_name].load()() lister_cls = registry_entry["lister"] from swh.lister import pattern if issubclass(lister_cls, pattern.Lister): return lister_cls.from_config(**conf) else: # Old-style lister return lister_cls(override_config=conf) diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json index bb1943c..61764ce 100644 --- a/swh/lister/nixguix/tests/data/sources-success.json +++ b/swh/lister/nixguix/tests/data/sources-success.json @@ -1,107 +1,184 @@ { "sources": [ { "type": "url", "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" }, { "type": "url", "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ], "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" }, { "type": "url", "urls": [ "https://example.com/file.txt" ], "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" }, { "type": "url", "urls": [ "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz" ], "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0=" }, { "type": "url", "urls": [ "ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz" ], "integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ=" }, { "type": "url", "urls": [ "www.roudoudou.com/export/cpc/rasm/rasm_v0117_src.zip" ], "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" }, { "type": "url", "outputHashMode": "flat", "urls": [ "http://downloads.sourceforge.net/project/nmon/lmon16n.c", "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c", "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c" ], "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" }, { "outputHash": "0s7p9swjqjsqddylmgid6cv263ggq7pmb734z4k84yfcrgb6kg4g", "outputHashAlgo": "sha256", "outputHashMode": "recursive", "type": "url", "urls": [ "https://github.com/kandu/trie/archive/1.0.0.tar.gz" ], "integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=", "inferredFetcher": "fetchzip" }, { "type": "url", "urls": [ "https://github.com/trie/trie.git" ], "integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=" }, { "type": "git", "git_url": "https://example.org/pali/0xffff", "git_ref": "0.9" }, { "type": "hg", "hg_url": "https://example.org/vityok/cl-string-match", "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97" }, { "type": "svn", "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2", "svn_revision": 39057 }, { "outputHash": "sha256-LxVcYj2WKHbhNu5x/DFkxQPOYrVkNvwiE/qcODq52Lc=", "outputHashAlgo": null, "outputHashMode": "recursive", "type": "url", "urls": [ "https://github.com/julian-klode/triehash/archive/debian/0.3-3.tar.gz" ], "inferredFetcher": "fetchzip" }, { "type": "url", "urls": [ "http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz" ], "integrity": "sha256-G/7oY5qdCSJ59VlwHtIbvMdT6+mriXhMqQIHNx65J+E=" }, { "type": "url", "urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"], "integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk=" + }, + { + "outputHash": "0w2qkrrkzfy4h4jld18apypmbi8a8r89y2l11axlv808i2rg68fk", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://github.com/josefnpat/vapor/releases/download/0.2.3/vapor_dbf509f.love" + ], + "integrity": "sha256-0yHzsogIoE27CoEKn1BGCsVVr78KhUYlgcS7P3OeWHA=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "0rf06axz1hxssg942w2g66avak30jy6rfdwxynhriqv3vrf17bja", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "http://mirrors.jenkins.io/war-stable/2.303.1/jenkins.war" + ], + "integrity": "sha256-Sq4TXN5j45ih9Z03l42XYEy1lTFPcEHS07rD8LsywGU=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "1filqm050ixy53kdv81bd4n80vjvfapnmzizy7jg8a6pilv17gfc", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://files.pythonhosted.org/packages/py2.py3/g/geojson/geojson-2.5.0-py2.py3-none-any.whl" + ], + "integrity": "sha256-zL0TNo3XKPTk8T/+aq9yW26ALGkroN3mKL5HUEDFNLo=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "sha256:0i1cw0nfg24b0sg2yc3q7315ng5vc5245nvh0l1cndkn2c9z4978", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://stavekontrolden.dk/dictionaries/da_DK/da_DK-2.5.189.oxt" + ], + "integrity": "sha256-6CTyExN2NssCBXDbQkRhuzxbwjh4MC+eBouI5yzgLEQ=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "0y2HN4WGYUUXBfqp8Xb4oaA0hbLZmE3kDUXMBAOjvPQ=", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://github.com/microsoft/vscode-python/releases/download/2021.5.829140558/ms-python-release.vsix" + ], + "integrity": "sha256-0y2HN4WGYUUXBfqp8Xb4oaA0hbLZmE3kDUXMBAOjvPQ=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "08dfl5h1k6s542qw5qx2czm1wb37ck9w2vpjz44kp2az352nmksb", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://zxh404.gallery.vsassets.io/_apis/public/gallery/publisher/zxh404/extension/vscode-proto3/0.5.4/assetbyname/Microsoft.VisualStudio.Services.VSIXPackage" + ], + "integrity": "sha256-S89qRRlfiTsJ+fJuwdNkZywe6mei48KxIEWbGWChriE=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "0kaz8j85wjjnf18z0lz69xr1z8makg30jn2dzdyicd1asrj0q1jm", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://github.com/yvt/openspades/releases/download/v0.1.1b/NotoFonts.pak" + ], + "integrity": "sha256-VQYMZNYqNBZ9+01YCcabqqIfck/mU/BRcFZKXpBEX00=", + "inferredFetcher": "unclassified" } ], "version": "1", "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" } diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py index cadb65e..7d162cb 100644 --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -1,309 +1,309 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import json import logging from pathlib import Path from typing import Dict, List import pytest import requests from requests.exceptions import ConnectionError, InvalidSchema, SSLError from swh.lister import TARBALL_EXTENSIONS from swh.lister.nixguix.lister import ( POSSIBLE_TARBALL_MIMETYPES, ArtifactNatureMistyped, ArtifactNatureUndetected, NixGuixLister, is_tarball, ) from swh.lister.pattern import ListerStats logger = logging.getLogger(__name__) SOURCES = { "guix": { "repo": "https://git.savannah.gnu.org/cgit/guix.git/", "manifest": "https://guix.gnu.org/sources.json", }, "nixpkgs": { "repo": "https://github.com/NixOS/nixpkgs", "manifest": "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json", }, } def page_response(datadir, instance: str = "success") -> List[Dict]: """Return list of repositories (out of test dataset)""" datapath = Path(datadir, f"sources-{instance}.json") return json.loads(datapath.read_text()) if datapath.exists else [] @pytest.mark.parametrize( "tarballs", [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], ) def test_is_tarball_simple(tarballs): """Simple check on tarball should discriminate between tarball and file""" urls = [f"https://example.org/{tarball}" for tarball in tarballs] is_tar, origin = is_tarball(urls) assert is_tar is True assert origin == urls[0] @pytest.mark.parametrize( "query_param", ["file", "f", "url", "name", "anykeyreally"], ) def test_is_tarball_not_so_simple(query_param): """More involved check on tarball should discriminate between tarball and file""" url = f"https://example.org/download.php?foo=bar&{query_param}=one.tar.gz" is_tar, origin = is_tarball([url]) assert is_tar is True assert origin == url @pytest.mark.parametrize( "files", [ ["abc.lisp"], ["one.abc", "two.bcd"], ["abc.c", "other.c"], ["one.scm?foo=bar", "two.scm?foo=bar"], ["config.nix", "flakes.nix"], ], ) def test_is_tarball_simple_not_tarball(files): """Simple check on tarball should discriminate between tarball and file""" urls = [f"http://example.org/{file}" for file in files] is_tar, origin = is_tarball(urls) assert is_tar is False assert origin == urls[0] def test_is_tarball_complex_with_no_result(requests_mock): """Complex tarball detection without proper information should fail.""" # No extension, this won't detect immediately the nature of the url url = "https://example.org/crates/package/download" urls = [url] with pytest.raises(ArtifactNatureUndetected): is_tarball(urls) # no request parameter, this cannot fallback, raises with pytest.raises(ArtifactNatureUndetected): requests_mock.head( url, status_code=404, # not found so cannot detect anything ) is_tarball(urls, requests) with pytest.raises(ArtifactNatureUndetected): requests_mock.head( url, headers={} ) # response ok without headers, cannot detect anything is_tarball(urls, requests) with pytest.raises(ArtifactNatureUndetected): fallback_url = "https://example.org/mirror/crates/package/download" requests_mock.head( url, headers={"location": fallback_url} # still no extension, cannot detect ) is_tarball(urls, requests) with pytest.raises(ArtifactNatureMistyped): is_tarball(["foo://example.org/unsupported-scheme"]) with pytest.raises(ArtifactNatureMistyped): fallback_url = "foo://example.org/unsupported-scheme" requests_mock.head( url, headers={"location": fallback_url} # still no extension, cannot detect ) is_tarball(urls, requests) @pytest.mark.parametrize( "fallback_url, expected_result", [ ("https://example.org/mirror/crates/package/download.tar.gz", True), ("https://example.org/mirror/package/download.lisp", False), ], ) def test_is_tarball_complex_with_location_result( requests_mock, fallback_url, expected_result ): """Complex tarball detection with information should detect artifact nature""" # No extension, this won't detect immediately the nature of the url url = "https://example.org/crates/package/download" urls = [url] # One scenario where the url renders a location with a proper extension requests_mock.head(url, headers={"location": fallback_url}) is_tar, origin = is_tarball(urls, requests) assert is_tar == expected_result if is_tar: assert origin == fallback_url @pytest.mark.parametrize( "content_type, expected_result", [("application/json", False), ("application/something", False)] + [(ext, True) for ext in POSSIBLE_TARBALL_MIMETYPES], ) def test_is_tarball_complex_with_content_type_result( requests_mock, content_type, expected_result ): """Complex tarball detection with information should detect artifact nature""" # No extension, this won't detect immediately the nature of the url url = "https://example.org/crates/package/download" urls = [url] # One scenario where the url renders a location with a proper extension requests_mock.head(url, headers={"Content-Type": content_type}) is_tar, origin = is_tarball(urls, requests) assert is_tar == expected_result if is_tar: assert origin == url def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): """NixGuixLister should list all origins per visit type""" url = SOURCES["guix"]["manifest"] origin_upstream = SOURCES["guix"]["repo"] lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) response = page_response(datadir, "success") requests_mock.get( url, [{"json": response}], ) requests_mock.get( "https://api.github.com/repos/trie/trie", [{"json": {"html_url": "https://github.com/trie/trie.git"}}], ) requests_mock.head( "http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz", headers={"Content-Type": "application/gzip; charset=ISO-8859-1"}, ) expected_visit_types = defaultdict(int) # origin upstream is added as origin expected_nb_origins = 1 expected_visit_types["git"] += 1 for artifact in response["sources"]: # Each artifact is considered an origin (even "url" artifacts with mirror urls) expected_nb_origins += 1 artifact_type = artifact["type"] if artifact_type in [ "git", "svn", "hg", ]: expected_visit_types[artifact_type] += 1 elif artifact_type == "url": url = artifact["urls"][0] if url.endswith(".git"): expected_visit_types["git"] += 1 elif url.endswith(".c") or url.endswith(".txt"): expected_visit_types["content"] += 1 elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless expected_visit_types["svn"] += 1 - else: + else: # tarball artifacts expected_visit_types["directory"] += 1 assert set(expected_visit_types.keys()) == { "content", "git", "svn", "hg", "directory", } listed_result = lister.run() # 1 page read is 1 origin nb_pages = expected_nb_origins assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins mapping_visit_types = defaultdict(int) for listed_origin in scheduler_origins: assert listed_origin.visit_type in expected_visit_types # no last update is listed on those manifests assert listed_origin.last_update is None mapping_visit_types[listed_origin.visit_type] += 1 assert dict(mapping_visit_types) == expected_visit_types def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): """NixGuixLister should ignore unsupported or incomplete origins""" url = SOURCES["nixpkgs"]["manifest"] origin_upstream = SOURCES["nixpkgs"]["repo"] lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) response = page_response(datadir, "failure") requests_mock.get( url, [{"json": response}], ) # Amongst artifacts, this url does not allow to determine its nature (tarball, file) # It's ending up doing a http head query which ends up being 404, so it's skipped. requests_mock.head( "https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped", status_code=404, ) # Invalid schema for that origin (and no extension), so skip origin # from its name requests_mock.head( "ftp://ftp.ourproject.org/file-with-no-extension", exc=InvalidSchema, ) # Cannot communicate with an expired cert, so skip origin requests_mock.head( "https://code.9front.org/hg/plan9front", exc=SSLError, ) # Cannot connect to the site, so skip origin requests_mock.head( "https://git-tails.immerda.ch/onioncircuits", exc=ConnectionError, ) listed_result = lister.run() # only the origin upstream is listed, every other entries are unsupported or incomplete assert listed_result == ListerStats(pages=1, origins=1) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].visit_type == "git" def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock): url = SOURCES["nixpkgs"]["manifest"] origin_upstream = SOURCES["nixpkgs"]["repo"] lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) requests_mock.get( url, status_code=404, ) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0