diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json index 61764ce..0e5ccb0 100644 --- a/swh/lister/nixguix/tests/data/sources-success.json +++ b/swh/lister/nixguix/tests/data/sources-success.json @@ -1,184 +1,272 @@ { "sources": [ { "type": "url", "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" }, { "type": "url", - "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ], + "urls": [ "https://github.com/owner-3/repository-1/revision-1.tar" ], "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" }, { "type": "url", "urls": [ "https://example.com/file.txt" ], "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" }, { "type": "url", "urls": [ "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz" ], "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0=" }, { "type": "url", "urls": [ "ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz" ], "integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ=" }, { "type": "url", "urls": [ "www.roudoudou.com/export/cpc/rasm/rasm_v0117_src.zip" ], "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" }, { "type": "url", "outputHashMode": "flat", "urls": [ "http://downloads.sourceforge.net/project/nmon/lmon16n.c", "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c", "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c" ], "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" }, { "outputHash": "0s7p9swjqjsqddylmgid6cv263ggq7pmb734z4k84yfcrgb6kg4g", "outputHashAlgo": "sha256", "outputHashMode": "recursive", "type": "url", "urls": [ - "https://github.com/kandu/trie/archive/1.0.0.tar.gz" + "https://github.com/kandu/trie/archive/1.0.0.txz" ], "integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=", "inferredFetcher": "fetchzip" }, { "type": "url", "urls": [ "https://github.com/trie/trie.git" ], "integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=" }, { "type": "git", "git_url": "https://example.org/pali/0xffff", "git_ref": "0.9" }, { "type": "hg", "hg_url": "https://example.org/vityok/cl-string-match", "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97" }, { "type": "svn", "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2", "svn_revision": 39057 }, { "outputHash": "sha256-LxVcYj2WKHbhNu5x/DFkxQPOYrVkNvwiE/qcODq52Lc=", "outputHashAlgo": null, "outputHashMode": "recursive", "type": "url", "urls": [ - "https://github.com/julian-klode/triehash/archive/debian/0.3-3.tar.gz" + "https://github.com/julian-klode/triehash/archive/debian/0.3-3.tbz" ], "inferredFetcher": "fetchzip" }, { "type": "url", "urls": [ "http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz" ], "integrity": "sha256-G/7oY5qdCSJ59VlwHtIbvMdT6+mriXhMqQIHNx65J+E=" }, { "type": "url", "urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"], "integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk=" }, { "outputHash": "0w2qkrrkzfy4h4jld18apypmbi8a8r89y2l11axlv808i2rg68fk", "outputHashAlgo": "sha256", "outputHashMode": "flat", "type": "url", "urls": [ "https://github.com/josefnpat/vapor/releases/download/0.2.3/vapor_dbf509f.love" ], "integrity": "sha256-0yHzsogIoE27CoEKn1BGCsVVr78KhUYlgcS7P3OeWHA=", "inferredFetcher": "unclassified" }, { "outputHash": "0rf06axz1hxssg942w2g66avak30jy6rfdwxynhriqv3vrf17bja", "outputHashAlgo": "sha256", "outputHashMode": "flat", "type": "url", "urls": [ "http://mirrors.jenkins.io/war-stable/2.303.1/jenkins.war" ], "integrity": "sha256-Sq4TXN5j45ih9Z03l42XYEy1lTFPcEHS07rD8LsywGU=", "inferredFetcher": "unclassified" }, { "outputHash": "1filqm050ixy53kdv81bd4n80vjvfapnmzizy7jg8a6pilv17gfc", "outputHashAlgo": "sha256", "outputHashMode": "flat", "type": "url", "urls": [ "https://files.pythonhosted.org/packages/py2.py3/g/geojson/geojson-2.5.0-py2.py3-none-any.whl" ], "integrity": "sha256-zL0TNo3XKPTk8T/+aq9yW26ALGkroN3mKL5HUEDFNLo=", "inferredFetcher": "unclassified" }, { "outputHash": "sha256:0i1cw0nfg24b0sg2yc3q7315ng5vc5245nvh0l1cndkn2c9z4978", "outputHashAlgo": "sha256", "outputHashMode": "flat", "type": "url", "urls": [ "https://stavekontrolden.dk/dictionaries/da_DK/da_DK-2.5.189.oxt" ], "integrity": "sha256-6CTyExN2NssCBXDbQkRhuzxbwjh4MC+eBouI5yzgLEQ=", "inferredFetcher": "unclassified" }, { "outputHash": "0y2HN4WGYUUXBfqp8Xb4oaA0hbLZmE3kDUXMBAOjvPQ=", "outputHashAlgo": "sha256", "outputHashMode": "flat", "type": "url", "urls": [ "https://github.com/microsoft/vscode-python/releases/download/2021.5.829140558/ms-python-release.vsix" ], "integrity": "sha256-0y2HN4WGYUUXBfqp8Xb4oaA0hbLZmE3kDUXMBAOjvPQ=", "inferredFetcher": "unclassified" }, { "outputHash": "08dfl5h1k6s542qw5qx2czm1wb37ck9w2vpjz44kp2az352nmksb", "outputHashAlgo": "sha256", "outputHashMode": "flat", "type": "url", "urls": [ "https://zxh404.gallery.vsassets.io/_apis/public/gallery/publisher/zxh404/extension/vscode-proto3/0.5.4/assetbyname/Microsoft.VisualStudio.Services.VSIXPackage" ], "integrity": "sha256-S89qRRlfiTsJ+fJuwdNkZywe6mei48KxIEWbGWChriE=", "inferredFetcher": "unclassified" }, { "outputHash": "0kaz8j85wjjnf18z0lz69xr1z8makg30jn2dzdyicd1asrj0q1jm", "outputHashAlgo": "sha256", "outputHashMode": "flat", "type": "url", "urls": [ "https://github.com/yvt/openspades/releases/download/v0.1.1b/NotoFonts.pak" ], "integrity": "sha256-VQYMZNYqNBZ9+01YCcabqqIfck/mU/BRcFZKXpBEX00=", "inferredFetcher": "unclassified" + }, + { + "type": "url", + "urls": [ + "https://crates.io/api/v1/crates/syntect/4.6.0/download" + ], + "integrity": "sha256-iyCBW76A7gvgbmlXRQqEEYX89pD+AXjxTXegXOLKoDE=" + }, + { + "outputHash": "0x5l2pn4x92734k6i2wcjbn2klmwgkiqaajvxadh35k74dgnyh18", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://rubygems.org/gems/wdm-0.1.1.gem" + ], + "integrity": "sha256-KEBvXyNnlgGb6lsqheN8vNIp7JKMi2gmGUekTuwVtHQ=", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "2al10188nwrdmi9zk3bid4ijjfsa8ymh6m9hin5jsja7hx7anbvs3i2y7kall56h4qn7j1rj73f8499x3i2k6x53kszmksvd2a1pkd4", + "outputHashAlgo": "sha512", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://repo1.maven.org/maven2/org/codehaus/plexus/plexus-compiler-manager/2.4/plexus-compiler-manager-2.4.jar" + ], + "integrity": "sha512-pM0blGhbz/r1HKWbKeLoKRHkxpE5yGMxgaZQqubxIg69l1Wnw6OklsVGmKqB1SOlnZSRtLjG/CnWlrlFKIBAlQ==", + "inferredFetcher": "unclassified" + }, + { + "outputHash": "19mnq9a1yr16srqs8n6hddahr4f9d2gbpmld62pvlw1ps7nfrp9w", + "outputHashAlgo": "sha256", + "outputHashMode": "recursive", + "type": "url", + "urls": [ + "https://bitbucket.org/zandoye/charinfo_width/get/1.1.0.tar.bz2" + ], + "integrity": "sha256-PN3s7NE3cLqvMI3Wu55oyZEMVWvQWKRx1iZkH1TCtqY=", + "inferredFetcher": "fetchzip" + }, + { + "type": "url", + "urls": [ + "https://ftpmirror.gnu.org/gnu/texinfo/texinfo-4.13a.tar.lzma", + "ftp://ftp.cs.tu-berlin.de/pub/gnu/texinfo/texinfo-4.13a.tar.lzma" + ], + "integrity": "sha256-bSiwzq6GbjU2FC/FUuejvJ+EyDAxGcJXMbJHju9kyeU=" + }, + { + "type": "url", + "urls": [ + "https://download.savannah.gnu.org/releases/zutils/zutils-1.10.tar.lz", + "https://nongnu.freemirror.org/nongnu/zutils/zutils-1.10.tar.lz" + ], + "integrity": "sha256-DdRBOCktV1dkgDcZW2lFw99wsxYiG0KFUgrTjy6usZU=" + }, + { + "type": "url", + "urls": [ + "http://www.rle.mit.edu/cpg/codes/fasthenry-3.0-12Nov96.tar.z" + ], + "integrity": "sha256-8V9YKMP4A50xYvmFlzh5sbQv6L39hD+znfAD0rzvBqg=" + }, + { + "type": "url", + "urls": [ + "http://ftp.x.org/contrib/utilities/unclutter-8.tar.Z" + ], + "integrity": "sha256-uFWnjURlqy+GKH6srGOnPxUEsIUihAqjdxh3bn7JGSo=" + }, + { + "outputHash": "sha256-Y40oLjddunrd7ZF1JbCcgjSCn8jFTubq69jhAVxInXw=", + "outputHashAlgo": "sha256", + "outputHashMode": "flat", + "type": "url", + "urls": [ + "https://github.com/vk-cli/vk/releases/download/0.7.6/vk-0.7.6-64-bin.7z" + ], + "integrity": "sha256-Y40oLjddunrd7ZF1JbCcgjSCn8jFTubq69jhAVxInXw=", + "inferredFetcher": "unclassified" + }, + { + "type": "url", + "urls": [ + "https://github.com/Doom-Utils/deutex/releases/download/v5.2.2/deutex-5.2.2.tar.zst" + ], + "integrity": "sha256-EO0OelM+yXy20DVI1CWPvsiIUqRbXqTPVDQ3atQXS18=" } ], "version": "1", "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" } diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py index 7d162cb..d19b9c5 100644 --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -1,309 +1,317 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import json import logging from pathlib import Path from typing import Dict, List import pytest import requests from requests.exceptions import ConnectionError, InvalidSchema, SSLError from swh.lister import TARBALL_EXTENSIONS from swh.lister.nixguix.lister import ( POSSIBLE_TARBALL_MIMETYPES, ArtifactNatureMistyped, ArtifactNatureUndetected, NixGuixLister, is_tarball, ) from swh.lister.pattern import ListerStats logger = logging.getLogger(__name__) SOURCES = { "guix": { "repo": "https://git.savannah.gnu.org/cgit/guix.git/", "manifest": "https://guix.gnu.org/sources.json", }, "nixpkgs": { "repo": "https://github.com/NixOS/nixpkgs", "manifest": "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json", }, } def page_response(datadir, instance: str = "success") -> List[Dict]: """Return list of repositories (out of test dataset)""" datapath = Path(datadir, f"sources-{instance}.json") return json.loads(datapath.read_text()) if datapath.exists else [] @pytest.mark.parametrize( "tarballs", [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], ) def test_is_tarball_simple(tarballs): """Simple check on tarball should discriminate between tarball and file""" urls = [f"https://example.org/{tarball}" for tarball in tarballs] is_tar, origin = is_tarball(urls) assert is_tar is True assert origin == urls[0] @pytest.mark.parametrize( "query_param", ["file", "f", "url", "name", "anykeyreally"], ) def test_is_tarball_not_so_simple(query_param): """More involved check on tarball should discriminate between tarball and file""" url = f"https://example.org/download.php?foo=bar&{query_param}=one.tar.gz" is_tar, origin = is_tarball([url]) assert is_tar is True assert origin == url @pytest.mark.parametrize( "files", [ ["abc.lisp"], ["one.abc", "two.bcd"], ["abc.c", "other.c"], ["one.scm?foo=bar", "two.scm?foo=bar"], ["config.nix", "flakes.nix"], ], ) def test_is_tarball_simple_not_tarball(files): """Simple check on tarball should discriminate between tarball and file""" urls = [f"http://example.org/{file}" for file in files] is_tar, origin = is_tarball(urls) assert is_tar is False assert origin == urls[0] def test_is_tarball_complex_with_no_result(requests_mock): """Complex tarball detection without proper information should fail.""" # No extension, this won't detect immediately the nature of the url url = "https://example.org/crates/package/download" urls = [url] with pytest.raises(ArtifactNatureUndetected): is_tarball(urls) # no request parameter, this cannot fallback, raises with pytest.raises(ArtifactNatureUndetected): requests_mock.head( url, status_code=404, # not found so cannot detect anything ) is_tarball(urls, requests) with pytest.raises(ArtifactNatureUndetected): requests_mock.head( url, headers={} ) # response ok without headers, cannot detect anything is_tarball(urls, requests) with pytest.raises(ArtifactNatureUndetected): fallback_url = "https://example.org/mirror/crates/package/download" requests_mock.head( url, headers={"location": fallback_url} # still no extension, cannot detect ) is_tarball(urls, requests) with pytest.raises(ArtifactNatureMistyped): is_tarball(["foo://example.org/unsupported-scheme"]) with pytest.raises(ArtifactNatureMistyped): fallback_url = "foo://example.org/unsupported-scheme" requests_mock.head( url, headers={"location": fallback_url} # still no extension, cannot detect ) is_tarball(urls, requests) @pytest.mark.parametrize( "fallback_url, expected_result", [ ("https://example.org/mirror/crates/package/download.tar.gz", True), ("https://example.org/mirror/package/download.lisp", False), ], ) def test_is_tarball_complex_with_location_result( requests_mock, fallback_url, expected_result ): """Complex tarball detection with information should detect artifact nature""" # No extension, this won't detect immediately the nature of the url url = "https://example.org/crates/package/download" urls = [url] # One scenario where the url renders a location with a proper extension requests_mock.head(url, headers={"location": fallback_url}) is_tar, origin = is_tarball(urls, requests) assert is_tar == expected_result if is_tar: assert origin == fallback_url @pytest.mark.parametrize( "content_type, expected_result", [("application/json", False), ("application/something", False)] + [(ext, True) for ext in POSSIBLE_TARBALL_MIMETYPES], ) def test_is_tarball_complex_with_content_type_result( requests_mock, content_type, expected_result ): """Complex tarball detection with information should detect artifact nature""" # No extension, this won't detect immediately the nature of the url url = "https://example.org/crates/package/download" urls = [url] # One scenario where the url renders a location with a proper extension requests_mock.head(url, headers={"Content-Type": content_type}) is_tar, origin = is_tarball(urls, requests) assert is_tar == expected_result if is_tar: assert origin == url def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): """NixGuixLister should list all origins per visit type""" url = SOURCES["guix"]["manifest"] origin_upstream = SOURCES["guix"]["repo"] lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) response = page_response(datadir, "success") requests_mock.get( url, [{"json": response}], ) requests_mock.get( "https://api.github.com/repos/trie/trie", [{"json": {"html_url": "https://github.com/trie/trie.git"}}], ) requests_mock.head( "http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz", headers={"Content-Type": "application/gzip; charset=ISO-8859-1"}, ) + requests_mock.head( + "https://crates.io/api/v1/crates/syntect/4.6.0/download", + headers={ + "Location": "https://static.crates.io/crates/syntect/syntect-4.6.0.crate" + }, + ) expected_visit_types = defaultdict(int) # origin upstream is added as origin expected_nb_origins = 1 expected_visit_types["git"] += 1 for artifact in response["sources"]: # Each artifact is considered an origin (even "url" artifacts with mirror urls) expected_nb_origins += 1 artifact_type = artifact["type"] if artifact_type in [ "git", "svn", "hg", ]: expected_visit_types[artifact_type] += 1 elif artifact_type == "url": url = artifact["urls"][0] if url.endswith(".git"): expected_visit_types["git"] += 1 elif url.endswith(".c") or url.endswith(".txt"): expected_visit_types["content"] += 1 elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless expected_visit_types["svn"] += 1 + elif "crates.io" in url: + expected_visit_types["directory"] += 1 else: # tarball artifacts expected_visit_types["directory"] += 1 assert set(expected_visit_types.keys()) == { "content", "git", "svn", "hg", "directory", } listed_result = lister.run() # 1 page read is 1 origin nb_pages = expected_nb_origins assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins mapping_visit_types = defaultdict(int) for listed_origin in scheduler_origins: assert listed_origin.visit_type in expected_visit_types # no last update is listed on those manifests assert listed_origin.last_update is None mapping_visit_types[listed_origin.visit_type] += 1 assert dict(mapping_visit_types) == expected_visit_types def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): """NixGuixLister should ignore unsupported or incomplete origins""" url = SOURCES["nixpkgs"]["manifest"] origin_upstream = SOURCES["nixpkgs"]["repo"] lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) response = page_response(datadir, "failure") requests_mock.get( url, [{"json": response}], ) # Amongst artifacts, this url does not allow to determine its nature (tarball, file) # It's ending up doing a http head query which ends up being 404, so it's skipped. requests_mock.head( "https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped", status_code=404, ) # Invalid schema for that origin (and no extension), so skip origin # from its name requests_mock.head( "ftp://ftp.ourproject.org/file-with-no-extension", exc=InvalidSchema, ) # Cannot communicate with an expired cert, so skip origin requests_mock.head( "https://code.9front.org/hg/plan9front", exc=SSLError, ) # Cannot connect to the site, so skip origin requests_mock.head( "https://git-tails.immerda.ch/onioncircuits", exc=ConnectionError, ) listed_result = lister.run() # only the origin upstream is listed, every other entries are unsupported or incomplete assert listed_result == ListerStats(pages=1, origins=1) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].visit_type == "git" def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock): url = SOURCES["nixpkgs"]["manifest"] origin_upstream = SOURCES["nixpkgs"]["repo"] lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) requests_mock.get( url, status_code=404, ) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0