diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ -swh.core[db,github] >= 2.8 +swh.core[db,github] >= 2.15 swh.scheduler >= 0.8 diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -29,6 +29,28 @@ SUPPORTED_LISTERS = list(LISTERS) +TARBALL_EXTENSIONS = [ + "crate", + "gem", + "jar", + "zip", + "tar", + "gz", + "tgz", + "tbz", + "bz2", + "bzip2", + "lzma", + "lz", + "txz", + "xz", + "z", + "Z", + "7z", + "zst", +] +"""Tarball recognition pattern""" + def get_lister(lister_name, db_url=None, **conf): """Instantiate a lister given its name. diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,6 +15,8 @@ import requests +from swh.lister import TARBALL_EXTENSIONS + logger = logging.getLogger(__name__) @@ -186,21 +188,6 @@ return False -# to recognize existing naming pattern -EXTENSIONS = [ - "zip", - "tar", - "gz", - "tgz", - "bz2", - "bzip2", - "lzma", - "lz", - "xz", - "Z", - "7z", -] - VERSION_KEYWORDS = [ "cygwin_me", "w32", @@ -269,7 +256,7 @@ (?P(?:\.(?:{extensions}))+) $ """.format( - extensions="|".join(EXTENSIONS), + extensions="|".join(TARBALL_EXTENSIONS), vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS), ) diff --git a/swh/lister/nixguix/__init__.py b/swh/lister/nixguix/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import NixGuixLister + + return { + "lister": NixGuixLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/lister.py @@ -0,0 +1,370 @@ +# Copyright (C) 2020-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""NixGuix lister definition. + +This lists artifacts out of manifest for Guix or Nixpkgs manifests. + +Artifacts can be of types: +- upstream git repository (NixOS/nixpkgs, Guix) +- VCS repositories (svn, git, hg, ...) +- unique file +- unique tarball + +""" + +import base64 +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +import random +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from urllib.parse import urlparse + +import requests + +from swh.core.github.utils import GitHubSession +from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT +from swh.lister import TARBALL_EXTENSIONS +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.model import ListedOrigin + +logger = logging.getLogger(__name__) + + +class ArtifactNatureUndetected(ValueError): + """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" + + pass + + +@dataclass +class OriginUpstream: + """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" + + origin: str + """Canonical url of the repository""" + version: int + """Version of the repository (dismissed?)""" + revision: str + """Revision of the repository (dismissed?)""" + + +@dataclass +class Artifact: + """Metadata information on Remote Artifact with url (tarball or file).""" + + origin: str + """Canonical url retrieve the tarball artifact.""" + visit_type: str + """Either 'tar' or 'file' """ + fallback_urls: List[str] + """List of urls to retrieve tarball artifact if canonical url no longer works.""" + checksums: Dict[str, str] + """Integrity hash converted into a checksum dict.""" + + +@dataclass +class VCS: + """Metadata information on VCS.""" + + origin: str + """Origin url of the vcs""" + ref: Optional[str] + """Reference either a svn commit id, a git commit, ...""" + type: str + """Type of (d)vcs, e.g. svn, git, hg, ...""" + + +class ArtifactType(Enum): + """The possible artifact types listed out of the manifest.""" + + ARTIFACT = "artifact" + ORIGIN = "origin" + VCS = "vcs" + + +PageResult = Tuple[ArtifactType, Union[Artifact, VCS, OriginUpstream]] + + +VCS_SUPPORTED = ("git", "svn", "hg") + +# Rough approximation of what we can find of mimetypes for tarballs "out there" +POSSIBLE_TARBALL_MIMETYPES = set(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) + + +def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: + """Determine whether a list of files actually are tarballs or simple files. + + When this cannot be answered simply out of the url, when request is provided, this + executes a HTTP `HEAD` query on the url to determine the information. If request is + not provided, this raises an ArtifactNatureUndetected exception. + + Args: + urls: name of the remote files for which the extension needs to be checked. + + Raises: + ArtifactNatureUndetected when the artifact's nature cannot be detected out + of its url + + Returns: A tuple (bool, url). The boolean represents whether the url is an archive + or not. The second parameter is the actual url once the head request is issued + as a fallback of not finding out whether the urls are tarballs or not. + + """ + + def _is_tarball(url): + """Determine out of an extension whether url is a tarball. + + Raises: + IndexError in case no extension is available + + """ + return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS + + index = random.randrange(len(urls)) + url = urls[index] + try: + is_tar = _is_tarball(url) + return is_tar, urls[0] + except IndexError: + if request is None: + raise ArtifactNatureUndetected( + "Cannot determine artifact type from url %s", url + ) + logger.warning( + "Cannot detect extension for '%s'. Fallback to http head query", + url, + ) + response = request.head(url) + + if not response.ok or response.status_code == 404: + raise ArtifactNatureUndetected( + "Cannot determine artifact type from url %s", url + ) + location = response.headers.get("Location") + if location: # It's not always present + logger.debug("Location: %s", location) + try: + # FIXME: location is also returned as it's considered the true origin, + # true enough? + return _is_tarball(location), location + except IndexError: + logger.warning( + "Still cannot detect extension through location '%s'...", + url, + ) + + content_type = response.headers.get("Content-Type") + if content_type: + logger.debug("Content-Type: %s", content_type) + if content_type == "application/json": + return False, urls[0] + return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0] + + raise ArtifactNatureUndetected( + "Cannot determine artifact type from url %s", url + ) + + +VCS_KEYS_MAPPING = { + "git": { + "ref": "git_ref", + "url": "git_url", + }, + "svn": { + "ref": "svn_revision", + "url": "svn_url", + }, + "hg": { + "ref": "hg_changeset", + "url": "hg_url", + }, +} + + +class NixGuixLister(StatelessLister[PageResult]): + """List Guix or Nix sources out of a public json manifest. + + This lister can output: + - unique tarball (.tar.gz, .tbz2, ...) + - vcs repositories (e.g. git, hg, svn) + - unique file (.lisp, .py, ...) + + Note that no `last_update` is available in either manifest. + + For `url` types artifacts, this tries to determine the artifact's nature, tarball or + file. It first tries to compute out of the "url" extension. In case of no extension, + it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` + response header, and then checks the extension again. + + """ + + LISTER_NAME = "nixguix" + + def __init__( + self, + scheduler, + url: str, + origin_upstream: str, + instance: Optional[str] = None, + credentials: Optional[CredentialsType] = None, + # canonicalize urls, can be turned off during docker runs + canonicalize: bool = True, + **kwargs: Any, + ): + super().__init__( + scheduler=scheduler, + url=url.rstrip("/"), + instance=instance, + credentials=credentials, + ) + # either full fqdn NixOS/nixpkgs or guix repository urls + # maybe add an assert on those specific urls? + self.origin_upstream = origin_upstream + + self.session = requests.Session() + # for testing purposes, we may want to skip this step (e.g. docker run and rate + # limit) + self.github_session = ( + GitHubSession( + credentials=self.credentials, + user_agent=str(self.session.headers["User-Agent"]), + ) + if canonicalize + else None + ) + + def get_pages(self) -> Iterator[PageResult]: + """Yield one page per "typed" origin referenced in manifest.""" + # fetch and parse the manifest... + response = self.http_request(self.url) + + # ... if any + raw_data = response.json() + version = raw_data["version"] + revision = raw_data["revision"] + yield ArtifactType.ORIGIN, OriginUpstream( + self.origin_upstream, + version, + revision, + ) + + # grep '"type"' guix-sources.json | sort | uniq + # "type": false <<<<<<<<< noise + # "type": "git", + # "type": "hg", + # "type": "no-origin", <<<<<<<<< noise + # "type": "svn", + # "type": "url", + + # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq + # "type": "url", + + for artifact in raw_data["sources"]: + artifact_type = artifact["type"] + if artifact_type in VCS_SUPPORTED: + plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]] + plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]] + artifact_url = ( + self.github_session.get_canonical_url(plain_url) + if self.github_session + else plain_url + ) + if not artifact_url: + continue + yield ArtifactType.VCS, VCS( + origin=artifact_url, type=artifact_type, ref=plain_ref + ) + elif artifact_type == "url": + # It's either a tarball or a file + urls = artifact.get("urls") + if not urls: + # Nothing to fetch + logger.warning("Skipping url '%s': empty artifact", artifact) + continue + + assert urls is not None + # FIXME: T3294: Fix missing scheme in urls + origin, *fallback_urls = urls + + integrity = artifact.get("integrity") + if integrity is None: + logger.warning("Skipping url '%s': missing integrity field", origin) + continue + + try: + is_tar, origin = is_tarball(urls, self.session) + except ArtifactNatureUndetected: + logger.warning( + "Skipping url '%s': undetected remote artifact type", origin + ) + continue + + # Determine the content checksum stored in the integrity field and + # convert into a dict of checksums. This only parses the + # `hash-expression` (hash-) as defined in + # https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute + chksum_algo, chksum_b64 = integrity.split("-") + checksums: Dict[str, str] = { + chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() + } + + logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) + yield ArtifactType.ARTIFACT, Artifact( + origin=origin, + fallback_urls=fallback_urls, + checksums=checksums, + visit_type="directory" if is_tar else "content", + ) + else: + logger.warning( + "Skipping artifact '%s': unsupported type %s", + artifact, + artifact_type, + ) + + def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]: + """Given a vcs repository, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=artifact.origin, + visit_type=artifact.type, + ) + + def origin_to_listed_origin( + self, origin_upstream: OriginUpstream + ) -> Iterator[ListedOrigin]: + """Given an upstream origin, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_upstream.origin, + visit_type="git", # both nixpkgs and guix are git origins so far + ) + + def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]: + """Given an artifact (tarball, file), yield one ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=artifact.origin, + visit_type=artifact.visit_type, + extra_loader_arguments={ + "checksums": artifact.checksums, + "fallback_urls": artifact.fallback_urls, + }, + ) + + def get_origins_from_page( + self, artifact_tuple: PageResult + ) -> Iterator[ListedOrigin]: + """Given an artifact tuple (type, artifact), yield a ListedOrigin.""" + artifact_type, artifact = artifact_tuple + mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin") + yield from mapping_type_fn(artifact) diff --git a/swh/lister/nixguix/tests/__init__.py b/swh/lister/nixguix/tests/__init__.py new file mode 100644 diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json @@ -0,0 +1,19 @@ +{ + "sources": [ + {"type": "git", "git_url": "", "git_ref": ""}, + {"type": false}, + {"type": "no-origin"}, + {"type": "url", "urls": []}, + { + "type": "url", + "urls": ["https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped"], + "integrity": "sha256-HW6jxFlbljY8E5Q0l9s0r0Rg+0dKlcQ/REatNBuMl4U=" + }, + { + "type": "url", + "urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ] + } + ], + "version":"1", + "revision":"ab59155c5a38dda7efaceb47c7528578fcf0def4" +} diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -0,0 +1,52 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://example.com/file.txt" ], + "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" + }, + { + "type": "url", + "urls": [ + "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz" + ], + "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0=" + }, + { + "type": "url", + "urls": [ + "http://downloads.sourceforge.net/project/nmon/lmon16n.c", + "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c", + "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c" + ], + "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" + }, + { + "type": "git", + "git_url": "https://example.org/pali/0xffff", + "git_ref": "0.9" + }, + { + "type": "hg", + "hg_url": "https://example.org/vityok/cl-string-match", + "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97" + }, + { + "type": "svn", + "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2", + "svn_revision": 39057 + } + ], + "version": "1", + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +} diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/test_lister.py @@ -0,0 +1,244 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from collections import defaultdict +import json +import logging +from pathlib import Path +from typing import Dict, List + +import pytest +import requests + +from swh.lister import TARBALL_EXTENSIONS +from swh.lister.nixguix.lister import ( + POSSIBLE_TARBALL_MIMETYPES, + ArtifactNatureUndetected, + NixGuixLister, + is_tarball, +) +from swh.lister.pattern import ListerStats + +logger = logging.getLogger(__name__) + + +def page_response(datadir, instance: str) -> List[Dict]: + """Return list of repositories (out of test dataset)""" + datapath = Path(datadir, f"{instance}-swh_sources.json") + return json.loads(datapath.read_text()) if datapath.exists else [] + + +@pytest.mark.parametrize( + "urls", + [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] + + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], +) +def test_is_tarball_simple(urls): + """Simple check on tarball should discriminate betwenn tarball and file""" + is_tar, origin = is_tarball(urls) + assert is_tar is True + assert origin == urls[0] + + +@pytest.mark.parametrize( + "urls", + [ + ["abc.lisp"], + ["one.abc", "two.bcd"], + ["abc.c", "other.c"], + ["one.scm?foo=bar", "two.scm?foo=bar"], + ["config.nix", "flakes.nix"], + ], +) +def test_is_tarball_simple_not_tarball(urls): + """Simple check on tarball should discriminate betwenn tarball and file""" + is_tar, origin = is_tarball(urls) + assert is_tar is False + assert origin == urls[0] + + +def test_is_tarball_complex_with_no_result(requests_mock): + """Complex tarball detection without proper information should fail.""" + # No extension, this won't detect immediately the nature of the url + url = "https://example.org/crates/package/download" + urls = [url] + with pytest.raises(ArtifactNatureUndetected): + is_tarball(url) # no request parameter, this cannot fallback, raises + + with pytest.raises(ArtifactNatureUndetected): + requests_mock.head( + url, + status_code=404, # not found so cannot detect anything + ) + is_tarball(urls, requests) + + with pytest.raises(ArtifactNatureUndetected): + requests_mock.head( + url, headers={} + ) # response ok without headers, cannot detect anything + is_tarball(urls, requests) + + with pytest.raises(ArtifactNatureUndetected): + fallback_url = "https://example.org/mirror/crates/package/download" + requests_mock.head( + url, headers={"location": fallback_url} # still no extension, cannot detect + ) + is_tarball(urls, requests) + + +@pytest.mark.parametrize( + "fallback_url, expected_result", + [ + ("https://example.org/mirror/crates/package/download.tar.gz", True), + ("https://example.org/mirror/package/download.lisp", False), + ], +) +def test_is_tarball_complex_with_location_result( + requests_mock, fallback_url, expected_result +): + """Complex tarball detection with information should detect artifact nature""" + # No extension, this won't detect immediately the nature of the url + url = "https://example.org/crates/package/download" + urls = [url] + + # One scenario where the url renders a location with a proper extension + requests_mock.head(url, headers={"location": fallback_url}) + is_tar, origin = is_tarball(urls, requests) + assert is_tar == expected_result + if is_tar: + assert origin == fallback_url + + +@pytest.mark.parametrize( + "content_type, expected_result", + [("application/json", False), ("application/something", False)] + + [(ext, True) for ext in POSSIBLE_TARBALL_MIMETYPES], +) +def test_is_tarball_complex_with_content_type_result( + requests_mock, content_type, expected_result +): + """Complex tarball detection with information should detect artifact nature""" + # No extension, this won't detect immediately the nature of the url + url = "https://example.org/crates/package/download" + urls = [url] + + # One scenario where the url renders a location with a proper extension + requests_mock.head(url, headers={"Content-Type": content_type}) + is_tar, origin = is_tarball(urls, requests) + assert is_tar == expected_result + if is_tar: + assert origin == url + + +def test_lister_nixguix(datadir, swh_scheduler, requests_mock): + """NixGuixLister should list all origins per visit type""" + url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + origin_upstream = "https://github.com/NixOS/nixpkgs" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + response = page_response(datadir, "nixpkgs") + requests_mock.get( + url, + [{"json": response}], + ) + + expected_visit_types = defaultdict(int) + # origin upstream is added as origin + expected_nb_origins = 1 + expected_visit_types["git"] += 1 + for artifact in response["sources"]: + # Each artifact is considered an origin (even "url" artifacts with mirror urls) + expected_nb_origins += 1 + artifact_type = artifact["type"] + if artifact_type in [ + "git", + "svn", + "hg", + ]: + expected_visit_types[artifact_type] += 1 + elif artifact_type == "url": + url = artifact["urls"][0] + if url.endswith(".c") or url.endswith(".txt"): + expected_visit_types["content"] += 1 + else: + expected_visit_types["directory"] += 1 + + assert set(expected_visit_types.keys()) == { + "content", + "git", + "svn", + "hg", + "directory", + } + + listed_result = lister.run() + + # 1 page read is 1 origin + nb_pages = expected_nb_origins + assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + mapping_visit_types = defaultdict(int) + + for listed_origin in scheduler_origins: + assert listed_origin.visit_type in expected_visit_types + # no last update is listed on those manifests + assert listed_origin.last_update is None + + mapping_visit_types[listed_origin.visit_type] += 1 + + assert dict(mapping_visit_types) == expected_visit_types + + +def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): + """NixGuixLister should ignore unsupported or incomplete origins""" + url = "https://guix.gnu.org/sources.json" + origin_upstream = "https://git.savannah.gnu.org/git/guix.git" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + response = page_response(datadir, "guix") + + requests_mock.get( + url, + [{"json": response}], + ) + # Amongst artifacts, this url does not allow to determine its nature (tarball, file) + # It's ending up doing a http head query which ends up being 404, so it's skipped. + requests_mock.head( + "https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped", + status_code=404, + ) + + listed_result = lister.run() + # only the origin upstream is listed, every other entries are unsupported or incomplete + assert listed_result == ListerStats(pages=1, origins=1) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == 1 + + assert scheduler_origins[0].visit_type == "git" + + +def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock): + url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + origin_upstream = "https://github.com/NixOS/nixpkgs" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + requests_mock.get( + url, + status_code=404, + ) + + with pytest.raises(requests.HTTPError): # listing cannot continues so stop + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 0