diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -72,6 +72,7 @@ lister.golang=swh.lister.golang:register lister.hackage=swh.lister.hackage:register lister.launchpad=swh.lister.launchpad:register + lister.nixguix=swh.lister.nixguix:register lister.npm=swh.lister.npm:register lister.nuget=swh.lister.nuget:register lister.opam=swh.lister.opam:register diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -29,6 +29,28 @@ SUPPORTED_LISTERS = list(LISTERS) +TARBALL_EXTENSIONS = [ + "crate", + "gem", + "jar", + "zip", + "tar", + "gz", + "tgz", + "tbz", + "bz2", + "bzip2", + "lzma", + "lz", + "txz", + "xz", + "z", + "Z", + "7z", + "zst", +] +"""Tarball recognition pattern""" + def get_lister(lister_name, db_url=None, **conf): """Instantiate a lister given its name. diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,6 +15,8 @@ import requests +from swh.lister import TARBALL_EXTENSIONS + logger = logging.getLogger(__name__) @@ -186,21 +188,6 @@ return False -# to recognize existing naming pattern -EXTENSIONS = [ - "zip", - "tar", - "gz", - "tgz", - "bz2", - "bzip2", - "lzma", - "lz", - "xz", - "Z", - "7z", -] - VERSION_KEYWORDS = [ "cygwin_me", "w32", @@ -269,7 +256,7 @@ (?P(?:\.(?:{extensions}))+) $ """.format( - extensions="|".join(EXTENSIONS), + extensions="|".join(TARBALL_EXTENSIONS), vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS), ) diff --git a/swh/lister/nixguix/__init__.py b/swh/lister/nixguix/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/__init__.py @@ -0,0 +1,38 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +NixGuix lister +============== + +Nix and Guix package managers are among other things (lazy) functional package managers. +We cannot easily parse their source declarations as it would require some involved +computations. + +After some discussion and work with both communities, they now expose public manifests +that the lister consumes to extract origins. Be it the `Guix manifest`_ or the `Nixpkgs +manifests`_. + +4 kinds of origins are listed: + +- main `Guix repository`_ or `Nixpkgs repository`_ which are git repositories +- DVCS origins (git, svn, hg) +- unique file +- unique tarball + +.. _Guix repository: https://git.savannah.gnu.org/cgit/guix.git/ +.. _Nixpkgs repository: https://github.com/NixOS/nixpkgs +.. _Guix manifest: https://guix.gnu.org/sources.json +.. _Nixpkgs manifests: https://nix-community.github.io/nixpkgs-swh/sources-unstable-full.json + +""" + + +def register(): + from .lister import NixGuixLister + + return { + "lister": NixGuixLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/lister.py @@ -0,0 +1,352 @@ +# Copyright (C) 2020-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""NixGuix lister definition. + +This lists artifacts out of manifest for Guix or Nixpkgs manifests. + +Artifacts can be of types: +- upstream git repository (NixOS/nixpkgs, Guix) +- VCS repositories (svn, git, hg, ...) +- unique file +- unique tarball + +""" + +import base64 +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +import random +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from urllib.parse import urlparse + +import requests + +from swh.core.github.utils import GitHubSession +from swh.lister import TARBALL_EXTENSIONS +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.model import ListedOrigin + +logger = logging.getLogger(__name__) + + +@dataclass +class OriginUpstream: + """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" + + origin: str + """Canonical url of the repository""" + version: int + """Version of the repository (dismissed?)""" + revision: str + """Revision of the repository (dismissed?)""" + + +@dataclass +class Artifact: + """Metadata information on Remote Artifact with url (tarball or file).""" + + origin: str + """Canonical url retrieve the tarball artifact.""" + visit_type: str + """Either 'tar' or 'file' """ + fallback_urls: List[str] + """List of urls to retrieve tarball artifact if canonical url no longer works.""" + checksums: Dict[str, str] + """Integrity hash converted into a checksum dict.""" + + +@dataclass +class VCS: + """Metadata information on VCS.""" + + origin: str + """Origin url of the vcs""" + ref: Optional[str] + """Reference either a svn commit id, a git commit, ...""" + type: str + """Type of (d)vcs, e.g. svn, git, hg, ...""" + + +class ArtifactType(Enum): + """The possible artifact types listed out of the manifest.""" + + ARTIFACT = "artifact" + ORIGIN = "origin" + VCS = "vcs" + + +PageResult = Tuple[ArtifactType, Union[Artifact, VCS, OriginUpstream]] + + +VCS_SUPPORTED = ("git", "svn", "hg") + +# Rough approximation of what we can find of mimetypes for tarballs "out there" +POSSIBLE_TARBALL_MIMETYPES = [ + f"application/{v}" for v in ["gzip", "tar+gzip"] + TARBALL_EXTENSIONS +] + + +def is_tarball(urls: List[str], request) -> Tuple[bool, str]: + """Determine whether a list of files actually are tarballs or simple files. + + Args: + urls: name of the remote files for which the extension needs to be checked. + + Returns: A tuple (bool, url). The boolean represents whether the url is an archive + or not. The second parameter is the actual url once the head request is issued + as a fallback of not finding out whether the urls are tarballs or not. + + Example: + + >>> is_tarball(['abc.zip']) + True + >>> is_tarball(['one.tar.gz', 'two.tgz']) + True + >>> is_tarball(['abc.c', 'other.c']) + False + >>> is_tarball(['one.tar.gz?foo=bar', 'two.tgz?foo=bar']) + True + + """ + + def _is_tarball(url): + """Determine out of an extension whether url is a tarball. + + Raises: + IndexError in case no extension is available + + """ + return Path(urlparse(url).path).suffixes[-1].lstrip(".") + + index = random.randrange(len(urls)) + url = urls[index] + try: + file_suffix = _is_tarball(url) + except IndexError: + logger.warning( + "Cannot detect extension for '%s'. Fallback to http head query", + url, + ) + response = request.head(url) + location = response.headers.get("Location") + if location: # It's not always present + logger.debug("Location: %s", location) + try: + # FIXME: location is also returned as it's considered the true origin, + # true enough? + return _is_tarball(location), location + except IndexError: + logger.warning( + "Still cannot detect extension through location '%s'...", + url, + ) + + content_type = response.headers.get("Content-Type") + if content_type: + logger.debug("Content-Type: %s", content_type) + if content_type == "application/json": + return False, urls[0] + return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0] + + return False, urls[0] + else: + return file_suffix in TARBALL_EXTENSIONS, urls[0] + + +VCS_KEYS_MAPPING = { + "git": { + "ref": "git_ref", + "url": "git_url", + }, + "svn": { + "ref": "svn_revision", + "url": "svn_url", + }, + "hg": { + "ref": "hg_changeset", + "url": "hg_url", + }, +} + + +class NixGuixLister(StatelessLister[PageResult]): + """List Guix or Nix sources out of a public json manifest. + + This lister can output: + - unique tarball (.tar.gz, .tbz2, ...) + - vcs repositories (e.g. git, hg, svn) + - unique file (.lisp, .py, ...) + + Note that no `last_update` is available in either manifest. + + For `url` types artifacts, this tries to determine the artifact's nature, tarball or + file. It first tries to compute out of the "url" extension. In case of no extension, + it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` + response header, and then checks the extension again. + + """ + + LISTER_NAME = "nixguix" + + def __init__( + self, + scheduler, + url: str, + origin_upstream: str, + instance: Optional[str] = None, + credentials: Optional[CredentialsType] = None, + # canonicalize urls, can be turned off during docker runs + canonicalize: bool = True, + **kwargs: Any, + ): + super().__init__( + scheduler=scheduler, + url=url.rstrip("/"), + instance=instance, + credentials=credentials, + ) + # either full fqdn NixOS/nixpkgs or guix repository urls + # maybe add an assert on those specific urls? + self.origin_upstream = origin_upstream + + self.session = requests.Session() + self.session.headers.update({"Accept": "application/json"}) + # for testing purposes, we may want to skip this step (rate limit!) + self.github_session = ( + GitHubSession( + credentials=self.credentials, + user_agent=str(self.session.headers["User-Agent"]), + ) + if canonicalize + else None + ) + + def get_pages(self) -> Iterator[PageResult]: + """Yield one page per "typed" origin referenced in manifest.""" + # fetch and parse the manifest... + response = self.http_request(self.url, allow_redirects=True) + + # ... if any + raw_data = response.json() + version = raw_data["version"] + revision = raw_data["revision"] + yield ArtifactType.ORIGIN, OriginUpstream( + self.origin_upstream, + version, + revision, + ) + + # grep '"type"' guix-sources.json | sort | uniq + # "type": false <<<<<<<<< noise + # "type": "git", + # "type": "hg", + # "type": "no-origin", <<<<<<<<< noise + # "type": "svn", + # "type": "url", + + # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq + # "type": "url", + + for artifact in raw_data["sources"]: + artifact_type = artifact["type"] + if artifact_type in VCS_SUPPORTED: + plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]] + plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]] + artifact_url = ( + self.github_session.get_canonical_url(plain_url) + if self.github_session + else plain_url + ) + if not artifact_url: + continue + yield ArtifactType.VCS, VCS( + origin=artifact_url, type=artifact_type, ref=plain_ref + ) + elif artifact_type == "url": + # It's either a tarball or a file + urls = artifact.get("urls") + if not urls: + # Nothing to fetch + logger.warning("Skipping empty artifact %s", artifact) + continue + + assert urls is not None + + # Determine the content checksum stored in the integrity field and + # convert into a dict of checksums. This only parses the + # `hash-expression` (hash-) as defined in + # https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute + chksum_algo, chksum_b64 = artifact["integrity"].split("-") + checksums: Dict[str, str] = { + chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() + } + + is_tar, true_tar_origin = is_tarball(urls, self.session) + + # FIXME: T3294: Fix missing scheme in urls + origin, *fallback_urls = urls + logger.info( + "%s: %s -> %s", + "tar " if is_tar else "file", + origin, + true_tar_origin if is_tar else None, + ) + yield ArtifactType.ARTIFACT, Artifact( + origin=true_tar_origin if is_tar else origin, + fallback_urls=fallback_urls, + checksums=checksums, + visit_type="tar" if is_tar else "file", + ) + else: + logger.warning( + "Skipping unsupported type %s for artifact %s", + artifact_type, + artifact, + ) + + def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]: + """Given a vcs repository, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=artifact.origin, + visit_type=artifact.type, + ) + + def origin_to_listed_origin( + self, origin_upstream: OriginUpstream + ) -> Iterator[ListedOrigin]: + """Given an upstream origin, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_upstream.origin, + visit_type="git", # both nixpkgs and guix are git origins so far + ) + + def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]: + """Given an artifact (tarball, file), yield one ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=artifact.origin, + visit_type=artifact.visit_type, + extra_loader_arguments={ + "checksums": artifact.checksums, + "fallback_urls": artifact.fallback_urls, + }, + ) + + def get_origins_from_page( + self, artifact_tuple: PageResult + ) -> Iterator[ListedOrigin]: + """Given an artifact tuple (type, artifact), yield a ListedOrigin.""" + artifact_type, artifact = artifact_tuple + mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin") + yield from mapping_type_fn(artifact) diff --git a/swh/lister/nixguix/tasks.py b/swh/lister/nixguix/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tasks.py @@ -0,0 +1,14 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + + +@shared_task(name=__name__ + ".NixGuixListerTask") +def list_nixguix(**lister_args): + """Lister task for Arch Linux""" + from swh.lister.nixguix.lister import NixGuixLister + + return NixGuixLister.from_configfile(**lister_args).run().dict() diff --git a/swh/lister/nixguix/tests/__init__.py b/swh/lister/nixguix/tests/__init__.py new file mode 100644 diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json @@ -0,0 +1,10 @@ +{ + "sources": [ + {"type": "git", "git_url": "", "git_ref": ""}, + {"type": false}, + {"type": "no-origin"}, + {"type": "url", "urls": []} + ], + "version":"1", + "revision":"ab59155c5a38dda7efaceb47c7528578fcf0def4" +} diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -0,0 +1,52 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://example.com/file.txt" ], + "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" + }, + { + "type": "url", + "urls": [ + "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz" + ], + "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0=" + }, + { + "type": "url", + "urls": [ + "http://downloads.sourceforge.net/project/nmon/lmon16n.c", + "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c", + "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c" + ], + "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" + }, + { + "type": "git", + "git_url": "https://example.org/pali/0xffff", + "git_ref": "0.9" + }, + { + "type": "hg", + "hg_url": "https://example.org/vityok/cl-string-match", + "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97" + }, + { + "type": "svn", + "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2", + "svn_revision": 39057 + } + ], + "version": "1", + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +} diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/test_lister.py @@ -0,0 +1,126 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from collections import defaultdict +import json +import logging +from pathlib import Path +from typing import Dict, List + +import pytest +import requests + +from swh.lister.nixguix.lister import NixGuixLister +from swh.lister.pattern import ListerStats + +logger = logging.getLogger(__name__) + + +def page_response(datadir, instance: str) -> List[Dict]: + """Return list of repositories (out of test dataset)""" + datapath = Path(datadir, f"{instance}-swh_sources.json") + return json.loads(datapath.read_text()) if datapath.exists else [] + + +def test_lister_nixguix(datadir, swh_scheduler, requests_mock): + """NixGuixLister should list all origins per visit type""" + instance = "nixpkgs" + url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + origin_upstream = "https://github.com/NixOS/nixpkgs" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + response = page_response(datadir, instance) + requests_mock.get( + url, + [{"json": response}], + ) + + expected_visit_types = defaultdict(int) + # origin upstream is added as origin + expected_nb_origins = 1 + expected_visit_types["git"] += 1 + for artifact in response["sources"]: + # Each artifact is considered an origin (even "url" artifacts with mirror urls) + expected_nb_origins += 1 + artifact_type = artifact["type"] + if artifact_type in [ + "git", + "svn", + "hg", + ]: + expected_visit_types[artifact_type] += 1 + elif artifact_type == "url": + url = artifact["urls"][0] + if url.endswith(".c") or url.endswith(".txt"): + expected_visit_types["file"] += 1 + else: + expected_visit_types["tar"] += 1 + + assert set(expected_visit_types.keys()) == {"file", "git", "svn", "hg", "tar"} + + listed_result = lister.run() + + # 1 page read is 1 origin + nb_pages = expected_nb_origins + assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + mapping_visit_types = defaultdict(int) + + for listed_origin in scheduler_origins: + assert listed_origin.visit_type in expected_visit_types + # no last update is listed on those manifests + assert listed_origin.last_update is None + + mapping_visit_types[listed_origin.visit_type] += 1 + + assert dict(mapping_visit_types) == expected_visit_types + + +def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): + """NixGuixLister should ignore unsupported or incomplete origins""" + instance = "guix" + url = "https://guix.gnu.org/sources.json" + origin_upstream = "https://git.savannah.gnu.org/git/guix.git" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + response = page_response(datadir, instance) + + requests_mock.get( + url, + [{"json": response}], + ) + + listed_result = lister.run() + # only the origin upstream is listed, every other entries are unsupported or incomplete + assert listed_result == ListerStats(pages=1, origins=1) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == 1 + + assert scheduler_origins[0].visit_type == "git" + + +def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock): + url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + origin_upstream = "https://github.com/NixOS/nixpkgs" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + requests_mock.get( + url, + status_code=404, + ) + + with pytest.raises(requests.HTTPError): # listing cannot continues so stop + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 0 diff --git a/swh/lister/nixguix/tests/test_tasks.py b/swh/lister/nixguix/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/test_tasks.py @@ -0,0 +1,27 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + +NAMESPACE = "swh.lister.nixguix" + + +def test_nixguix_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked ArchLister + lister = mocker.patch(f"{NAMESPACE}.lister.NixGuixLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task( + f"{NAMESPACE}.tasks.NixGuixListerTask", + ) + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -35,6 +35,10 @@ "url": "https://try.gogs.io/", "api_token": "secret", }, + "nixguix": { + "url": "https://guix.gnu.org/sources.json", + "origin_upstream": "https://git.savannah.gnu.org/cgit/guix.git/", + }, }