diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -30,6 +30,26 @@ SUPPORTED_LISTERS = list(LISTERS) +# to recognize existing naming pattern +TARBALL_EXTENSIONS = [ + "jar", + "zip", + "tar", + "gz", + "tgz", + "tbz", + "bz2", + "bzip2", + "lzma", + "lz", + "txz", + "xz", + "z", + "Z", + "7z", + "zst", +] + def get_lister(lister_name, db_url=None, **conf): """Instantiate a lister given its name. diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,6 +15,8 @@ import requests +from swh.lister import TARBALL_EXTENSIONS + logger = logging.getLogger(__name__) @@ -186,21 +188,6 @@ return False -# to recognize existing naming pattern -EXTENSIONS = [ - "zip", - "tar", - "gz", - "tgz", - "bz2", - "bzip2", - "lzma", - "lz", - "xz", - "Z", - "7z", -] - VERSION_KEYWORDS = [ "cygwin_me", "w32", @@ -269,7 +256,7 @@ (?P(?:\.(?:{extensions}))+) $ """.format( - extensions="|".join(EXTENSIONS), + extensions="|".join(TARBALL_EXTENSIONS), vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS), ) diff --git a/swh/lister/nixguix/__init__.py b/swh/lister/nixguix/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import NixGuixLister + + return { + "lister": NixGuixLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/lister.py @@ -0,0 +1,307 @@ +# Copyright (C) 2020-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""NixGuix lister definition. + +This lists artifacts out of manifest for Guix or Nixpkgs manifests. + +""" + +from dataclasses import dataclass +import logging +from pathlib import Path +import random +from typing import Any, Iterator, List, Optional, Tuple, Union + +import requests + +from swh.core.github.utils import GitHubSession +from swh.lister import TARBALL_EXTENSIONS, USER_AGENT +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.model import ListedOrigin + +logger = logging.getLogger(__name__) + + +@dataclass +class OriginUpstream: + """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" + + url: str + version: int + revision: str + + +@dataclass +class Tarball: + """Metadata information on Tarball.""" + + origin: str + """Canonical url retrieve the tarball artifact.""" + fallback_urls: List[str] + """List of urls to retrieve tarball artifact if canonical url no longer works.""" + integrity: str + """Integrity hash of the tarball.""" + + +@dataclass +class File: + """Metadata information on File.""" + + origin: str + """Canonical url retrieve the tarball artifact.""" + fallback_urls: List[str] + """List of urls to retrieve tarball artifact if canonical url no longer works.""" + integrity: str + """Integrity hash of the file.""" + + +@dataclass +class DVCS: + """Metadata information on DVCS.""" + + origin: str + """Origin url of the dvcs""" + ref: Optional[str] + """Reference either a svn commit id, a git commit, ...""" + type: str + """Type of (d)vcs, e.g. svn, git, hg, ...""" + + +ArtifactTypes = Union[Tarball, File, DVCS, OriginUpstream] +PageResult = Tuple[str, ArtifactTypes] + + +DVCS_SUPPORTED = ("git", "svn", "hg") + + +def is_tarball(urls: List[str]) -> bool: + """Determine whether a list of files actually are tarballs or simple files. + + Args: + urls: name of the remote files for which the extensions needs to be + checked. + + Returns: + Whether filename is an archive or not + + Example: + + >>> is_tarball(['abc.zip']) + True + >>> is_tarball(['one.tar.gz', 'two.tgz']) + True + >>> is_tarball(['abc.c', 'other.c']) + False + + """ + index = random.randrange(len(urls)) + filepath = urls[index] + file_suffix = Path(filepath).suffixes[-1].lstrip(".") + return file_suffix in TARBALL_EXTENSIONS + + +class NixGuixLister(StatelessLister[PageResult]): + """List Guix or Nix sources out of a public json manifest. + + This lister can output: + - tarballs (.tar.gz, .tbz2, ...) + - dvcs repositories (e.g. git, hg, svn) + - files (.lisp, .py, ...) + + Note that no `last_update` is available in either manifest. + + """ + + LISTER_NAME = "NixGuixLister" + + def __init__( + self, + scheduler, + url: str, + origin_upstream: str, + name: Optional[str] = "nixpkgs", + instance: Optional[str] = None, + credentials: Optional[CredentialsType] = None, + **kwargs: Any, + ): + super().__init__( + scheduler=scheduler, + url=url.rstrip("/"), + instance=instance, + credentials=credentials, + ) + # either full fqdn NixOS/nixpkgs or guix repository urls + # maybe add an assert on those specific urls? + self.origin_upstream = origin_upstream + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + self.github_session = GitHubSession( + credentials=self.credentials, user_agent=USER_AGENT + ) + + def get_pages(self) -> Iterator[PageResult]: + """Yield one page per "typed" origin referenced in manifest.""" + dvcs_keys = { + "git": { + "ref": "git_ref", + "url": "git_url", + }, + "svn": { + "ref": "svn_revision", + "url": "svn_url", + }, + "hg": { + "ref": "hg_changeset", + "url": "hg_url", + }, + } + # fetch the manifest to parse... + response = self.session.get(self.url, allow_redirects=True) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + # ... if any + raw_data = response.json() + version = raw_data["version"] + revision = raw_data["revision"] + yield "origin", OriginUpstream( + self.origin_upstream, + version, + revision, + ) + + # grep '"type"' guix-sources.json | sort | uniq + # "type": false <<<<<<<<< noise + # "type": "git", + # "type": "hg", + # "type": "no-origin", <<<<<<<<< noise + # "type": "svn", + # "type": "url", + + # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq + # "type": "url", + + for artifact in raw_data["sources"]: + artifact_type = artifact["type"] + if artifact_type in DVCS_SUPPORTED: + plain_url = artifact[dvcs_keys[artifact_type]["url"]] + plain_ref = artifact[dvcs_keys[artifact_type]["ref"]] + artifact_url = self.github_session.get_canonical_url(plain_url) + if not artifact_url: + continue + yield "dvcs", DVCS( + origin=artifact_url, type=artifact_type, ref=plain_ref + ) + elif artifact_type == "url": + # It's either a tarball or a file + urls = artifact.get("urls") + if not urls: + # Nothing to fetch + logger.warning("Skipping empty artifact %s", artifact) + continue + + assert urls is not None + integrity = artifact["integrity"] + + # FIXME: T3294: Fix missing scheme in urls + origin, *fallback_urls = urls + obj_type: str + obj_class: Union[Tarball, File] + if is_tarball(urls): + yield "tarball", Tarball( + origin=origin, + fallback_urls=fallback_urls, + integrity=integrity, + ) + else: + yield "file", File( + origin=origin, + fallback_urls=fallback_urls, + integrity=integrity, + ) + else: + logger.warning( + "Skipping unsupported type %s for artifact %s", + artifact_type, + artifact, + ) + + def dvcs_to_listed_origin(self, artifact: DVCS) -> Iterator[ListedOrigin]: + """Given a dvcs repository, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=artifact.origin, + visit_type=artifact.type, + ) + + def origin_to_listed_origin( + self, origin_upstream: OriginUpstream + ) -> Iterator[ListedOrigin]: + """Given an upstream origin, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_upstream.url, + visit_type="git", # both nixpkgs and guix are git origins so far + ) + + def tarball_to_listed_origin(self, tarball: Tarball) -> Iterator[ListedOrigin]: + """Given a tarball, yield as many ListedOrigin as tarball urls.""" + # FIXME: maybe check or filter according to file extensions + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=tarball.origin, + visit_type="tar", + extra_loader_arguments={ + "artifacts": [ + { + "url": tarball.origin, + } + ], + "extid_manifest_format": "$url $integrity", + "integrity": tarball.integrity, + "fallback_urls": tarball.fallback_urls, + }, + ) + + def file_to_listed_origin(self, simplefile: File) -> Iterator[ListedOrigin]: + """Given a remote file, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=simplefile.origin, + visit_type="file", + extra_loader_arguments={ + "artifacts": [ + { + "url": simplefile.origin, + } + ], + "extid_manifest_format": "$url $integrity", + "integrity": simplefile.integrity, + "fallback_urls": simplefile.fallback_urls, + }, + ) + + def get_origins_from_page( + self, artifact_tuple: PageResult + ) -> Iterator[ListedOrigin]: + """Given an artifact tuple (type, artifact), yield a ListedOrigin.""" + artifact_type, artifact = artifact_tuple + mapping_type_fn = getattr(self, f"{artifact_type}_to_listed_origin") + yield from mapping_type_fn(artifact) diff --git a/swh/lister/nixguix/tests/__init__.py b/swh/lister/nixguix/tests/__init__.py new file mode 100644 diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json @@ -0,0 +1,10 @@ +{ + "sources": [ + {"type": "git", "git_url": "", "git_ref": ""}, + {"type": false}, + {"type": "no-origin"}, + {"type": "url", "urls": []} + ], + "version":"1", + "revision":"ab59155c5a38dda7efaceb47c7528578fcf0def4" +} diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -0,0 +1,52 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://example.com/file.txt" ], + "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" + }, + { + "type": "url", + "urls": [ + "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz" + ], + "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0=" + }, + { + "type": "url", + "urls": [ + "http://downloads.sourceforge.net/project/nmon/lmon16n.c", + "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c", + "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c" + ], + "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" + }, + { + "type": "git", + "git_url": "https://example.org/pali/0xffff", + "git_ref": "0.9" + }, + { + "type": "hg", + "hg_url": "https://example.org/vityok/cl-string-match", + "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97" + }, + { + "type": "svn", + "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2", + "svn_revision": 39057 + } + ], + "version": "1", + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +} diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/test_lister.py @@ -0,0 +1,134 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from collections import defaultdict +import json +import logging +from pathlib import Path +from typing import Dict, List + +import pytest +import requests + +from swh.lister import USER_AGENT +from swh.lister.nixguix.lister import NixGuixLister +from swh.lister.pattern import ListerStats + +logger = logging.getLogger(__name__) + + +def _match_request(request): + return request.headers.get("User-Agent") == USER_AGENT + + +def page_response(datadir, instance: str) -> List[Dict]: + """Return list of repositories (out of test dataset)""" + datapath = Path(datadir, f"{instance}-swh_sources.json") + return json.loads(datapath.read_text()) if datapath.exists else [] + + +def test_lister_nixguix(datadir, swh_scheduler, requests_mock): + """NixGuixLister should list all origins per visit type""" + instance = "nixpkgs" + url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + origin_upstream = "https://github.com/NixOS/nixpkgs" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + response = page_response(datadir, instance) + requests_mock.get( + url, + [{"json": response}], + additional_matcher=_match_request, + ) + + expected_visit_types = defaultdict(int) + # origin upstream is added as origin + expected_nb_origins = 1 + expected_visit_types["git"] += 1 + for artifact in response["sources"]: + # Each artifact is considered an origin (even "url" artifacts with mirror urls) + expected_nb_origins += 1 + artifact_type = artifact["type"] + if artifact_type in [ + "git", + "svn", + "hg", + ]: + expected_visit_types[artifact_type] += 1 + elif artifact_type == "url": + url = artifact["urls"][0] + if url.endswith(".c") or url.endswith(".txt"): + expected_visit_types["file"] += 1 + else: + expected_visit_types["tar"] += 1 + + assert set(expected_visit_types.keys()) == {"file", "git", "svn", "hg", "tar"} + + listed_result = lister.run() + + # 1 page read is 1 origin + nb_pages = expected_nb_origins + assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + mapping_visit_types = defaultdict(int) + + for listed_origin in scheduler_origins: + assert listed_origin.visit_type in expected_visit_types + # no last update is listed on those manifests + assert listed_origin.last_update is None + + mapping_visit_types[listed_origin.visit_type] += 1 + + assert dict(mapping_visit_types) == expected_visit_types + + +def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): + """NixGuixLister should ignore unsupported or incomplete origins""" + instance = "guix" + url = "https://guix.gnu.org/sources.json" + origin_upstream = "https://git.savannah.gnu.org/git/guix.git" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + response = page_response(datadir, instance) + + requests_mock.get( + url, + [{"json": response}], + additional_matcher=_match_request, + ) + + listed_result = lister.run() + # only the origin upstream is listed, every other entries are unsupported or incomplete + assert listed_result == ListerStats(pages=1, origins=1) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == 1 + + assert scheduler_origins[0].visit_type == "git" + + +def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock): + url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + origin_upstream = "https://github.com/NixOS/nixpkgs" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + requests_mock.get( + url, + status_code=404, + additional_matcher=_match_request, + ) + + with pytest.raises(requests.HTTPError): # listing cannot continues so stop + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 0