diff --git a/swh/lister/nixguix/__init__.py b/swh/lister/nixguix/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import NixGuixLister + + return { + "lister": NixGuixLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/lister.py @@ -0,0 +1,229 @@ +# Copyright (C) 2020-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import dataclass +import logging +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union + +import requests + +from swh.core.github.utils import GitHubSession +from swh.lister import USER_AGENT +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.model import ListedOrigin + +logger = logging.getLogger(__name__) + + +@dataclass +class OriginUpstream: + """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" + + url: str + version: int + revision: str + + +@dataclass +class Tarball: + """Metadata information on Tarball.""" + + urls: List[str] + """List of urls to retrieve the tarball artifact.""" + integrity: str + """Integrity hash of the tarball.""" + + +@dataclass +class File: + """Metadata information on File.""" + + pass + + +@dataclass +class DVCS: + """Metadata information on DVCS.""" + + origin: str + """Origin url of the dvcs""" + ref: Optional[str] + """Reference either a svn commit id, a git commit, ...""" + type: str + """Type of (d)vcs, e.g. svn, git, hg, ...""" + + +ArtifactTypes = Union[Tarball, File, DVCS, OriginUpstream] +PageResult = Tuple[str, ArtifactTypes] + + +DVCS_SUPPORTED = ("git", "svn", "hg") + + +class NixGuixLister(StatelessLister[PageResult]): + """List Guix or Nix sources out of a public json manifest. + + This lister can output: + - tarballs (.tar.gz, .tbz2, ...) + - dvcs repositories (e.g. git, hg, svn) + - files (.lisp, .py, ...) + + """ + + def __init__( + self, + scheduler, + url: str, + origin_upstream: str, + name: Optional[str] = "nixpkgs", + instance: Optional[str] = None, + credentials: Optional[CredentialsType] = None, + **kwargs: Any, + ): + super().__init__( + scheduler=scheduler, + url=url.rstrip("/"), + instance=instance, + credentials=credentials, + ) + # either full fqdn NixOS/nixpkgs or guix repository urls + # maybe add an assert on those specific urls? + self.origin_upstream = origin_upstream + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + self.github_session = GitHubSession( + credentials=self.credentials, user_agent=USER_AGENT + ) + + def get_pages(self) -> Iterator[PageResult]: + """Yield a page listing all projects referenced in the manifest.""" + dvcs_keys = { + "git": { + "ref": "git_ref", + "url": "git_url", + }, + "svn": { + "ref": "svn_ref", + "url": "svn_url", + }, + "hg": { + "ref": "hg_changeset", + "url": "hg_url", + }, + } + # fetch the manifest to parse + response = self.session.get(self.url, allow_redirects=True) + if not response.ok: + raise ValueError(f"Error during query to {self.url}") + + raw_data = response.json() + + version = raw_data["version"] + revision = raw_data["revision"] + yield "origin", OriginUpstream( + self.origin_upstream, + version, + revision, + ) + + # grep '"type"' guix-sources.json | sort | uniq + # "type": false <<<<<<<<< noise + # "type": "git", + # "type": "hg", + # "type": "no-origin", <<<<<<<<< noise + # "type": "svn", + # "type": "url", + + # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq + # "type": "url", + + for artifact in raw_data["sources"]: + artifact_type = artifact["type"] + if artifact_type in DVCS_SUPPORTED: + plain_url = artifact[dvcs_keys[artifact_type]["url"]] + plain_ref = artifact[dvcs_keys[artifact_type]["ref"]] + artifact_url = self.github_session.get_canonical_url(plain_url) + if not artifact_url: + continue + yield "dvcs", DVCS( + origin=artifact_url, type=artifact_type, ref=plain_ref + ) + elif artifact_type == "url": + # TODO + pass + else: + # unsupported + pass + + def from_dvcs_to_listed_origin(self, artifact: DVCS) -> Iterator[ListedOrigin]: + """Given a dvcs repository, yield a ListedOrigin.""" + pass + + def from_origin_to_listed_origin( + self, origin_upstream: OriginUpstream + ) -> Iterator[ListedOrigin]: + """Given an upstream origin, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_upstream.url, + visit_type="git", # both nixpkgs and guix are git origins so far + ) + + def from_tarball_to_listed_origin(self, tarball: Tarball) -> Iterator[ListedOrigin]: + """Given a tarball, yield as many ListedOrigin as tarball urls.""" + # FIXME: maybe check or filter according to file extensions + assert self.lister_obj.id is not None + for url in tarball.urls: + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=url, + visit_type="tar", + extra_loader_arguments={ + "artifacts": [ + { + "url": url, + } + ], + "extid_manifest_format": "$url $integrity", + "integrity": tarball.integrity, + }, + ) + + def from_file_to_listed_origin(self, file: File) -> Iterator[ListedOrigin]: + """Given a remote file, yield a ListedOrigin.""" + pass + + def get_origins_from_page( + self, artifact_tuple: PageResult + ) -> Iterator[ListedOrigin]: + """Given an artifact tuple (type, artifact), yield a ListedOrigin.""" + artifact_type, artifact = artifact_tuple + mapping_type_to_fn: Dict[str, Callable[[Any], Iterator[ListedOrigin]]] = { + "dvcs": self.from_dvcs_to_listed_origin, + "file": self.from_file_to_listed_origin, + "origin": self.from_origin_to_listed_origin, + "tarball": self.from_tarball_to_listed_origin, + } + yield from mapping_type_to_fn[artifact_type](artifact) + + # callable_fn = mapping_fn[object_type] + + # yield from callable_fn(artifacts) + # origin_url = artifacts["url"] + # last_update = iso8601.parse_date(project_info["time_modified"]) + + # logger.debug("Found origin %s last updated on %s", origin_url, last_update) + + # yield ListedOrigin( + # lister_id=self.lister_obj.id, + # url=origin_url, + # visit_type="tar", + # last_update=last_update, + # extra_loader_arguments={"artifacts": artifacts[project_name]}, + # ) diff --git a/swh/lister/nixguix/tests/__init__.py b/swh/lister/nixguix/tests/__init__.py new file mode 100644 diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json new file mode 100644 --- /dev/null +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -0,0 +1,21 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://example.com/file.txt" ], + "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" + } + ], + "version": "1", + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +}