Page MenuHomeSoftware Heritage

D8341.id30304.diff
No OneTemporary

D8341.id30304.diff

diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py
--- a/swh/lister/__init__.py
+++ b/swh/lister/__init__.py
@@ -30,6 +30,26 @@
SUPPORTED_LISTERS = list(LISTERS)
+# to recognize existing naming pattern
+TARBALL_EXTENSIONS = [
+ "jar",
+ "zip",
+ "tar",
+ "gz",
+ "tgz",
+ "tbz",
+ "bz2",
+ "bzip2",
+ "lzma",
+ "lz",
+ "txz",
+ "xz",
+ "z",
+ "Z",
+ "7z",
+ "zst",
+]
+
def get_lister(lister_name, db_url=None, **conf):
"""Instantiate a lister given its name.
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
--- a/swh/lister/gnu/tree.py
+++ b/swh/lister/gnu/tree.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2021 The Software Heritage developers
+# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -15,6 +15,8 @@
import requests
+from swh.lister import TARBALL_EXTENSIONS
+
logger = logging.getLogger(__name__)
@@ -186,21 +188,6 @@
return False
-# to recognize existing naming pattern
-EXTENSIONS = [
- "zip",
- "tar",
- "gz",
- "tgz",
- "bz2",
- "bzip2",
- "lzma",
- "lz",
- "xz",
- "Z",
- "7z",
-]
-
VERSION_KEYWORDS = [
"cygwin_me",
"w32",
@@ -269,7 +256,7 @@
(?P<extension>(?:\.(?:{extensions}))+)
$
""".format(
- extensions="|".join(EXTENSIONS),
+ extensions="|".join(TARBALL_EXTENSIONS),
vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS),
)
diff --git a/swh/lister/nixguix/__init__.py b/swh/lister/nixguix/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import NixGuixLister
+
+ return {
+ "lister": NixGuixLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/lister.py
@@ -0,0 +1,285 @@
+# Copyright (C) 2020-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from dataclasses import dataclass
+import logging
+from pathlib import Path
+import random
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+
+import requests
+
+from swh.core.github.utils import GitHubSession
+from swh.lister import TARBALL_EXTENSIONS, USER_AGENT
+from swh.lister.pattern import CredentialsType, StatelessLister
+from swh.scheduler.model import ListedOrigin
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class OriginUpstream:
+ """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix)."""
+
+ url: str
+ version: int
+ revision: str
+
+
+@dataclass
+class Tarball:
+ """Metadata information on Tarball."""
+
+ urls: List[str]
+ """List of urls to retrieve the tarball artifact."""
+ integrity: str
+ """Integrity hash of the tarball."""
+
+
+@dataclass
+class File:
+ """Metadata information on File."""
+
+ urls: List[str]
+ """List of urls to retrieve the file artifact."""
+ integrity: str
+ """Integrity hash of the file."""
+
+
+@dataclass
+class DVCS:
+ """Metadata information on DVCS."""
+
+ origin: str
+ """Origin url of the dvcs"""
+ ref: Optional[str]
+ """Reference either a svn commit id, a git commit, ..."""
+ type: str
+ """Type of (d)vcs, e.g. svn, git, hg, ..."""
+
+
+ArtifactTypes = Union[Tarball, File, DVCS, OriginUpstream]
+PageResult = Tuple[str, ArtifactTypes]
+
+
+DVCS_SUPPORTED = ("git", "svn", "hg")
+
+
+def is_tarball(urls: List[str]) -> bool:
+ """Determine whether a list of files actually are tarballs or simple files.
+
+ Args:
+ urls: name of the remote files for which the extensions needs to be
+ checked.
+
+ Returns:
+ Whether filename is an archive or not
+
+ Example:
+
+ >>> is_tarball(['abc.zip'])
+ True
+ >>> is_tarball(['one.tar.gz', 'two.tgz'])
+ True
+ >>> is_tarball(['abc.c', 'other.c'])
+ False
+
+ """
+ index = random.randrange(len(urls))
+ filepath = urls[index]
+ file_suffix = Path(filepath).suffixes[-1].lstrip(".")
+ return file_suffix in TARBALL_EXTENSIONS
+
+
+class NixGuixLister(StatelessLister[PageResult]):
+ """List Guix or Nix sources out of a public json manifest.
+
+ This lister can output:
+ - tarballs (.tar.gz, .tbz2, ...)
+ - dvcs repositories (e.g. git, hg, svn)
+ - files (.lisp, .py, ...)
+
+ Note that no `last_update` is available in either manifest.
+
+ """
+
+ LISTER_NAME = "NixGuixLister"
+
+ def __init__(
+ self,
+ scheduler,
+ url: str,
+ origin_upstream: str,
+ name: Optional[str] = "nixpkgs",
+ instance: Optional[str] = None,
+ credentials: Optional[CredentialsType] = None,
+ **kwargs: Any,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ url=url.rstrip("/"),
+ instance=instance,
+ credentials=credentials,
+ )
+ # either full fqdn NixOS/nixpkgs or guix repository urls
+ # maybe add an assert on those specific urls?
+ self.origin_upstream = origin_upstream
+
+ self.session = requests.Session()
+ self.session.headers.update(
+ {"Accept": "application/json", "User-Agent": USER_AGENT}
+ )
+ self.github_session = GitHubSession(
+ credentials=self.credentials, user_agent=USER_AGENT
+ )
+
+ def get_pages(self) -> Iterator[PageResult]:
+ """Yield a page listing all projects referenced in the manifest."""
+ dvcs_keys = {
+ "git": {
+ "ref": "git_ref",
+ "url": "git_url",
+ },
+ "svn": {
+ "ref": "svn_revision",
+ "url": "svn_url",
+ },
+ "hg": {
+ "ref": "hg_changeset",
+ "url": "hg_url",
+ },
+ }
+ # fetch the manifest to parse
+ response = self.session.get(self.url, allow_redirects=True)
+ if not response.ok:
+ raise ValueError(f"Error during query to {self.url}")
+
+ raw_data = response.json()
+
+ version = raw_data["version"]
+ revision = raw_data["revision"]
+ yield "origin", OriginUpstream(
+ self.origin_upstream,
+ version,
+ revision,
+ )
+
+ # grep '"type"' guix-sources.json | sort | uniq
+ # "type": false <<<<<<<<< noise
+ # "type": "git",
+ # "type": "hg",
+ # "type": "no-origin", <<<<<<<<< noise
+ # "type": "svn",
+ # "type": "url",
+
+ # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq
+ # "type": "url",
+
+ for artifact in raw_data["sources"]:
+ artifact_type = artifact["type"]
+ if artifact_type in DVCS_SUPPORTED:
+ plain_url = artifact[dvcs_keys[artifact_type]["url"]]
+ plain_ref = artifact[dvcs_keys[artifact_type]["ref"]]
+ artifact_url = self.github_session.get_canonical_url(plain_url)
+ if not artifact_url:
+ continue
+ yield "dvcs", DVCS(
+ origin=artifact_url, type=artifact_type, ref=plain_ref
+ )
+ elif artifact_type == "url":
+ # It's either a tarball or a file
+ urls = artifact.get("urls")
+ if not urls:
+ # Nothing to fetch
+ logger.warning("Skipping empty artifact %s", artifact)
+ continue
+
+ assert urls is not None
+ integrity = artifact["integrity"]
+
+ if is_tarball(urls):
+ yield "tarball", Tarball(urls=urls, integrity=integrity)
+ else:
+ yield "file", File(urls=urls, integrity=integrity)
+ else:
+ logger.warning(
+ "Skipping unsupported type %s for artifact %s",
+ artifact_type,
+ artifact,
+ )
+
+ def from_dvcs_to_listed_origin(self, artifact: DVCS) -> Iterator[ListedOrigin]:
+ """Given a dvcs repository, yield a ListedOrigin."""
+ assert self.lister_obj.id is not None
+ # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...)
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=artifact.origin,
+ visit_type=artifact.type,
+ )
+
+ def from_origin_to_listed_origin(
+ self, origin_upstream: OriginUpstream
+ ) -> Iterator[ListedOrigin]:
+ """Given an upstream origin, yield a ListedOrigin."""
+ assert self.lister_obj.id is not None
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=origin_upstream.url,
+ visit_type="git", # both nixpkgs and guix are git origins so far
+ )
+
+ def from_tarball_to_listed_origin(self, tarball: Tarball) -> Iterator[ListedOrigin]:
+ """Given a tarball, yield as many ListedOrigin as tarball urls."""
+ # FIXME: maybe check or filter according to file extensions
+ assert self.lister_obj.id is not None
+ for url in tarball.urls:
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=url,
+ visit_type="tar",
+ extra_loader_arguments={
+ "artifacts": [
+ {
+ "url": url,
+ }
+ ],
+ "extid_manifest_format": "$url $integrity",
+ "integrity": tarball.integrity,
+ },
+ )
+
+ def from_file_to_listed_origin(self, simplefile: File) -> Iterator[ListedOrigin]:
+ """Given a remote file, yield a ListedOrigin."""
+ # FIXME: Actually same implem. as tarball, so do we need to separate those?
+ assert self.lister_obj.id is not None
+ for url in simplefile.urls:
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=url,
+ visit_type="file",
+ extra_loader_arguments={
+ "artifacts": [
+ {
+ "url": url,
+ }
+ ],
+ "extid_manifest_format": "$url $integrity",
+ "integrity": simplefile.integrity,
+ },
+ )
+
+ def get_origins_from_page(
+ self, artifact_tuple: PageResult
+ ) -> Iterator[ListedOrigin]:
+ """Given an artifact tuple (type, artifact), yield a ListedOrigin."""
+ artifact_type, artifact = artifact_tuple
+ mapping_type_to_fn: Dict[str, Callable[[Any], Iterator[ListedOrigin]]] = {
+ "dvcs": self.from_dvcs_to_listed_origin,
+ "file": self.from_file_to_listed_origin,
+ "origin": self.from_origin_to_listed_origin,
+ "tarball": self.from_tarball_to_listed_origin,
+ }
+ yield from mapping_type_to_fn[artifact_type](artifact)
diff --git a/swh/lister/nixguix/tests/__init__.py b/swh/lister/nixguix/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
@@ -0,0 +1,52 @@
+{
+ "sources": [
+ {
+ "type": "url",
+ "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://example.com/file.txt" ],
+ "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz"
+ ],
+ "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "http://downloads.sourceforge.net/project/nmon/lmon16n.c",
+ "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c",
+ "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c"
+ ],
+ "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
+ },
+ {
+ "type": "git",
+ "git_url": "https://example.org/pali/0xffff",
+ "git_ref": "0.9"
+ },
+ {
+ "type": "hg",
+ "hg_url": "https://example.org/vityok/cl-string-match",
+ "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97"
+ },
+ {
+ "type": "svn",
+ "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
+ "svn_revision": 39057
+ }
+ ],
+ "version": "1",
+ "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"
+}
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import defaultdict
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from swh.lister import USER_AGENT
+from swh.lister.nixguix.lister import NixGuixLister
+from swh.lister.pattern import ListerStats
+
+logger = logging.getLogger(__name__)
+
+
+def _match_request(request):
+ return request.headers.get("User-Agent") == USER_AGENT
+
+
+def page_response(datadir, instance: str) -> List[Dict]:
+ """Return list of repositories (out of test dataset)"""
+ datapath = Path(datadir, f"{instance}-swh_sources.json")
+ return json.loads(datapath.read_text()) if datapath.exists else []
+
+
+def test_lister_nixguix(datadir, swh_scheduler, requests_mock):
+ """Gitlab lister supports full listing"""
+ instance = "nixpkgs"
+ url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
+ origin_upstream = "https://github.com/NixOS/nixpkgs"
+ lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+
+ response = page_response(datadir, instance)
+
+ requests_mock.get(
+ url,
+ [{"json": response}],
+ additional_matcher=_match_request,
+ )
+
+ listed_result = lister.run()
+ expected_nb_origins = 1 # origin upstream is an origin as well
+ for source in response["sources"]:
+ if source["type"] == "url":
+ # for typed artifact "url", each url referenced is one new origin
+ expected_nb_origins += len(source["urls"])
+ else:
+ expected_nb_origins += 1
+
+ nb_pages = (
+ len(response["sources"]) + 1
+ ) # each origin read is one page + the upstream origin
+ assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins)
+
+ scheduler_origins = lister.scheduler.get_listed_origins(
+ lister.lister_obj.id
+ ).results
+ assert len(scheduler_origins) == expected_nb_origins
+
+ mapping_visit_types = defaultdict(int)
+
+ for listed_origin in scheduler_origins:
+ assert listed_origin.visit_type in ["file", "git", "svn", "hg", "tar"]
+ # no last update is listed on those manifests
+ assert listed_origin.last_update is None
+
+ mapping_visit_types[listed_origin.visit_type] += 1
+
+ assert dict(mapping_visit_types) == {
+ "git": 1 + 1, # origin_upstream again
+ "svn": 1,
+ "hg": 1,
+ "tar": 3,
+ "file": 4,
+ }

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 8:47 PM (3 h, 57 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226688

Event Timeline