Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9696597
D8341.id30304.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Subscribers
None
D8341.id30304.diff
View Options
diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py
--- a/swh/lister/__init__.py
+++ b/swh/lister/__init__.py
@@ -30,6 +30,26 @@
SUPPORTED_LISTERS = list(LISTERS)
+# to recognize existing naming pattern
+TARBALL_EXTENSIONS = [
+ "jar",
+ "zip",
+ "tar",
+ "gz",
+ "tgz",
+ "tbz",
+ "bz2",
+ "bzip2",
+ "lzma",
+ "lz",
+ "txz",
+ "xz",
+ "z",
+ "Z",
+ "7z",
+ "zst",
+]
+
def get_lister(lister_name, db_url=None, **conf):
"""Instantiate a lister given its name.
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
--- a/swh/lister/gnu/tree.py
+++ b/swh/lister/gnu/tree.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2021 The Software Heritage developers
+# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -15,6 +15,8 @@
import requests
+from swh.lister import TARBALL_EXTENSIONS
+
logger = logging.getLogger(__name__)
@@ -186,21 +188,6 @@
return False
-# to recognize existing naming pattern
-EXTENSIONS = [
- "zip",
- "tar",
- "gz",
- "tgz",
- "bz2",
- "bzip2",
- "lzma",
- "lz",
- "xz",
- "Z",
- "7z",
-]
-
VERSION_KEYWORDS = [
"cygwin_me",
"w32",
@@ -269,7 +256,7 @@
(?P<extension>(?:\.(?:{extensions}))+)
$
""".format(
- extensions="|".join(EXTENSIONS),
+ extensions="|".join(TARBALL_EXTENSIONS),
vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS),
)
diff --git a/swh/lister/nixguix/__init__.py b/swh/lister/nixguix/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import NixGuixLister
+
+ return {
+ "lister": NixGuixLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/lister.py
@@ -0,0 +1,285 @@
+# Copyright (C) 2020-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from dataclasses import dataclass
+import logging
+from pathlib import Path
+import random
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+
+import requests
+
+from swh.core.github.utils import GitHubSession
+from swh.lister import TARBALL_EXTENSIONS, USER_AGENT
+from swh.lister.pattern import CredentialsType, StatelessLister
+from swh.scheduler.model import ListedOrigin
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class OriginUpstream:
+ """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix)."""
+
+ url: str
+ version: int
+ revision: str
+
+
+@dataclass
+class Tarball:
+ """Metadata information on Tarball."""
+
+ urls: List[str]
+ """List of urls to retrieve the tarball artifact."""
+ integrity: str
+ """Integrity hash of the tarball."""
+
+
+@dataclass
+class File:
+ """Metadata information on File."""
+
+ urls: List[str]
+ """List of urls to retrieve the file artifact."""
+ integrity: str
+ """Integrity hash of the file."""
+
+
+@dataclass
+class DVCS:
+ """Metadata information on DVCS."""
+
+ origin: str
+ """Origin url of the dvcs"""
+ ref: Optional[str]
+ """Reference either a svn commit id, a git commit, ..."""
+ type: str
+ """Type of (d)vcs, e.g. svn, git, hg, ..."""
+
+
+ArtifactTypes = Union[Tarball, File, DVCS, OriginUpstream]
+PageResult = Tuple[str, ArtifactTypes]
+
+
+DVCS_SUPPORTED = ("git", "svn", "hg")
+
+
+def is_tarball(urls: List[str]) -> bool:
+ """Determine whether a list of files actually are tarballs or simple files.
+
+ Args:
+ urls: name of the remote files for which the extensions needs to be
+ checked.
+
+ Returns:
+ Whether filename is an archive or not
+
+ Example:
+
+ >>> is_tarball(['abc.zip'])
+ True
+ >>> is_tarball(['one.tar.gz', 'two.tgz'])
+ True
+ >>> is_tarball(['abc.c', 'other.c'])
+ False
+
+ """
+ index = random.randrange(len(urls))
+ filepath = urls[index]
+ file_suffix = Path(filepath).suffixes[-1].lstrip(".")
+ return file_suffix in TARBALL_EXTENSIONS
+
+
+class NixGuixLister(StatelessLister[PageResult]):
+ """List Guix or Nix sources out of a public json manifest.
+
+ This lister can output:
+ - tarballs (.tar.gz, .tbz2, ...)
+ - dvcs repositories (e.g. git, hg, svn)
+ - files (.lisp, .py, ...)
+
+ Note that no `last_update` is available in either manifest.
+
+ """
+
+ LISTER_NAME = "NixGuixLister"
+
+ def __init__(
+ self,
+ scheduler,
+ url: str,
+ origin_upstream: str,
+ name: Optional[str] = "nixpkgs",
+ instance: Optional[str] = None,
+ credentials: Optional[CredentialsType] = None,
+ **kwargs: Any,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ url=url.rstrip("/"),
+ instance=instance,
+ credentials=credentials,
+ )
+ # either full fqdn NixOS/nixpkgs or guix repository urls
+ # maybe add an assert on those specific urls?
+ self.origin_upstream = origin_upstream
+
+ self.session = requests.Session()
+ self.session.headers.update(
+ {"Accept": "application/json", "User-Agent": USER_AGENT}
+ )
+ self.github_session = GitHubSession(
+ credentials=self.credentials, user_agent=USER_AGENT
+ )
+
+ def get_pages(self) -> Iterator[PageResult]:
+ """Yield a page listing all projects referenced in the manifest."""
+ dvcs_keys = {
+ "git": {
+ "ref": "git_ref",
+ "url": "git_url",
+ },
+ "svn": {
+ "ref": "svn_revision",
+ "url": "svn_url",
+ },
+ "hg": {
+ "ref": "hg_changeset",
+ "url": "hg_url",
+ },
+ }
+ # fetch the manifest to parse
+ response = self.session.get(self.url, allow_redirects=True)
+ if not response.ok:
+ raise ValueError(f"Error during query to {self.url}")
+
+ raw_data = response.json()
+
+ version = raw_data["version"]
+ revision = raw_data["revision"]
+ yield "origin", OriginUpstream(
+ self.origin_upstream,
+ version,
+ revision,
+ )
+
+ # grep '"type"' guix-sources.json | sort | uniq
+ # "type": false <<<<<<<<< noise
+ # "type": "git",
+ # "type": "hg",
+ # "type": "no-origin", <<<<<<<<< noise
+ # "type": "svn",
+ # "type": "url",
+
+ # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq
+ # "type": "url",
+
+ for artifact in raw_data["sources"]:
+ artifact_type = artifact["type"]
+ if artifact_type in DVCS_SUPPORTED:
+ plain_url = artifact[dvcs_keys[artifact_type]["url"]]
+ plain_ref = artifact[dvcs_keys[artifact_type]["ref"]]
+ artifact_url = self.github_session.get_canonical_url(plain_url)
+ if not artifact_url:
+ continue
+ yield "dvcs", DVCS(
+ origin=artifact_url, type=artifact_type, ref=plain_ref
+ )
+ elif artifact_type == "url":
+ # It's either a tarball or a file
+ urls = artifact.get("urls")
+ if not urls:
+ # Nothing to fetch
+ logger.warning("Skipping empty artifact %s", artifact)
+ continue
+
+ assert urls is not None
+ integrity = artifact["integrity"]
+
+ if is_tarball(urls):
+ yield "tarball", Tarball(urls=urls, integrity=integrity)
+ else:
+ yield "file", File(urls=urls, integrity=integrity)
+ else:
+ logger.warning(
+ "Skipping unsupported type %s for artifact %s",
+ artifact_type,
+ artifact,
+ )
+
+ def from_dvcs_to_listed_origin(self, artifact: DVCS) -> Iterator[ListedOrigin]:
+ """Given a dvcs repository, yield a ListedOrigin."""
+ assert self.lister_obj.id is not None
+ # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...)
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=artifact.origin,
+ visit_type=artifact.type,
+ )
+
+ def from_origin_to_listed_origin(
+ self, origin_upstream: OriginUpstream
+ ) -> Iterator[ListedOrigin]:
+ """Given an upstream origin, yield a ListedOrigin."""
+ assert self.lister_obj.id is not None
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=origin_upstream.url,
+ visit_type="git", # both nixpkgs and guix are git origins so far
+ )
+
+ def from_tarball_to_listed_origin(self, tarball: Tarball) -> Iterator[ListedOrigin]:
+ """Given a tarball, yield as many ListedOrigin as tarball urls."""
+ # FIXME: maybe check or filter according to file extensions
+ assert self.lister_obj.id is not None
+ for url in tarball.urls:
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=url,
+ visit_type="tar",
+ extra_loader_arguments={
+ "artifacts": [
+ {
+ "url": url,
+ }
+ ],
+ "extid_manifest_format": "$url $integrity",
+ "integrity": tarball.integrity,
+ },
+ )
+
+ def from_file_to_listed_origin(self, simplefile: File) -> Iterator[ListedOrigin]:
+ """Given a remote file, yield a ListedOrigin."""
+ # FIXME: Actually same implem. as tarball, so do we need to separate those?
+ assert self.lister_obj.id is not None
+ for url in simplefile.urls:
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=url,
+ visit_type="file",
+ extra_loader_arguments={
+ "artifacts": [
+ {
+ "url": url,
+ }
+ ],
+ "extid_manifest_format": "$url $integrity",
+ "integrity": simplefile.integrity,
+ },
+ )
+
+ def get_origins_from_page(
+ self, artifact_tuple: PageResult
+ ) -> Iterator[ListedOrigin]:
+ """Given an artifact tuple (type, artifact), yield a ListedOrigin."""
+ artifact_type, artifact = artifact_tuple
+ mapping_type_to_fn: Dict[str, Callable[[Any], Iterator[ListedOrigin]]] = {
+ "dvcs": self.from_dvcs_to_listed_origin,
+ "file": self.from_file_to_listed_origin,
+ "origin": self.from_origin_to_listed_origin,
+ "tarball": self.from_tarball_to_listed_origin,
+ }
+ yield from mapping_type_to_fn[artifact_type](artifact)
diff --git a/swh/lister/nixguix/tests/__init__.py b/swh/lister/nixguix/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
@@ -0,0 +1,52 @@
+{
+ "sources": [
+ {
+ "type": "url",
+ "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://example.com/file.txt" ],
+ "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz"
+ ],
+ "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "http://downloads.sourceforge.net/project/nmon/lmon16n.c",
+ "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c",
+ "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c"
+ ],
+ "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
+ },
+ {
+ "type": "git",
+ "git_url": "https://example.org/pali/0xffff",
+ "git_ref": "0.9"
+ },
+ {
+ "type": "hg",
+ "hg_url": "https://example.org/vityok/cl-string-match",
+ "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97"
+ },
+ {
+ "type": "svn",
+ "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
+ "svn_revision": 39057
+ }
+ ],
+ "version": "1",
+ "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"
+}
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import defaultdict
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from swh.lister import USER_AGENT
+from swh.lister.nixguix.lister import NixGuixLister
+from swh.lister.pattern import ListerStats
+
+logger = logging.getLogger(__name__)
+
+
+def _match_request(request):
+ return request.headers.get("User-Agent") == USER_AGENT
+
+
+def page_response(datadir, instance: str) -> List[Dict]:
+ """Return list of repositories (out of test dataset)"""
+ datapath = Path(datadir, f"{instance}-swh_sources.json")
+ return json.loads(datapath.read_text()) if datapath.exists else []
+
+
+def test_lister_nixguix(datadir, swh_scheduler, requests_mock):
+ """Gitlab lister supports full listing"""
+ instance = "nixpkgs"
+ url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
+ origin_upstream = "https://github.com/NixOS/nixpkgs"
+ lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+
+ response = page_response(datadir, instance)
+
+ requests_mock.get(
+ url,
+ [{"json": response}],
+ additional_matcher=_match_request,
+ )
+
+ listed_result = lister.run()
+ expected_nb_origins = 1 # origin upstream is an origin as well
+ for source in response["sources"]:
+ if source["type"] == "url":
+ # for typed artifact "url", each url referenced is one new origin
+ expected_nb_origins += len(source["urls"])
+ else:
+ expected_nb_origins += 1
+
+ nb_pages = (
+ len(response["sources"]) + 1
+ ) # each origin read is one page + the upstream origin
+ assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins)
+
+ scheduler_origins = lister.scheduler.get_listed_origins(
+ lister.lister_obj.id
+ ).results
+ assert len(scheduler_origins) == expected_nb_origins
+
+ mapping_visit_types = defaultdict(int)
+
+ for listed_origin in scheduler_origins:
+ assert listed_origin.visit_type in ["file", "git", "svn", "hg", "tar"]
+ # no last update is listed on those manifests
+ assert listed_origin.last_update is None
+
+ mapping_visit_types[listed_origin.visit_type] += 1
+
+ assert dict(mapping_visit_types) == {
+ "git": 1 + 1, # origin_upstream again
+ "svn": 1,
+ "hg": 1,
+ "tar": 3,
+ "file": 4,
+ }
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 17, 8:47 PM (3 h, 57 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226688
Attached To
D8341: Add nixguix lister
Event Timeline
Log In to Comment