Changeset View
Standalone View
swh/lister/nixguix/lister.py
- This file was added.
# Copyright (C) 2020-2022 The Software Heritage developers | ||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||||||
# See top-level LICENSE file for more information | ||||||||||||||||
from dataclasses import dataclass | ||||||||||||||||
import logging | ||||||||||||||||
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union | ||||||||||||||||
import requests | ||||||||||||||||
from swh.core.github.utils import GitHubSession | ||||||||||||||||
from swh.lister import USER_AGENT | ||||||||||||||||
from swh.lister.pattern import CredentialsType, StatelessLister | ||||||||||||||||
from swh.scheduler.model import ListedOrigin | ||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||
@dataclass | ||||||||||||||||
class OriginUpstream: | ||||||||||||||||
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" | ||||||||||||||||
url: str | ||||||||||||||||
version: int | ||||||||||||||||
revision: str | ||||||||||||||||
@dataclass | ||||||||||||||||
class Tarball: | ||||||||||||||||
"""Metadata information on Tarball.""" | ||||||||||||||||
urls: List[str] | ||||||||||||||||
"""List of urls to retrieve the tarball artifact.""" | ||||||||||||||||
integrity: str | ||||||||||||||||
"""Integrity hash of the tarball.""" | ||||||||||||||||
@dataclass | ||||||||||||||||
class File: | ||||||||||||||||
"""Metadata information on File.""" | ||||||||||||||||
pass | ||||||||||||||||
@dataclass | ||||||||||||||||
class DVCS: | ||||||||||||||||
"""Metadata information on DVCS.""" | ||||||||||||||||
origin: str | ||||||||||||||||
"""Origin url of the dvcs""" | ||||||||||||||||
ref: Optional[str] | ||||||||||||||||
"""Reference either a svn commit id, a git commit, ...""" | ||||||||||||||||
type: str | ||||||||||||||||
"""Type of (d)vcs, e.g. svn, git, hg, ...""" | ||||||||||||||||
ArtifactTypes = Union[Tarball, File, DVCS, OriginUpstream] | ||||||||||||||||
PageResult = Tuple[str, ArtifactTypes] | ||||||||||||||||
DVCS_SUPPORTED = ("git", "svn", "hg") | ||||||||||||||||
class NixGuixLister(StatelessLister[PageResult]): | ||||||||||||||||
"""List Guix or Nix sources out of a public json manifest. | ||||||||||||||||
This lister can output: | ||||||||||||||||
- tarballs (.tar.gz, .tbz2, ...) | ||||||||||||||||
- dvcs repositories (e.g. git, hg, svn) | ||||||||||||||||
- files (.lisp, .py, ...) | ||||||||||||||||
""" | ||||||||||||||||
vlorentz: use an enum instead of `str`. (or `typing.Literal` if it really needs to be a string) | ||||||||||||||||
def __init__( | ||||||||||||||||
self, | ||||||||||||||||
scheduler, | ||||||||||||||||
Done Inline Actionssvn is not a DVCS, rename it to SUPPORTED_VCS vlorentz: svn is not a DVCS, rename it to `SUPPORTED_VCS` | ||||||||||||||||
url: str, | ||||||||||||||||
origin_upstream: str, | ||||||||||||||||
name: Optional[str] = "nixpkgs", | ||||||||||||||||
instance: Optional[str] = None, | ||||||||||||||||
credentials: Optional[CredentialsType] = None, | ||||||||||||||||
**kwargs: Any, | ||||||||||||||||
): | ||||||||||||||||
super().__init__( | ||||||||||||||||
scheduler=scheduler, | ||||||||||||||||
url=url.rstrip("/"), | ||||||||||||||||
instance=instance, | ||||||||||||||||
credentials=credentials, | ||||||||||||||||
) | ||||||||||||||||
# either full fqdn NixOS/nixpkgs or guix repository urls | ||||||||||||||||
# maybe add an assert on those specific urls? | ||||||||||||||||
self.origin_upstream = origin_upstream | ||||||||||||||||
self.session = requests.Session() | ||||||||||||||||
self.session.headers.update( | ||||||||||||||||
{"Accept": "application/json", "User-Agent": USER_AGENT} | ||||||||||||||||
Done Inline ActionsMaybe you could update and reuse tarball mimetypes declared in swh.core.tarball instead ? anlambert: Maybe you could update and reuse tarball mimetypes declared in [swh.core.tarball](https://forge. | ||||||||||||||||
Done Inline Actionsyeah, that'd make sense, can we iterate over this in another diff? That'd let me the time to open those mimetypes in swh.core.tarball (and release) so we can use a non private variable. ardumont: yeah, that'd make sense, can we iterate over this in another diff?
That'd let me the time to… | ||||||||||||||||
Done Inline ActionsOpened [1] in swh.core [1] D8603 ardumont: Opened [1] in swh.core
[1] D8603 | ||||||||||||||||
Done Inline ActionsI've amended that lister diff to do that immediatly in the end (the dev/diff/review/package got fast ;) ardumont: I've amended that lister diff to do that immediatly in the end (the dev/diff/review/package got… | ||||||||||||||||
) | ||||||||||||||||
self.github_session = GitHubSession( | ||||||||||||||||
credentials=self.credentials, user_agent=USER_AGENT | ||||||||||||||||
) | ||||||||||||||||
def get_pages(self) -> Iterator[PageResult]: | ||||||||||||||||
"""Yield a page listing all projects referenced in the manifest.""" | ||||||||||||||||
dvcs_keys = { | ||||||||||||||||
"git": { | ||||||||||||||||
"ref": "git_ref", | ||||||||||||||||
"url": "git_url", | ||||||||||||||||
}, | ||||||||||||||||
"svn": { | ||||||||||||||||
"ref": "svn_ref", | ||||||||||||||||
"url": "svn_url", | ||||||||||||||||
}, | ||||||||||||||||
"hg": { | ||||||||||||||||
"ref": "hg_changeset", | ||||||||||||||||
"url": "hg_url", | ||||||||||||||||
}, | ||||||||||||||||
} | ||||||||||||||||
# fetch the manifest to parse | ||||||||||||||||
response = self.session.get(self.url, allow_redirects=True) | ||||||||||||||||
if not response.ok: | ||||||||||||||||
raise ValueError(f"Error during query to {self.url}") | ||||||||||||||||
raw_data = response.json() | ||||||||||||||||
version = raw_data["version"] | ||||||||||||||||
revision = raw_data["revision"] | ||||||||||||||||
Not Done Inline Actions
vlorentz: | ||||||||||||||||
yield "origin", OriginUpstream( | ||||||||||||||||
self.origin_upstream, | ||||||||||||||||
version, | ||||||||||||||||
revision, | ||||||||||||||||
) | ||||||||||||||||
# grep '"type"' guix-sources.json | sort | uniq | ||||||||||||||||
# "type": false <<<<<<<<< noise | ||||||||||||||||
# "type": "git", | ||||||||||||||||
# "type": "hg", | ||||||||||||||||
# "type": "no-origin", <<<<<<<<< noise | ||||||||||||||||
# "type": "svn", | ||||||||||||||||
# "type": "url", | ||||||||||||||||
Not Done Inline ActionsCould you make it a required argument, to avoid accidental misconfiguration? vlorentz: Could you make it a required argument, to avoid accidental misconfiguration? | ||||||||||||||||
Done Inline Actionsyes ardumont: yes | ||||||||||||||||
Done Inline Actionsactually it got dropped as it was a misnamed "instance" equivalent... ardumont: actually it got dropped as it was a misnamed "instance" equivalent... | ||||||||||||||||
# grep '"type"' nixpkgs-sources-unstable.json | sort | uniq | ||||||||||||||||
# "type": "url", | ||||||||||||||||
for artifact in raw_data["sources"]: | ||||||||||||||||
artifact_type = artifact["type"] | ||||||||||||||||
if artifact_type in DVCS_SUPPORTED: | ||||||||||||||||
plain_url = artifact[dvcs_keys[artifact_type]["url"]] | ||||||||||||||||
plain_ref = artifact[dvcs_keys[artifact_type]["ref"]] | ||||||||||||||||
artifact_url = self.github_session.get_canonical_url(plain_url) | ||||||||||||||||
if not artifact_url: | ||||||||||||||||
continue | ||||||||||||||||
yield "dvcs", DVCS( | ||||||||||||||||
origin=artifact_url, type=artifact_type, ref=plain_ref | ||||||||||||||||
) | ||||||||||||||||
elif artifact_type == "url": | ||||||||||||||||
# TODO | ||||||||||||||||
pass | ||||||||||||||||
else: | ||||||||||||||||
# unsupported | ||||||||||||||||
pass | ||||||||||||||||
def from_dvcs_to_listed_origin(self, artifact: DVCS) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given a dvcs repository, yield a ListedOrigin.""" | ||||||||||||||||
pass | ||||||||||||||||
def from_origin_to_listed_origin( | ||||||||||||||||
self, origin_upstream: OriginUpstream | ||||||||||||||||
) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given an upstream origin, yield a ListedOrigin.""" | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=origin_upstream.url, | ||||||||||||||||
visit_type="git", # both nixpkgs and guix are git origins so far | ||||||||||||||||
) | ||||||||||||||||
def from_tarball_to_listed_origin(self, tarball: Tarball) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given a tarball, yield as many ListedOrigin as tarball urls.""" | ||||||||||||||||
# FIXME: maybe check or filter according to file extensions | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
for url in tarball.urls: | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=url, | ||||||||||||||||
visit_type="tar", | ||||||||||||||||
extra_loader_arguments={ | ||||||||||||||||
"artifacts": [ | ||||||||||||||||
{ | ||||||||||||||||
"url": url, | ||||||||||||||||
} | ||||||||||||||||
], | ||||||||||||||||
"extid_manifest_format": "$url $integrity", | ||||||||||||||||
Done Inline Actionsthis definitely must include $integrity to detect changes in the URL's content vlorentz: this definitely must include `$integrity` to detect changes in the URL's content | ||||||||||||||||
Done Inline Actionsdefinitely! ardumont: definitely! | ||||||||||||||||
"integrity": tarball.integrity, | ||||||||||||||||
Done Inline ActionsI know it's a noop on non-GitHub URLs, but I find it meh to call this unconditionally on non-GitHub URLs; because seemingly innocuous changes to GithubSession.get_canonical_url's code would make us leak our API tokens to random websites. Heh, I guess that's fine. vlorentz: I know it's a noop on non-GitHub URLs, but I find it meh to call this unconditionally on non… | ||||||||||||||||
Done Inline ActionsIt's up to reviews to block innocuous change that would do what you said. ardumont: It's up to reviews to block innocuous change that would do what you said. | ||||||||||||||||
}, | ||||||||||||||||
) | ||||||||||||||||
def from_file_to_listed_origin(self, file: File) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given a remote file, yield a ListedOrigin.""" | ||||||||||||||||
pass | ||||||||||||||||
def get_origins_from_page( | ||||||||||||||||
self, artifact_tuple: PageResult | ||||||||||||||||
) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given an artifact tuple (type, artifact), yield a ListedOrigin.""" | ||||||||||||||||
artifact_type, artifact = artifact_tuple | ||||||||||||||||
mapping_type_to_fn: Dict[str, Callable[[Any], Iterator[ListedOrigin]]] = { | ||||||||||||||||
Done Inline ActionsIt's really Callable[[ArtifactTypes], Iterator[ListedOrigins]]... [1] But then mypy
And i don't immediately see how to simply declare this ¯\_(ツ)_/¯. So Callable[[Any], Iterator[ListedOrigins]] it is. [1] as ArtifactTypes == Union[File, DVCS, Tarball, OriginUpstream] ardumont: It's really Callable[[ArtifactTypes], Iterator[ListedOrigins]]... [1] But then mypy
annoys me… | ||||||||||||||||
Done Inline Actionsannoys me with: swh/lister/nixguix/lister.py:210: error: Dict entry 0 has incompatible type "str": "Callable[[DVCS], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:211: error: Dict entry 1 has incompatible type "str": "Callable[[File], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:212: error: Dict entry 2 has incompatible type "str": "Callable[[OriginUpstream], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:213: error: Dict entry 3 has incompatible type "str": "Callable[[Tarball], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" Found 4 errors in 1 file (checked 154 source files) ardumont: annoys me with:
```
swh/lister/nixguix/lister.py:210: error: Dict entry 0 has incompatible type… | ||||||||||||||||
Done Inline ActionsIt's because Callable[[OriginUpstream], Iterator[ListedOrigin]] is not a subtype of Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]; the latter is a function that accepts strictly more argument types than the latter. (mypy calls is "contravariance" https://mypy.readthedocs.io/en/stable/generics.html#variance-of-generic-types ) In other words, Callable[[A], ...] is a subtype of Callable[[B], ...] iff B is a subtype of A. vlorentz: It's because `Callable[[OriginUpstream], Iterator[ListedOrigin]]` is not a subtype of `Callable… | ||||||||||||||||
Done Inline ActionsRight, thx. I knew it was related to one of those (co|contra)variance related ;) but did not want to dig in. ardumont: Right, thx.
I knew it was related to one of those (co|contra)variance related ;) but did not… | ||||||||||||||||
Done Inline Actions
It's also the only way without changing the code or considerably enhancing Python's type system :p vlorentz: > I feel like using Any is enough currently.
It's also the only way without changing the code… | ||||||||||||||||
"dvcs": self.from_dvcs_to_listed_origin, | ||||||||||||||||
"file": self.from_file_to_listed_origin, | ||||||||||||||||
"origin": self.from_origin_to_listed_origin, | ||||||||||||||||
"tarball": self.from_tarball_to_listed_origin, | ||||||||||||||||
} | ||||||||||||||||
Not Done Inline Actions
vlorentz: | ||||||||||||||||
yield from mapping_type_to_fn[artifact_type](artifact) | ||||||||||||||||
# callable_fn = mapping_fn[object_type] | ||||||||||||||||
# yield from callable_fn(artifacts) | ||||||||||||||||
Done Inline Actionsfwiw, I've really opened this only for docker because i got rate-limited almost immediately otherwise... ardumont: fwiw, I've really opened this only for docker because i got rate-limited almost immediately… | ||||||||||||||||
# origin_url = artifacts["url"] | ||||||||||||||||
# last_update = iso8601.parse_date(project_info["time_modified"]) | ||||||||||||||||
Done Inline ActionsLet's make a dict of hashes out of the integrity field [1]. @vlorentz any no go against this idea ^ (I'm not clear whether we drop the integrity field as well or if we keep it after the change ^, as it could serve as metadata too). ardumont: Let's make a dict of hashes out of the integrity field [1].
@vlorentz any no go against this… | ||||||||||||||||
Done Inline Actionsyeah, drop it. No point in having both the SRI field and the hash dict. vlorentz: yeah, drop it. No point in having both the SRI field and the hash dict. | ||||||||||||||||
# logger.debug("Found origin %s last updated on %s", origin_url, last_update) | ||||||||||||||||
# yield ListedOrigin( | ||||||||||||||||
# lister_id=self.lister_obj.id, | ||||||||||||||||
# url=origin_url, | ||||||||||||||||
# visit_type="tar", | ||||||||||||||||
# last_update=last_update, | ||||||||||||||||
# extra_loader_arguments={"artifacts": artifacts[project_name]}, | ||||||||||||||||
# ) | ||||||||||||||||
Done Inline Actions
which implies a new ContentLoader(PackageLoader). ardumont: which implies a new `ContentLoader(PackageLoader)`. | ||||||||||||||||
Done Inline ActionsYeah that makes sense. (Or make it inherit BaseLoader directly, whichever is more convenient) vlorentz: Yeah that makes sense. (Or make it inherit BaseLoader directly, whichever is more convenient) | ||||||||||||||||
Done Inline Actionsyou also need to pass a manifest type name (and version); otherwise multiple formats would be undistinguishable once in the DB. And a nitpick: have the $url last, for consistency with other formats vlorentz: you also need to pass a manifest type name (and version); otherwise multiple formats would be… | ||||||||||||||||
Done Inline Actions
as in the one information from this listing, specifically [1]? { ... "version": "1", "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" }
ok ardumont: > you also need to pass a manifest type name (and version); otherwise multiple formats would be… | ||||||||||||||||
Done Inline Actionsvlorentz: no, I mean these: https://forge.softwareheritage.org/source/swh-loader… | ||||||||||||||||
Done Inline ActionsMove this to artefacts.checksums for consistency, and update https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json to include integrity. vlorentz: Move this to `artefacts.checksums` for consistency, and update https://docs.softwareheritage. | ||||||||||||||||
Done Inline Actionsyou mean like this, right? "checksums": { "integrity": "<hex-encoded string>", }, ardumont: you mean like this, right?
```
"checksums": {
"integrity": "<hex-encoded… | ||||||||||||||||
Done Inline Actionsit's also prefixed by the hash name, but yes vlorentz: it's also prefixed by the hash name, but yes | ||||||||||||||||
Done Inline Actionsactually, instead of passing the integrity as-is, we could extract the hash name, check it's something we support (typically sha256) and pass it as a key; so we don't need to add nixguix-specific code to the package loader vlorentz: actually, instead of passing the integrity as-is, we could extract the hash name, check it's… | ||||||||||||||||
Done Inline Actionshmm no, scratch that. integrity is fine because it's more of a standard format for extid manifests than anything else we could come up with. vlorentz: hmm no, scratch that. integrity is fine because it's more of a standard format for extid… | ||||||||||||||||
Done Inline Actions
Actually, that's where it should go ^ (and not below). But then i don't get how i can put it under the "checksums" (what you asked below) while keeping the "extid_manifest_format" work... ardumont: Actually, that's where it should go ^ (and not below).
But then i don't get how i can put it… | ||||||||||||||||
Not Done Inline ActionsI don't think "extid_manifest_format" should be passed to the loader; instead the loader should detect the presence of integrity and switch its format when this happens. (Also, I'd like to switch all package listers to switch to integrity instead of custom hash keys, but that's for later) vlorentz: I don't think "extid_manifest_format" should be passed to the loader; instead the loader should… | ||||||||||||||||
Done Inline ActionsI see. Ok then, a new diff will be on its way for that ArchiveLoader then regarding this. ardumont: I see.
Ok then, a new diff will be on its way for that ArchiveLoader then regarding this.
And… | ||||||||||||||||
Not Done Inline Actionssounds good! vlorentz: sounds good! | ||||||||||||||||
Done Inline Actionsvcs, too vlorentz: vcs, too | ||||||||||||||||
Not Done Inline Actionsyou can remove that line, session is already initialized by the base Lister class. anlambert: you can remove that line, session is already initialized by the base Lister class. | ||||||||||||||||
Done Inline Actionsack ardumont: ack | ||||||||||||||||
Not Done Inline Actionsallow_redirects is set to True by default for GET requests, you can remove that parameter. anlambert: `allow_redirects` is set to `True` by default for GET requests, you can remove that parameter. | ||||||||||||||||
Done Inline Actionsright ardumont: right | ||||||||||||||||
Done Inline Actionsok (missing coverage), i can add that part as well... ardumont: ok (missing coverage), i can add that part as well... |
use an enum instead of str. (or typing.Literal if it really needs to be a string)