Changeset View
Standalone View
swh/lister/nixguix/lister.py
- This file was added.
# Copyright (C) 2020-2022 The Software Heritage developers | ||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||||||
# See top-level LICENSE file for more information | ||||||||||||||||
"""NixGuix lister definition. | ||||||||||||||||
This lists artifacts out of manifest for Guix or Nixpkgs manifests. | ||||||||||||||||
Artifacts can be of types: | ||||||||||||||||
- upstream git repository (NixOS/nixpkgs, Guix) | ||||||||||||||||
- DVCS repositories (svn, git, hg, ...) | ||||||||||||||||
- unique file | ||||||||||||||||
- unique tarball | ||||||||||||||||
""" | ||||||||||||||||
import base64 | ||||||||||||||||
from dataclasses import dataclass | ||||||||||||||||
import logging | ||||||||||||||||
from pathlib import Path | ||||||||||||||||
import random | ||||||||||||||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union | ||||||||||||||||
import requests | ||||||||||||||||
from swh.core.github.utils import GitHubSession | ||||||||||||||||
from swh.lister import TARBALL_EXTENSIONS | ||||||||||||||||
from swh.lister.pattern import CredentialsType, StatelessLister | ||||||||||||||||
from swh.scheduler.model import ListedOrigin | ||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||
@dataclass | ||||||||||||||||
class OriginUpstream: | ||||||||||||||||
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" | ||||||||||||||||
origin: str | ||||||||||||||||
"""Canonical url of the repository""" | ||||||||||||||||
version: int | ||||||||||||||||
"""Version of the repository (dismissed?)""" | ||||||||||||||||
revision: str | ||||||||||||||||
"""Revision of the repository (dismissed?)""" | ||||||||||||||||
@dataclass | ||||||||||||||||
class Artifact: | ||||||||||||||||
"""Metadata information on Remote Artifact with url (tarball or file).""" | ||||||||||||||||
origin: str | ||||||||||||||||
"""Canonical url retrieve the tarball artifact.""" | ||||||||||||||||
visit_type: str | ||||||||||||||||
"""Either 'tar' or 'file' """ | ||||||||||||||||
fallback_urls: List[str] | ||||||||||||||||
"""List of urls to retrieve tarball artifact if canonical url no longer works.""" | ||||||||||||||||
checksums: Dict[str, str] | ||||||||||||||||
"""Integrity hash converted into a checksum dict.""" | ||||||||||||||||
@dataclass | ||||||||||||||||
class DVCS: | ||||||||||||||||
"""Metadata information on DVCS.""" | ||||||||||||||||
origin: str | ||||||||||||||||
"""Origin url of the dvcs""" | ||||||||||||||||
ref: Optional[str] | ||||||||||||||||
"""Reference either a svn commit id, a git commit, ...""" | ||||||||||||||||
type: str | ||||||||||||||||
"""Type of (d)vcs, e.g. svn, git, hg, ...""" | ||||||||||||||||
ArtifactTypes = Union[Artifact, DVCS, OriginUpstream] | ||||||||||||||||
PageResult = Tuple[str, ArtifactTypes] | ||||||||||||||||
vlorentz: use an enum instead of `str`. (or `typing.Literal` if it really needs to be a string) | ||||||||||||||||
DVCS_SUPPORTED = ("git", "svn", "hg") | ||||||||||||||||
Done Inline Actionssvn is not a DVCS, rename it to SUPPORTED_VCS vlorentz: svn is not a DVCS, rename it to `SUPPORTED_VCS` | ||||||||||||||||
def is_tarball(urls: List[str]) -> bool: | ||||||||||||||||
"""Determine whether a list of files actually are tarballs or simple files. | ||||||||||||||||
Args: | ||||||||||||||||
urls: name of the remote files for which the extensions needs to be | ||||||||||||||||
checked. | ||||||||||||||||
Returns: | ||||||||||||||||
Whether filename is an archive or not | ||||||||||||||||
Example: | ||||||||||||||||
>>> is_tarball(['abc.zip']) | ||||||||||||||||
True | ||||||||||||||||
>>> is_tarball(['one.tar.gz', 'two.tgz']) | ||||||||||||||||
True | ||||||||||||||||
>>> is_tarball(['abc.c', 'other.c']) | ||||||||||||||||
False | ||||||||||||||||
Done Inline ActionsMaybe you could update and reuse tarball mimetypes declared in swh.core.tarball instead ? anlambert: Maybe you could update and reuse tarball mimetypes declared in [swh.core.tarball](https://forge. | ||||||||||||||||
Done Inline Actionsyeah, that'd make sense, can we iterate over this in another diff? That'd let me the time to open those mimetypes in swh.core.tarball (and release) so we can use a non private variable. ardumont: yeah, that'd make sense, can we iterate over this in another diff?
That'd let me the time to… | ||||||||||||||||
Done Inline ActionsOpened [1] in swh.core [1] D8603 ardumont: Opened [1] in swh.core
[1] D8603 | ||||||||||||||||
Done Inline ActionsI've amended that lister diff to do that immediatly in the end (the dev/diff/review/package got fast ;) ardumont: I've amended that lister diff to do that immediatly in the end (the dev/diff/review/package got… | ||||||||||||||||
""" | ||||||||||||||||
index = random.randrange(len(urls)) | ||||||||||||||||
filepath = urls[index] | ||||||||||||||||
file_suffix = Path(filepath).suffixes[-1].lstrip(".") | ||||||||||||||||
return file_suffix in TARBALL_EXTENSIONS | ||||||||||||||||
DVCS_KEYS_MAPPING = { | ||||||||||||||||
"git": { | ||||||||||||||||
"ref": "git_ref", | ||||||||||||||||
"url": "git_url", | ||||||||||||||||
}, | ||||||||||||||||
"svn": { | ||||||||||||||||
"ref": "svn_revision", | ||||||||||||||||
"url": "svn_url", | ||||||||||||||||
}, | ||||||||||||||||
"hg": { | ||||||||||||||||
"ref": "hg_changeset", | ||||||||||||||||
"url": "hg_url", | ||||||||||||||||
}, | ||||||||||||||||
} | ||||||||||||||||
class NixGuixLister(StatelessLister[PageResult]): | ||||||||||||||||
"""List Guix or Nix sources out of a public json manifest. | ||||||||||||||||
This lister can output: | ||||||||||||||||
- unique tarball (.tar.gz, .tbz2, ...) | ||||||||||||||||
- dvcs repositories (e.g. git, hg, svn) | ||||||||||||||||
Not Done Inline Actions
vlorentz: | ||||||||||||||||
- unique file (.lisp, .py, ...) | ||||||||||||||||
Note that no `last_update` is available in either manifest. | ||||||||||||||||
""" | ||||||||||||||||
LISTER_NAME = "NixGuixLister" | ||||||||||||||||
def __init__( | ||||||||||||||||
self, | ||||||||||||||||
scheduler, | ||||||||||||||||
url: str, | ||||||||||||||||
origin_upstream: str, | ||||||||||||||||
name: Optional[str] = "nixpkgs", | ||||||||||||||||
Not Done Inline ActionsCould you make it a required argument, to avoid accidental misconfiguration? vlorentz: Could you make it a required argument, to avoid accidental misconfiguration? | ||||||||||||||||
Done Inline Actionsyes ardumont: yes | ||||||||||||||||
Done Inline Actionsactually it got dropped as it was a misnamed "instance" equivalent... ardumont: actually it got dropped as it was a misnamed "instance" equivalent... | ||||||||||||||||
instance: Optional[str] = None, | ||||||||||||||||
credentials: Optional[CredentialsType] = None, | ||||||||||||||||
**kwargs: Any, | ||||||||||||||||
): | ||||||||||||||||
super().__init__( | ||||||||||||||||
scheduler=scheduler, | ||||||||||||||||
url=url.rstrip("/"), | ||||||||||||||||
instance=instance, | ||||||||||||||||
credentials=credentials, | ||||||||||||||||
) | ||||||||||||||||
# either full fqdn NixOS/nixpkgs or guix repository urls | ||||||||||||||||
# maybe add an assert on those specific urls? | ||||||||||||||||
self.origin_upstream = origin_upstream | ||||||||||||||||
self.session = requests.Session() | ||||||||||||||||
self.session.headers.update({"Accept": "application/json"}) | ||||||||||||||||
self.github_session = GitHubSession( | ||||||||||||||||
credentials=self.credentials, | ||||||||||||||||
user_agent=str(self.session.headers["User-Agent"]), | ||||||||||||||||
) | ||||||||||||||||
def get_pages(self) -> Iterator[PageResult]: | ||||||||||||||||
"""Yield one page per "typed" origin referenced in manifest.""" | ||||||||||||||||
# fetch and parse the manifest... | ||||||||||||||||
response = self.http_request(self.url, allow_redirects=True) | ||||||||||||||||
# ... if any | ||||||||||||||||
raw_data = response.json() | ||||||||||||||||
version = raw_data["version"] | ||||||||||||||||
revision = raw_data["revision"] | ||||||||||||||||
yield "origin", OriginUpstream( | ||||||||||||||||
self.origin_upstream, | ||||||||||||||||
version, | ||||||||||||||||
revision, | ||||||||||||||||
) | ||||||||||||||||
# grep '"type"' guix-sources.json | sort | uniq | ||||||||||||||||
# "type": false <<<<<<<<< noise | ||||||||||||||||
# "type": "git", | ||||||||||||||||
# "type": "hg", | ||||||||||||||||
# "type": "no-origin", <<<<<<<<< noise | ||||||||||||||||
# "type": "svn", | ||||||||||||||||
# "type": "url", | ||||||||||||||||
# grep '"type"' nixpkgs-sources-unstable.json | sort | uniq | ||||||||||||||||
# "type": "url", | ||||||||||||||||
for artifact in raw_data["sources"]: | ||||||||||||||||
artifact_type = artifact["type"] | ||||||||||||||||
if artifact_type in DVCS_SUPPORTED: | ||||||||||||||||
plain_url = artifact[DVCS_KEYS_MAPPING[artifact_type]["url"]] | ||||||||||||||||
plain_ref = artifact[DVCS_KEYS_MAPPING[artifact_type]["ref"]] | ||||||||||||||||
Done Inline Actionsthis definitely must include $integrity to detect changes in the URL's content vlorentz: this definitely must include `$integrity` to detect changes in the URL's content | ||||||||||||||||
Done Inline Actionsdefinitely! ardumont: definitely! | ||||||||||||||||
artifact_url = self.github_session.get_canonical_url(plain_url) | ||||||||||||||||
Done Inline ActionsI know it's a noop on non-GitHub URLs, but I find it meh to call this unconditionally on non-GitHub URLs; because seemingly innocuous changes to GithubSession.get_canonical_url's code would make us leak our API tokens to random websites. Heh, I guess that's fine. vlorentz: I know it's a noop on non-GitHub URLs, but I find it meh to call this unconditionally on non… | ||||||||||||||||
Done Inline ActionsIt's up to reviews to block innocuous change that would do what you said. ardumont: It's up to reviews to block innocuous change that would do what you said. | ||||||||||||||||
if not artifact_url: | ||||||||||||||||
continue | ||||||||||||||||
yield "dvcs", DVCS( | ||||||||||||||||
origin=artifact_url, type=artifact_type, ref=plain_ref | ||||||||||||||||
) | ||||||||||||||||
elif artifact_type == "url": | ||||||||||||||||
# It's either a tarball or a file | ||||||||||||||||
urls = artifact.get("urls") | ||||||||||||||||
if not urls: | ||||||||||||||||
# Nothing to fetch | ||||||||||||||||
logger.warning("Skipping empty artifact %s", artifact) | ||||||||||||||||
continue | ||||||||||||||||
Done Inline ActionsIt's really Callable[[ArtifactTypes], Iterator[ListedOrigins]]... [1] But then mypy
And i don't immediately see how to simply declare this ¯\_(ツ)_/¯. So Callable[[Any], Iterator[ListedOrigins]] it is. [1] as ArtifactTypes == Union[File, DVCS, Tarball, OriginUpstream] ardumont: It's really Callable[[ArtifactTypes], Iterator[ListedOrigins]]... [1] But then mypy
annoys me… | ||||||||||||||||
Done Inline Actionsannoys me with: swh/lister/nixguix/lister.py:210: error: Dict entry 0 has incompatible type "str": "Callable[[DVCS], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:211: error: Dict entry 1 has incompatible type "str": "Callable[[File], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:212: error: Dict entry 2 has incompatible type "str": "Callable[[OriginUpstream], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:213: error: Dict entry 3 has incompatible type "str": "Callable[[Tarball], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" Found 4 errors in 1 file (checked 154 source files) ardumont: annoys me with:
```
swh/lister/nixguix/lister.py:210: error: Dict entry 0 has incompatible type… | ||||||||||||||||
Done Inline ActionsIt's because Callable[[OriginUpstream], Iterator[ListedOrigin]] is not a subtype of Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]; the latter is a function that accepts strictly more argument types than the latter. (mypy calls is "contravariance" https://mypy.readthedocs.io/en/stable/generics.html#variance-of-generic-types ) In other words, Callable[[A], ...] is a subtype of Callable[[B], ...] iff B is a subtype of A. vlorentz: It's because `Callable[[OriginUpstream], Iterator[ListedOrigin]]` is not a subtype of `Callable… | ||||||||||||||||
Done Inline ActionsRight, thx. I knew it was related to one of those (co|contra)variance related ;) but did not want to dig in. ardumont: Right, thx.
I knew it was related to one of those (co|contra)variance related ;) but did not… | ||||||||||||||||
Done Inline Actions
It's also the only way without changing the code or considerably enhancing Python's type system :p vlorentz: > I feel like using Any is enough currently.
It's also the only way without changing the code… | ||||||||||||||||
assert urls is not None | ||||||||||||||||
# Determine the content checksum stored in the integrity field | ||||||||||||||||
# (hash-<b64-encoded-checksum>) and convert into a dict of checksums | ||||||||||||||||
# https://w3c.github.io/webappsec-subresource-integrity/#grammardef-hash-algo | ||||||||||||||||
Not Done Inline Actions
vlorentz: | ||||||||||||||||
chksum_algo, chksum_b64 = artifact["integrity"].split("-") | ||||||||||||||||
checksums: Dict[str, str] = { | ||||||||||||||||
chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() | ||||||||||||||||
} | ||||||||||||||||
Done Inline Actionsfwiw, I've really opened this only for docker because i got rate-limited almost immediately otherwise... ardumont: fwiw, I've really opened this only for docker because i got rate-limited almost immediately… | ||||||||||||||||
# FIXME: T3294: Fix missing scheme in urls | ||||||||||||||||
origin, *fallback_urls = urls | ||||||||||||||||
Done Inline ActionsLet's make a dict of hashes out of the integrity field [1]. @vlorentz any no go against this idea ^ (I'm not clear whether we drop the integrity field as well or if we keep it after the change ^, as it could serve as metadata too). ardumont: Let's make a dict of hashes out of the integrity field [1].
@vlorentz any no go against this… | ||||||||||||||||
Done Inline Actionsyeah, drop it. No point in having both the SRI field and the hash dict. vlorentz: yeah, drop it. No point in having both the SRI field and the hash dict. | ||||||||||||||||
yield "artifact", Artifact( | ||||||||||||||||
origin=origin, | ||||||||||||||||
fallback_urls=fallback_urls, | ||||||||||||||||
checksums=checksums, | ||||||||||||||||
visit_type="tar" if is_tarball(urls) else "file", | ||||||||||||||||
) | ||||||||||||||||
else: | ||||||||||||||||
logger.warning( | ||||||||||||||||
"Skipping unsupported type %s for artifact %s", | ||||||||||||||||
artifact_type, | ||||||||||||||||
artifact, | ||||||||||||||||
Not Done Inline Actionsyou can remove that line, session is already initialized by the base Lister class. anlambert: you can remove that line, session is already initialized by the base Lister class. | ||||||||||||||||
Done Inline Actionsack ardumont: ack | ||||||||||||||||
) | ||||||||||||||||
def dvcs_to_listed_origin(self, artifact: DVCS) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given a dvcs repository, yield a ListedOrigin.""" | ||||||||||||||||
Done Inline Actionsvcs, too vlorentz: vcs, too | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
# FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=artifact.origin, | ||||||||||||||||
visit_type=artifact.type, | ||||||||||||||||
) | ||||||||||||||||
def origin_to_listed_origin( | ||||||||||||||||
self, origin_upstream: OriginUpstream | ||||||||||||||||
) -> Iterator[ListedOrigin]: | ||||||||||||||||
Not Done Inline Actionsallow_redirects is set to True by default for GET requests, you can remove that parameter. anlambert: `allow_redirects` is set to `True` by default for GET requests, you can remove that parameter. | ||||||||||||||||
Done Inline Actionsright ardumont: right | ||||||||||||||||
"""Given an upstream origin, yield a ListedOrigin.""" | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=origin_upstream.origin, | ||||||||||||||||
visit_type="git", # both nixpkgs and guix are git origins so far | ||||||||||||||||
) | ||||||||||||||||
Done Inline Actions
Actually, that's where it should go ^ (and not below). But then i don't get how i can put it under the "checksums" (what you asked below) while keeping the "extid_manifest_format" work... ardumont: Actually, that's where it should go ^ (and not below).
But then i don't get how i can put it… | ||||||||||||||||
Not Done Inline ActionsI don't think "extid_manifest_format" should be passed to the loader; instead the loader should detect the presence of integrity and switch its format when this happens. (Also, I'd like to switch all package listers to switch to integrity instead of custom hash keys, but that's for later) vlorentz: I don't think "extid_manifest_format" should be passed to the loader; instead the loader should… | ||||||||||||||||
Done Inline ActionsI see. Ok then, a new diff will be on its way for that ArchiveLoader then regarding this. ardumont: I see.
Ok then, a new diff will be on its way for that ArchiveLoader then regarding this.
And… | ||||||||||||||||
Not Done Inline Actionssounds good! vlorentz: sounds good! | ||||||||||||||||
def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given an artifact (tarball, file), yield one ListedOrigin.""" | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=artifact.origin, | ||||||||||||||||
visit_type=artifact.visit_type, | ||||||||||||||||
Done Inline Actions
which implies a new ContentLoader(PackageLoader). ardumont: which implies a new `ContentLoader(PackageLoader)`. | ||||||||||||||||
Done Inline ActionsYeah that makes sense. (Or make it inherit BaseLoader directly, whichever is more convenient) vlorentz: Yeah that makes sense. (Or make it inherit BaseLoader directly, whichever is more convenient) | ||||||||||||||||
extra_loader_arguments={ | ||||||||||||||||
"checksums": artifact.checksums, | ||||||||||||||||
"fallback_urls": artifact.fallback_urls, | ||||||||||||||||
}, | ||||||||||||||||
) | ||||||||||||||||
def get_origins_from_page( | ||||||||||||||||
self, artifact_tuple: PageResult | ||||||||||||||||
) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given an artifact tuple (type, artifact), yield a ListedOrigin.""" | ||||||||||||||||
artifact_type, artifact = artifact_tuple | ||||||||||||||||
mapping_type_fn = getattr(self, f"{artifact_type}_to_listed_origin") | ||||||||||||||||
yield from mapping_type_fn(artifact) | ||||||||||||||||
Done Inline Actionsyou also need to pass a manifest type name (and version); otherwise multiple formats would be undistinguishable once in the DB. And a nitpick: have the $url last, for consistency with other formats vlorentz: you also need to pass a manifest type name (and version); otherwise multiple formats would be… | ||||||||||||||||
Done Inline Actions
as in the one information from this listing, specifically [1]? { ... "version": "1", "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" }
ok ardumont: > you also need to pass a manifest type name (and version); otherwise multiple formats would be… | ||||||||||||||||
Done Inline Actionsvlorentz: no, I mean these: https://forge.softwareheritage.org/source/swh-loader… | ||||||||||||||||
Done Inline ActionsMove this to artefacts.checksums for consistency, and update https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json to include integrity. vlorentz: Move this to `artefacts.checksums` for consistency, and update https://docs.softwareheritage. | ||||||||||||||||
Done Inline Actionsyou mean like this, right? "checksums": { "integrity": "<hex-encoded string>", }, ardumont: you mean like this, right?
```
"checksums": {
"integrity": "<hex-encoded… | ||||||||||||||||
Done Inline Actionsit's also prefixed by the hash name, but yes vlorentz: it's also prefixed by the hash name, but yes | ||||||||||||||||
Done Inline Actionsactually, instead of passing the integrity as-is, we could extract the hash name, check it's something we support (typically sha256) and pass it as a key; so we don't need to add nixguix-specific code to the package loader vlorentz: actually, instead of passing the integrity as-is, we could extract the hash name, check it's… | ||||||||||||||||
Done Inline Actionshmm no, scratch that. integrity is fine because it's more of a standard format for extid manifests than anything else we could come up with. vlorentz: hmm no, scratch that. integrity is fine because it's more of a standard format for extid… | ||||||||||||||||
Done Inline Actionsok (missing coverage), i can add that part as well... ardumont: ok (missing coverage), i can add that part as well... |
use an enum instead of str. (or typing.Literal if it really needs to be a string)