Changeset View
Standalone View
swh/lister/nixguix/lister.py
- This file was added.
# Copyright (C) 2020-2022 The Software Heritage developers | ||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||||||
# See top-level LICENSE file for more information | ||||||||||||||||
"""NixGuix lister definition. | ||||||||||||||||
This lists artifacts out of manifest for Guix or Nixpkgs manifests. | ||||||||||||||||
Artifacts can be of types: | ||||||||||||||||
- upstream git repository (NixOS/nixpkgs, Guix) | ||||||||||||||||
- VCS repositories (svn, git, hg, ...) | ||||||||||||||||
- unique file | ||||||||||||||||
- unique tarball | ||||||||||||||||
""" | ||||||||||||||||
import base64 | ||||||||||||||||
from dataclasses import dataclass | ||||||||||||||||
from enum import Enum | ||||||||||||||||
import logging | ||||||||||||||||
from pathlib import Path | ||||||||||||||||
import random | ||||||||||||||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union | ||||||||||||||||
from urllib.parse import urlparse | ||||||||||||||||
import requests | ||||||||||||||||
from swh.core.github.utils import GitHubSession | ||||||||||||||||
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT | ||||||||||||||||
from swh.lister import TARBALL_EXTENSIONS | ||||||||||||||||
from swh.lister.pattern import CredentialsType, StatelessLister | ||||||||||||||||
from swh.scheduler.model import ListedOrigin | ||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||
class ArtifactNatureUndetected(ValueError): | ||||||||||||||||
"""Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | ||||||||||||||||
pass | ||||||||||||||||
@dataclass | ||||||||||||||||
class OriginUpstream: | ||||||||||||||||
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" | ||||||||||||||||
origin: str | ||||||||||||||||
"""Canonical url of the repository""" | ||||||||||||||||
version: int | ||||||||||||||||
"""Version of the repository (dismissed?)""" | ||||||||||||||||
revision: str | ||||||||||||||||
"""Revision of the repository (dismissed?)""" | ||||||||||||||||
@dataclass | ||||||||||||||||
class Artifact: | ||||||||||||||||
"""Metadata information on Remote Artifact with url (tarball or file).""" | ||||||||||||||||
origin: str | ||||||||||||||||
"""Canonical url retrieve the tarball artifact.""" | ||||||||||||||||
visit_type: str | ||||||||||||||||
"""Either 'tar' or 'file' """ | ||||||||||||||||
fallback_urls: List[str] | ||||||||||||||||
"""List of urls to retrieve tarball artifact if canonical url no longer works.""" | ||||||||||||||||
checksums: Dict[str, str] | ||||||||||||||||
"""Integrity hash converted into a checksum dict.""" | ||||||||||||||||
@dataclass | ||||||||||||||||
class VCS: | ||||||||||||||||
"""Metadata information on VCS.""" | ||||||||||||||||
origin: str | ||||||||||||||||
vlorentz: use an enum instead of `str`. (or `typing.Literal` if it really needs to be a string) | ||||||||||||||||
"""Origin url of the vcs""" | ||||||||||||||||
ref: Optional[str] | ||||||||||||||||
"""Reference either a svn commit id, a git commit, ...""" | ||||||||||||||||
Done Inline Actionssvn is not a DVCS, rename it to SUPPORTED_VCS vlorentz: svn is not a DVCS, rename it to `SUPPORTED_VCS` | ||||||||||||||||
type: str | ||||||||||||||||
"""Type of (d)vcs, e.g. svn, git, hg, ...""" | ||||||||||||||||
class ArtifactType(Enum): | ||||||||||||||||
"""The possible artifact types listed out of the manifest.""" | ||||||||||||||||
ARTIFACT = "artifact" | ||||||||||||||||
ORIGIN = "origin" | ||||||||||||||||
VCS = "vcs" | ||||||||||||||||
PageResult = Tuple[ArtifactType, Union[Artifact, VCS, OriginUpstream]] | ||||||||||||||||
VCS_SUPPORTED = ("git", "svn", "hg") | ||||||||||||||||
# Rough approximation of what we can find of mimetypes for tarballs "out there" | ||||||||||||||||
POSSIBLE_TARBALL_MIMETYPES = set(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) | ||||||||||||||||
Done Inline ActionsMaybe you could update and reuse tarball mimetypes declared in swh.core.tarball instead ? anlambert: Maybe you could update and reuse tarball mimetypes declared in [swh.core.tarball](https://forge. | ||||||||||||||||
Done Inline Actionsyeah, that'd make sense, can we iterate over this in another diff? That'd let me the time to open those mimetypes in swh.core.tarball (and release) so we can use a non private variable. ardumont: yeah, that'd make sense, can we iterate over this in another diff?
That'd let me the time to… | ||||||||||||||||
Done Inline ActionsOpened [1] in swh.core [1] D8603 ardumont: Opened [1] in swh.core
[1] D8603 | ||||||||||||||||
Done Inline ActionsI've amended that lister diff to do that immediatly in the end (the dev/diff/review/package got fast ;) ardumont: I've amended that lister diff to do that immediatly in the end (the dev/diff/review/package got… | ||||||||||||||||
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: | ||||||||||||||||
"""Determine whether a list of files actually are tarballs or simple files. | ||||||||||||||||
When this cannot be answered simply out of the url, when request is provided, this | ||||||||||||||||
executes a HTTP `HEAD` query on the url to determine the information. If request is | ||||||||||||||||
not provided, this raises an ArtifactNatureUndetected exception. | ||||||||||||||||
Args: | ||||||||||||||||
urls: name of the remote files for which the extension needs to be checked. | ||||||||||||||||
Raises: | ||||||||||||||||
ArtifactNatureUndetected when the artifact's nature cannot be detected out | ||||||||||||||||
of its url | ||||||||||||||||
Returns: A tuple (bool, url). The boolean represents whether the url is an archive | ||||||||||||||||
or not. The second parameter is the actual url once the head request is issued | ||||||||||||||||
as a fallback of not finding out whether the urls are tarballs or not. | ||||||||||||||||
""" | ||||||||||||||||
def _is_tarball(url): | ||||||||||||||||
"""Determine out of an extension whether url is a tarball. | ||||||||||||||||
Raises: | ||||||||||||||||
IndexError in case no extension is available | ||||||||||||||||
""" | ||||||||||||||||
return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS | ||||||||||||||||
Not Done Inline Actions
vlorentz: | ||||||||||||||||
index = random.randrange(len(urls)) | ||||||||||||||||
url = urls[index] | ||||||||||||||||
try: | ||||||||||||||||
is_tar = _is_tarball(url) | ||||||||||||||||
return is_tar, urls[0] | ||||||||||||||||
except IndexError: | ||||||||||||||||
if request is None: | ||||||||||||||||
raise ArtifactNatureUndetected( | ||||||||||||||||
"Cannot determine artifact type from url %s", url | ||||||||||||||||
) | ||||||||||||||||
logger.warning( | ||||||||||||||||
"Cannot detect extension for '%s'. Fallback to http head query", | ||||||||||||||||
url, | ||||||||||||||||
) | ||||||||||||||||
Not Done Inline ActionsCould you make it a required argument, to avoid accidental misconfiguration? vlorentz: Could you make it a required argument, to avoid accidental misconfiguration? | ||||||||||||||||
Done Inline Actionsyes ardumont: yes | ||||||||||||||||
Done Inline Actionsactually it got dropped as it was a misnamed "instance" equivalent... ardumont: actually it got dropped as it was a misnamed "instance" equivalent... | ||||||||||||||||
response = request.head(url) | ||||||||||||||||
if not response.ok or response.status_code == 404: | ||||||||||||||||
raise ArtifactNatureUndetected( | ||||||||||||||||
"Cannot determine artifact type from url %s", url | ||||||||||||||||
) | ||||||||||||||||
location = response.headers.get("Location") | ||||||||||||||||
if location: # It's not always present | ||||||||||||||||
logger.debug("Location: %s", location) | ||||||||||||||||
try: | ||||||||||||||||
# FIXME: location is also returned as it's considered the true origin, | ||||||||||||||||
# true enough? | ||||||||||||||||
return _is_tarball(location), location | ||||||||||||||||
except IndexError: | ||||||||||||||||
logger.warning( | ||||||||||||||||
"Still cannot detect extension through location '%s'...", | ||||||||||||||||
url, | ||||||||||||||||
) | ||||||||||||||||
content_type = response.headers.get("Content-Type") | ||||||||||||||||
if content_type: | ||||||||||||||||
logger.debug("Content-Type: %s", content_type) | ||||||||||||||||
if content_type == "application/json": | ||||||||||||||||
return False, urls[0] | ||||||||||||||||
return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0] | ||||||||||||||||
raise ArtifactNatureUndetected( | ||||||||||||||||
"Cannot determine artifact type from url %s", url | ||||||||||||||||
) | ||||||||||||||||
VCS_KEYS_MAPPING = { | ||||||||||||||||
"git": { | ||||||||||||||||
"ref": "git_ref", | ||||||||||||||||
"url": "git_url", | ||||||||||||||||
}, | ||||||||||||||||
"svn": { | ||||||||||||||||
"ref": "svn_revision", | ||||||||||||||||
"url": "svn_url", | ||||||||||||||||
}, | ||||||||||||||||
"hg": { | ||||||||||||||||
"ref": "hg_changeset", | ||||||||||||||||
"url": "hg_url", | ||||||||||||||||
}, | ||||||||||||||||
} | ||||||||||||||||
class NixGuixLister(StatelessLister[PageResult]): | ||||||||||||||||
"""List Guix or Nix sources out of a public json manifest. | ||||||||||||||||
This lister can output: | ||||||||||||||||
- unique tarball (.tar.gz, .tbz2, ...) | ||||||||||||||||
Done Inline Actionsthis definitely must include $integrity to detect changes in the URL's content vlorentz: this definitely must include `$integrity` to detect changes in the URL's content | ||||||||||||||||
Done Inline Actionsdefinitely! ardumont: definitely! | ||||||||||||||||
- vcs repositories (e.g. git, hg, svn) | ||||||||||||||||
Done Inline ActionsI know it's a noop on non-GitHub URLs, but I find it meh to call this unconditionally on non-GitHub URLs; because seemingly innocuous changes to GithubSession.get_canonical_url's code would make us leak our API tokens to random websites. Heh, I guess that's fine. vlorentz: I know it's a noop on non-GitHub URLs, but I find it meh to call this unconditionally on non… | ||||||||||||||||
Done Inline ActionsIt's up to reviews to block innocuous change that would do what you said. ardumont: It's up to reviews to block innocuous change that would do what you said. | ||||||||||||||||
- unique file (.lisp, .py, ...) | ||||||||||||||||
Note that no `last_update` is available in either manifest. | ||||||||||||||||
For `url` types artifacts, this tries to determine the artifact's nature, tarball or | ||||||||||||||||
file. It first tries to compute out of the "url" extension. In case of no extension, | ||||||||||||||||
it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` | ||||||||||||||||
response header, and then checks the extension again. | ||||||||||||||||
""" | ||||||||||||||||
LISTER_NAME = "nixguix" | ||||||||||||||||
Done Inline ActionsIt's really Callable[[ArtifactTypes], Iterator[ListedOrigins]]... [1] But then mypy
And i don't immediately see how to simply declare this ¯\_(ツ)_/¯. So Callable[[Any], Iterator[ListedOrigins]] it is. [1] as ArtifactTypes == Union[File, DVCS, Tarball, OriginUpstream] ardumont: It's really Callable[[ArtifactTypes], Iterator[ListedOrigins]]... [1] But then mypy
annoys me… | ||||||||||||||||
Done Inline Actionsannoys me with: swh/lister/nixguix/lister.py:210: error: Dict entry 0 has incompatible type "str": "Callable[[DVCS], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:211: error: Dict entry 1 has incompatible type "str": "Callable[[File], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:212: error: Dict entry 2 has incompatible type "str": "Callable[[OriginUpstream], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" swh/lister/nixguix/lister.py:213: error: Dict entry 3 has incompatible type "str": "Callable[[Tarball], Iterator[ListedOrigin]]"; expected "str": "Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]" Found 4 errors in 1 file (checked 154 source files) ardumont: annoys me with:
```
swh/lister/nixguix/lister.py:210: error: Dict entry 0 has incompatible type… | ||||||||||||||||
Done Inline ActionsIt's because Callable[[OriginUpstream], Iterator[ListedOrigin]] is not a subtype of Callable[[Union[Tarball, File, DVCS, OriginUpstream]], Iterator[ListedOrigin]]; the latter is a function that accepts strictly more argument types than the latter. (mypy calls is "contravariance" https://mypy.readthedocs.io/en/stable/generics.html#variance-of-generic-types ) In other words, Callable[[A], ...] is a subtype of Callable[[B], ...] iff B is a subtype of A. vlorentz: It's because `Callable[[OriginUpstream], Iterator[ListedOrigin]]` is not a subtype of `Callable… | ||||||||||||||||
Done Inline ActionsRight, thx. I knew it was related to one of those (co|contra)variance related ;) but did not want to dig in. ardumont: Right, thx.
I knew it was related to one of those (co|contra)variance related ;) but did not… | ||||||||||||||||
Done Inline Actions
It's also the only way without changing the code or considerably enhancing Python's type system :p vlorentz: > I feel like using Any is enough currently.
It's also the only way without changing the code… | ||||||||||||||||
def __init__( | ||||||||||||||||
self, | ||||||||||||||||
scheduler, | ||||||||||||||||
url: str, | ||||||||||||||||
origin_upstream: str, | ||||||||||||||||
Not Done Inline Actions
vlorentz: | ||||||||||||||||
instance: Optional[str] = None, | ||||||||||||||||
credentials: Optional[CredentialsType] = None, | ||||||||||||||||
# canonicalize urls, can be turned off during docker runs | ||||||||||||||||
canonicalize: bool = True, | ||||||||||||||||
**kwargs: Any, | ||||||||||||||||
Done Inline Actionsfwiw, I've really opened this only for docker because i got rate-limited almost immediately otherwise... ardumont: fwiw, I've really opened this only for docker because i got rate-limited almost immediately… | ||||||||||||||||
): | ||||||||||||||||
super().__init__( | ||||||||||||||||
Done Inline ActionsLet's make a dict of hashes out of the integrity field [1]. @vlorentz any no go against this idea ^ (I'm not clear whether we drop the integrity field as well or if we keep it after the change ^, as it could serve as metadata too). ardumont: Let's make a dict of hashes out of the integrity field [1].
@vlorentz any no go against this… | ||||||||||||||||
Done Inline Actionsyeah, drop it. No point in having both the SRI field and the hash dict. vlorentz: yeah, drop it. No point in having both the SRI field and the hash dict. | ||||||||||||||||
scheduler=scheduler, | ||||||||||||||||
url=url.rstrip("/"), | ||||||||||||||||
instance=instance, | ||||||||||||||||
credentials=credentials, | ||||||||||||||||
) | ||||||||||||||||
# either full fqdn NixOS/nixpkgs or guix repository urls | ||||||||||||||||
# maybe add an assert on those specific urls? | ||||||||||||||||
self.origin_upstream = origin_upstream | ||||||||||||||||
self.session = requests.Session() | ||||||||||||||||
# for testing purposes, we may want to skip this step (e.g. docker run and rate | ||||||||||||||||
Not Done Inline Actionsyou can remove that line, session is already initialized by the base Lister class. anlambert: you can remove that line, session is already initialized by the base Lister class. | ||||||||||||||||
Done Inline Actionsack ardumont: ack | ||||||||||||||||
# limit) | ||||||||||||||||
self.github_session = ( | ||||||||||||||||
GitHubSession( | ||||||||||||||||
credentials=self.credentials, | ||||||||||||||||
Done Inline Actionsvcs, too vlorentz: vcs, too | ||||||||||||||||
user_agent=str(self.session.headers["User-Agent"]), | ||||||||||||||||
) | ||||||||||||||||
if canonicalize | ||||||||||||||||
else None | ||||||||||||||||
) | ||||||||||||||||
def get_pages(self) -> Iterator[PageResult]: | ||||||||||||||||
"""Yield one page per "typed" origin referenced in manifest.""" | ||||||||||||||||
# fetch and parse the manifest... | ||||||||||||||||
response = self.http_request(self.url) | ||||||||||||||||
Not Done Inline Actionsallow_redirects is set to True by default for GET requests, you can remove that parameter. anlambert: `allow_redirects` is set to `True` by default for GET requests, you can remove that parameter. | ||||||||||||||||
Done Inline Actionsright ardumont: right | ||||||||||||||||
# ... if any | ||||||||||||||||
raw_data = response.json() | ||||||||||||||||
version = raw_data["version"] | ||||||||||||||||
revision = raw_data["revision"] | ||||||||||||||||
yield ArtifactType.ORIGIN, OriginUpstream( | ||||||||||||||||
self.origin_upstream, | ||||||||||||||||
version, | ||||||||||||||||
Done Inline Actions
Actually, that's where it should go ^ (and not below). But then i don't get how i can put it under the "checksums" (what you asked below) while keeping the "extid_manifest_format" work... ardumont: Actually, that's where it should go ^ (and not below).
But then i don't get how i can put it… | ||||||||||||||||
Not Done Inline ActionsI don't think "extid_manifest_format" should be passed to the loader; instead the loader should detect the presence of integrity and switch its format when this happens. (Also, I'd like to switch all package listers to switch to integrity instead of custom hash keys, but that's for later) vlorentz: I don't think "extid_manifest_format" should be passed to the loader; instead the loader should… | ||||||||||||||||
Done Inline ActionsI see. Ok then, a new diff will be on its way for that ArchiveLoader then regarding this. ardumont: I see.
Ok then, a new diff will be on its way for that ArchiveLoader then regarding this.
And… | ||||||||||||||||
Not Done Inline Actionssounds good! vlorentz: sounds good! | ||||||||||||||||
revision, | ||||||||||||||||
) | ||||||||||||||||
# grep '"type"' guix-sources.json | sort | uniq | ||||||||||||||||
# "type": false <<<<<<<<< noise | ||||||||||||||||
# "type": "git", | ||||||||||||||||
# "type": "hg", | ||||||||||||||||
# "type": "no-origin", <<<<<<<<< noise | ||||||||||||||||
Done Inline Actions
which implies a new ContentLoader(PackageLoader). ardumont: which implies a new `ContentLoader(PackageLoader)`. | ||||||||||||||||
Done Inline ActionsYeah that makes sense. (Or make it inherit BaseLoader directly, whichever is more convenient) vlorentz: Yeah that makes sense. (Or make it inherit BaseLoader directly, whichever is more convenient) | ||||||||||||||||
# "type": "svn", | ||||||||||||||||
# "type": "url", | ||||||||||||||||
# grep '"type"' nixpkgs-sources-unstable.json | sort | uniq | ||||||||||||||||
# "type": "url", | ||||||||||||||||
for artifact in raw_data["sources"]: | ||||||||||||||||
artifact_type = artifact["type"] | ||||||||||||||||
if artifact_type in VCS_SUPPORTED: | ||||||||||||||||
plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]] | ||||||||||||||||
plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]] | ||||||||||||||||
artifact_url = ( | ||||||||||||||||
self.github_session.get_canonical_url(plain_url) | ||||||||||||||||
if self.github_session | ||||||||||||||||
else plain_url | ||||||||||||||||
Done Inline Actionsyou also need to pass a manifest type name (and version); otherwise multiple formats would be undistinguishable once in the DB. And a nitpick: have the $url last, for consistency with other formats vlorentz: you also need to pass a manifest type name (and version); otherwise multiple formats would be… | ||||||||||||||||
Done Inline Actions
as in the one information from this listing, specifically [1]? { ... "version": "1", "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" }
ok ardumont: > you also need to pass a manifest type name (and version); otherwise multiple formats would be… | ||||||||||||||||
Done Inline Actionsvlorentz: no, I mean these: https://forge.softwareheritage.org/source/swh-loader… | ||||||||||||||||
) | ||||||||||||||||
Done Inline ActionsMove this to artefacts.checksums for consistency, and update https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json to include integrity. vlorentz: Move this to `artefacts.checksums` for consistency, and update https://docs.softwareheritage. | ||||||||||||||||
Done Inline Actionsyou mean like this, right? "checksums": { "integrity": "<hex-encoded string>", }, ardumont: you mean like this, right?
```
"checksums": {
"integrity": "<hex-encoded… | ||||||||||||||||
Done Inline Actionsit's also prefixed by the hash name, but yes vlorentz: it's also prefixed by the hash name, but yes | ||||||||||||||||
Done Inline Actionsactually, instead of passing the integrity as-is, we could extract the hash name, check it's something we support (typically sha256) and pass it as a key; so we don't need to add nixguix-specific code to the package loader vlorentz: actually, instead of passing the integrity as-is, we could extract the hash name, check it's… | ||||||||||||||||
Done Inline Actionshmm no, scratch that. integrity is fine because it's more of a standard format for extid manifests than anything else we could come up with. vlorentz: hmm no, scratch that. integrity is fine because it's more of a standard format for extid… | ||||||||||||||||
if not artifact_url: | ||||||||||||||||
continue | ||||||||||||||||
yield ArtifactType.VCS, VCS( | ||||||||||||||||
origin=artifact_url, type=artifact_type, ref=plain_ref | ||||||||||||||||
) | ||||||||||||||||
elif artifact_type == "url": | ||||||||||||||||
# It's either a tarball or a file | ||||||||||||||||
urls = artifact.get("urls") | ||||||||||||||||
if not urls: | ||||||||||||||||
# Nothing to fetch | ||||||||||||||||
logger.warning("Skipping url '%s': empty artifact", artifact) | ||||||||||||||||
continue | ||||||||||||||||
assert urls is not None | ||||||||||||||||
# FIXME: T3294: Fix missing scheme in urls | ||||||||||||||||
origin, *fallback_urls = urls | ||||||||||||||||
integrity = artifact.get("integrity") | ||||||||||||||||
if integrity is None: | ||||||||||||||||
logger.warning("Skipping url '%s': missing integrity field", origin) | ||||||||||||||||
continue | ||||||||||||||||
try: | ||||||||||||||||
is_tar, origin = is_tarball(urls, self.session) | ||||||||||||||||
except ArtifactNatureUndetected: | ||||||||||||||||
logger.warning( | ||||||||||||||||
"Skipping url '%s': undetected remote artifact type", origin | ||||||||||||||||
) | ||||||||||||||||
continue | ||||||||||||||||
# Determine the content checksum stored in the integrity field and | ||||||||||||||||
# convert into a dict of checksums. This only parses the | ||||||||||||||||
# `hash-expression` (hash-<b64-encoded-checksum>) as defined in | ||||||||||||||||
# https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute | ||||||||||||||||
Done Inline Actionsok (missing coverage), i can add that part as well... ardumont: ok (missing coverage), i can add that part as well... | ||||||||||||||||
chksum_algo, chksum_b64 = integrity.split("-") | ||||||||||||||||
checksums: Dict[str, str] = { | ||||||||||||||||
chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() | ||||||||||||||||
} | ||||||||||||||||
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) | ||||||||||||||||
yield ArtifactType.ARTIFACT, Artifact( | ||||||||||||||||
origin=origin, | ||||||||||||||||
fallback_urls=fallback_urls, | ||||||||||||||||
checksums=checksums, | ||||||||||||||||
visit_type="directory" if is_tar else "content", | ||||||||||||||||
) | ||||||||||||||||
else: | ||||||||||||||||
logger.warning( | ||||||||||||||||
"Skipping artifact '%s': unsupported type %s", | ||||||||||||||||
artifact, | ||||||||||||||||
artifact_type, | ||||||||||||||||
) | ||||||||||||||||
def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given a vcs repository, yield a ListedOrigin.""" | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
# FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=artifact.origin, | ||||||||||||||||
visit_type=artifact.type, | ||||||||||||||||
) | ||||||||||||||||
def origin_to_listed_origin( | ||||||||||||||||
self, origin_upstream: OriginUpstream | ||||||||||||||||
) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given an upstream origin, yield a ListedOrigin.""" | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=origin_upstream.origin, | ||||||||||||||||
visit_type="git", # both nixpkgs and guix are git origins so far | ||||||||||||||||
) | ||||||||||||||||
def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given an artifact (tarball, file), yield one ListedOrigin.""" | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=artifact.origin, | ||||||||||||||||
visit_type=artifact.visit_type, | ||||||||||||||||
extra_loader_arguments={ | ||||||||||||||||
"checksums": artifact.checksums, | ||||||||||||||||
"fallback_urls": artifact.fallback_urls, | ||||||||||||||||
}, | ||||||||||||||||
) | ||||||||||||||||
def get_origins_from_page( | ||||||||||||||||
self, artifact_tuple: PageResult | ||||||||||||||||
) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Given an artifact tuple (type, artifact), yield a ListedOrigin.""" | ||||||||||||||||
artifact_type, artifact = artifact_tuple | ||||||||||||||||
mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin") | ||||||||||||||||
yield from mapping_type_fn(artifact) |
use an enum instead of str. (or typing.Literal if it really needs to be a string)