Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F8395748
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
45 KB
Subscribers
None
View Options
diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
index 21cae67..0b8e8be 100644
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -1,555 +1,576 @@
# Copyright (C) 2020-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""NixGuix lister definition.
This lists artifacts out of manifest for Guix or Nixpkgs manifests.
Artifacts can be of types:
- upstream git repository (NixOS/nixpkgs, Guix)
- VCS repositories (svn, git, hg, ...)
- unique file
- unique tarball
"""
import base64
import binascii
from dataclasses import dataclass
from enum import Enum
import logging
from pathlib import Path
import random
import re
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from urllib.parse import parse_qsl, urlparse
import requests
from requests.exceptions import ConnectionError, InvalidSchema, SSLError
from swh.core.github.utils import GitHubSession
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
# By default, ignore binary files and archives containing binaries
DEFAULT_EXTENSIONS_TO_IGNORE = [
"AppImage",
"bin",
"exe",
"iso",
"linux64",
"msi",
"png",
"dic",
"deb",
"rpm",
]
class ArtifactNatureUndetected(ValueError):
"""Raised when a remote artifact's nature (tarball, file) cannot be detected."""
pass
class ArtifactNatureMistyped(ValueError):
"""Raised when a remote artifact is neither a tarball nor a file.
Error of this type are' probably a misconfiguration in the manifest generation that
badly typed a vcs repository.
"""
pass
class ArtifactWithoutExtension(ValueError):
"""Raised when an artifact nature cannot be determined by its name."""
pass
class ChecksumsComputation(Enum):
"""The possible artifact types listed out of the manifest."""
STANDARD = "standard"
"""Standard checksums (e.g. sha1, sha256, ...) on the tarball or file."""
NAR = "nar"
"""The hash is computed over the NAR archive dump of the output (e.g. uncompressed
directory.)"""
MAPPING_CHECKSUMS_COMPUTATION = {
"flat": ChecksumsComputation.STANDARD,
"recursive": ChecksumsComputation.NAR,
}
"""Mapping between the outputHashMode from the manifest and how to compute checksums."""
@dataclass
class Artifact:
"""Metadata information on Remote Artifact with url (tarball or file)."""
origin: str
"""Canonical url retrieve the tarball artifact."""
visit_type: str
"""Either 'tar' or 'file' """
fallback_urls: List[str]
"""List of urls to retrieve tarball artifact if canonical url no longer works."""
checksums: Dict[str, str]
"""Integrity hash converted into a checksum dict."""
checksums_computation: ChecksumsComputation
"""Checksums computation mode to provide to loaders (e.g. nar, standard, ...)"""
@dataclass
class VCS:
"""Metadata information on VCS."""
origin: str
"""Origin url of the vcs"""
type: str
"""Type of (d)vcs, e.g. svn, git, hg, ..."""
ref: Optional[str] = None
"""Reference either a svn commit id, a git commit, ..."""
class ArtifactType(Enum):
"""The possible artifact types listed out of the manifest."""
ARTIFACT = "artifact"
VCS = "vcs"
PageResult = Tuple[ArtifactType, Union[Artifact, VCS]]
VCS_SUPPORTED = ("git", "svn", "hg")
# Rough approximation of what we can find of mimetypes for tarballs "out there"
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
def url_endswith(
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
) -> bool:
"""Determine whether urlparsed ends with one of the extensions passed as parameter.
This also account for the edge case of a filename with only a version as name (so no
extension in the end.)
Raises:
ArtifactWithoutExtension in case no extension is available and
raise_when_no_extension is True (the default)
"""
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
match = any(path.suffix.endswith(tuple(extensions)) for path in paths)
if match:
return match
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
# to catch those
name = Path(urlparsed.path).name
if not PATTERN_VERSION.match(name):
return match
if raise_when_no_extension:
raise ArtifactWithoutExtension
return False
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
"""Determine whether a list of files actually are tarballs or simple files.
When this cannot be answered simply out of the url, when request is provided, this
executes a HTTP `HEAD` query on the url to determine the information. If request is
not provided, this raises an ArtifactNatureUndetected exception.
Args:
urls: name of the remote files for which the extension needs to be checked.
Raises:
ArtifactNatureUndetected when the artifact's nature cannot be detected out
of its url
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
the caller to do what's right with it.
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
or not. The second parameter is the actual url once the head request is issued
as a fallback of not finding out whether the urls are tarballs or not.
"""
def _is_tarball(url):
"""Determine out of an extension whether url is a tarball.
Raises:
ArtifactWithoutExtension in case no extension is available
"""
urlparsed = urlparse(url)
if urlparsed.scheme not in ("http", "https", "ftp"):
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
return url_endswith(urlparsed, TARBALL_EXTENSIONS)
index = random.randrange(len(urls))
url = urls[index]
try:
return _is_tarball(url), urls[0]
except ArtifactWithoutExtension:
if request is None:
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
logger.warning(
"Cannot detect extension for <%s>. Fallback to http head query",
url,
)
try:
response = request.head(url)
except (InvalidSchema, SSLError, ConnectionError):
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
if not response.ok or response.status_code == 404:
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
location = response.headers.get("Location")
if location: # It's not always present
logger.debug("Location: %s", location)
try:
# FIXME: location is also returned as it's considered the true origin,
# true enough?
return _is_tarball(location), location
except ArtifactWithoutExtension:
logger.warning(
"Still cannot detect extension through location <%s>...",
url,
)
+ origin = urls[0]
+
content_type = response.headers.get("Content-Type")
if content_type:
logger.debug("Content-Type: %s", content_type)
if content_type == "application/json":
- return False, urls[0]
- return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), urls[0]
+ return False, origin
+ return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
+
+ content_disposition = response.headers.get("Content-Disposition")
+ if content_disposition:
+ logger.debug("Content-Disposition: %s", content_disposition)
+ if "filename=" in content_disposition:
+ fields = content_disposition.split("; ")
+ for field in fields:
+ if "filename=" in field:
+ _, filename = field.split("filename=")
+ break
+
+ return (
+ url_endswith(
+ urlparse(filename),
+ TARBALL_EXTENSIONS,
+ raise_when_no_extension=False,
+ ),
+ origin,
+ )
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
VCS_KEYS_MAPPING = {
"git": {
"ref": "git_ref",
"url": "git_url",
},
"svn": {
"ref": "svn_revision",
"url": "svn_url",
},
"hg": {
"ref": "hg_changeset",
"url": "hg_url",
},
}
class NixGuixLister(StatelessLister[PageResult]):
"""List Guix or Nix sources out of a public json manifest.
This lister can output:
- unique tarball (.tar.gz, .tbz2, ...)
- vcs repositories (e.g. git, hg, svn)
- unique file (.lisp, .py, ...)
Note that no `last_update` is available in either manifest.
For `url` types artifacts, this tries to determine the artifact's nature, tarball or
file. It first tries to compute out of the "url" extension. In case of no extension,
it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
response header, and then checks the extension again.
Optionally, when the `extension_to_ignore` parameter is provided, it extends the
default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed.
This can be used to drop further binary files detected in the wild.
"""
LISTER_NAME = "nixguix"
def __init__(
self,
scheduler,
url: str,
origin_upstream: str,
instance: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
# canonicalize urls, can be turned off during docker runs
canonicalize: bool = True,
extensions_to_ignore: List[str] = [],
**kwargs: Any,
):
super().__init__(
scheduler=scheduler,
url=url.rstrip("/"),
instance=instance,
credentials=credentials,
)
# either full fqdn NixOS/nixpkgs or guix repository urls
# maybe add an assert on those specific urls?
self.origin_upstream = origin_upstream
self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
self.session = requests.Session()
# for testing purposes, we may want to skip this step (e.g. docker run and rate
# limit)
self.github_session = (
GitHubSession(
credentials=self.credentials,
user_agent=str(self.session.headers["User-Agent"]),
)
if canonicalize
else None
)
def build_artifact(
self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None
) -> Optional[Tuple[ArtifactType, VCS]]:
"""Build a canonicalized vcs artifact when possible."""
origin = (
self.github_session.get_canonical_url(artifact_url)
if self.github_session
else artifact_url
)
if not origin:
return None
return ArtifactType.VCS, VCS(
origin=origin, type=artifact_type, ref=artifact_ref
)
def get_pages(self) -> Iterator[PageResult]:
"""Yield one page per "typed" origin referenced in manifest."""
# fetch and parse the manifest...
response = self.http_request(self.url)
# ... if any
raw_data = response.json()
yield ArtifactType.VCS, VCS(origin=self.origin_upstream, type="git")
# grep '"type"' guix-sources.json | sort | uniq
# "type": false <<<<<<<<< noise
# "type": "git",
# "type": "hg",
# "type": "no-origin", <<<<<<<<< noise
# "type": "svn",
# "type": "url",
# grep '"type"' nixpkgs-sources-unstable.json | sort | uniq
# "type": "url",
sources = raw_data["sources"]
random.shuffle(sources)
for artifact in sources:
artifact_type = artifact["type"]
if artifact_type in VCS_SUPPORTED:
plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]]
plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]]
built_artifact = self.build_artifact(
plain_url, artifact_type, plain_ref
)
if not built_artifact:
continue
yield built_artifact
elif artifact_type == "url":
# It's either a tarball or a file
origin_urls = artifact.get("urls")
if not origin_urls:
# Nothing to fetch
logger.warning("Skipping url <%s>: empty artifact", artifact)
continue
assert origin_urls is not None
# Deal with urls with empty scheme (basic fallback to http)
urls = []
for url in origin_urls:
urlparsed = urlparse(url)
if urlparsed.scheme == "":
logger.warning("Missing scheme for <%s>: fallback to http", url)
fixed_url = f"http://{url}"
else:
fixed_url = url
urls.append(fixed_url)
origin, *fallback_urls = urls
if origin.endswith(".git"):
built_artifact = self.build_artifact(origin, "git")
if not built_artifact:
continue
yield built_artifact
continue
outputHash = artifact.get("outputHash")
integrity = artifact.get("integrity")
if integrity is None and outputHash is None:
logger.warning(
"Skipping url <%s>: missing integrity and outputHash field",
origin,
)
continue
# Falls back to outputHash field if integrity is missing
if integrity is None and outputHash:
# We'll deal with outputHash as integrity field
integrity = outputHash
try:
is_tar, origin = is_tarball(urls, self.session)
except ArtifactNatureMistyped:
logger.warning(
"Mistyped url <%s>: trying to deal with it properly", origin
)
urlparsed = urlparse(origin)
artifact_type = urlparsed.scheme
if artifact_type in VCS_SUPPORTED:
built_artifact = self.build_artifact(origin, artifact_type)
if not built_artifact:
continue
yield built_artifact
else:
logger.warning(
"Skipping url <%s>: undetected remote artifact type", origin
)
continue
except ArtifactNatureUndetected:
logger.warning(
"Skipping url <%s>: undetected remote artifact type", origin
)
continue
# Determine the content checksum stored in the integrity field and
# convert into a dict of checksums. This only parses the
# `hash-expression` (hash-<b64-encoded-checksum>) as defined in
# https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute
try:
chksum_algo, chksum_b64 = integrity.split("-")
checksums: Dict[str, str] = {
chksum_algo: base64.decodebytes(chksum_b64.encode()).hex()
}
except binascii.Error:
logger.exception(
"Skipping url: <%s>: integrity computation failure for <%s>",
url,
artifact,
)
continue
# The 'outputHashMode' attribute determines how the hash is computed. It
# must be one of the following two values:
# - "flat": (default) The output must be a non-executable regular file.
# If it isn’t, the build fails. The hash is simply computed over the
# contents of that file (so it’s equal to what Unix commands like
# `sha256sum` or `sha1sum` produce).
# - "recursive": The hash is computed over the NAR archive dump of the
# output (i.e., the result of `nix-store --dump`). In this case,
# the output can be anything, including a directory tree.
outputHashMode = artifact.get("outputHashMode", "flat")
if not is_tar and outputHashMode == "recursive":
# T4608: Cannot deal with those properly yet as some can be missing
# 'critical' information about how to recompute the hash (e.g. fs
# layout, executable bit, ...)
logger.warning(
"Skipping artifact <%s>: 'file' artifact of type <%s> is"
" missing information to properly check its integrity",
artifact,
artifact_type,
)
continue
# At this point plenty of heuristics happened and we should have found
# the right origin and its nature.
# Let's check and filter it out if it is to be ignored (if possible).
# Some origin urls may not have extension at this point (e.g
# http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
if url_endswith(
urlparse(origin),
self.extensions_to_ignore,
raise_when_no_extension=False,
):
logger.warning(
"Skipping artifact <%s>: 'file' artifact of type <%s> is"
" ignored due to lister configuration. It should ignore"
" origins with extension [%s]",
origin,
artifact_type,
",".join(self.extensions_to_ignore),
)
continue
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
yield ArtifactType.ARTIFACT, Artifact(
origin=origin,
fallback_urls=fallback_urls,
checksums=checksums,
checksums_computation=MAPPING_CHECKSUMS_COMPUTATION[outputHashMode],
visit_type="directory" if is_tar else "content",
)
else:
logger.warning(
"Skipping artifact <%s>: unsupported type %s",
artifact,
artifact_type,
)
def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]:
"""Given a vcs repository, yield a ListedOrigin."""
assert self.lister_obj.id is not None
# FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=artifact.origin,
visit_type=artifact.type,
)
def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]:
"""Given an artifact (tarball, file), yield one ListedOrigin."""
assert self.lister_obj.id is not None
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=artifact.origin,
visit_type=artifact.visit_type,
extra_loader_arguments={
"checksums": artifact.checksums,
"checksums_computation": artifact.checksums_computation.value,
"fallback_urls": artifact.fallback_urls,
},
)
def get_origins_from_page(
self, artifact_tuple: PageResult
) -> Iterator[ListedOrigin]:
"""Given an artifact tuple (type, artifact), yield a ListedOrigin."""
artifact_type, artifact = artifact_tuple
mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin")
yield from mapping_type_fn(artifact)
diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json
index 3178159..05fdd79 100644
--- a/swh/lister/nixguix/tests/data/sources-success.json
+++ b/swh/lister/nixguix/tests/data/sources-success.json
@@ -1,279 +1,293 @@
{
"sources": [
{
"type": "url",
"urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ],
"integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
},
{
"type": "url",
"urls": [ "https://github.com/owner-3/repository-1/revision-1.tar" ],
"integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
},
{
"type": "url",
"urls": [ "https://example.com/file.txt" ],
"integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM="
},
{
"type": "url",
"urls": [
"https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz"
],
"integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
},
{
"type": "url",
"urls": [
"ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz"
],
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
},
{
"type": "url",
"urls": [
"www.roudoudou.com/export/cpc/rasm/rasm_v0117_src.zip"
],
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
},
{
"type": "url",
"outputHashMode": "flat",
"urls": [
"http://downloads.sourceforge.net/project/nmon/lmon16n.c",
"http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c",
"http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c"
],
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
},
{
"outputHash": "0s7p9swjqjsqddylmgid6cv263ggq7pmb734z4k84yfcrgb6kg4g",
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"type": "url",
"urls": [
"https://github.com/kandu/trie/archive/1.0.0.txz"
],
"integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=",
"inferredFetcher": "fetchzip"
},
{
"type": "url",
"urls": [
"https://github.com/trie/trie.git"
],
"integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g="
},
{
"type": "git",
"git_url": "https://example.org/pali/0xffff",
"git_ref": "0.9"
},
{
"type": "hg",
"hg_url": "https://example.org/vityok/cl-string-match",
"hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97"
},
{
"type": "svn",
"svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
"svn_revision": 39057
},
{
"outputHash": "sha256-LxVcYj2WKHbhNu5x/DFkxQPOYrVkNvwiE/qcODq52Lc=",
"outputHashAlgo": null,
"outputHashMode": "recursive",
"type": "url",
"urls": [
"https://github.com/julian-klode/triehash/archive/debian/0.3-3.tbz"
],
"inferredFetcher": "fetchzip"
},
{
"type": "url",
"urls": [
"http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz"
],
"integrity": "sha256-G/7oY5qdCSJ59VlwHtIbvMdT6+mriXhMqQIHNx65J+E="
},
{
"type": "url",
"urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"],
"integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk="
},
{
"outputHash": "0w2qkrrkzfy4h4jld18apypmbi8a8r89y2l11axlv808i2rg68fk",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://github.com/josefnpat/vapor/releases/download/0.2.3/vapor_dbf509f.love"
],
"integrity": "sha256-0yHzsogIoE27CoEKn1BGCsVVr78KhUYlgcS7P3OeWHA=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "0rf06axz1hxssg942w2g66avak30jy6rfdwxynhriqv3vrf17bja",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"http://mirrors.jenkins.io/war-stable/2.303.1/jenkins.war"
],
"integrity": "sha256-Sq4TXN5j45ih9Z03l42XYEy1lTFPcEHS07rD8LsywGU=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "1filqm050ixy53kdv81bd4n80vjvfapnmzizy7jg8a6pilv17gfc",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://files.pythonhosted.org/packages/py2.py3/g/geojson/geojson-2.5.0-py2.py3-none-any.whl"
],
"integrity": "sha256-zL0TNo3XKPTk8T/+aq9yW26ALGkroN3mKL5HUEDFNLo=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "sha256:0i1cw0nfg24b0sg2yc3q7315ng5vc5245nvh0l1cndkn2c9z4978",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://stavekontrolden.dk/dictionaries/da_DK/da_DK-2.5.189.oxt"
],
"integrity": "sha256-6CTyExN2NssCBXDbQkRhuzxbwjh4MC+eBouI5yzgLEQ=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "0y2HN4WGYUUXBfqp8Xb4oaA0hbLZmE3kDUXMBAOjvPQ=",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://github.com/microsoft/vscode-python/releases/download/2021.5.829140558/ms-python-release.vsix"
],
"integrity": "sha256-0y2HN4WGYUUXBfqp8Xb4oaA0hbLZmE3kDUXMBAOjvPQ=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "08dfl5h1k6s542qw5qx2czm1wb37ck9w2vpjz44kp2az352nmksb",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://zxh404.gallery.vsassets.io/_apis/public/gallery/publisher/zxh404/extension/vscode-proto3/0.5.4/assetbyname/Microsoft.VisualStudio.Services.VSIXPackage"
],
"integrity": "sha256-S89qRRlfiTsJ+fJuwdNkZywe6mei48KxIEWbGWChriE=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "0kaz8j85wjjnf18z0lz69xr1z8makg30jn2dzdyicd1asrj0q1jm",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://github.com/yvt/openspades/releases/download/v0.1.1b/NotoFonts.pak"
],
"integrity": "sha256-VQYMZNYqNBZ9+01YCcabqqIfck/mU/BRcFZKXpBEX00=",
"inferredFetcher": "unclassified"
},
{
"type": "url",
"urls": [
"https://crates.io/api/v1/crates/syntect/4.6.0/download"
],
"integrity": "sha256-iyCBW76A7gvgbmlXRQqEEYX89pD+AXjxTXegXOLKoDE="
},
{
"outputHash": "0x5l2pn4x92734k6i2wcjbn2klmwgkiqaajvxadh35k74dgnyh18",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://rubygems.org/gems/wdm-0.1.1.gem"
],
"integrity": "sha256-KEBvXyNnlgGb6lsqheN8vNIp7JKMi2gmGUekTuwVtHQ=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "2al10188nwrdmi9zk3bid4ijjfsa8ymh6m9hin5jsja7hx7anbvs3i2y7kall56h4qn7j1rj73f8499x3i2k6x53kszmksvd2a1pkd4",
"outputHashAlgo": "sha512",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://repo1.maven.org/maven2/org/codehaus/plexus/plexus-compiler-manager/2.4/plexus-compiler-manager-2.4.jar"
],
"integrity": "sha512-pM0blGhbz/r1HKWbKeLoKRHkxpE5yGMxgaZQqubxIg69l1Wnw6OklsVGmKqB1SOlnZSRtLjG/CnWlrlFKIBAlQ==",
"inferredFetcher": "unclassified"
},
{
"outputHash": "19mnq9a1yr16srqs8n6hddahr4f9d2gbpmld62pvlw1ps7nfrp9w",
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"type": "url",
"urls": [
"https://bitbucket.org/zandoye/charinfo_width/get/1.1.0.tar.bz2"
],
"integrity": "sha256-PN3s7NE3cLqvMI3Wu55oyZEMVWvQWKRx1iZkH1TCtqY=",
"inferredFetcher": "fetchzip"
},
{
"type": "url",
"urls": [
"https://ftpmirror.gnu.org/gnu/texinfo/texinfo-4.13a.tar.lzma",
"ftp://ftp.cs.tu-berlin.de/pub/gnu/texinfo/texinfo-4.13a.tar.lzma"
],
"integrity": "sha256-bSiwzq6GbjU2FC/FUuejvJ+EyDAxGcJXMbJHju9kyeU="
},
{
"type": "url",
"urls": [
"https://download.savannah.gnu.org/releases/zutils/zutils-1.10.tar.lz",
"https://nongnu.freemirror.org/nongnu/zutils/zutils-1.10.tar.lz"
],
"integrity": "sha256-DdRBOCktV1dkgDcZW2lFw99wsxYiG0KFUgrTjy6usZU="
},
{
"type": "url",
"urls": [
"http://www.rle.mit.edu/cpg/codes/fasthenry-3.0-12Nov96.tar.z"
],
"integrity": "sha256-8V9YKMP4A50xYvmFlzh5sbQv6L39hD+znfAD0rzvBqg="
},
{
"type": "url",
"urls": [
"http://ftp.x.org/contrib/utilities/unclutter-8.tar.Z"
],
"integrity": "sha256-uFWnjURlqy+GKH6srGOnPxUEsIUihAqjdxh3bn7JGSo="
},
{
"outputHash": "sha256-Y40oLjddunrd7ZF1JbCcgjSCn8jFTubq69jhAVxInXw=",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://github.com/vk-cli/vk/releases/download/0.7.6/vk-0.7.6-64-bin.7z"
],
"integrity": "sha256-Y40oLjddunrd7ZF1JbCcgjSCn8jFTubq69jhAVxInXw=",
"inferredFetcher": "unclassified"
},
{
"type": "url",
"urls": [
"https://github.com/Doom-Utils/deutex/releases/download/v5.2.2/deutex-5.2.2.tar.zst"
],
"integrity": "sha256-EO0OelM+yXy20DVI1CWPvsiIUqRbXqTPVDQ3atQXS18="
},
{
"type": "url",
"urls": [
"https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5"
],
"integrity": "sha256-Kb5f9LN54vxPiO99i8FyNCEw3T53owYfZMinXv5OunM="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "https://codeload.github.com/unknown-horizons/unknown-horizons/tar.gz/2019.1"
+ ],
+ "integrity": "sha256-pBf9PTQiEv0ZDk8hvoLvE8EOHtfCiPu+RuRiAM895Ng="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "https://codeload.github.com/fifengine/fifengine/tar.gz/0.4.2"
+ ],
+ "integrity": "sha256-6IK1W++jauLxqJraFq8PgUobePfL5gIexbFgVgTPj/g="
}
],
"version": "1",
"revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"
}
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
index 13ee116..fdb7210 100644
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -1,368 +1,381 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
import json
import logging
from pathlib import Path
from typing import Dict, List
from urllib.parse import urlparse
import pytest
import requests
from requests.exceptions import ConnectionError, InvalidSchema, SSLError
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
DEFAULT_EXTENSIONS_TO_IGNORE,
POSSIBLE_TARBALL_MIMETYPES,
ArtifactNatureMistyped,
ArtifactNatureUndetected,
ArtifactWithoutExtension,
NixGuixLister,
is_tarball,
url_endswith,
)
from swh.lister.pattern import ListerStats
logger = logging.getLogger(__name__)
SOURCES = {
"guix": {
"repo": "https://git.savannah.gnu.org/cgit/guix.git/",
"manifest": "https://guix.gnu.org/sources.json",
},
"nixpkgs": {
"repo": "https://github.com/NixOS/nixpkgs",
"manifest": "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json",
},
}
def page_response(datadir, instance: str = "success") -> List[Dict]:
"""Return list of repositories (out of test dataset)"""
datapath = Path(datadir, f"sources-{instance}.json")
return json.loads(datapath.read_text()) if datapath.exists else []
@pytest.mark.parametrize(
"name,expected_result",
[(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
+ [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
+ [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
+ [
("two?file=something.el", False),
("foo?two=two&three=three", False),
("v1.2.3", False), # with raise_when_no_extension is False
("2048-game-20151026.1233", False),
("v2048-game-20151026.1233", False),
],
)
def test_url_endswith(name, expected_result):
"""It should detect whether url or query params of the urls ends with extensions"""
urlparsed = urlparse(f"https://example.org/{name}")
assert (
url_endswith(
urlparsed,
TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE,
raise_when_no_extension=False,
)
is expected_result
)
@pytest.mark.parametrize(
"name", ["foo?two=two&three=three", "tar.gz/0.1.5", "tar.gz/v10.3.1"]
)
def test_url_endswith_raise(name):
"""It should raise when the tested url has no extension"""
urlparsed = urlparse(f"https://example.org/{name}")
with pytest.raises(ArtifactWithoutExtension):
url_endswith(urlparsed, ["unimportant"])
@pytest.mark.parametrize(
"tarballs",
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS],
)
def test_is_tarball_simple(tarballs):
"""Simple check on tarball should discriminate between tarball and file"""
urls = [f"https://example.org/{tarball}" for tarball in tarballs]
is_tar, origin = is_tarball(urls)
assert is_tar is True
assert origin == urls[0]
@pytest.mark.parametrize(
"query_param",
["file", "f", "url", "name", "anykeyreally"],
)
def test_is_tarball_not_so_simple(query_param):
"""More involved check on tarball should discriminate between tarball and file"""
url = f"https://example.org/download.php?foo=bar&{query_param}=one.tar.gz"
is_tar, origin = is_tarball([url])
assert is_tar is True
assert origin == url
@pytest.mark.parametrize(
"files",
[
["abc.lisp"],
["one.abc", "two.bcd"],
["abc.c", "other.c"],
["one.scm?foo=bar", "two.scm?foo=bar"],
["config.nix", "flakes.nix"],
],
)
def test_is_tarball_simple_not_tarball(files):
"""Simple check on tarball should discriminate between tarball and file"""
urls = [f"http://example.org/{file}" for file in files]
is_tar, origin = is_tarball(urls)
assert is_tar is False
assert origin == urls[0]
def test_is_tarball_complex_with_no_result(requests_mock):
"""Complex tarball detection without proper information should fail."""
# No extension, this won't detect immediately the nature of the url
url = "https://example.org/crates/package/download"
urls = [url]
with pytest.raises(ArtifactNatureUndetected):
is_tarball(urls) # no request parameter, this cannot fallback, raises
with pytest.raises(ArtifactNatureUndetected):
requests_mock.head(
url,
status_code=404, # not found so cannot detect anything
)
is_tarball(urls, requests)
with pytest.raises(ArtifactNatureUndetected):
requests_mock.head(
url, headers={}
) # response ok without headers, cannot detect anything
is_tarball(urls, requests)
with pytest.raises(ArtifactNatureUndetected):
fallback_url = "https://example.org/mirror/crates/package/download"
requests_mock.head(
url, headers={"location": fallback_url} # still no extension, cannot detect
)
is_tarball(urls, requests)
with pytest.raises(ArtifactNatureMistyped):
is_tarball(["foo://example.org/unsupported-scheme"])
with pytest.raises(ArtifactNatureMistyped):
fallback_url = "foo://example.org/unsupported-scheme"
requests_mock.head(
url, headers={"location": fallback_url} # still no extension, cannot detect
)
is_tarball(urls, requests)
@pytest.mark.parametrize(
"fallback_url, expected_result",
[
("https://example.org/mirror/crates/package/download.tar.gz", True),
("https://example.org/mirror/package/download.lisp", False),
],
)
def test_is_tarball_complex_with_location_result(
requests_mock, fallback_url, expected_result
):
"""Complex tarball detection with information should detect artifact nature"""
# No extension, this won't detect immediately the nature of the url
url = "https://example.org/crates/package/download"
urls = [url]
# One scenario where the url renders a location with a proper extension
requests_mock.head(url, headers={"location": fallback_url})
is_tar, origin = is_tarball(urls, requests)
assert is_tar == expected_result
if is_tar:
assert origin == fallback_url
@pytest.mark.parametrize(
"content_type, expected_result",
[("application/json", False), ("application/something", False)]
+ [(ext, True) for ext in POSSIBLE_TARBALL_MIMETYPES],
)
def test_is_tarball_complex_with_content_type_result(
requests_mock, content_type, expected_result
):
"""Complex tarball detection with information should detect artifact nature"""
# No extension, this won't detect immediately the nature of the url
url = "https://example.org/crates/package/download"
urls = [url]
# One scenario where the url renders a location with a proper extension
requests_mock.head(url, headers={"Content-Type": content_type})
is_tar, origin = is_tarball(urls, requests)
assert is_tar == expected_result
if is_tar:
assert origin == url
def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
"""NixGuixLister should list all origins per visit type"""
url = SOURCES["guix"]["manifest"]
origin_upstream = SOURCES["guix"]["repo"]
lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
response = page_response(datadir, "success")
requests_mock.get(
url,
[{"json": response}],
)
requests_mock.get(
"https://api.github.com/repos/trie/trie",
[{"json": {"html_url": "https://github.com/trie/trie.git"}}],
)
requests_mock.head(
"http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz",
headers={"Content-Type": "application/gzip; charset=ISO-8859-1"},
)
requests_mock.head(
"https://crates.io/api/v1/crates/syntect/4.6.0/download",
headers={
"Location": "https://static.crates.io/crates/syntect/syntect-4.6.0.crate"
},
)
requests_mock.head(
"https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5",
headers={
"Content-Type": "application/x-gzip",
},
)
+ requests_mock.head(
+ "https://codeload.github.com/unknown-horizons/unknown-horizons/tar.gz/2019.1",
+ headers={
+ "Content-Disposition": "attachment; filename=unknown-horizons-2019.1.tar.gz",
+ },
+ )
+ requests_mock.head(
+ "https://codeload.github.com/fifengine/fifengine/tar.gz/0.4.2",
+ headers={
+ "Content-Disposition": "attachment; name=fieldName; "
+ "filename=fifengine-0.4.2.tar.gz; other=stuff",
+ },
+ )
expected_visit_types = defaultdict(int)
# origin upstream is added as origin
expected_nb_origins = 1
expected_visit_types["git"] += 1
for artifact in response["sources"]:
# Each artifact is considered an origin (even "url" artifacts with mirror urls)
expected_nb_origins += 1
artifact_type = artifact["type"]
if artifact_type in [
"git",
"svn",
"hg",
]:
expected_visit_types[artifact_type] += 1
elif artifact_type == "url":
url = artifact["urls"][0]
if url.endswith(".git"):
expected_visit_types["git"] += 1
elif url.endswith(".c") or url.endswith(".txt"):
expected_visit_types["content"] += 1
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
expected_visit_types["svn"] += 1
elif "crates.io" in url or "codeload.github.com" in url:
expected_visit_types["directory"] += 1
else: # tarball artifacts
expected_visit_types["directory"] += 1
assert set(expected_visit_types.keys()) == {
"content",
"git",
"svn",
"hg",
"directory",
}
listed_result = lister.run()
# 1 page read is 1 origin
nb_pages = expected_nb_origins
assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins)
scheduler_origins = lister.scheduler.get_listed_origins(
lister.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
mapping_visit_types = defaultdict(int)
for listed_origin in scheduler_origins:
assert listed_origin.visit_type in expected_visit_types
# no last update is listed on those manifests
assert listed_origin.last_update is None
mapping_visit_types[listed_origin.visit_type] += 1
assert dict(mapping_visit_types) == expected_visit_types
def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
"""NixGuixLister should ignore unsupported or incomplete or to ignore origins"""
url = SOURCES["nixpkgs"]["manifest"]
origin_upstream = SOURCES["nixpkgs"]["repo"]
lister = NixGuixLister(
swh_scheduler,
url=url,
origin_upstream=origin_upstream,
extensions_to_ignore=["foobar"],
)
response = page_response(datadir, "failure")
requests_mock.get(
url,
[{"json": response}],
)
# Amongst artifacts, this url does not allow to determine its nature (tarball, file)
# It's ending up doing a http head query which ends up being 404, so it's skipped.
requests_mock.head(
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
status_code=404,
)
# Invalid schema for that origin (and no extension), so skip origin
# from its name
requests_mock.head(
"ftp://ftp.ourproject.org/file-with-no-extension",
exc=InvalidSchema,
)
# Cannot communicate with an expired cert, so skip origin
requests_mock.head(
"https://code.9front.org/hg/plan9front",
exc=SSLError,
)
# Cannot connect to the site, so skip origin
requests_mock.head(
"https://git-tails.immerda.ch/onioncircuits",
exc=ConnectionError,
)
listed_result = lister.run()
# only the origin upstream is listed, every other entries are unsupported or incomplete
assert listed_result == ListerStats(pages=1, origins=1)
scheduler_origins = lister.scheduler.get_listed_origins(
lister.lister_obj.id
).results
assert len(scheduler_origins) == 1
assert scheduler_origins[0].visit_type == "git"
def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock):
url = SOURCES["nixpkgs"]["manifest"]
origin_upstream = SOURCES["nixpkgs"]["repo"]
lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
requests_mock.get(
url,
status_code=404,
)
with pytest.raises(requests.HTTPError): # listing cannot continues so stop
lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 0
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Jun 4 2025, 7:44 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214510
Attached To
rDLS Listers
Event Timeline
Log In to Comment