diff --git a/requirements-swh.txt b/requirements-swh.txt index 3281b3e..1678cb4 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ -swh.core[db,github] >= 2.8 +swh.core[db,github] >= 2.15 swh.scheduler >= 0.8 diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py index be53d8b..eaa5efd 100644 --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -1,62 +1,84 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import pkg_resources logger = logging.getLogger(__name__) try: __version__ = pkg_resources.get_distribution("swh.lister").version except pkg_resources.DistributionNotFound: __version__ = "devel" USER_AGENT_TEMPLATE = ( f"Software Heritage %s lister v{__version__}" " (+https://www.softwareheritage.org/contact)" ) LISTERS = { entry_point.name.split(".", 1)[1]: entry_point for entry_point in pkg_resources.iter_entry_points("swh.workers") if entry_point.name.split(".", 1)[0] == "lister" } SUPPORTED_LISTERS = list(LISTERS) +TARBALL_EXTENSIONS = [ + "crate", + "gem", + "jar", + "zip", + "tar", + "gz", + "tgz", + "tbz", + "bz2", + "bzip2", + "lzma", + "lz", + "txz", + "xz", + "z", + "Z", + "7z", + "zst", +] +"""Tarball recognition pattern""" + def get_lister(lister_name, db_url=None, **conf): """Instantiate a lister given its name. Args: lister_name (str): Lister's name conf (dict): Configuration dict (lister db cnx, policy, priority...) Returns: Tuple (instantiated lister, drop_tables function, init schema function, insert minimum data function) """ if lister_name not in LISTERS: raise ValueError( "Invalid lister %s: only supported listers are %s" % (lister_name, SUPPORTED_LISTERS) ) if db_url: conf["lister"] = {"cls": "local", "args": {"db": db_url}} registry_entry = LISTERS[lister_name].load()() lister_cls = registry_entry["lister"] from swh.lister import pattern if issubclass(lister_cls, pattern.Lister): return lister_cls.from_config(**conf) else: # Old-style lister return lister_cls(override_config=conf) diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py index f414ef3..ec48cf0 100644 --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -1,332 +1,319 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import gzip import json import logging from os import path from pathlib import Path import re from typing import Any, List, Mapping, Sequence, Tuple from urllib.parse import urlparse import requests +from swh.lister import TARBALL_EXTENSIONS + logger = logging.getLogger(__name__) class GNUTree: """Gnu Tree's representation""" def __init__(self, url: str): self.url = url # filepath or uri u = urlparse(url) self.base_url = "%s://%s" % (u.scheme, u.netloc) # Interesting top level directories self.top_level_directories = ["gnu", "old-gnu"] # internal state self._artifacts = {} # type: Mapping[str, Any] self._projects = {} # type: Mapping[str, Any] @property def projects(self) -> Mapping[str, Any]: if not self._projects: self._projects, self._artifacts = self._load() return self._projects @property def artifacts(self) -> Mapping[str, Any]: if not self._artifacts: self._projects, self._artifacts = self._load() return self._artifacts def _load(self) -> Tuple[Mapping[str, Any], Mapping[str, Any]]: """Compute projects and artifacts per project Returns: Tuple of dict projects (key project url, value the associated information) and a dict artifacts (key project url, value the info_file list) """ projects = {} artifacts = {} raw_data = load_raw_data(self.url)[0] for directory in raw_data["contents"]: if directory["name"] not in self.top_level_directories: continue infos = directory["contents"] for info in infos: if info["type"] == "directory": package_url = "%s/%s/%s/" % ( self.base_url, directory["name"], info["name"], ) package_artifacts = find_artifacts(info["contents"], package_url) if package_artifacts != []: repo_details = { "name": info["name"], "url": package_url, "time_modified": format_date(info["time"]), } artifacts[package_url] = package_artifacts projects[package_url] = repo_details return projects, artifacts def find_artifacts( filesystem: List[Mapping[str, Any]], url: str ) -> List[Mapping[str, Any]]: """Recursively list artifacts present in the folder and subfolders for a particular package url. Args: filesystem: File structure of the package root directory. This is a list of Dict representing either file or directory information as dict (keys: name, size, time, type). url: URL of the corresponding package Returns List of tarball urls and their associated metadata (time, length, etc...). For example: .. code-block:: python [ { 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', 'time': 1071002600, 'filename': '3DLDF-1.1.3.tar.gz', 'version': '1.1.3', 'length': 543 }, { 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', 'time': 1071078759, 'filename: '3DLDF-1.1.4.tar.gz', 'version': '1.1.4', 'length': 456 }, { 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', 'time': 1074278633, 'filename': '3DLDF-1.1.5.tar.gz', 'version': '1.1.5' 'length': 251 }, ... ] """ artifacts = [] # type: List[Mapping[str, Any]] for info_file in filesystem: filetype = info_file["type"] filename = info_file["name"] if filetype == "file": if check_filename_is_archive(filename): uri = url + filename artifacts.append( { "url": uri, "filename": filename, "time": format_date(info_file["time"]), "length": int(info_file["size"]), "version": get_version(filename), } ) # It will recursively check for artifacts in all sub-folders elif filetype == "directory": tarballs_in_dir = find_artifacts( info_file["contents"], url + filename + "/" ) artifacts.extend(tarballs_in_dir) return artifacts def check_filename_is_archive(filename: str) -> bool: """ Check for the extension of the file, if the file is of zip format of .tar.x format, where x could be anything, then returns true. Args: filename: name of the file for which the extensions is needs to be checked. Returns: Whether filename is an archive or not Example: >>> check_filename_is_archive('abc.zip') True >>> check_filename_is_archive('abc.tar.gz') True >>> check_filename_is_archive('bac.tar') True >>> check_filename_is_archive('abc.tar.gz.sig') False >>> check_filename_is_archive('foobar.tar.') False """ file_suffixes = Path(filename).suffixes if len(file_suffixes) == 1 and file_suffixes[-1] in (".zip", ".tar"): return True elif len(file_suffixes) > 1: if file_suffixes[-1] == ".zip" or file_suffixes[-2] == ".tar": return True return False -# to recognize existing naming pattern -EXTENSIONS = [ - "zip", - "tar", - "gz", - "tgz", - "bz2", - "bzip2", - "lzma", - "lz", - "xz", - "Z", - "7z", -] - VERSION_KEYWORDS = [ "cygwin_me", "w32", "win32", "nt", "cygwin", "mingw", "latest", "alpha", "beta", "release", "stable", "hppa", "solaris", "sunos", "sun4u", "sparc", "sun", "aix", "ibm", "rs6000", "i386", "i686", "linux", "redhat", "linuxlibc", "mips", "powerpc", "macos", "apple", "darwin", "macosx", "powermacintosh", "unknown", "netbsd", "freebsd", "sgi", "irix", ] # Match a filename into components. # # We use Debian's release number heuristic: A release number starts # with a digit, and is followed by alphanumeric characters or any of # ., +, :, ~ and - # # We hardcode a list of possible extensions, as this release number # scheme would match them too... We match on any combination of those. # # Greedy matching is done right to left (we only match the extension # greedily with +, software_name and release_number are matched lazily # with +? and *?). PATTERN = r""" ^ (?: # We have a software name and a release number, separated with a # -, _ or dot. (?P.+?[-_.]) (?P({vkeywords}|[0-9][0-9a-zA-Z_.+:~-]*?)+) | # We couldn't match a release number, put everything in the # software name. (?P.+?) ) (?P(?:\.(?:{extensions}))+) $ """.format( - extensions="|".join(EXTENSIONS), + extensions="|".join(TARBALL_EXTENSIONS), vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS), ) def get_version(uri: str) -> str: """Extract branch name from tarball uri Args: uri (str): Tarball URI Returns: Version detected Example: >>> uri = 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz' >>> get_version(uri) '0.2.0' >>> uri = '8sync-0.3.0.tar.gz' >>> get_version(uri) '0.3.0' """ filename = path.split(uri)[-1] m = re.match(PATTERN, filename, flags=re.VERBOSE | re.IGNORECASE) if m: d = m.groupdict() if d["software_name1"] and d["release_number"]: return d["release_number"] if d["software_name2"]: return d["software_name2"] return "" def load_raw_data(url: str) -> Sequence[Mapping]: """Load the raw json from the tree.json.gz Args: url: Tree.json.gz url or path Returns: The raw json list """ if url.startswith("http://") or url.startswith("https://"): response = requests.get(url, allow_redirects=True) if not response.ok: raise ValueError("Error during query to %s" % url) raw = gzip.decompress(response.content) else: with gzip.open(url, "r") as f: raw = f.read() raw_data = json.loads(raw.decode("utf-8")) return raw_data def format_date(timestamp: str) -> str: """Format a string timestamp to an isoformat string""" return datetime.fromtimestamp(int(timestamp), tz=timezone.utc).isoformat() diff --git a/swh/lister/nixguix/__init__.py b/swh/lister/nixguix/__init__.py new file mode 100644 index 0000000..5dd39c5 --- /dev/null +++ b/swh/lister/nixguix/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import NixGuixLister + + return { + "lister": NixGuixLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py new file mode 100644 index 0000000..6873977 --- /dev/null +++ b/swh/lister/nixguix/lister.py @@ -0,0 +1,370 @@ +# Copyright (C) 2020-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""NixGuix lister definition. + +This lists artifacts out of manifest for Guix or Nixpkgs manifests. + +Artifacts can be of types: +- upstream git repository (NixOS/nixpkgs, Guix) +- VCS repositories (svn, git, hg, ...) +- unique file +- unique tarball + +""" + +import base64 +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +import random +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from urllib.parse import urlparse + +import requests + +from swh.core.github.utils import GitHubSession +from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT +from swh.lister import TARBALL_EXTENSIONS +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.model import ListedOrigin + +logger = logging.getLogger(__name__) + + +class ArtifactNatureUndetected(ValueError): + """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" + + pass + + +@dataclass +class OriginUpstream: + """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" + + origin: str + """Canonical url of the repository""" + version: int + """Version of the repository (dismissed?)""" + revision: str + """Revision of the repository (dismissed?)""" + + +@dataclass +class Artifact: + """Metadata information on Remote Artifact with url (tarball or file).""" + + origin: str + """Canonical url retrieve the tarball artifact.""" + visit_type: str + """Either 'tar' or 'file' """ + fallback_urls: List[str] + """List of urls to retrieve tarball artifact if canonical url no longer works.""" + checksums: Dict[str, str] + """Integrity hash converted into a checksum dict.""" + + +@dataclass +class VCS: + """Metadata information on VCS.""" + + origin: str + """Origin url of the vcs""" + ref: Optional[str] + """Reference either a svn commit id, a git commit, ...""" + type: str + """Type of (d)vcs, e.g. svn, git, hg, ...""" + + +class ArtifactType(Enum): + """The possible artifact types listed out of the manifest.""" + + ARTIFACT = "artifact" + ORIGIN = "origin" + VCS = "vcs" + + +PageResult = Tuple[ArtifactType, Union[Artifact, VCS, OriginUpstream]] + + +VCS_SUPPORTED = ("git", "svn", "hg") + +# Rough approximation of what we can find of mimetypes for tarballs "out there" +POSSIBLE_TARBALL_MIMETYPES = set(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) + + +def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: + """Determine whether a list of files actually are tarballs or simple files. + + When this cannot be answered simply out of the url, when request is provided, this + executes a HTTP `HEAD` query on the url to determine the information. If request is + not provided, this raises an ArtifactNatureUndetected exception. + + Args: + urls: name of the remote files for which the extension needs to be checked. + + Raises: + ArtifactNatureUndetected when the artifact's nature cannot be detected out + of its url + + Returns: A tuple (bool, url). The boolean represents whether the url is an archive + or not. The second parameter is the actual url once the head request is issued + as a fallback of not finding out whether the urls are tarballs or not. + + """ + + def _is_tarball(url): + """Determine out of an extension whether url is a tarball. + + Raises: + IndexError in case no extension is available + + """ + return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS + + index = random.randrange(len(urls)) + url = urls[index] + try: + is_tar = _is_tarball(url) + return is_tar, urls[0] + except IndexError: + if request is None: + raise ArtifactNatureUndetected( + "Cannot determine artifact type from url %s", url + ) + logger.warning( + "Cannot detect extension for '%s'. Fallback to http head query", + url, + ) + response = request.head(url) + + if not response.ok or response.status_code == 404: + raise ArtifactNatureUndetected( + "Cannot determine artifact type from url %s", url + ) + location = response.headers.get("Location") + if location: # It's not always present + logger.debug("Location: %s", location) + try: + # FIXME: location is also returned as it's considered the true origin, + # true enough? + return _is_tarball(location), location + except IndexError: + logger.warning( + "Still cannot detect extension through location '%s'...", + url, + ) + + content_type = response.headers.get("Content-Type") + if content_type: + logger.debug("Content-Type: %s", content_type) + if content_type == "application/json": + return False, urls[0] + return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0] + + raise ArtifactNatureUndetected( + "Cannot determine artifact type from url %s", url + ) + + +VCS_KEYS_MAPPING = { + "git": { + "ref": "git_ref", + "url": "git_url", + }, + "svn": { + "ref": "svn_revision", + "url": "svn_url", + }, + "hg": { + "ref": "hg_changeset", + "url": "hg_url", + }, +} + + +class NixGuixLister(StatelessLister[PageResult]): + """List Guix or Nix sources out of a public json manifest. + + This lister can output: + - unique tarball (.tar.gz, .tbz2, ...) + - vcs repositories (e.g. git, hg, svn) + - unique file (.lisp, .py, ...) + + Note that no `last_update` is available in either manifest. + + For `url` types artifacts, this tries to determine the artifact's nature, tarball or + file. It first tries to compute out of the "url" extension. In case of no extension, + it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` + response header, and then checks the extension again. + + """ + + LISTER_NAME = "nixguix" + + def __init__( + self, + scheduler, + url: str, + origin_upstream: str, + instance: Optional[str] = None, + credentials: Optional[CredentialsType] = None, + # canonicalize urls, can be turned off during docker runs + canonicalize: bool = True, + **kwargs: Any, + ): + super().__init__( + scheduler=scheduler, + url=url.rstrip("/"), + instance=instance, + credentials=credentials, + ) + # either full fqdn NixOS/nixpkgs or guix repository urls + # maybe add an assert on those specific urls? + self.origin_upstream = origin_upstream + + self.session = requests.Session() + # for testing purposes, we may want to skip this step (e.g. docker run and rate + # limit) + self.github_session = ( + GitHubSession( + credentials=self.credentials, + user_agent=str(self.session.headers["User-Agent"]), + ) + if canonicalize + else None + ) + + def get_pages(self) -> Iterator[PageResult]: + """Yield one page per "typed" origin referenced in manifest.""" + # fetch and parse the manifest... + response = self.http_request(self.url) + + # ... if any + raw_data = response.json() + version = raw_data["version"] + revision = raw_data["revision"] + yield ArtifactType.ORIGIN, OriginUpstream( + self.origin_upstream, + version, + revision, + ) + + # grep '"type"' guix-sources.json | sort | uniq + # "type": false <<<<<<<<< noise + # "type": "git", + # "type": "hg", + # "type": "no-origin", <<<<<<<<< noise + # "type": "svn", + # "type": "url", + + # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq + # "type": "url", + + for artifact in raw_data["sources"]: + artifact_type = artifact["type"] + if artifact_type in VCS_SUPPORTED: + plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]] + plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]] + artifact_url = ( + self.github_session.get_canonical_url(plain_url) + if self.github_session + else plain_url + ) + if not artifact_url: + continue + yield ArtifactType.VCS, VCS( + origin=artifact_url, type=artifact_type, ref=plain_ref + ) + elif artifact_type == "url": + # It's either a tarball or a file + urls = artifact.get("urls") + if not urls: + # Nothing to fetch + logger.warning("Skipping url '%s': empty artifact", artifact) + continue + + assert urls is not None + # FIXME: T3294: Fix missing scheme in urls + origin, *fallback_urls = urls + + integrity = artifact.get("integrity") + if integrity is None: + logger.warning("Skipping url '%s': missing integrity field", origin) + continue + + try: + is_tar, origin = is_tarball(urls, self.session) + except ArtifactNatureUndetected: + logger.warning( + "Skipping url '%s': undetected remote artifact type", origin + ) + continue + + # Determine the content checksum stored in the integrity field and + # convert into a dict of checksums. This only parses the + # `hash-expression` (hash-) as defined in + # https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute + chksum_algo, chksum_b64 = integrity.split("-") + checksums: Dict[str, str] = { + chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() + } + + logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) + yield ArtifactType.ARTIFACT, Artifact( + origin=origin, + fallback_urls=fallback_urls, + checksums=checksums, + visit_type="directory" if is_tar else "content", + ) + else: + logger.warning( + "Skipping artifact '%s': unsupported type %s", + artifact, + artifact_type, + ) + + def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]: + """Given a vcs repository, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=artifact.origin, + visit_type=artifact.type, + ) + + def origin_to_listed_origin( + self, origin_upstream: OriginUpstream + ) -> Iterator[ListedOrigin]: + """Given an upstream origin, yield a ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_upstream.origin, + visit_type="git", # both nixpkgs and guix are git origins so far + ) + + def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]: + """Given an artifact (tarball, file), yield one ListedOrigin.""" + assert self.lister_obj.id is not None + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=artifact.origin, + visit_type=artifact.visit_type, + extra_loader_arguments={ + "checksums": artifact.checksums, + "fallback_urls": artifact.fallback_urls, + }, + ) + + def get_origins_from_page( + self, artifact_tuple: PageResult + ) -> Iterator[ListedOrigin]: + """Given an artifact tuple (type, artifact), yield a ListedOrigin.""" + artifact_type, artifact = artifact_tuple + mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin") + yield from mapping_type_fn(artifact) diff --git a/swh/lister/nixguix/tests/__init__.py b/swh/lister/nixguix/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json new file mode 100644 index 0000000..3cd5ae9 --- /dev/null +++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json @@ -0,0 +1,19 @@ +{ + "sources": [ + {"type": "git", "git_url": "", "git_ref": ""}, + {"type": false}, + {"type": "no-origin"}, + {"type": "url", "urls": []}, + { + "type": "url", + "urls": ["https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped"], + "integrity": "sha256-HW6jxFlbljY8E5Q0l9s0r0Rg+0dKlcQ/REatNBuMl4U=" + }, + { + "type": "url", + "urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ] + } + ], + "version":"1", + "revision":"ab59155c5a38dda7efaceb47c7528578fcf0def4" +} diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json new file mode 100644 index 0000000..5e8cf22 --- /dev/null +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -0,0 +1,52 @@ +{ + "sources": [ + { + "type": "url", + "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ], + "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" + }, + { + "type": "url", + "urls": [ "https://example.com/file.txt" ], + "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" + }, + { + "type": "url", + "urls": [ + "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz" + ], + "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0=" + }, + { + "type": "url", + "urls": [ + "http://downloads.sourceforge.net/project/nmon/lmon16n.c", + "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c", + "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c" + ], + "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" + }, + { + "type": "git", + "git_url": "https://example.org/pali/0xffff", + "git_ref": "0.9" + }, + { + "type": "hg", + "hg_url": "https://example.org/vityok/cl-string-match", + "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97" + }, + { + "type": "svn", + "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2", + "svn_revision": 39057 + } + ], + "version": "1", + "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" +} diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py new file mode 100644 index 0000000..3dee3d2 --- /dev/null +++ b/swh/lister/nixguix/tests/test_lister.py @@ -0,0 +1,244 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from collections import defaultdict +import json +import logging +from pathlib import Path +from typing import Dict, List + +import pytest +import requests + +from swh.lister import TARBALL_EXTENSIONS +from swh.lister.nixguix.lister import ( + POSSIBLE_TARBALL_MIMETYPES, + ArtifactNatureUndetected, + NixGuixLister, + is_tarball, +) +from swh.lister.pattern import ListerStats + +logger = logging.getLogger(__name__) + + +def page_response(datadir, instance: str) -> List[Dict]: + """Return list of repositories (out of test dataset)""" + datapath = Path(datadir, f"{instance}-swh_sources.json") + return json.loads(datapath.read_text()) if datapath.exists else [] + + +@pytest.mark.parametrize( + "urls", + [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] + + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], +) +def test_is_tarball_simple(urls): + """Simple check on tarball should discriminate betwenn tarball and file""" + is_tar, origin = is_tarball(urls) + assert is_tar is True + assert origin == urls[0] + + +@pytest.mark.parametrize( + "urls", + [ + ["abc.lisp"], + ["one.abc", "two.bcd"], + ["abc.c", "other.c"], + ["one.scm?foo=bar", "two.scm?foo=bar"], + ["config.nix", "flakes.nix"], + ], +) +def test_is_tarball_simple_not_tarball(urls): + """Simple check on tarball should discriminate betwenn tarball and file""" + is_tar, origin = is_tarball(urls) + assert is_tar is False + assert origin == urls[0] + + +def test_is_tarball_complex_with_no_result(requests_mock): + """Complex tarball detection without proper information should fail.""" + # No extension, this won't detect immediately the nature of the url + url = "https://example.org/crates/package/download" + urls = [url] + with pytest.raises(ArtifactNatureUndetected): + is_tarball(url) # no request parameter, this cannot fallback, raises + + with pytest.raises(ArtifactNatureUndetected): + requests_mock.head( + url, + status_code=404, # not found so cannot detect anything + ) + is_tarball(urls, requests) + + with pytest.raises(ArtifactNatureUndetected): + requests_mock.head( + url, headers={} + ) # response ok without headers, cannot detect anything + is_tarball(urls, requests) + + with pytest.raises(ArtifactNatureUndetected): + fallback_url = "https://example.org/mirror/crates/package/download" + requests_mock.head( + url, headers={"location": fallback_url} # still no extension, cannot detect + ) + is_tarball(urls, requests) + + +@pytest.mark.parametrize( + "fallback_url, expected_result", + [ + ("https://example.org/mirror/crates/package/download.tar.gz", True), + ("https://example.org/mirror/package/download.lisp", False), + ], +) +def test_is_tarball_complex_with_location_result( + requests_mock, fallback_url, expected_result +): + """Complex tarball detection with information should detect artifact nature""" + # No extension, this won't detect immediately the nature of the url + url = "https://example.org/crates/package/download" + urls = [url] + + # One scenario where the url renders a location with a proper extension + requests_mock.head(url, headers={"location": fallback_url}) + is_tar, origin = is_tarball(urls, requests) + assert is_tar == expected_result + if is_tar: + assert origin == fallback_url + + +@pytest.mark.parametrize( + "content_type, expected_result", + [("application/json", False), ("application/something", False)] + + [(ext, True) for ext in POSSIBLE_TARBALL_MIMETYPES], +) +def test_is_tarball_complex_with_content_type_result( + requests_mock, content_type, expected_result +): + """Complex tarball detection with information should detect artifact nature""" + # No extension, this won't detect immediately the nature of the url + url = "https://example.org/crates/package/download" + urls = [url] + + # One scenario where the url renders a location with a proper extension + requests_mock.head(url, headers={"Content-Type": content_type}) + is_tar, origin = is_tarball(urls, requests) + assert is_tar == expected_result + if is_tar: + assert origin == url + + +def test_lister_nixguix(datadir, swh_scheduler, requests_mock): + """NixGuixLister should list all origins per visit type""" + url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + origin_upstream = "https://github.com/NixOS/nixpkgs" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + response = page_response(datadir, "nixpkgs") + requests_mock.get( + url, + [{"json": response}], + ) + + expected_visit_types = defaultdict(int) + # origin upstream is added as origin + expected_nb_origins = 1 + expected_visit_types["git"] += 1 + for artifact in response["sources"]: + # Each artifact is considered an origin (even "url" artifacts with mirror urls) + expected_nb_origins += 1 + artifact_type = artifact["type"] + if artifact_type in [ + "git", + "svn", + "hg", + ]: + expected_visit_types[artifact_type] += 1 + elif artifact_type == "url": + url = artifact["urls"][0] + if url.endswith(".c") or url.endswith(".txt"): + expected_visit_types["content"] += 1 + else: + expected_visit_types["directory"] += 1 + + assert set(expected_visit_types.keys()) == { + "content", + "git", + "svn", + "hg", + "directory", + } + + listed_result = lister.run() + + # 1 page read is 1 origin + nb_pages = expected_nb_origins + assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + mapping_visit_types = defaultdict(int) + + for listed_origin in scheduler_origins: + assert listed_origin.visit_type in expected_visit_types + # no last update is listed on those manifests + assert listed_origin.last_update is None + + mapping_visit_types[listed_origin.visit_type] += 1 + + assert dict(mapping_visit_types) == expected_visit_types + + +def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): + """NixGuixLister should ignore unsupported or incomplete origins""" + url = "https://guix.gnu.org/sources.json" + origin_upstream = "https://git.savannah.gnu.org/git/guix.git" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + response = page_response(datadir, "guix") + + requests_mock.get( + url, + [{"json": response}], + ) + # Amongst artifacts, this url does not allow to determine its nature (tarball, file) + # It's ending up doing a http head query which ends up being 404, so it's skipped. + requests_mock.head( + "https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped", + status_code=404, + ) + + listed_result = lister.run() + # only the origin upstream is listed, every other entries are unsupported or incomplete + assert listed_result == ListerStats(pages=1, origins=1) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == 1 + + assert scheduler_origins[0].visit_type == "git" + + +def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock): + url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + origin_upstream = "https://github.com/NixOS/nixpkgs" + lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) + + requests_mock.get( + url, + status_code=404, + ) + + with pytest.raises(requests.HTTPError): # listing cannot continues so stop + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 0