Page MenuHomeSoftware Heritage

D8341.diff
No OneTemporary

D8341.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,2 +1,2 @@
-swh.core[db,github] >= 2.8
+swh.core[db,github] >= 2.15
swh.scheduler >= 0.8
diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py
--- a/swh/lister/__init__.py
+++ b/swh/lister/__init__.py
@@ -29,6 +29,28 @@
SUPPORTED_LISTERS = list(LISTERS)
+TARBALL_EXTENSIONS = [
+ "crate",
+ "gem",
+ "jar",
+ "zip",
+ "tar",
+ "gz",
+ "tgz",
+ "tbz",
+ "bz2",
+ "bzip2",
+ "lzma",
+ "lz",
+ "txz",
+ "xz",
+ "z",
+ "Z",
+ "7z",
+ "zst",
+]
+"""Tarball recognition pattern"""
+
def get_lister(lister_name, db_url=None, **conf):
"""Instantiate a lister given its name.
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
--- a/swh/lister/gnu/tree.py
+++ b/swh/lister/gnu/tree.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2021 The Software Heritage developers
+# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -15,6 +15,8 @@
import requests
+from swh.lister import TARBALL_EXTENSIONS
+
logger = logging.getLogger(__name__)
@@ -186,21 +188,6 @@
return False
-# to recognize existing naming pattern
-EXTENSIONS = [
- "zip",
- "tar",
- "gz",
- "tgz",
- "bz2",
- "bzip2",
- "lzma",
- "lz",
- "xz",
- "Z",
- "7z",
-]
-
VERSION_KEYWORDS = [
"cygwin_me",
"w32",
@@ -269,7 +256,7 @@
(?P<extension>(?:\.(?:{extensions}))+)
$
""".format(
- extensions="|".join(EXTENSIONS),
+ extensions="|".join(TARBALL_EXTENSIONS),
vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS),
)
diff --git a/swh/lister/nixguix/__init__.py b/swh/lister/nixguix/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import NixGuixLister
+
+ return {
+ "lister": NixGuixLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/lister.py
@@ -0,0 +1,370 @@
+# Copyright (C) 2020-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""NixGuix lister definition.
+
+This lists artifacts out of manifest for Guix or Nixpkgs manifests.
+
+Artifacts can be of types:
+- upstream git repository (NixOS/nixpkgs, Guix)
+- VCS repositories (svn, git, hg, ...)
+- unique file
+- unique tarball
+
+"""
+
+import base64
+from dataclasses import dataclass
+from enum import Enum
+import logging
+from pathlib import Path
+import random
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from urllib.parse import urlparse
+
+import requests
+
+from swh.core.github.utils import GitHubSession
+from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT
+from swh.lister import TARBALL_EXTENSIONS
+from swh.lister.pattern import CredentialsType, StatelessLister
+from swh.scheduler.model import ListedOrigin
+
+logger = logging.getLogger(__name__)
+
+
+class ArtifactNatureUndetected(ValueError):
+ """Raised when a remote artifact's nature (tarball, file) cannot be detected."""
+
+ pass
+
+
+@dataclass
+class OriginUpstream:
+ """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix)."""
+
+ origin: str
+ """Canonical url of the repository"""
+ version: int
+ """Version of the repository (dismissed?)"""
+ revision: str
+ """Revision of the repository (dismissed?)"""
+
+
+@dataclass
+class Artifact:
+ """Metadata information on Remote Artifact with url (tarball or file)."""
+
+ origin: str
+ """Canonical url retrieve the tarball artifact."""
+ visit_type: str
+ """Either 'tar' or 'file' """
+ fallback_urls: List[str]
+ """List of urls to retrieve tarball artifact if canonical url no longer works."""
+ checksums: Dict[str, str]
+ """Integrity hash converted into a checksum dict."""
+
+
+@dataclass
+class VCS:
+ """Metadata information on VCS."""
+
+ origin: str
+ """Origin url of the vcs"""
+ ref: Optional[str]
+ """Reference either a svn commit id, a git commit, ..."""
+ type: str
+ """Type of (d)vcs, e.g. svn, git, hg, ..."""
+
+
+class ArtifactType(Enum):
+ """The possible artifact types listed out of the manifest."""
+
+ ARTIFACT = "artifact"
+ ORIGIN = "origin"
+ VCS = "vcs"
+
+
+PageResult = Tuple[ArtifactType, Union[Artifact, VCS, OriginUpstream]]
+
+
+VCS_SUPPORTED = ("git", "svn", "hg")
+
+# Rough approximation of what we can find of mimetypes for tarballs "out there"
+POSSIBLE_TARBALL_MIMETYPES = set(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
+
+
+def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
+ """Determine whether a list of files actually are tarballs or simple files.
+
+ When this cannot be answered simply out of the url, when request is provided, this
+ executes a HTTP `HEAD` query on the url to determine the information. If request is
+ not provided, this raises an ArtifactNatureUndetected exception.
+
+ Args:
+ urls: name of the remote files for which the extension needs to be checked.
+
+ Raises:
+ ArtifactNatureUndetected when the artifact's nature cannot be detected out
+ of its url
+
+ Returns: A tuple (bool, url). The boolean represents whether the url is an archive
+ or not. The second parameter is the actual url once the head request is issued
+ as a fallback of not finding out whether the urls are tarballs or not.
+
+ """
+
+ def _is_tarball(url):
+ """Determine out of an extension whether url is a tarball.
+
+ Raises:
+ IndexError in case no extension is available
+
+ """
+ return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
+
+ index = random.randrange(len(urls))
+ url = urls[index]
+ try:
+ is_tar = _is_tarball(url)
+ return is_tar, urls[0]
+ except IndexError:
+ if request is None:
+ raise ArtifactNatureUndetected(
+ "Cannot determine artifact type from url %s", url
+ )
+ logger.warning(
+ "Cannot detect extension for '%s'. Fallback to http head query",
+ url,
+ )
+ response = request.head(url)
+
+ if not response.ok or response.status_code == 404:
+ raise ArtifactNatureUndetected(
+ "Cannot determine artifact type from url %s", url
+ )
+ location = response.headers.get("Location")
+ if location: # It's not always present
+ logger.debug("Location: %s", location)
+ try:
+ # FIXME: location is also returned as it's considered the true origin,
+ # true enough?
+ return _is_tarball(location), location
+ except IndexError:
+ logger.warning(
+ "Still cannot detect extension through location '%s'...",
+ url,
+ )
+
+ content_type = response.headers.get("Content-Type")
+ if content_type:
+ logger.debug("Content-Type: %s", content_type)
+ if content_type == "application/json":
+ return False, urls[0]
+ return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0]
+
+ raise ArtifactNatureUndetected(
+ "Cannot determine artifact type from url %s", url
+ )
+
+
+VCS_KEYS_MAPPING = {
+ "git": {
+ "ref": "git_ref",
+ "url": "git_url",
+ },
+ "svn": {
+ "ref": "svn_revision",
+ "url": "svn_url",
+ },
+ "hg": {
+ "ref": "hg_changeset",
+ "url": "hg_url",
+ },
+}
+
+
+class NixGuixLister(StatelessLister[PageResult]):
+ """List Guix or Nix sources out of a public json manifest.
+
+ This lister can output:
+ - unique tarball (.tar.gz, .tbz2, ...)
+ - vcs repositories (e.g. git, hg, svn)
+ - unique file (.lisp, .py, ...)
+
+ Note that no `last_update` is available in either manifest.
+
+ For `url` types artifacts, this tries to determine the artifact's nature, tarball or
+ file. It first tries to compute out of the "url" extension. In case of no extension,
+ it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
+ response header, and then checks the extension again.
+
+ """
+
+ LISTER_NAME = "nixguix"
+
+ def __init__(
+ self,
+ scheduler,
+ url: str,
+ origin_upstream: str,
+ instance: Optional[str] = None,
+ credentials: Optional[CredentialsType] = None,
+ # canonicalize urls, can be turned off during docker runs
+ canonicalize: bool = True,
+ **kwargs: Any,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ url=url.rstrip("/"),
+ instance=instance,
+ credentials=credentials,
+ )
+ # either full fqdn NixOS/nixpkgs or guix repository urls
+ # maybe add an assert on those specific urls?
+ self.origin_upstream = origin_upstream
+
+ self.session = requests.Session()
+ # for testing purposes, we may want to skip this step (e.g. docker run and rate
+ # limit)
+ self.github_session = (
+ GitHubSession(
+ credentials=self.credentials,
+ user_agent=str(self.session.headers["User-Agent"]),
+ )
+ if canonicalize
+ else None
+ )
+
+ def get_pages(self) -> Iterator[PageResult]:
+ """Yield one page per "typed" origin referenced in manifest."""
+ # fetch and parse the manifest...
+ response = self.http_request(self.url)
+
+ # ... if any
+ raw_data = response.json()
+ version = raw_data["version"]
+ revision = raw_data["revision"]
+ yield ArtifactType.ORIGIN, OriginUpstream(
+ self.origin_upstream,
+ version,
+ revision,
+ )
+
+ # grep '"type"' guix-sources.json | sort | uniq
+ # "type": false <<<<<<<<< noise
+ # "type": "git",
+ # "type": "hg",
+ # "type": "no-origin", <<<<<<<<< noise
+ # "type": "svn",
+ # "type": "url",
+
+ # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq
+ # "type": "url",
+
+ for artifact in raw_data["sources"]:
+ artifact_type = artifact["type"]
+ if artifact_type in VCS_SUPPORTED:
+ plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]]
+ plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]]
+ artifact_url = (
+ self.github_session.get_canonical_url(plain_url)
+ if self.github_session
+ else plain_url
+ )
+ if not artifact_url:
+ continue
+ yield ArtifactType.VCS, VCS(
+ origin=artifact_url, type=artifact_type, ref=plain_ref
+ )
+ elif artifact_type == "url":
+ # It's either a tarball or a file
+ urls = artifact.get("urls")
+ if not urls:
+ # Nothing to fetch
+ logger.warning("Skipping url '%s': empty artifact", artifact)
+ continue
+
+ assert urls is not None
+ # FIXME: T3294: Fix missing scheme in urls
+ origin, *fallback_urls = urls
+
+ integrity = artifact.get("integrity")
+ if integrity is None:
+ logger.warning("Skipping url '%s': missing integrity field", origin)
+ continue
+
+ try:
+ is_tar, origin = is_tarball(urls, self.session)
+ except ArtifactNatureUndetected:
+ logger.warning(
+ "Skipping url '%s': undetected remote artifact type", origin
+ )
+ continue
+
+ # Determine the content checksum stored in the integrity field and
+ # convert into a dict of checksums. This only parses the
+ # `hash-expression` (hash-<b64-encoded-checksum>) as defined in
+ # https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute
+ chksum_algo, chksum_b64 = integrity.split("-")
+ checksums: Dict[str, str] = {
+ chksum_algo: base64.decodebytes(chksum_b64.encode()).hex()
+ }
+
+ logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
+ yield ArtifactType.ARTIFACT, Artifact(
+ origin=origin,
+ fallback_urls=fallback_urls,
+ checksums=checksums,
+ visit_type="directory" if is_tar else "content",
+ )
+ else:
+ logger.warning(
+ "Skipping artifact '%s': unsupported type %s",
+ artifact,
+ artifact_type,
+ )
+
+ def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]:
+ """Given a vcs repository, yield a ListedOrigin."""
+ assert self.lister_obj.id is not None
+ # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...)
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=artifact.origin,
+ visit_type=artifact.type,
+ )
+
+ def origin_to_listed_origin(
+ self, origin_upstream: OriginUpstream
+ ) -> Iterator[ListedOrigin]:
+ """Given an upstream origin, yield a ListedOrigin."""
+ assert self.lister_obj.id is not None
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=origin_upstream.origin,
+ visit_type="git", # both nixpkgs and guix are git origins so far
+ )
+
+ def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]:
+ """Given an artifact (tarball, file), yield one ListedOrigin."""
+ assert self.lister_obj.id is not None
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=artifact.origin,
+ visit_type=artifact.visit_type,
+ extra_loader_arguments={
+ "checksums": artifact.checksums,
+ "fallback_urls": artifact.fallback_urls,
+ },
+ )
+
+ def get_origins_from_page(
+ self, artifact_tuple: PageResult
+ ) -> Iterator[ListedOrigin]:
+ """Given an artifact tuple (type, artifact), yield a ListedOrigin."""
+ artifact_type, artifact = artifact_tuple
+ mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin")
+ yield from mapping_type_fn(artifact)
diff --git a/swh/lister/nixguix/tests/__init__.py b/swh/lister/nixguix/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json
@@ -0,0 +1,19 @@
+{
+ "sources": [
+ {"type": "git", "git_url": "", "git_ref": ""},
+ {"type": false},
+ {"type": "no-origin"},
+ {"type": "url", "urls": []},
+ {
+ "type": "url",
+ "urls": ["https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped"],
+ "integrity": "sha256-HW6jxFlbljY8E5Q0l9s0r0Rg+0dKlcQ/REatNBuMl4U="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ]
+ }
+ ],
+ "version":"1",
+ "revision":"ab59155c5a38dda7efaceb47c7528578fcf0def4"
+}
diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
@@ -0,0 +1,52 @@
+{
+ "sources": [
+ {
+ "type": "url",
+ "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://example.com/file.txt" ],
+ "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz"
+ ],
+ "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
+ },
+ {
+ "type": "url",
+ "urls": [
+ "http://downloads.sourceforge.net/project/nmon/lmon16n.c",
+ "http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c",
+ "http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c"
+ ],
+ "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
+ },
+ {
+ "type": "git",
+ "git_url": "https://example.org/pali/0xffff",
+ "git_ref": "0.9"
+ },
+ {
+ "type": "hg",
+ "hg_url": "https://example.org/vityok/cl-string-match",
+ "hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97"
+ },
+ {
+ "type": "svn",
+ "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
+ "svn_revision": 39057
+ }
+ ],
+ "version": "1",
+ "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"
+}
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -0,0 +1,244 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import defaultdict
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+import pytest
+import requests
+
+from swh.lister import TARBALL_EXTENSIONS
+from swh.lister.nixguix.lister import (
+ POSSIBLE_TARBALL_MIMETYPES,
+ ArtifactNatureUndetected,
+ NixGuixLister,
+ is_tarball,
+)
+from swh.lister.pattern import ListerStats
+
+logger = logging.getLogger(__name__)
+
+
+def page_response(datadir, instance: str) -> List[Dict]:
+ """Return list of repositories (out of test dataset)"""
+ datapath = Path(datadir, f"{instance}-swh_sources.json")
+ return json.loads(datapath.read_text()) if datapath.exists else []
+
+
+@pytest.mark.parametrize(
+ "urls",
+ [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
+ + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS],
+)
+def test_is_tarball_simple(urls):
+ """Simple check on tarball should discriminate betwenn tarball and file"""
+ is_tar, origin = is_tarball(urls)
+ assert is_tar is True
+ assert origin == urls[0]
+
+
+@pytest.mark.parametrize(
+ "urls",
+ [
+ ["abc.lisp"],
+ ["one.abc", "two.bcd"],
+ ["abc.c", "other.c"],
+ ["one.scm?foo=bar", "two.scm?foo=bar"],
+ ["config.nix", "flakes.nix"],
+ ],
+)
+def test_is_tarball_simple_not_tarball(urls):
+ """Simple check on tarball should discriminate betwenn tarball and file"""
+ is_tar, origin = is_tarball(urls)
+ assert is_tar is False
+ assert origin == urls[0]
+
+
+def test_is_tarball_complex_with_no_result(requests_mock):
+ """Complex tarball detection without proper information should fail."""
+ # No extension, this won't detect immediately the nature of the url
+ url = "https://example.org/crates/package/download"
+ urls = [url]
+ with pytest.raises(ArtifactNatureUndetected):
+ is_tarball(url) # no request parameter, this cannot fallback, raises
+
+ with pytest.raises(ArtifactNatureUndetected):
+ requests_mock.head(
+ url,
+ status_code=404, # not found so cannot detect anything
+ )
+ is_tarball(urls, requests)
+
+ with pytest.raises(ArtifactNatureUndetected):
+ requests_mock.head(
+ url, headers={}
+ ) # response ok without headers, cannot detect anything
+ is_tarball(urls, requests)
+
+ with pytest.raises(ArtifactNatureUndetected):
+ fallback_url = "https://example.org/mirror/crates/package/download"
+ requests_mock.head(
+ url, headers={"location": fallback_url} # still no extension, cannot detect
+ )
+ is_tarball(urls, requests)
+
+
+@pytest.mark.parametrize(
+ "fallback_url, expected_result",
+ [
+ ("https://example.org/mirror/crates/package/download.tar.gz", True),
+ ("https://example.org/mirror/package/download.lisp", False),
+ ],
+)
+def test_is_tarball_complex_with_location_result(
+ requests_mock, fallback_url, expected_result
+):
+ """Complex tarball detection with information should detect artifact nature"""
+ # No extension, this won't detect immediately the nature of the url
+ url = "https://example.org/crates/package/download"
+ urls = [url]
+
+ # One scenario where the url renders a location with a proper extension
+ requests_mock.head(url, headers={"location": fallback_url})
+ is_tar, origin = is_tarball(urls, requests)
+ assert is_tar == expected_result
+ if is_tar:
+ assert origin == fallback_url
+
+
+@pytest.mark.parametrize(
+ "content_type, expected_result",
+ [("application/json", False), ("application/something", False)]
+ + [(ext, True) for ext in POSSIBLE_TARBALL_MIMETYPES],
+)
+def test_is_tarball_complex_with_content_type_result(
+ requests_mock, content_type, expected_result
+):
+ """Complex tarball detection with information should detect artifact nature"""
+ # No extension, this won't detect immediately the nature of the url
+ url = "https://example.org/crates/package/download"
+ urls = [url]
+
+ # One scenario where the url renders a location with a proper extension
+ requests_mock.head(url, headers={"Content-Type": content_type})
+ is_tar, origin = is_tarball(urls, requests)
+ assert is_tar == expected_result
+ if is_tar:
+ assert origin == url
+
+
+def test_lister_nixguix(datadir, swh_scheduler, requests_mock):
+ """NixGuixLister should list all origins per visit type"""
+ url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
+ origin_upstream = "https://github.com/NixOS/nixpkgs"
+ lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+
+ response = page_response(datadir, "nixpkgs")
+ requests_mock.get(
+ url,
+ [{"json": response}],
+ )
+
+ expected_visit_types = defaultdict(int)
+ # origin upstream is added as origin
+ expected_nb_origins = 1
+ expected_visit_types["git"] += 1
+ for artifact in response["sources"]:
+ # Each artifact is considered an origin (even "url" artifacts with mirror urls)
+ expected_nb_origins += 1
+ artifact_type = artifact["type"]
+ if artifact_type in [
+ "git",
+ "svn",
+ "hg",
+ ]:
+ expected_visit_types[artifact_type] += 1
+ elif artifact_type == "url":
+ url = artifact["urls"][0]
+ if url.endswith(".c") or url.endswith(".txt"):
+ expected_visit_types["content"] += 1
+ else:
+ expected_visit_types["directory"] += 1
+
+ assert set(expected_visit_types.keys()) == {
+ "content",
+ "git",
+ "svn",
+ "hg",
+ "directory",
+ }
+
+ listed_result = lister.run()
+
+ # 1 page read is 1 origin
+ nb_pages = expected_nb_origins
+ assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins)
+
+ scheduler_origins = lister.scheduler.get_listed_origins(
+ lister.lister_obj.id
+ ).results
+ assert len(scheduler_origins) == expected_nb_origins
+
+ mapping_visit_types = defaultdict(int)
+
+ for listed_origin in scheduler_origins:
+ assert listed_origin.visit_type in expected_visit_types
+ # no last update is listed on those manifests
+ assert listed_origin.last_update is None
+
+ mapping_visit_types[listed_origin.visit_type] += 1
+
+ assert dict(mapping_visit_types) == expected_visit_types
+
+
+def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
+ """NixGuixLister should ignore unsupported or incomplete origins"""
+ url = "https://guix.gnu.org/sources.json"
+ origin_upstream = "https://git.savannah.gnu.org/git/guix.git"
+ lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+
+ response = page_response(datadir, "guix")
+
+ requests_mock.get(
+ url,
+ [{"json": response}],
+ )
+ # Amongst artifacts, this url does not allow to determine its nature (tarball, file)
+ # It's ending up doing a http head query which ends up being 404, so it's skipped.
+ requests_mock.head(
+ "https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
+ status_code=404,
+ )
+
+ listed_result = lister.run()
+ # only the origin upstream is listed, every other entries are unsupported or incomplete
+ assert listed_result == ListerStats(pages=1, origins=1)
+
+ scheduler_origins = lister.scheduler.get_listed_origins(
+ lister.lister_obj.id
+ ).results
+ assert len(scheduler_origins) == 1
+
+ assert scheduler_origins[0].visit_type == "git"
+
+
+def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock):
+ url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
+ origin_upstream = "https://github.com/NixOS/nixpkgs"
+ lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+
+ requests_mock.get(
+ url,
+ status_code=404,
+ )
+
+ with pytest.raises(requests.HTTPError): # listing cannot continues so stop
+ lister.run()
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ assert len(scheduler_origins) == 0

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 7:39 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218510

Event Timeline