Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/nixguix/tests/test_lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import defaultdict | ||||
import json | import json | ||||
import logging | import logging | ||||
from pathlib import Path | from pathlib import Path | ||||
from typing import Dict, List | from typing import Dict, List | ||||
from urllib.parse import urlparse | |||||
import pytest | import pytest | ||||
import requests | import requests | ||||
from requests.exceptions import ConnectionError, InvalidSchema, SSLError | from requests.exceptions import ConnectionError, InvalidSchema, SSLError | ||||
from swh.lister import TARBALL_EXTENSIONS | from swh.lister import TARBALL_EXTENSIONS | ||||
from swh.lister.nixguix.lister import ( | from swh.lister.nixguix.lister import ( | ||||
DEFAULT_EXTENSIONS_TO_IGNORE, | |||||
POSSIBLE_TARBALL_MIMETYPES, | POSSIBLE_TARBALL_MIMETYPES, | ||||
ArtifactNatureMistyped, | ArtifactNatureMistyped, | ||||
ArtifactNatureUndetected, | ArtifactNatureUndetected, | ||||
ArtifactWithoutExtension, | |||||
NixGuixLister, | NixGuixLister, | ||||
is_tarball, | is_tarball, | ||||
url_endswith, | |||||
) | ) | ||||
from swh.lister.pattern import ListerStats | from swh.lister.pattern import ListerStats | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
SOURCES = { | SOURCES = { | ||||
"guix": { | "guix": { | ||||
"repo": "https://git.savannah.gnu.org/cgit/guix.git/", | "repo": "https://git.savannah.gnu.org/cgit/guix.git/", | ||||
"manifest": "https://guix.gnu.org/sources.json", | "manifest": "https://guix.gnu.org/sources.json", | ||||
}, | }, | ||||
"nixpkgs": { | "nixpkgs": { | ||||
"repo": "https://github.com/NixOS/nixpkgs", | "repo": "https://github.com/NixOS/nixpkgs", | ||||
"manifest": "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json", | "manifest": "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json", | ||||
}, | }, | ||||
} | } | ||||
def page_response(datadir, instance: str = "success") -> List[Dict]: | def page_response(datadir, instance: str = "success") -> List[Dict]: | ||||
"""Return list of repositories (out of test dataset)""" | """Return list of repositories (out of test dataset)""" | ||||
datapath = Path(datadir, f"sources-{instance}.json") | datapath = Path(datadir, f"sources-{instance}.json") | ||||
return json.loads(datapath.read_text()) if datapath.exists else [] | return json.loads(datapath.read_text()) if datapath.exists else [] | ||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"name,expected_result", | |||||
[(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS] | |||||
+ [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS] | |||||
+ [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE] | |||||
+ [("two?file=something.el", False), ("foo?two=two&three=three", False)], | |||||
) | |||||
def test_url_endswith(name, expected_result): | |||||
"""It should detect whether url or query params of the urls ends with extensions""" | |||||
urlparsed = urlparse(f"https://example.org/{name}") | |||||
assert ( | |||||
url_endswith( | |||||
urlparsed, | |||||
TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE, | |||||
raise_when_no_extension=False, | |||||
) | |||||
is expected_result | |||||
) | |||||
def test_url_endswith_raise(): | |||||
"""It should raise when the tested url has no extension""" | |||||
urlparsed = urlparse("https://example.org/foo?two=two&three=three") | |||||
with pytest.raises(ArtifactWithoutExtension): | |||||
url_endswith(urlparsed, ["unimportant"]) | |||||
@pytest.mark.parametrize( | |||||
"tarballs", | "tarballs", | ||||
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] | [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] | ||||
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], | + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], | ||||
) | ) | ||||
def test_is_tarball_simple(tarballs): | def test_is_tarball_simple(tarballs): | ||||
"""Simple check on tarball should discriminate between tarball and file""" | """Simple check on tarball should discriminate between tarball and file""" | ||||
urls = [f"https://example.org/{tarball}" for tarball in tarballs] | urls = [f"https://example.org/{tarball}" for tarball in tarballs] | ||||
is_tar, origin = is_tarball(urls) | is_tar, origin = is_tarball(urls) | ||||
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines | for listed_origin in scheduler_origins: | ||||
assert listed_origin.last_update is None | assert listed_origin.last_update is None | ||||
mapping_visit_types[listed_origin.visit_type] += 1 | mapping_visit_types[listed_origin.visit_type] += 1 | ||||
assert dict(mapping_visit_types) == expected_visit_types | assert dict(mapping_visit_types) == expected_visit_types | ||||
def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): | def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock): | ||||
"""NixGuixLister should ignore unsupported or incomplete origins""" | """NixGuixLister should ignore unsupported or incomplete or to ignore origins""" | ||||
url = SOURCES["nixpkgs"]["manifest"] | url = SOURCES["nixpkgs"]["manifest"] | ||||
origin_upstream = SOURCES["nixpkgs"]["repo"] | origin_upstream = SOURCES["nixpkgs"]["repo"] | ||||
lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream) | lister = NixGuixLister( | ||||
swh_scheduler, | |||||
url=url, | |||||
origin_upstream=origin_upstream, | |||||
extensions_to_ignore=["foobar"], | |||||
) | |||||
response = page_response(datadir, "failure") | response = page_response(datadir, "failure") | ||||
anlambert: I think those could be set as default in the lister constructor as we are not interested in… | |||||
Done Inline ActionsYes, it was on my mind initially but was unsure whether we wanted to hard-code it immediatly. Note that I've also locally refactored a bit the filtering to reuse some code. As usual, thx. ardumont: Yes, it was on my mind initially but was unsure whether we wanted to hard-code it immediatly. | |||||
Done Inline Actionsdone ardumont: done | |||||
requests_mock.get( | requests_mock.get( | ||||
url, | url, | ||||
[{"json": response}], | [{"json": response}], | ||||
) | ) | ||||
# Amongst artifacts, this url does not allow to determine its nature (tarball, file) | # Amongst artifacts, this url does not allow to determine its nature (tarball, file) | ||||
# It's ending up doing a http head query which ends up being 404, so it's skipped. | # It's ending up doing a http head query which ends up being 404, so it's skipped. | ||||
requests_mock.head( | requests_mock.head( | ||||
▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines |
I think those could be set as default in the lister constructor as we are not interested in archiving binary files. You could then extend that default list through configuration if we encounter other extensions not related to source code archives.