Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/nixguix/tests/test_lister.py
Show All 9 Lines | |||||
from typing import Dict, List | from typing import Dict, List | ||||
import pytest | import pytest | ||||
import requests | import requests | ||||
from swh.lister import TARBALL_EXTENSIONS | from swh.lister import TARBALL_EXTENSIONS | ||||
from swh.lister.nixguix.lister import ( | from swh.lister.nixguix.lister import ( | ||||
POSSIBLE_TARBALL_MIMETYPES, | POSSIBLE_TARBALL_MIMETYPES, | ||||
ArtifactNatureMistyped, | |||||
ArtifactNatureUndetected, | ArtifactNatureUndetected, | ||||
NixGuixLister, | NixGuixLister, | ||||
is_tarball, | is_tarball, | ||||
) | ) | ||||
from swh.lister.pattern import ListerStats | from swh.lister.pattern import ListerStats | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
def page_response(datadir, instance: str) -> List[Dict]: | def page_response(datadir, instance: str) -> List[Dict]: | ||||
"""Return list of repositories (out of test dataset)""" | """Return list of repositories (out of test dataset)""" | ||||
datapath = Path(datadir, f"{instance}-swh_sources.json") | datapath = Path(datadir, f"{instance}-swh_sources.json") | ||||
return json.loads(datapath.read_text()) if datapath.exists else [] | return json.loads(datapath.read_text()) if datapath.exists else [] | ||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"urls", | "tarballs", | ||||
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] | [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] | ||||
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], | + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], | ||||
) | ) | ||||
def test_is_tarball_simple(urls): | def test_is_tarball_simple(tarballs): | ||||
"""Simple check on tarball should discriminate betwenn tarball and file""" | """Simple check on tarball should discriminate betwenn tarball and file""" | ||||
urls = [f"https://example.org/{tarball}" for tarball in tarballs] | |||||
is_tar, origin = is_tarball(urls) | is_tar, origin = is_tarball(urls) | ||||
assert is_tar is True | assert is_tar is True | ||||
assert origin == urls[0] | assert origin == urls[0] | ||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"urls", | "files", | ||||
[ | [ | ||||
["abc.lisp"], | ["abc.lisp"], | ||||
["one.abc", "two.bcd"], | ["one.abc", "two.bcd"], | ||||
["abc.c", "other.c"], | ["abc.c", "other.c"], | ||||
["one.scm?foo=bar", "two.scm?foo=bar"], | ["one.scm?foo=bar", "two.scm?foo=bar"], | ||||
["config.nix", "flakes.nix"], | ["config.nix", "flakes.nix"], | ||||
], | ], | ||||
) | ) | ||||
def test_is_tarball_simple_not_tarball(urls): | def test_is_tarball_simple_not_tarball(files): | ||||
"""Simple check on tarball should discriminate betwenn tarball and file""" | """Simple check on tarball should discriminate betwenn tarball and file""" | ||||
urls = [f"http://example.org/{file}" for file in files] | |||||
is_tar, origin = is_tarball(urls) | is_tar, origin = is_tarball(urls) | ||||
assert is_tar is False | assert is_tar is False | ||||
assert origin == urls[0] | assert origin == urls[0] | ||||
def test_is_tarball_complex_with_no_result(requests_mock): | def test_is_tarball_complex_with_no_result(requests_mock): | ||||
"""Complex tarball detection without proper information should fail.""" | """Complex tarball detection without proper information should fail.""" | ||||
# No extension, this won't detect immediately the nature of the url | # No extension, this won't detect immediately the nature of the url | ||||
url = "https://example.org/crates/package/download" | url = "https://example.org/crates/package/download" | ||||
urls = [url] | urls = [url] | ||||
with pytest.raises(ArtifactNatureUndetected): | with pytest.raises(ArtifactNatureUndetected): | ||||
is_tarball(url) # no request parameter, this cannot fallback, raises | is_tarball(urls) # no request parameter, this cannot fallback, raises | ||||
with pytest.raises(ArtifactNatureUndetected): | with pytest.raises(ArtifactNatureUndetected): | ||||
requests_mock.head( | requests_mock.head( | ||||
url, | url, | ||||
status_code=404, # not found so cannot detect anything | status_code=404, # not found so cannot detect anything | ||||
) | ) | ||||
is_tarball(urls, requests) | is_tarball(urls, requests) | ||||
with pytest.raises(ArtifactNatureUndetected): | with pytest.raises(ArtifactNatureUndetected): | ||||
requests_mock.head( | requests_mock.head( | ||||
url, headers={} | url, headers={} | ||||
) # response ok without headers, cannot detect anything | ) # response ok without headers, cannot detect anything | ||||
is_tarball(urls, requests) | is_tarball(urls, requests) | ||||
with pytest.raises(ArtifactNatureUndetected): | with pytest.raises(ArtifactNatureUndetected): | ||||
fallback_url = "https://example.org/mirror/crates/package/download" | fallback_url = "https://example.org/mirror/crates/package/download" | ||||
requests_mock.head( | requests_mock.head( | ||||
url, headers={"location": fallback_url} # still no extension, cannot detect | url, headers={"location": fallback_url} # still no extension, cannot detect | ||||
) | ) | ||||
is_tarball(urls, requests) | is_tarball(urls, requests) | ||||
with pytest.raises(ArtifactNatureMistyped): | |||||
is_tarball(["foo://example.org/unsupported-scheme"]) | |||||
with pytest.raises(ArtifactNatureMistyped): | |||||
fallback_url = "foo://example.org/unsupported-scheme" | |||||
requests_mock.head( | |||||
url, headers={"location": fallback_url} # still no extension, cannot detect | |||||
) | |||||
is_tarball(urls, requests) | |||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"fallback_url, expected_result", | "fallback_url, expected_result", | ||||
[ | [ | ||||
("https://example.org/mirror/crates/package/download.tar.gz", True), | ("https://example.org/mirror/crates/package/download.tar.gz", True), | ||||
("https://example.org/mirror/package/download.lisp", False), | ("https://example.org/mirror/package/download.lisp", False), | ||||
], | ], | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | def test_lister_nixguix(datadir, swh_scheduler, requests_mock): | ||||
requests_mock.get( | requests_mock.get( | ||||
url, | url, | ||||
[{"json": response}], | [{"json": response}], | ||||
) | ) | ||||
expected_visit_types = defaultdict(int) | expected_visit_types = defaultdict(int) | ||||
# origin upstream is added as origin | # origin upstream is added as origin | ||||
expected_nb_origins = 1 | expected_nb_origins = 1 | ||||
expected_visit_types["git"] += 1 | expected_visit_types["git"] += 1 | ||||
ardumont: can be dropped now. | |||||
for artifact in response["sources"]: | for artifact in response["sources"]: | ||||
# Each artifact is considered an origin (even "url" artifacts with mirror urls) | # Each artifact is considered an origin (even "url" artifacts with mirror urls) | ||||
expected_nb_origins += 1 | expected_nb_origins += 1 | ||||
artifact_type = artifact["type"] | artifact_type = artifact["type"] | ||||
if artifact_type in [ | if artifact_type in [ | ||||
"git", | "git", | ||||
"svn", | "svn", | ||||
"hg", | "hg", | ||||
]: | ]: | ||||
expected_visit_types[artifact_type] += 1 | expected_visit_types[artifact_type] += 1 | ||||
elif artifact_type == "url": | elif artifact_type == "url": | ||||
url = artifact["urls"][0] | url = artifact["urls"][0] | ||||
if url.endswith(".c") or url.endswith(".txt"): | if url.endswith(".c") or url.endswith(".txt"): | ||||
expected_visit_types["content"] += 1 | expected_visit_types["content"] += 1 | ||||
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless | |||||
expected_visit_types["svn"] += 1 | |||||
else: | else: | ||||
expected_visit_types["directory"] += 1 | expected_visit_types["directory"] += 1 | ||||
assert set(expected_visit_types.keys()) == { | assert set(expected_visit_types.keys()) == { | ||||
"content", | "content", | ||||
"git", | "git", | ||||
"svn", | "svn", | ||||
"hg", | "hg", | ||||
Show All 36 Lines | requests_mock.get( | ||||
[{"json": response}], | [{"json": response}], | ||||
) | ) | ||||
# Amongst artifacts, this url does not allow to determine its nature (tarball, file) | # Amongst artifacts, this url does not allow to determine its nature (tarball, file) | ||||
# It's ending up doing a http head query which ends up being 404, so it's skipped. | # It's ending up doing a http head query which ends up being 404, so it's skipped. | ||||
requests_mock.head( | requests_mock.head( | ||||
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped", | "https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped", | ||||
status_code=404, | status_code=404, | ||||
) | ) | ||||
# Will raise for that origin, this will get ignored as we cannot determine anything | |||||
# from its name | |||||
requests_mock.head( | |||||
"ftp://ftp.ourproject.org/file-with-no-extension", | |||||
exc=requests.exceptions.InvalidSchema, | |||||
) | |||||
listed_result = lister.run() | listed_result = lister.run() | ||||
# only the origin upstream is listed, every other entries are unsupported or incomplete | # only the origin upstream is listed, every other entries are unsupported or incomplete | ||||
assert listed_result == ListerStats(pages=1, origins=1) | assert listed_result == ListerStats(pages=1, origins=1) | ||||
scheduler_origins = lister.scheduler.get_listed_origins( | scheduler_origins = lister.scheduler.get_listed_origins( | ||||
lister.lister_obj.id | lister.lister_obj.id | ||||
).results | ).results | ||||
Show All 20 Lines |
can be dropped now.