diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
index 78f58f2..0253a32 100644
--- a/swh/lister/cgit/lister.py
+++ b/swh/lister/cgit/lister.py
@@ -1,183 +1,187 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import requests
from requests.exceptions import HTTPError
from swh.lister import USER_AGENT
-from swh.lister.pattern import StatelessLister
+from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
Repositories = List[Dict[str, Any]]
class CGitLister(StatelessLister[Repositories]):
"""Lister class for CGit repositories.
This lister will retrieve the list of published git repositories by
parsing the HTML page(s) of the index retrieved at `url`.
For each found git repository, a query is made at the given url found
in this index to gather published "Clone" URLs to be used as origin
URL for that git repo.
If several "Clone" urls are provided, prefer the http/https one, if
any, otherwise fallback to the first one.
"""
LISTER_NAME = "cgit"
def __init__(
- self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None
+ self,
+ scheduler: SchedulerInterface,
+ url: str,
+ instance: Optional[str] = None,
+ credentials: Optional[CredentialsType] = None,
):
"""Lister class for CGit repositories.
Args:
url (str): main URL of the CGit instance, i.e. url of the index
of published git repositories on this instance.
instance (str): Name of cgit instance. Defaults to url's hostname
if unset.
"""
if not instance:
instance = urlparse(url).hostname
assert instance is not None # Make mypy happy
super().__init__(
- scheduler=scheduler, credentials=None, url=url, instance=instance,
+ scheduler=scheduler, url=url, instance=instance, credentials=credentials,
)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
response = self.session.get(url)
response.raise_for_status()
return BeautifulSoup(response.text, features="html.parser")
def get_pages(self) -> Iterator[Repositories]:
"""Generate git 'project' URLs found on the current CGit server
The last_update date is retrieved on the list of repo page to avoid
to compute it on the repository details which only give a date per branch
"""
next_page: Optional[str] = self.url
while next_page:
bs_idx = self._get_and_parse(next_page)
page_results = []
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
url = urljoin(self.url, tr.find("a")["href"])
span = tr.find("span", {"class": re.compile("age-")})
if span:
last_updated_date = span["title"]
else:
last_updated_date = None
page_results.append(
{"url": url, "last_updated_date": last_updated_date}
)
yield page_results
try:
pager = bs_idx.find("ul", {"class": "pager"})
current_page = pager.find("a", {"class": "current"})
if current_page:
next_page = current_page.parent.next_sibling.a["href"]
next_page = urljoin(self.url, next_page)
except (AttributeError, KeyError):
# no pager, or no next page
next_page = None
def get_origins_from_page(
self, repositories: Repositories
) -> Iterator[ListedOrigin]:
"""Convert a page of cgit repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
for repository in repositories:
origin_url = self._get_origin_from_repository_url(repository["url"])
if origin_url is None:
continue
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
last_update=_parse_last_updated_date(repository),
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
"""Extract the git url from the repository page"""
try:
bs = self._get_and_parse(repository_url)
except HTTPError as e:
logger.warning(
"Unexpected HTTP status code %s on %s",
e.response.status_code,
e.response.url,
)
return None
# origin urls are listed on the repository page
# TODO check if forcing https is better or not ?
#
#
#
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
if not urls:
return None
# look for the http/https url, if any, and use it as origin_url
for url in urls:
if urlparse(url).scheme in ("http", "https"):
origin_url = url
break
else:
# otherwise, choose the first one
origin_url = urls[0]
return origin_url
def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]:
"""Parse the last updated date"""
date = repository.get("last_updated_date")
if not date:
return None
parsed_date = None
for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"):
try:
parsed_date = datetime.strptime(date, date_format)
# force UTC to avoid naive datetime
if not parsed_date.tzinfo:
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
break
except Exception:
pass
if not parsed_date:
logger.warning(
"Could not parse %s last_updated date: %s", repository["url"], date,
)
return parsed_date
diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py
index f36f956..cf4c9fc 100644
--- a/swh/lister/cgit/tests/test_lister.py
+++ b/swh/lister/cgit/tests/test_lister.py
@@ -1,162 +1,198 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timedelta, timezone
from typing import List
import pytest
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.lister import __version__
from swh.lister.cgit.lister import CGitLister, _parse_last_updated_date
from swh.lister.pattern import ListerStats
def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler):
url = "https://git.savannah.gnu.org/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
repos: List[List[str]] = list(lister_cgit.get_pages())
flattened_repos = sum(repos, [])
assert len(flattened_repos) == 977
assert (
flattened_repos[0]["url"] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
)
# note the url below is NOT a subpath of /cgit/
assert (
flattened_repos[-1]["url"] == "https://git.savannah.gnu.org/path/to/yetris.git/"
) # noqa
# note the url below is NOT on the same server
assert flattened_repos[-2]["url"] == "http://example.org/cgit/xstarcastle.git/"
def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler):
url = "https://git.tizen/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
repos: List[List[str]] = list(lister_cgit.get_pages())
flattened_repos = sum(repos, [])
# we should have 16 repos (listed on 3 pages)
assert len(repos) == 3
assert len(flattened_repos) == 16
def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler):
"""cgit lister supports pagination"""
url = "https://git.tizen/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
stats = lister_cgit.run()
expected_nb_origins = 16
assert stats == ListerStats(pages=3, origins=expected_nb_origins)
# test page parsing
scheduler_origins = swh_scheduler.get_listed_origins(
lister_cgit.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
# test listed repositories
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith("https://git.tizen")
# test user agent content
assert len(requests_mock_datadir.request_history) != 0
for request in requests_mock_datadir.request_history:
assert "User-Agent" in request.headers
user_agent = request.headers["User-Agent"]
assert "Software Heritage Lister" in user_agent
assert __version__ in user_agent
def test_lister_cgit_run_populates_last_update(requests_mock_datadir, swh_scheduler):
"""cgit lister returns last updated date"""
url = "https://git.tizen/cgit"
urls_without_date = [
f"https://git.tizen.org/cgit/{suffix_url}"
for suffix_url in ["All-Projects", "All-Users", "Lock-Projects",]
]
lister_cgit = CGitLister(swh_scheduler, url=url)
stats = lister_cgit.run()
expected_nb_origins = 16
assert stats == ListerStats(pages=3, origins=expected_nb_origins)
# test page parsing
scheduler_origins = swh_scheduler.get_listed_origins(
lister_cgit.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
# test listed repositories
for listed_origin in scheduler_origins:
if listed_origin.url in urls_without_date:
assert listed_origin.last_update is None
else:
assert listed_origin.last_update is not None
@pytest.mark.parametrize(
"date_str,expected_date",
[
({}, None),
("unexpected date", None),
("2020-0140-10 10:10:10 (GMT)", None),
(
"2020-01-10 10:10:10 (GMT)",
datetime(
year=2020,
month=1,
day=10,
hour=10,
minute=10,
second=10,
tzinfo=timezone.utc,
),
),
(
"2019-08-04 05:10:41 +0100",
datetime(
year=2019,
month=8,
day=4,
hour=5,
minute=10,
second=41,
tzinfo=timezone(timedelta(hours=1)),
),
),
],
)
def test_lister_cgit_date_parsing(date_str, expected_date):
"""test cgit lister date parsing"""
repository = {"url": "url", "last_updated_date": date_str}
assert _parse_last_updated_date(repository) == expected_date
requests_mock_datadir_missing_url = requests_mock_datadir_factory(
ignore_urls=["https://git.tizen/cgit/adaptation/ap_samsung/audio-hal-e4x12/",]
)
def test_lister_cgit_get_origin_from_repo_failing(
requests_mock_datadir_missing_url, swh_scheduler
):
url = "https://git.tizen/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
stats = lister_cgit.run()
expected_nb_origins = 15
assert stats == ListerStats(pages=3, origins=expected_nb_origins)
+
+
+@pytest.mark.parametrize(
+ "credentials, expected_credentials",
+ [
+ (None, []),
+ ({"key": "value"}, []),
+ (
+ {"cgit": {"tizen": [{"username": "user", "password": "pass"}]}},
+ [{"username": "user", "password": "pass"}],
+ ),
+ ],
+)
+def test_lister_cgit_instantiation_with_credentials(
+ credentials, expected_credentials, swh_scheduler
+):
+ url = "https://git.tizen/cgit/"
+ lister = CGitLister(
+ swh_scheduler, url=url, instance="tizen", credentials=credentials
+ )
+
+ # Credentials are allowed in constructor
+ assert lister.credentials == expected_credentials
+
+
+def test_lister_cgit_from_configfile(swh_scheduler_config, mocker):
+ load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
+ load_from_envvar.return_value = {
+ "scheduler": {"cls": "local", **swh_scheduler_config},
+ "url": "https://git.tizen/cgit/",
+ "instance": "tizen",
+ "credentials": {},
+ }
+ lister = CGitLister.from_configfile()
+ assert lister.scheduler is not None
+ assert lister.credentials is not None
diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py
index 17d34a9..f348221 100644
--- a/swh/lister/cgit/tests/test_tasks.py
+++ b/swh/lister/cgit/tests/test_tasks.py
@@ -1,37 +1,35 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from unittest.mock import patch
-
from swh.lister.pattern import ListerStats
def test_cgit_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.cgit.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
-@patch("swh.lister.cgit.tasks.CGitLister")
def test_cgit_lister_task(
- lister, swh_scheduler_celery_app, swh_scheduler_celery_worker
+ swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
# setup the mocked CGitLister
+ lister = mocker.patch("swh.lister.cgit.tasks.CGitLister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
kwargs = dict(url="https://git.kernel.org/", instance="kernel")
res = swh_scheduler_celery_app.send_task(
"swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**kwargs)
lister.run.assert_called_once_with()
diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py
index 7dd8b08..89468c6 100644
--- a/swh/lister/cran/lister.py
+++ b/swh/lister/cran/lister.py
@@ -1,125 +1,129 @@
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import json
import logging
import subprocess
from typing import Dict, Iterator, List, Optional, Tuple
import pkg_resources
-from swh.lister.pattern import StatelessLister
+from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
CRAN_MIRROR = "https://cran.r-project.org"
PageType = List[Dict[str, str]]
class CRANLister(StatelessLister[PageType]):
"""
List all packages hosted on The Comprehensive R Archive Network.
"""
LISTER_NAME = "CRAN"
def __init__(
- self, scheduler: SchedulerInterface,
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
):
- super().__init__(scheduler, url=CRAN_MIRROR, instance="cran")
+ super().__init__(
+ scheduler, url=CRAN_MIRROR, instance="cran", credentials=credentials
+ )
def get_pages(self) -> Iterator[PageType]:
"""
Yields a single page containing all CRAN packages info.
"""
yield read_cran_data()
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
assert self.lister_obj.id is not None
for package_info in page:
origin_url, artifact_url = compute_origin_urls(package_info)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="tar",
last_update=parse_packaged_date(package_info),
extra_loader_arguments={
"artifacts": [
{"url": artifact_url, "version": package_info["Version"]}
]
},
)
def read_cran_data() -> List[Dict[str, str]]:
"""
Runs R script which uses inbuilt API to return a json response
containing data about the R packages.
Returns:
List of Dict about R packages. For example::
[
{
'Package': 'A3',
'Version': '1.0.0'
},
{
'Package': 'abbyyR',
'Version': '0.5.4'
},
...
]
"""
filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R")
logger.debug("Executing R script %s", filepath)
response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False)
return json.loads(response.stdout.decode("utf-8"))
def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]:
"""Compute the package url from the repo dict.
Args:
repo: dict with key 'Package', 'Version'
Returns:
the tuple project url, artifact url
"""
package = package_info["Package"]
version = package_info["Version"]
origin_url = f"{CRAN_MIRROR}/package={package}"
artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz"
return origin_url, artifact_url
def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]:
packaged_at_str = package_info.get("Packaged", "")
packaged_at = None
if packaged_at_str:
try:
# Packaged field format: "%Y-%m-%d %H:%M:%S UTC; ",
packaged_at = datetime.strptime(
packaged_at_str.split(" UTC;")[0], "%Y-%m-%d %H:%M:%S",
).replace(tzinfo=timezone.utc)
except Exception:
try:
# Some old packages have a different date format:
# "%a %b %d %H:%M:%S %Y; "
packaged_at = datetime.strptime(
packaged_at_str.split(";")[0], "%a %b %d %H:%M:%S %Y",
).replace(tzinfo=timezone.utc)
except Exception:
logger.debug(
"Could not parse %s package release date: %s",
package_info["Package"],
packaged_at_str,
)
return packaged_at
diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py
index 1530a8c..700545e 100644
--- a/swh/lister/cran/tests/test_lister.py
+++ b/swh/lister/cran/tests/test_lister.py
@@ -1,91 +1,122 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import json
from os import path
import pytest
from swh.lister.cran.lister import (
CRAN_MIRROR,
CRANLister,
compute_origin_urls,
parse_packaged_date,
)
def test_cran_compute_origin_urls():
pack = "something"
vers = "0.0.1"
origin_url, artifact_url = compute_origin_urls({"Package": pack, "Version": vers,})
assert origin_url == f"{CRAN_MIRROR}/package={pack}"
assert artifact_url == f"{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz"
def test_cran_compute_origin_urls_failure():
for incomplete_repo in [{"Version": "0.0.1"}, {"Package": "package"}, {}]:
with pytest.raises(KeyError):
compute_origin_urls(incomplete_repo)
def test_parse_packaged_date():
common_date_format = {
"Package": "test",
"Packaged": "2017-04-26 11:36:15 UTC; Jonathan",
}
assert parse_packaged_date(common_date_format) == datetime(
year=2017, month=4, day=26, hour=11, minute=36, second=15, tzinfo=timezone.utc
)
old_date_format = {
"Package": "test",
"Packaged": "Thu Mar 30 10:48:35 2006; hornik",
}
assert parse_packaged_date(old_date_format) == datetime(
year=2006, month=3, day=30, hour=10, minute=48, second=35, tzinfo=timezone.utc
)
invalid_date_format = {
"Package": "test",
"Packaged": "foo",
}
assert parse_packaged_date(invalid_date_format) is None
missing_date = {
"Package": "test",
}
assert parse_packaged_date(missing_date) is None
def test_cran_lister_cran(datadir, swh_scheduler, mocker):
with open(path.join(datadir, "list-r-packages.json")) as f:
cran_data = json.loads(f.read())
lister = CRANLister(swh_scheduler)
mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data")
mock_cran.return_value = cran_data
stats = lister.run()
assert stats.pages == 1
assert stats.origins == len(cran_data)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(cran_data)
for package_info in cran_data:
origin_url, artifact_url = compute_origin_urls(package_info)
filtered_origins = [o for o in scheduler_origins if o.url == origin_url]
assert len(filtered_origins) == 1
assert filtered_origins[0].extra_loader_arguments == {
"artifacts": [{"url": artifact_url, "version": package_info["Version"]}]
}
filtered_origins[0].last_update == parse_packaged_date(package_info)
+
+
+@pytest.mark.parametrize(
+ "credentials, expected_credentials",
+ [
+ (None, []),
+ ({"key": "value"}, []),
+ (
+ {"CRAN": {"cran": [{"username": "user", "password": "pass"}]}},
+ [{"username": "user", "password": "pass"}],
+ ),
+ ],
+)
+def test_lister_cran_instantiation_with_credentials(
+ credentials, expected_credentials, swh_scheduler
+):
+ lister = CRANLister(swh_scheduler, credentials=credentials)
+
+ # Credentials are allowed in constructor
+ assert lister.credentials == expected_credentials
+
+
+def test_lister_cran_from_configfile(swh_scheduler_config, mocker):
+ load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
+ load_from_envvar.return_value = {
+ "scheduler": {"cls": "local", **swh_scheduler_config},
+ "credentials": {},
+ }
+ lister = CRANLister.from_configfile()
+ assert lister.scheduler is not None
+ assert lister.credentials is not None
diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
index c680556..ae9874b 100644
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -1,72 +1,76 @@
-# Copyright (C) 2018-2019 The Software Heritage developers
+# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
-from typing import Iterator, List
+from typing import Iterator, List, Optional
import requests
import xmltodict
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
-from ..pattern import StatelessLister
+from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
PackageListPage = List[str]
class PyPILister(StatelessLister[PackageListPage]):
"""List origins from PyPI.
"""
LISTER_NAME = "pypi"
INSTANCE = "pypi" # As of today only the main pypi.org is used
PACKAGE_LIST_URL = "https://pypi.org/simple/"
PACKAGE_URL = "https://pypi.org/project/{package_name}/"
- def __init__(self, scheduler: SchedulerInterface):
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
super().__init__(
scheduler=scheduler,
- credentials=None,
url=self.PACKAGE_LIST_URL,
instance=self.INSTANCE,
+ credentials=credentials,
)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
def get_pages(self) -> Iterator[PackageListPage]:
response = self.session.get(self.PACKAGE_LIST_URL)
response.raise_for_status()
page_xmldict = xmltodict.parse(response.content)
page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]]
yield page_results
def get_origins_from_page(
self, packages_name: PackageListPage
) -> Iterator[ListedOrigin]:
"""Convert a page of PyPI repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
for package_name in packages_name:
package_url = self.PACKAGE_URL.format(package_name=package_name)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=package_url,
visit_type="pypi",
last_update=None, # available on PyPI JSON API
)
diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py
index 43b301c..1e613de 100644
--- a/swh/lister/pypi/tests/test_lister.py
+++ b/swh/lister/pypi/tests/test_lister.py
@@ -1,80 +1,111 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
from typing import List
import pytest
import requests
from swh.lister.pypi.lister import PyPILister
from swh.scheduler.model import ListedOrigin
@pytest.fixture
def pypi_packages_testdata(datadir):
content = Path(datadir, "https_pypi.org", "simple").read_bytes()
names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"]
urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names]
return content, names, urls
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
"""Asserts that the two collections have the same origin URLs"""
sorted_lister_urls = list(sorted(lister_urls))
sorted_scheduler_origins = list(sorted(scheduler_origins))
assert len(sorted_lister_urls) == len(sorted_scheduler_origins)
for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins):
assert l_url == s_origin.url
def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata):
t_content, t_names, t_urls = pypi_packages_testdata
requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_content)
lister = PyPILister(scheduler=swh_scheduler)
lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page")
lister.session.get = mocker.spy(lister.session, "get")
stats = lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL)
lister.get_origins_from_page.assert_called_once_with(t_names)
assert stats.pages == 1
assert stats.origins == 4
assert len(scheduler_origins) == 4
check_listed_origins(t_urls, scheduler_origins)
assert lister.get_state_from_scheduler() is None
@pytest.mark.parametrize("http_code", [400, 429, 500])
def test_pypi_list_http_error(swh_scheduler, requests_mock, mocker, http_code):
requests_mock.get(
PyPILister.PACKAGE_LIST_URL, [{"content": None, "status_code": http_code},],
)
lister = PyPILister(scheduler=swh_scheduler)
lister.session.get = mocker.spy(lister.session, "get")
with pytest.raises(requests.HTTPError):
lister.run()
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 0
+
+
+@pytest.mark.parametrize(
+ "credentials, expected_credentials",
+ [
+ (None, []),
+ ({"key": "value"}, []),
+ (
+ {"pypi": {"pypi": [{"username": "user", "password": "pass"}]}},
+ [{"username": "user", "password": "pass"}],
+ ),
+ ],
+)
+def test_lister_pypi_instantiation_with_credentials(
+ credentials, expected_credentials, swh_scheduler
+):
+ lister = PyPILister(swh_scheduler, credentials=credentials)
+
+ # Credentials are allowed in constructor
+ assert lister.credentials == expected_credentials
+
+
+def test_lister_pypi_from_configfile(swh_scheduler_config, mocker):
+ load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
+ load_from_envvar.return_value = {
+ "scheduler": {"cls": "local", **swh_scheduler_config},
+ "credentials": {},
+ }
+ lister = PyPILister.from_configfile()
+ assert lister.scheduler is not None
+ assert lister.credentials is not None