Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pypi/tests/test_lister.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from pathlib import Path | from collections import defaultdict | ||||
from datetime import datetime, timezone | |||||
from typing import List | from typing import List | ||||
import pytest | import pytest | ||||
import requests | |||||
from swh.lister.pypi.lister import PyPILister | from swh.lister.pypi.lister import ChangelogEntry, PyPILister, pypi_url | ||||
from swh.scheduler.interface import SchedulerInterface | |||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
@pytest.fixture | |||||
def pypi_packages_testdata(datadir): | |||||
content = Path(datadir, "https_pypi.org", "simple").read_bytes() | |||||
names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"] | |||||
urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names] | |||||
return content, names, urls | |||||
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): | def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): | ||||
"""Asserts that the two collections have the same origin URLs""" | """Asserts that the two collections have the same origin URLs""" | ||||
sorted_lister_urls = list(sorted(lister_urls)) | sorted_lister_urls = list(sorted(lister_urls)) | ||||
sorted_scheduler_origins = list(sorted(scheduler_origins)) | sorted_scheduler_origins = list(sorted(scheduler_origins)) | ||||
assert len(sorted_lister_urls) == len(sorted_scheduler_origins) | assert len(sorted_lister_urls) == len(sorted_scheduler_origins) | ||||
for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): | for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): | ||||
assert l_url == s_origin.url | assert l_url == s_origin.url | ||||
def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata): | |||||
t_content, t_names, t_urls = pypi_packages_testdata | |||||
requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_content) | |||||
lister = PyPILister(scheduler=swh_scheduler) | |||||
lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page") | |||||
lister.session.get = mocker.spy(lister.session, "get") | |||||
stats = lister.run() | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL) | |||||
lister.get_origins_from_page.assert_called_once_with(t_names) | |||||
assert stats.pages == 1 | |||||
assert stats.origins == 4 | |||||
assert len(scheduler_origins) == 4 | |||||
check_listed_origins(t_urls, scheduler_origins) | |||||
assert lister.get_state_from_scheduler() is None | |||||
@pytest.mark.parametrize("http_code", [400, 429, 500]) | |||||
def test_pypi_list_http_error(swh_scheduler, requests_mock, mocker, http_code): | |||||
requests_mock.get( | |||||
PyPILister.PACKAGE_LIST_URL, [{"content": None, "status_code": http_code},], | |||||
) | |||||
lister = PyPILister(scheduler=swh_scheduler) | |||||
lister.session.get = mocker.spy(lister.session, "get") | |||||
with pytest.raises(requests.HTTPError): | |||||
lister.run() | |||||
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL) | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
assert len(scheduler_origins) == 0 | |||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"credentials, expected_credentials", | "credentials, expected_credentials", | ||||
[ | [ | ||||
(None, []), | (None, []), | ||||
({"key": "value"}, []), | ({"key": "value"}, []), | ||||
( | ( | ||||
{"pypi": {"pypi": [{"username": "user", "password": "pass"}]}}, | {"pypi": {"pypi": [{"username": "user", "password": "pass"}]}}, | ||||
[{"username": "user", "password": "pass"}], | [{"username": "user", "password": "pass"}], | ||||
Show All 13 Lines | def test_lister_pypi_from_configfile(swh_scheduler_config, mocker): | ||||
load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") | load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") | ||||
load_from_envvar.return_value = { | load_from_envvar.return_value = { | ||||
"scheduler": {"cls": "local", **swh_scheduler_config}, | "scheduler": {"cls": "local", **swh_scheduler_config}, | ||||
"credentials": {}, | "credentials": {}, | ||||
} | } | ||||
lister = PyPILister.from_configfile() | lister = PyPILister.from_configfile() | ||||
assert lister.scheduler is not None | assert lister.scheduler is not None | ||||
assert lister.credentials is not None | assert lister.credentials is not None | ||||
def to_serial(changelog_entry: ChangelogEntry) -> int: | |||||
olasd: `mock_pypi_xmlrpc`? | |||||
"""Helper utility to read the serial entry in the tuple | |||||
Args: | |||||
changelog_entry: Changelog entry to read data from | |||||
Returns: | |||||
The serial from the entry | |||||
""" | |||||
return changelog_entry[4] | |||||
def configure_scheduler_state( | |||||
scheduler: SchedulerInterface, data: List[ChangelogEntry] | |||||
): | |||||
"""Allows to pre configure a last serial state for the lister consistent with the test | |||||
data set (the last_serial will be something inferior than the most minimal serial | |||||
Done Inline Actionsyou were right earlier, see this ^ ardumont: you were right earlier, see this ^ | |||||
in the data set). | |||||
Args: | |||||
scheduler: The actual scheduler instance used during test | |||||
Done Inline Actionsmypy did not want it so fine it complained it wanted Tuple[datetime, List[object], Any] which is not as informative as ^. ardumont: mypy did not want it so fine it complained it wanted `Tuple[datetime, List[object], Any]` which… | |||||
data: The actual dataset used during test | |||||
""" | |||||
# Compute the lowest serial to make it a minimum state to store in the scheduler | |||||
lowest_serial = min(map(to_serial, data)) | |||||
# We'll need to configure the scheduler's state | |||||
lister_obj = scheduler.get_or_create_lister( | |||||
name=PyPILister.LISTER_NAME, instance_name=PyPILister.INSTANCE | |||||
) | |||||
lister_obj.current_state = {"last_serial": lowest_serial - 10} | |||||
scheduler.update_lister(lister_obj) | |||||
@pytest.fixture | |||||
def mock_pypi_xmlrpc(mocker, swh_scheduler): | |||||
"""This setups a lister so it can actually fake the call to the rpc service executed | |||||
during an incremental listing. | |||||
To retrieve or update the faked data, open a python3 toplevel and execute the | |||||
following: | |||||
.. code:: python | |||||
from datetime import timezone, datetime, timedelta | |||||
from xmlrpc.client import ServerProxy | |||||
from swh.scheduler.utils import utcnow | |||||
RPC_URL = "https://pypi.org/pypi" | |||||
cli = ServerProxy(RPC_URL) | |||||
last_serial = cli.changelog_last_serial() | |||||
# 10854808 | |||||
last_state_serial = 2168587 | |||||
results = cli.changelog_since_serial(last_state_serial) | |||||
Returns: | |||||
the following Tuple[serial, List[PackageUpdate], MagicMock, MagicMock] type. | |||||
""" | |||||
data = [ | |||||
["wordsmith", None, 1465998124, "add Owner DoublePlusAwks", 2168628], | |||||
["wordsmith", "0.1", 1465998123, "new release", 2168629], | |||||
["wordsmith", "0.1", 1465998131, "update classifiers", 2168630], | |||||
[ | |||||
"UFx", | |||||
"1.0", | |||||
1465998207, | |||||
"update author_email, home_page, summary, description", | |||||
2168631, | |||||
], | |||||
["UFx", "1.0", 1465998236, "remove file UFx-1.0.tar.gz", 2168632], | |||||
["wordsmith", "0.1", 1465998309, "update classifiers", 2168633], | |||||
[ | |||||
"wordsmith", | |||||
"0.1", | |||||
1465998406, | |||||
"update summary, description, classifiers", | |||||
2168634, | |||||
], | |||||
["property-manager", "2.0", 1465998436, "new release", 2168635], | |||||
[ | |||||
"property-manager", | |||||
"2.0", | |||||
1465998439, | |||||
"add source file property-manager-2.0.tar.gz", | |||||
2168636, | |||||
], | |||||
["numtest", "2.0.0", 1465998446, "new release", 2168637], | |||||
["property-manager", "2.1", 1465998468, "new release", 2168638], | |||||
[ | |||||
"property-manager", | |||||
"2.1", | |||||
1465998472, | |||||
"add source file property-manager-2.1.tar.gz", | |||||
2168639, | |||||
], | |||||
["kafka-utils", "0.2.0", 1465998477, "new release", 2168640], | |||||
[ | |||||
"kafka-utils", | |||||
"0.2.0", | |||||
1465998480, | |||||
"add source file kafka-utils-0.2.0.tar.gz", | |||||
2168641, | |||||
], | |||||
["numtest", "2.0.1", 1465998520, "new release", 2168642], | |||||
["coala-bears", "0.3.0.dev20160615134909", 1465998552, "new release", 2168643], | |||||
[ | |||||
"coala-bears", | |||||
"0.3.0.dev20160615134909", | |||||
1465998556, | |||||
"add py3 file coala_bears-0.3.0.dev20160615134909-py3-none-any.whl", | |||||
2168644, | |||||
], | |||||
["django_sphinxsearch", "0.4.0", 1465998571, "new release", 2168645], | |||||
[ | |||||
"django_sphinxsearch", | |||||
"0.4.0", | |||||
1465998573, | |||||
"add source file django_sphinxsearch-0.4.0.tar.gz", | |||||
2168646, | |||||
], | |||||
[ | |||||
"coala-bears", | |||||
"0.3.0.dev20160615134909", | |||||
1465998589, | |||||
"add source file coala-bears-0.3.0.dev20160615134909.tar.gz", | |||||
2168647, | |||||
], | |||||
] | |||||
highest_serial = min(map(to_serial, data)) | |||||
def sleep(seconds): | |||||
pass | |||||
mocker.patch("swh.lister.pypi.lister.sleep").return_value = sleep | |||||
class FakeServerProxy: | |||||
"""Fake Server Proxy""" | |||||
def changelog_last_serial(self): | |||||
return highest_serial | |||||
def changelog_since_serial(self, serial): | |||||
return data | |||||
mock_serverproxy = mocker.patch("swh.lister.pypi.lister.ServerProxy") | |||||
mock_serverproxy.return_value = FakeServerProxy() | |||||
return highest_serial, data, mock_serverproxy | |||||
@pytest.mark.parametrize("configure_state", [True, False]) | |||||
def test_lister_pypi_run(mock_pypi_xmlrpc, swh_scheduler, configure_state): | |||||
highest_serial, data, mock_serverproxy = mock_pypi_xmlrpc | |||||
if configure_state: | |||||
configure_scheduler_state(swh_scheduler, data) | |||||
updated_packages = defaultdict(list) | |||||
for [package, _, release_date, _, _] in data: | |||||
updated_packages[package].append(release_date) | |||||
assert len(updated_packages) > 0 | |||||
expected_last_updates = { | |||||
pypi_url(package): datetime.fromtimestamp(max(releases)).replace( | |||||
tzinfo=timezone.utc | |||||
) | |||||
for package, releases in updated_packages.items() | |||||
} | |||||
expected_pypi_urls = [pypi_url(package_name) for package_name in updated_packages] | |||||
lister = PyPILister(scheduler=swh_scheduler) | |||||
stats = lister.run() | |||||
assert mock_serverproxy.called | |||||
assert stats.pages == 1 | |||||
assert stats.origins == len(updated_packages) | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
assert len(scheduler_origins) == stats.origins | |||||
check_listed_origins(expected_pypi_urls, scheduler_origins) | |||||
actual_scheduler_state = lister.get_state_from_scheduler() | |||||
# This new visit updated the state to the new one | |||||
assert actual_scheduler_state.last_serial == highest_serial | |||||
for listed_origin in scheduler_origins: | |||||
assert listed_origin.last_update is not None | |||||
assert listed_origin.last_update == expected_last_updates[listed_origin.url] | |||||
def test__if_rate_limited(): | |||||
# TODO | |||||
pass |
mock_pypi_xmlrpc?