Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pypi/tests/test_lister.py
| # Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
| # See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
| # License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
| # See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
| from pathlib import Path | from collections import defaultdict | ||||
| from datetime import datetime, timezone | |||||
| from typing import List | from typing import List | ||||
| import pytest | import pytest | ||||
| import requests | |||||
| from swh.lister.pypi.lister import PyPILister | from swh.lister.pypi.lister import ChangelogEntry, PyPILister, pypi_url | ||||
| from swh.scheduler.interface import SchedulerInterface | |||||
| from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
| @pytest.fixture | |||||
| def pypi_packages_testdata(datadir): | |||||
| content = Path(datadir, "https_pypi.org", "simple").read_bytes() | |||||
| names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"] | |||||
| urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names] | |||||
| return content, names, urls | |||||
| def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): | def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): | ||||
| """Asserts that the two collections have the same origin URLs""" | """Asserts that the two collections have the same origin URLs""" | ||||
| sorted_lister_urls = list(sorted(lister_urls)) | sorted_lister_urls = list(sorted(lister_urls)) | ||||
| sorted_scheduler_origins = list(sorted(scheduler_origins)) | sorted_scheduler_origins = list(sorted(scheduler_origins)) | ||||
| assert len(sorted_lister_urls) == len(sorted_scheduler_origins) | assert len(sorted_lister_urls) == len(sorted_scheduler_origins) | ||||
| for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): | for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): | ||||
| assert l_url == s_origin.url | assert l_url == s_origin.url | ||||
| def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata): | |||||
| t_content, t_names, t_urls = pypi_packages_testdata | |||||
| requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_content) | |||||
| lister = PyPILister(scheduler=swh_scheduler) | |||||
| lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page") | |||||
| lister.session.get = mocker.spy(lister.session, "get") | |||||
| stats = lister.run() | |||||
| scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
| lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL) | |||||
| lister.get_origins_from_page.assert_called_once_with(t_names) | |||||
| assert stats.pages == 1 | |||||
| assert stats.origins == 4 | |||||
| assert len(scheduler_origins) == 4 | |||||
| check_listed_origins(t_urls, scheduler_origins) | |||||
| assert lister.get_state_from_scheduler() is None | |||||
| @pytest.mark.parametrize("http_code", [400, 429, 500]) | |||||
| def test_pypi_list_http_error(swh_scheduler, requests_mock, mocker, http_code): | |||||
| requests_mock.get( | |||||
| PyPILister.PACKAGE_LIST_URL, [{"content": None, "status_code": http_code},], | |||||
| ) | |||||
| lister = PyPILister(scheduler=swh_scheduler) | |||||
| lister.session.get = mocker.spy(lister.session, "get") | |||||
| with pytest.raises(requests.HTTPError): | |||||
| lister.run() | |||||
| lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL) | |||||
| scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
| assert len(scheduler_origins) == 0 | |||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
| "credentials, expected_credentials", | "credentials, expected_credentials", | ||||
| [ | [ | ||||
| (None, []), | (None, []), | ||||
| ({"key": "value"}, []), | ({"key": "value"}, []), | ||||
| ( | ( | ||||
| {"pypi": {"pypi": [{"username": "user", "password": "pass"}]}}, | {"pypi": {"pypi": [{"username": "user", "password": "pass"}]}}, | ||||
| [{"username": "user", "password": "pass"}], | [{"username": "user", "password": "pass"}], | ||||
| Show All 13 Lines | def test_lister_pypi_from_configfile(swh_scheduler_config, mocker): | ||||
| load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") | load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") | ||||
| load_from_envvar.return_value = { | load_from_envvar.return_value = { | ||||
| "scheduler": {"cls": "local", **swh_scheduler_config}, | "scheduler": {"cls": "local", **swh_scheduler_config}, | ||||
| "credentials": {}, | "credentials": {}, | ||||
| } | } | ||||
| lister = PyPILister.from_configfile() | lister = PyPILister.from_configfile() | ||||
| assert lister.scheduler is not None | assert lister.scheduler is not None | ||||
| assert lister.credentials is not None | assert lister.credentials is not None | ||||
| def to_serial(changelog_entry: ChangelogEntry) -> int: | |||||
olasd: `mock_pypi_xmlrpc`? | |||||
| """Helper utility to read the serial entry in the tuple | |||||
| Args: | |||||
| changelog_entry: Changelog entry to read data from | |||||
| Returns: | |||||
| The serial from the entry | |||||
| """ | |||||
| return changelog_entry[4] | |||||
| def configure_scheduler_state( | |||||
| scheduler: SchedulerInterface, data: List[ChangelogEntry] | |||||
| ): | |||||
| """Allows to pre configure a last serial state for the lister consistent with the test | |||||
| data set (the last_serial will be something inferior than the most minimal serial | |||||
Done Inline Actionsyou were right earlier, see this ^ ardumont: you were right earlier, see this ^ | |||||
| in the data set). | |||||
| Args: | |||||
| scheduler: The actual scheduler instance used during test | |||||
Done Inline Actionsmypy did not want it so fine it complained it wanted Tuple[datetime, List[object], Any] which is not as informative as ^. ardumont: mypy did not want it so fine it complained it wanted `Tuple[datetime, List[object], Any]` which… | |||||
| data: The actual dataset used during test | |||||
| """ | |||||
| # Compute the lowest serial to make it a minimum state to store in the scheduler | |||||
| lowest_serial = min(map(to_serial, data)) | |||||
| # We'll need to configure the scheduler's state | |||||
| lister_obj = scheduler.get_or_create_lister( | |||||
| name=PyPILister.LISTER_NAME, instance_name=PyPILister.INSTANCE | |||||
| ) | |||||
| lister_obj.current_state = {"last_serial": lowest_serial - 10} | |||||
| scheduler.update_lister(lister_obj) | |||||
| @pytest.fixture | |||||
| def mock_pypi_xmlrpc(mocker, swh_scheduler): | |||||
| """This setups a lister so it can actually fake the call to the rpc service executed | |||||
| during an incremental listing. | |||||
| To retrieve or update the faked data, open a python3 toplevel and execute the | |||||
| following: | |||||
| .. code:: python | |||||
| from datetime import timezone, datetime, timedelta | |||||
| from xmlrpc.client import ServerProxy | |||||
| from swh.scheduler.utils import utcnow | |||||
| RPC_URL = "https://pypi.org/pypi" | |||||
| cli = ServerProxy(RPC_URL) | |||||
| last_serial = cli.changelog_last_serial() | |||||
| # 10854808 | |||||
| last_state_serial = 2168587 | |||||
| results = cli.changelog_since_serial(last_state_serial) | |||||
| Returns: | |||||
| the following Tuple[serial, List[PackageUpdate], MagicMock, MagicMock] type. | |||||
| """ | |||||
| data = [ | |||||
| ["wordsmith", None, 1465998124, "add Owner DoublePlusAwks", 2168628], | |||||
| ["wordsmith", "0.1", 1465998123, "new release", 2168629], | |||||
| ["wordsmith", "0.1", 1465998131, "update classifiers", 2168630], | |||||
| [ | |||||
| "UFx", | |||||
| "1.0", | |||||
| 1465998207, | |||||
| "update author_email, home_page, summary, description", | |||||
| 2168631, | |||||
| ], | |||||
| ["UFx", "1.0", 1465998236, "remove file UFx-1.0.tar.gz", 2168632], | |||||
| ["wordsmith", "0.1", 1465998309, "update classifiers", 2168633], | |||||
| [ | |||||
| "wordsmith", | |||||
| "0.1", | |||||
| 1465998406, | |||||
| "update summary, description, classifiers", | |||||
| 2168634, | |||||
| ], | |||||
| ["property-manager", "2.0", 1465998436, "new release", 2168635], | |||||
| [ | |||||
| "property-manager", | |||||
| "2.0", | |||||
| 1465998439, | |||||
| "add source file property-manager-2.0.tar.gz", | |||||
| 2168636, | |||||
| ], | |||||
| ["numtest", "2.0.0", 1465998446, "new release", 2168637], | |||||
| ["property-manager", "2.1", 1465998468, "new release", 2168638], | |||||
| [ | |||||
| "property-manager", | |||||
| "2.1", | |||||
| 1465998472, | |||||
| "add source file property-manager-2.1.tar.gz", | |||||
| 2168639, | |||||
| ], | |||||
| ["kafka-utils", "0.2.0", 1465998477, "new release", 2168640], | |||||
| [ | |||||
| "kafka-utils", | |||||
| "0.2.0", | |||||
| 1465998480, | |||||
| "add source file kafka-utils-0.2.0.tar.gz", | |||||
| 2168641, | |||||
| ], | |||||
| ["numtest", "2.0.1", 1465998520, "new release", 2168642], | |||||
| ["coala-bears", "0.3.0.dev20160615134909", 1465998552, "new release", 2168643], | |||||
| [ | |||||
| "coala-bears", | |||||
| "0.3.0.dev20160615134909", | |||||
| 1465998556, | |||||
| "add py3 file coala_bears-0.3.0.dev20160615134909-py3-none-any.whl", | |||||
| 2168644, | |||||
| ], | |||||
| ["django_sphinxsearch", "0.4.0", 1465998571, "new release", 2168645], | |||||
| [ | |||||
| "django_sphinxsearch", | |||||
| "0.4.0", | |||||
| 1465998573, | |||||
| "add source file django_sphinxsearch-0.4.0.tar.gz", | |||||
| 2168646, | |||||
| ], | |||||
| [ | |||||
| "coala-bears", | |||||
| "0.3.0.dev20160615134909", | |||||
| 1465998589, | |||||
| "add source file coala-bears-0.3.0.dev20160615134909.tar.gz", | |||||
| 2168647, | |||||
| ], | |||||
| ] | |||||
| highest_serial = min(map(to_serial, data)) | |||||
| def sleep(seconds): | |||||
| pass | |||||
| mocker.patch("swh.lister.pypi.lister.sleep").return_value = sleep | |||||
| class FakeServerProxy: | |||||
| """Fake Server Proxy""" | |||||
| def changelog_last_serial(self): | |||||
| return highest_serial | |||||
| def changelog_since_serial(self, serial): | |||||
| return data | |||||
| mock_serverproxy = mocker.patch("swh.lister.pypi.lister.ServerProxy") | |||||
| mock_serverproxy.return_value = FakeServerProxy() | |||||
| return highest_serial, data, mock_serverproxy | |||||
| @pytest.mark.parametrize("configure_state", [True, False]) | |||||
| def test_lister_pypi_run(mock_pypi_xmlrpc, swh_scheduler, configure_state): | |||||
| highest_serial, data, mock_serverproxy = mock_pypi_xmlrpc | |||||
| if configure_state: | |||||
| configure_scheduler_state(swh_scheduler, data) | |||||
| updated_packages = defaultdict(list) | |||||
| for [package, _, release_date, _, _] in data: | |||||
| updated_packages[package].append(release_date) | |||||
| assert len(updated_packages) > 0 | |||||
| expected_last_updates = { | |||||
| pypi_url(package): datetime.fromtimestamp(max(releases)).replace( | |||||
| tzinfo=timezone.utc | |||||
| ) | |||||
| for package, releases in updated_packages.items() | |||||
| } | |||||
| expected_pypi_urls = [pypi_url(package_name) for package_name in updated_packages] | |||||
| lister = PyPILister(scheduler=swh_scheduler) | |||||
| stats = lister.run() | |||||
| assert mock_serverproxy.called | |||||
| assert stats.pages == 1 | |||||
| assert stats.origins == len(updated_packages) | |||||
| scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
| assert len(scheduler_origins) == stats.origins | |||||
| check_listed_origins(expected_pypi_urls, scheduler_origins) | |||||
| actual_scheduler_state = lister.get_state_from_scheduler() | |||||
| # This new visit updated the state to the new one | |||||
| assert actual_scheduler_state.last_serial == highest_serial | |||||
| for listed_origin in scheduler_origins: | |||||
| assert listed_origin.last_update is not None | |||||
| assert listed_origin.last_update == expected_last_updates[listed_origin.url] | |||||
| def test__if_rate_limited(): | |||||
| # TODO | |||||
| pass | |||||
mock_pypi_xmlrpc?