diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py --- a/swh/lister/pypi/__init__.py +++ b/swh/lister/pypi/__init__.py @@ -5,10 +5,9 @@ def register(): from .lister import PyPILister - from .models import PyPIModel return { - "models": [PyPIModel], + "models": [], "lister": PyPILister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -3,65 +3,92 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging import random -from typing import Any, Dict +import time +from typing import Any, Dict, Iterator, List -from requests import Response +import requests import xmltodict -from swh.lister.core.lister_transports import ListerOnePageApiTransport -from swh.lister.core.simple_lister import SimpleLister -from swh.scheduler import utils +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin -from .models import PyPIModel +from .. import USER_AGENT +from ..pattern import StatelessLister +logger = logging.getLogger(__name__) + +PyPIPageType = List[Dict[str, Any]] + + +class PyPILister(StatelessLister[PyPIPageType]): + """List origins from PyPI. + + """ -class PyPILister(ListerOnePageApiTransport, SimpleLister): - MODEL = PyPIModel LISTER_NAME = "pypi" - PAGE = "https://pypi.org/simple/" - instance = "pypi" # As of today only the main pypi.org is used + INSTANCE = "pypi" # As of today only the main pypi.org is used + + PACKAGE_LIST_URL = "https://pypi.org/simple/" + PACKAGE_URL = "https://pypi.org/project/{package_name}/" + + BACKOFF_FACTOR = 10 + MAX_RETRIES = 5 + + def __init__(self, scheduler: SchedulerInterface): + super().__init__( + scheduler=scheduler, + credentials=None, + url=self.PACKAGE_LIST_URL, + instance=self.INSTANCE, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/html", "User-Agent": USER_AGENT} + ) + + def get_pages(self) -> Iterator[PyPIPageType]: - def __init__(self, override_config=None): - ListerOnePageApiTransport.__init__(self) - SimpleLister.__init__(self, override_config=override_config) + backoff = self.BACKOFF_FACTOR - def task_dict(self, origin_type: str, origin_url: str, **kwargs): - """(Override) Return task format dict + for request_count in range(self.MAX_RETRIES): + response = self.session.get(self.PACKAGE_LIST_URL) - This is overridden from the lister_base as more information is - needed for the ingestion task creation. + # handle rate-limiting + if response.status_code == 429: + logger.info("Rate limit was hit, sleeping %ss", backoff) + time.sleep(backoff) - """ - _type = "load-%s" % origin_type - _policy = kwargs.get("policy", "recurring") - return utils.create_task_dict(_type, _policy, url=origin_url) + backoff *= self.BACKOFF_FACTOR + else: + # not rate-limited + break + else: + logger.info("Max number of attempts hit (%s), giving up", self.MAX_RETRIES) + return - def list_packages(self, response: Response) -> list: - """(Override) List the actual pypi origins from the response. + response.raise_for_status() - """ result = xmltodict.parse(response.content) - _packages = [p["#text"] for p in result["html"]["body"]["a"]] - random.shuffle(_packages) - return _packages - - def origin_url(self, repo_name: str) -> str: - """Returns origin_url - - """ - return "https://pypi.org/project/%s/" % repo_name - - def get_model_from_repo(self, repo_name: str) -> Dict[str, Any]: - """(Override) Transform from repository representation to model - - """ - origin_url = self.origin_url(repo_name) - return { - "uid": origin_url, - "name": repo_name, - "full_name": repo_name, - "html_url": origin_url, - "origin_url": origin_url, - "origin_type": "pypi", - } + packages_name = [p["#text"] for p in result["html"]["body"]["a"]] + random.shuffle(packages_name) + + yield packages_name + + def get_origins_from_page( + self, packages_name: PyPIPageType + ) -> Iterator[ListedOrigin]: + """Convert a page of PyPI repositories into a list of ListedOrigins.""" + assert self.lister_obj.id is not None + + for package_name in packages_name: + package_url = self.PACKAGE_URL.format(package_name=package_name) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=package_url, + visit_type="pypi", + last_update=None, # available but costs +1 request per project + ) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py --- a/swh/lister/pypi/tasks.py +++ b/swh/lister/pypi/tasks.py @@ -6,11 +6,17 @@ from .lister import PyPILister +# @shared_task(name=__name__ + ".PyPIListerTask") +# def list_pypi(**lister_args): +# "Full update of the PyPI (python) registry" +# return PyPILister(**lister_args).run() + @shared_task(name=__name__ + ".PyPIListerTask") -def list_pypi(**lister_args): +def list_pypi(): "Full update of the PyPI (python) registry" - return PyPILister(**lister_args).run() + lister = PyPILister.from_configfile() + return lister.run().dict() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py --- a/swh/lister/pypi/tests/test_lister.py +++ b/swh/lister/pypi/tests/test_lister.py @@ -3,7 +3,21 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os +import pytest + +from swh.lister.pypi.lister import PyPILister + + +@pytest.fixture +def pypi_packages_page_content(datadir): + data_file_path = os.path.join(datadir, "https_pypi.org", "simple") + with open(data_file_path, "rb") as data_file: + return data_file.read() + + +""" def test_pypi_lister(lister_pypi, requests_mock_datadir): lister_pypi.run() @@ -25,3 +39,42 @@ assert row["policy"] == "recurring" assert row["priority"] is None +""" + + +def test_pypi_lister(swh_scheduler, requests_mock_datadir): + lister = PyPILister(scheduler=swh_scheduler) + stats = lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins + + assert stats.pages == 1 + assert stats.origins == 4 + assert len(scheduler_origins) == 4 + + +def test_pypi_lister_rate_limit_hit( + swh_scheduler, requests_mock, mocker, pypi_packages_page_content, +): + + mock_sleep = mocker.patch("swh.lister.pypi.lister.time.sleep") + + requests_mock.get( + PyPILister.PACKAGE_LIST_URL, + [ + {"content": None, "status_code": 429}, + {"content": None, "status_code": 429}, + {"content": pypi_packages_page_content, "status_code": 200}, + ], + ) + + lister = PyPILister(scheduler=swh_scheduler) + + stats = lister.run() + + mock_sleep.assert_has_calls( + [mocker.call(lister.BACKOFF_FACTOR), mocker.call(lister.BACKOFF_FACTOR ** 2),] + ) + assert stats.pages == 1 + assert stats.origins == 4 + assert len(swh_scheduler.get_listed_origins(lister.lister_obj.id).origins) == 4 diff --git a/swh/lister/pypi/tests/test_tasks.py b/swh/lister/pypi/tests/test_tasks.py --- a/swh/lister/pypi/tests/test_tasks.py +++ b/swh/lister/pypi/tests/test_tasks.py @@ -5,6 +5,8 @@ from unittest.mock import patch +from swh.lister.pattern import ListerStats + def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.pypi.tasks.ping") @@ -17,14 +19,13 @@ @patch("swh.lister.pypi.tasks.PyPILister") def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked PypiLister - lister.return_value = lister - lister.run.return_value = None + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=1, origins=0) res = swh_scheduler_celery_app.send_task("swh.lister.pypi.tasks.PyPIListerTask") assert res res.wait() assert res.successful() - lister.assert_called_once_with() - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()