diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py --- a/swh/lister/pypi/__init__.py +++ b/swh/lister/pypi/__init__.py @@ -5,10 +5,9 @@ def register(): from .lister import PyPILister - from .models import PyPIModel return { - "models": [PyPIModel], + "models": [], "lister": PyPILister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -3,65 +3,70 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import random -from typing import Any, Dict +import logging +from typing import Iterator, List -from requests import Response +import requests import xmltodict -from swh.lister.core.lister_transports import ListerOnePageApiTransport -from swh.lister.core.simple_lister import SimpleLister -from swh.scheduler import utils +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin -from .models import PyPIModel +from .. import USER_AGENT +from ..pattern import StatelessLister +logger = logging.getLogger(__name__) + +PackageListPage = List[str] + + +class PyPILister(StatelessLister[PackageListPage]): + """List origins from PyPI. + + """ -class PyPILister(ListerOnePageApiTransport, SimpleLister): - MODEL = PyPIModel LISTER_NAME = "pypi" - PAGE = "https://pypi.org/simple/" - instance = "pypi" # As of today only the main pypi.org is used - - def __init__(self, override_config=None): - ListerOnePageApiTransport.__init__(self) - SimpleLister.__init__(self, override_config=override_config) - - def task_dict(self, origin_type: str, origin_url: str, **kwargs): - """(Override) Return task format dict - - This is overridden from the lister_base as more information is - needed for the ingestion task creation. - - """ - _type = "load-%s" % origin_type - _policy = kwargs.get("policy", "recurring") - return utils.create_task_dict(_type, _policy, url=origin_url) - - def list_packages(self, response: Response) -> list: - """(Override) List the actual pypi origins from the response. - - """ - result = xmltodict.parse(response.content) - _packages = [p["#text"] for p in result["html"]["body"]["a"]] - random.shuffle(_packages) - return _packages - - def origin_url(self, repo_name: str) -> str: - """Returns origin_url - - """ - return "https://pypi.org/project/%s/" % repo_name - - def get_model_from_repo(self, repo_name: str) -> Dict[str, Any]: - """(Override) Transform from repository representation to model - - """ - origin_url = self.origin_url(repo_name) - return { - "uid": origin_url, - "name": repo_name, - "full_name": repo_name, - "html_url": origin_url, - "origin_url": origin_url, - "origin_type": "pypi", - } + INSTANCE = "pypi" # As of today only the main pypi.org is used + + PACKAGE_LIST_URL = "https://pypi.org/simple/" + PACKAGE_URL = "https://pypi.org/project/{package_name}/" + + def __init__(self, scheduler: SchedulerInterface): + super().__init__( + scheduler=scheduler, + credentials=None, + url=self.PACKAGE_LIST_URL, + instance=self.INSTANCE, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/html", "User-Agent": USER_AGENT} + ) + + def get_pages(self) -> Iterator[PackageListPage]: + + response = self.session.get(self.PACKAGE_LIST_URL) + + response.raise_for_status() + + page_xmldict = xmltodict.parse(response.content) + page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]] + + yield page_results + + def get_origins_from_page( + self, packages_name: PackageListPage + ) -> Iterator[ListedOrigin]: + """Convert a page of PyPI repositories into a list of ListedOrigins.""" + assert self.lister_obj.id is not None + + for package_name in packages_name: + package_url = self.PACKAGE_URL.format(package_name=package_name) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=package_url, + visit_type="pypi", + last_update=None, # available on PyPI JSON API + ) diff --git a/swh/lister/pypi/models.py b/swh/lister/pypi/models.py deleted file mode 100644 --- a/swh/lister/pypi/models.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2018 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, String - -from ..core.models import ModelBase - - -class PyPIModel(ModelBase): - """a PyPI repository representation - - """ - - __tablename__ = "pypi_repo" - - uid = Column(String, primary_key=True) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py --- a/swh/lister/pypi/tasks.py +++ b/swh/lister/pypi/tasks.py @@ -8,9 +8,10 @@ @shared_task(name=__name__ + ".PyPIListerTask") -def list_pypi(**lister_args): - "Full update of the PyPI (python) registry" - return PyPILister(**lister_args).run() +def list_pypi(): + "Full listing of the PyPI registry" + lister = PyPILister.from_configfile() + return lister.run().dict() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py --- a/swh/lister/pypi/tests/test_lister.py +++ b/swh/lister/pypi/tests/test_lister.py @@ -3,25 +3,80 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from pathlib import Path +from typing import List -def test_pypi_lister(lister_pypi, requests_mock_datadir): - lister_pypi.run() +import pytest +import requests - r = lister_pypi.scheduler.search_tasks(task_type="load-pypi") - assert len(r) == 4 +from swh.lister.pypi.lister import PyPILister +from swh.scheduler.model import ListedOrigin - for row in r: - assert row["type"] == "load-pypi" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 - # kwargs - kwargs = row["arguments"]["kwargs"] - assert len(kwargs) == 1 +@pytest.fixture +def pypi_packages_testdata(datadir): + content = Path(datadir, "https_pypi.org", "simple").read_bytes() + names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"] + urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names] + return content, names, urls - origin_url = kwargs["url"] - assert "https://pypi.org/project" in origin_url - assert row["policy"] == "recurring" - assert row["priority"] is None +def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): + """Asserts that the two collections have the same origin URLs""" + + sorted_lister_urls = list(sorted(lister_urls)) + sorted_scheduler_origins = list(sorted(scheduler_origins)) + + assert len(sorted_lister_urls) == len(sorted_scheduler_origins) + + for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): + assert l_url == s_origin.url + + +def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata): + + t_content, t_names, t_urls = pypi_packages_testdata + + requests_mock.get( + PyPILister.PACKAGE_LIST_URL, [{"content": t_content, "status_code": 200},], + ) + + lister = PyPILister(scheduler=swh_scheduler) + + lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page") + lister.session.get = mocker.spy(lister.session, "get") + + stats = lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins + + lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL) + lister.get_origins_from_page.assert_called_once_with(t_names) + + assert stats.pages == 1 + assert stats.origins == 4 + assert len(scheduler_origins) == 4 + + check_listed_origins(t_urls, scheduler_origins) + + assert lister.get_state_from_scheduler() is None + + +@pytest.mark.parametrize("http_code", [400, 429, 500]) +def test_pypi_list_http_error(swh_scheduler, requests_mock, mocker, http_code): + + requests_mock.get( + PyPILister.PACKAGE_LIST_URL, [{"content": None, "status_code": http_code},], + ) + + lister = PyPILister(scheduler=swh_scheduler) + + lister.session.get = mocker.spy(lister.session, "get") + + with pytest.raises(requests.HTTPError): + lister.run() + + lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL) + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins + assert len(scheduler_origins) == 0 diff --git a/swh/lister/pypi/tests/test_tasks.py b/swh/lister/pypi/tests/test_tasks.py --- a/swh/lister/pypi/tests/test_tasks.py +++ b/swh/lister/pypi/tests/test_tasks.py @@ -5,6 +5,8 @@ from unittest.mock import patch +from swh.lister.pattern import ListerStats + def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.pypi.tasks.ping") @@ -17,14 +19,13 @@ @patch("swh.lister.pypi.tasks.PyPILister") def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked PypiLister - lister.return_value = lister - lister.run.return_value = None + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=1, origins=0) res = swh_scheduler_celery_app.send_task("swh.lister.pypi.tasks.PyPIListerTask") assert res res.wait() assert res.successful() - lister.assert_called_once_with() - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()