diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -3,24 +3,37 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict +from dataclasses import asdict, dataclass +from datetime import datetime, timezone import logging -from typing import Iterator, List, Optional +from typing import Any, Dict, Iterator, List, Optional, Tuple +from xmlrpc.client import ServerProxy from bs4 import BeautifulSoup +import iso8601 import requests from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) -PackageListPage = List[str] +PackageListPage = List[Dict] -class PyPILister(StatelessLister[PackageListPage]): +@dataclass +class PyPIListerState: + """State of PyPI lister""" + + last_visit: Optional[datetime] = None + """Last visit date and time since we visited the pypi instance (incremental pass)""" + + +class PyPILister(Lister[PyPIListerState, PackageListPage]): """List origins from PyPI. """ @@ -30,11 +43,13 @@ PACKAGE_LIST_URL = "https://pypi.org/simple/" PACKAGE_URL = "https://pypi.org/project/{package_name}/" + RPC_URL = "https://pypi.org/pypi" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + incremental: bool = True, ): super().__init__( scheduler=scheduler, @@ -42,36 +57,109 @@ instance=self.INSTANCE, credentials=credentials, ) + self.incremental = incremental self.session = requests.Session() self.session.headers.update( {"Accept": "application/html", "User-Agent": USER_AGENT} ) + self.client: Optional[ServerProxy] = None + self.current_visit: Optional[datetime] = None - def get_pages(self) -> Iterator[PackageListPage]: + def state_from_dict(self, d: Dict[str, Any]) -> PyPIListerState: + last_visit = d.get("last_visit") + if last_visit is not None: + d["last_visit"] = iso8601.parse_date(last_visit) + return PyPIListerState(**d) + + def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]: + d = asdict(state) + last_visit = d.get("last_visit") + if last_visit is not None: + d["last_visit"] = last_visit.isoformat() + return d + + def _last_updates_since( + self, last_visit_timestamp: int + ) -> List[Tuple[str, str, int, str]]: + """Execute the listing of the last update since the last_visit_timestamp. - response = self.session.get(self.PACKAGE_LIST_URL) + The indirection method exists so the testing is actually doable. Technically, + the ServerProxy class does not expose the changelog method due to internal + implementation detail which makes the testing hard for no good reason. - response.raise_for_status() + Args: + last_visit_timestamp: The last timestamp since we visited - page = BeautifulSoup(response.content, features="html.parser") + Returns: + The list of tuple information (package-name, version, last-update, + description) - page_results = [p.text for p in page.find_all("a")] + """ + if not self.client: + self.client = ServerProxy(self.RPC_URL) + + return self.client_changelog(last_visit_timestamp) # type: ignore + + def finalize(self): + """Finalize incremental visit state with the current visit we did + + """ + if self.incremental and self.current_visit: + self.updated = True + self.state.last_visit = self.current_visit + + def get_pages(self) -> Iterator[PackageListPage]: - yield page_results + if ( + self.incremental + and self.state is not None + and self.state.last_visit is not None + ): # incremental behavior will do its best to fetch latest change with + # last_update information + + last_visit_timestamp: int = int(self.state.last_visit.timestamp()) + + updated_packages = defaultdict(list) + self.current_visit = datetime.now(tz=timezone.utc) + for package, _, last_update, _ in self._last_updates_since( + last_visit_timestamp + ): + updated_packages[package].append(last_update) + + yield [ + { + "name": package, + "last_update": datetime.fromtimestamp(max(releases)).replace( + tzinfo=timezone.utc + ), + } + for package, releases in updated_packages.items() + ] + + else: # Full lister behavior + response = self.session.get(self.PACKAGE_LIST_URL) + response.raise_for_status() + + page = BeautifulSoup(response.content, features="html.parser") + + yield [ + {"name": package.text, "last_update": None} + for package in page.find_all("a") + ] def get_origins_from_page( - self, packages_name: PackageListPage + self, packages: PackageListPage ) -> Iterator[ListedOrigin]: """Convert a page of PyPI repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None - for package_name in packages_name: - package_url = self.PACKAGE_URL.format(package_name=package_name) + for package in packages: + package_url = self.PACKAGE_URL.format(package_name=package["name"]) yield ListedOrigin( lister_id=self.lister_obj.id, url=package_url, visit_type="pypi", - last_update=None, # available on PyPI JSON API + last_update=package["last_update"], ) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py --- a/swh/lister/pypi/tasks.py +++ b/swh/lister/pypi/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018 the Software Heritage developers +# Copyright (C) 2018-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,9 +7,16 @@ from .lister import PyPILister -@shared_task(name=__name__ + ".PyPIListerTask") +@shared_task(name=f"{__name__}.PyPIListerTask") def list_pypi(): "Full listing of the PyPI registry" + lister = PyPILister.from_configfile(incremental=False) + return lister.run().dict() + + +@shared_task(name=f"{__name__}.IncrementalPyPILister") +def list_pypi_incremental(): + "Incremental listing of the PyPI registry" lister = PyPILister.from_configfile() return lister.run().dict() diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py --- a/swh/lister/pypi/tests/test_lister.py +++ b/swh/lister/pypi/tests/test_lister.py @@ -1,24 +1,40 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict +from datetime import datetime, timezone from pathlib import Path -from typing import List +from typing import List, Tuple import pytest import requests -from swh.lister.pypi.lister import PyPILister +from swh.lister.pypi.lister import PackageListPage, PyPILister, PyPIListerState from swh.scheduler.model import ListedOrigin +def pypi_url(package_name: str) -> str: + """Build pypi url out of a package name. + + """ + return PyPILister.PACKAGE_URL.format(package_name=package_name) + + +def pypi_urls(packages: List[str]) -> List[str]: + """Build pypi urls out of package names + + """ + return [pypi_url(package_name) for package_name in packages] + + @pytest.fixture -def pypi_packages_testdata(datadir): - content = Path(datadir, "https_pypi.org", "simple").read_bytes() - names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"] - urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names] - return content, names, urls +def pypi_packages_testdata(datadir) -> Tuple[bytes, PackageListPage, List[str]]: + raw_content = Path(datadir, "https_pypi.org", "simple").read_bytes() + package_names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"] + packages = [{"name": name, "last_update": None} for name in package_names] + return raw_content, packages, pypi_urls(package_names) def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): @@ -35,11 +51,11 @@ def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata): - t_content, t_names, t_urls = pypi_packages_testdata + t_raw_content, t_packages, t_urls = pypi_packages_testdata - requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_content) + requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_raw_content) - lister = PyPILister(scheduler=swh_scheduler) + lister = PyPILister(scheduler=swh_scheduler, incremental=False) lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page") lister.session.get = mocker.spy(lister.session, "get") @@ -49,7 +65,7 @@ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL) - lister.get_origins_from_page.assert_called_once_with(t_names) + lister.get_origins_from_page.assert_called_once_with(t_packages) assert stats.pages == 1 assert stats.origins == 4 @@ -57,7 +73,7 @@ check_listed_origins(t_urls, scheduler_origins) - assert lister.get_state_from_scheduler() is None + assert lister.get_state_from_scheduler() == PyPIListerState(last_visit=None) @pytest.mark.parametrize("http_code", [400, 429, 500]) @@ -67,7 +83,7 @@ PyPILister.PACKAGE_LIST_URL, [{"content": None, "status_code": http_code},], ) - lister = PyPILister(scheduler=swh_scheduler) + lister = PyPILister(scheduler=swh_scheduler, incremental=False) lister.session.get = mocker.spy(lister.session, "get") @@ -94,7 +110,7 @@ def test_lister_pypi_instantiation_with_credentials( credentials, expected_credentials, swh_scheduler ): - lister = PyPILister(swh_scheduler, credentials=credentials) + lister = PyPILister(swh_scheduler, credentials=credentials, incremental=False) # Credentials are allowed in constructor assert lister.credentials == expected_credentials @@ -109,3 +125,125 @@ lister = PyPILister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None + + +@pytest.fixture +def mock_rpc_call(mocker, swh_scheduler): + """This setups a lister so it can actually fake the call to the rpc service executed + during an incremental listing. + + To retrieve or update the faked data, open a python3 toplevel and execute the + following: + + .. code:: python + + from datetime import timezone, datetime, timedelta + from xmlrpc.client import ServerProxy + from swh.scheduler.utils import utcnow + RPC_URL = "https://pypi.org/pypi" + cli = ServerProxy(RPC_URL) + date_yesterday = utcnow() - timedelta(days=1) + date_yesterday + datetime.datetime(2021, 7, 6, 12, 54, 49, 673346, tzinfo=datetime.timezone.utc) + recent_changes = cli.changelog(int(date_yesterday.timestamp())) + # recent_changes[0:20] should have sufficient data to update the tests + + Returns: + the following Tuple[datetime, List[Tuple[str, str, int, str]], MagicMock] type. + + """ + date_yesterday = datetime(2021, 7, 6, 12, 54, 49, 673346, tzinfo=timezone.utc) + # Set the lister state to the last visit as date_yesterday + lister_obj = swh_scheduler.get_or_create_lister( + name=PyPILister.LISTER_NAME, instance_name=PyPILister.INSTANCE + ) + lister_obj.current_state = {"last_visit": date_yesterday.isoformat()} + swh_scheduler.update_lister(lister_obj) + + data = [ + ["coordinate-geometry", "1.0.7", 1625576111, "new release"], + [ + "coordinate-geometry", + "1.0.7", + 1625576111, + "add source file coordinate_geometry-1.0.7.tar.gz", + ], + ["py-Ultroid", "45.5b0", 1625576137, "new release"], + [ + "py-Ultroid", + "45.5b0", + 1625576137, + "add py3 file py_Ultroid-45.5b0-py3-none-any.whl", + ], + [ + "py-Ultroid", + "45.5b0", + 1625576139, + "add source file py-Ultroid-45.5b0.tar.gz", + ], + ["bdrk", "0.9.0", 1625576160, "new release"], + ["bdrk", "0.9.0", 1625576160, "add py3 file bdrk-0.9.0-py3-none-any.whl"], + ["bdrk", "0.9.0", 1625576163, "add source file bdrk-0.9.0.tar.gz"], + ["dantro", "0.17.1", 1625576165, "new release"], + ["dantro", "0.17.1", 1625576165, "add py3 file dantro-0.17.1-py3-none-any.whl"], + ["dantro", "0.17.1", 1625576167, "add source file dantro-0.17.1.tar.gz"], + ["bamr", "0.1.1", 1625576222, "new release"], + ["bamr", "0.1.1", 1625576222, "add source file bamr-0.1.1.tar.gz"], + ["niozdaspy", None, 1625576223, "create"], + ["niozdaspy", None, 1625576223, "add Owner nioz"], + ["niozdaspy", "1.0", 1625576223, "new release"], + ["niozdaspy", "1.0", 1625576223, "add py3 file niozdaspy-1.0-py3-none-any.whl"], + ["niozdaspy", "1.0", 1625576225, "add source file niozdaspy-1.0.tar.gz"], + ["analysis-engine", "0.0.30", 1625576277, "new release"], + [ + "analysis-engine", + "0.0.30", + 1625576277, + "add py3 file analysis_engine-0.0.30-py3-none-any.whl", + ], + ] + + mock = mocker.patch("swh.lister.pypi.lister.PyPILister._last_updates_since") + mock.return_value = data + + return date_yesterday, data, mock + + +def test_lister_pypi_incremental(mock_rpc_call, swh_scheduler): + date_yesterday, data, mock = mock_rpc_call + + updated_packages = defaultdict(list) + for [package, _, release_date, _] in data: + updated_packages[package].append(release_date) + + assert len(updated_packages) > 0 + + expected_last_updates = { + pypi_url(package): datetime.fromtimestamp(max(releases)).replace( + tzinfo=timezone.utc + ) + for package, releases in updated_packages.items() + } + + expected_pypi_urls = pypi_urls(updated_packages) + + lister = PyPILister(scheduler=swh_scheduler, incremental=True) + + stats = lister.run() + + assert mock.called + assert stats.pages == 1 + assert stats.origins == len(updated_packages) + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == stats.origins + + check_listed_origins(expected_pypi_urls, scheduler_origins) + + actual_scheduler_state = lister.get_state_from_scheduler() + # our visit is most recent now + assert actual_scheduler_state.last_visit > date_yesterday + + for listed_origin in scheduler_origins: + assert listed_origin.last_update is not None + assert listed_origin.last_update == expected_last_updates[listed_origin.url] diff --git a/swh/lister/pypi/tests/test_tasks.py b/swh/lister/pypi/tests/test_tasks.py --- a/swh/lister/pypi/tests/test_tasks.py +++ b/swh/lister/pypi/tests/test_tasks.py @@ -1,10 +1,8 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch - from swh.lister.pattern import ListerStats @@ -16,9 +14,10 @@ assert res.result == "OK" -@patch("swh.lister.pypi.tasks.PyPILister") -def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked PypiLister +def test_pypi_full_lister( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.pypi.tasks.PyPILister") lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=1, origins=0) @@ -27,5 +26,23 @@ res.wait() assert res.successful() + lister.from_configfile.assert_called_once_with(incremental=False) + lister.run.assert_called_once_with() + + +def test_pypi_incremental_lister( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.pypi.tasks.PyPILister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=1, origins=0) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.pypi.tasks.IncrementalPyPILister" + ) + assert res + res.wait() + assert res.successful() + lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()