Page MenuHomeSoftware Heritage

D5977.id21538.diff
No OneTemporary

D5977.id21538.diff

diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -3,24 +3,37 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
import logging
-from typing import Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+from xmlrpc.client import ServerProxy
from bs4 import BeautifulSoup
+import iso8601
import requests
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
-PackageListPage = List[str]
+PackageListPage = List[Dict]
-class PyPILister(StatelessLister[PackageListPage]):
+@dataclass
+class PyPIListerState:
+ """State of PyPI lister"""
+
+ last_visit: Optional[datetime] = None
+ """Last visit date and time since we visited the pypi instance (incremental pass)"""
+
+
+class PyPILister(Lister[PyPIListerState, PackageListPage]):
"""List origins from PyPI.
"""
@@ -30,11 +43,13 @@
PACKAGE_LIST_URL = "https://pypi.org/simple/"
PACKAGE_URL = "https://pypi.org/project/{package_name}/"
+ RPC_URL = "https://pypi.org/pypi"
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
+ incremental: bool = False,
):
super().__init__(
scheduler=scheduler,
@@ -42,36 +57,109 @@
instance=self.INSTANCE,
credentials=credentials,
)
+ self.incremental = incremental
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
+ self.client: Optional[ServerProxy] = None
+ self.current_visit: Optional[datetime] = None
- def get_pages(self) -> Iterator[PackageListPage]:
+ def state_from_dict(self, d: Dict[str, Any]) -> PyPIListerState:
+ last_visit = d.get("last_visit")
+ if last_visit is not None:
+ d["last_visit"] = iso8601.parse_date(last_visit)
+ return PyPIListerState(**d)
+
+ def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]:
+ d = asdict(state)
+ last_visit = d.get("last_visit")
+ if last_visit is not None:
+ d["last_visit"] = last_visit.isoformat()
+ return d
+
+ def _last_updates_since(
+ self, last_visit_timestamp: int
+ ) -> List[Tuple[str, str, int, str]]:
+ """Execute the listing of the last update since the last_visit_timestamp.
- response = self.session.get(self.PACKAGE_LIST_URL)
+ The indirection method exists so the testing is actually doable. Technically,
+ the ServerProxy class does not expose the changelog method due to internal
+ implementation detail which makes the testing hard for no good reason.
- response.raise_for_status()
+ Args:
+ last_visit_timestamp: The last timestamp since we visited
- page = BeautifulSoup(response.content, features="html.parser")
+ Returns:
+ The list of tuple information (package-name, version, last-update,
+ description)
- page_results = [p.text for p in page.find_all("a")]
+ """
+ if not self.client:
+ self.client = ServerProxy(self.RPC_URL)
+
+ return self.client_changelog(last_visit_timestamp) # type: ignore
+
+ def finalize(self):
+ """Finalize incremental visit state with the current visit we did
+
+ """
+ if self.incremental and self.current_visit:
+ self.updated = True
+ self.state.last_visit = self.current_visit
+
+ def get_pages(self) -> Iterator[PackageListPage]:
- yield page_results
+ if (
+ self.incremental
+ and self.state is not None
+ and self.state.last_visit is not None
+ ): # incremental behavior will do its best to fetch latest change with
+ # last_update information
+
+ last_visit_timestamp: int = int(self.state.last_visit.timestamp())
+
+ updated_packages = defaultdict(list)
+ self.current_visit = datetime.now(tz=timezone.utc)
+ for package, _, last_update, _ in self._last_updates_since(
+ last_visit_timestamp
+ ):
+ updated_packages[package].append(last_update)
+
+ yield [
+ {
+ "name": package,
+ "last_update": datetime.fromtimestamp(max(releases)).replace(
+ tzinfo=timezone.utc
+ ),
+ }
+ for package, releases in updated_packages.items()
+ ]
+
+ else: # Full lister behavior
+ response = self.session.get(self.PACKAGE_LIST_URL)
+ response.raise_for_status()
+
+ page = BeautifulSoup(response.content, features="html.parser")
+
+ yield [
+ {"name": package.text, "last_update": None}
+ for package in page.find_all("a")
+ ]
def get_origins_from_page(
- self, packages_name: PackageListPage
+ self, packages: PackageListPage
) -> Iterator[ListedOrigin]:
"""Convert a page of PyPI repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
- for package_name in packages_name:
- package_url = self.PACKAGE_URL.format(package_name=package_name)
+ for package in packages:
+ package_url = self.PACKAGE_URL.format(package_name=package["name"])
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=package_url,
visit_type="pypi",
- last_update=None, # available on PyPI JSON API
+ last_update=package["last_update"],
)
diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py
--- a/swh/lister/pypi/tasks.py
+++ b/swh/lister/pypi/tasks.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018 the Software Heritage developers
+# Copyright (C) 2018-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -7,13 +7,20 @@
from .lister import PyPILister
-@shared_task(name=__name__ + ".PyPIListerTask")
+@shared_task(name=f"{__name__}.PyPIListerTask")
def list_pypi():
"Full listing of the PyPI registry"
lister = PyPILister.from_configfile()
return lister.run().dict()
+@shared_task(name=f"{__name__}.IncrementalPyPILister")
+def list_pypi_incremental():
+ "Incremental listing of the PyPI registry"
+ lister = PyPILister.from_configfile(incremental=True)
+ return lister.run().dict()
+
+
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"
diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py
--- a/swh/lister/pypi/tests/test_lister.py
+++ b/swh/lister/pypi/tests/test_lister.py
@@ -1,24 +1,40 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from collections import defaultdict
+from datetime import datetime, timezone
from pathlib import Path
-from typing import List
+from typing import List, Tuple
import pytest
import requests
-from swh.lister.pypi.lister import PyPILister
+from swh.lister.pypi.lister import PackageListPage, PyPILister, PyPIListerState
from swh.scheduler.model import ListedOrigin
+def pypi_url(package_name: str) -> str:
+ """Build pypi url out of a package name.
+
+ """
+ return PyPILister.PACKAGE_URL.format(package_name=package_name)
+
+
+def pypi_urls(packages: List[str]) -> List[str]:
+ """Build pypi urls out of package names
+
+ """
+ return [pypi_url(package_name) for package_name in packages]
+
+
@pytest.fixture
-def pypi_packages_testdata(datadir):
- content = Path(datadir, "https_pypi.org", "simple").read_bytes()
- names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"]
- urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names]
- return content, names, urls
+def pypi_packages_testdata(datadir) -> Tuple[bytes, PackageListPage, List[str]]:
+ raw_content = Path(datadir, "https_pypi.org", "simple").read_bytes()
+ package_names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"]
+ packages = [{"name": name, "last_update": None} for name in package_names]
+ return raw_content, packages, pypi_urls(package_names)
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
@@ -35,9 +51,9 @@
def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata):
- t_content, t_names, t_urls = pypi_packages_testdata
+ t_raw_content, t_packages, t_urls = pypi_packages_testdata
- requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_content)
+ requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_raw_content)
lister = PyPILister(scheduler=swh_scheduler)
@@ -49,7 +65,7 @@
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL)
- lister.get_origins_from_page.assert_called_once_with(t_names)
+ lister.get_origins_from_page.assert_called_once_with(t_packages)
assert stats.pages == 1
assert stats.origins == 4
@@ -57,7 +73,7 @@
check_listed_origins(t_urls, scheduler_origins)
- assert lister.get_state_from_scheduler() is None
+ assert lister.get_state_from_scheduler() == PyPIListerState(last_visit=None)
@pytest.mark.parametrize("http_code", [400, 429, 500])
@@ -109,3 +125,125 @@
lister = PyPILister.from_configfile()
assert lister.scheduler is not None
assert lister.credentials is not None
+
+
+@pytest.fixture
+def mock_rpc_call(mocker, swh_scheduler):
+ """This setups a lister so it can actually fake the call to the rpc service executed
+ during an incremental listing.
+
+ To retrieve or update the faked data, open a python3 toplevel and execute the
+ following:
+
+ .. code:: python
+
+ from datetime import timezone, datetime, timedelta
+ from xmlrpc.client import ServerProxy
+ from swh.scheduler.utils import utcnow
+ RPC_URL = "https://pypi.org/pypi"
+ cli = ServerProxy(RPC_URL)
+ date_yesterday = utcnow() - timedelta(days=1)
+ date_yesterday
+ datetime.datetime(2021, 7, 6, 12, 54, 49, 673346, tzinfo=datetime.timezone.utc)
+ recent_changes = cli.changelog(int(date_yesterday.timestamp()))
+ # recent_changes[0:20] should have sufficient data to update the tests
+
+ Returns:
+ the following Tuple[datetime, List[Tuple[str, str, int, str]], MagicMock] type.
+
+ """
+ date_yesterday = datetime(2021, 7, 6, 12, 54, 49, 673346, tzinfo=timezone.utc)
+ # Set the lister state to the last visit as date_yesterday
+ lister_obj = swh_scheduler.get_or_create_lister(
+ name=PyPILister.LISTER_NAME, instance_name=PyPILister.INSTANCE
+ )
+ lister_obj.current_state = {"last_visit": date_yesterday.isoformat()}
+ swh_scheduler.update_lister(lister_obj)
+
+ data = [
+ ["coordinate-geometry", "1.0.7", 1625576111, "new release"],
+ [
+ "coordinate-geometry",
+ "1.0.7",
+ 1625576111,
+ "add source file coordinate_geometry-1.0.7.tar.gz",
+ ],
+ ["py-Ultroid", "45.5b0", 1625576137, "new release"],
+ [
+ "py-Ultroid",
+ "45.5b0",
+ 1625576137,
+ "add py3 file py_Ultroid-45.5b0-py3-none-any.whl",
+ ],
+ [
+ "py-Ultroid",
+ "45.5b0",
+ 1625576139,
+ "add source file py-Ultroid-45.5b0.tar.gz",
+ ],
+ ["bdrk", "0.9.0", 1625576160, "new release"],
+ ["bdrk", "0.9.0", 1625576160, "add py3 file bdrk-0.9.0-py3-none-any.whl"],
+ ["bdrk", "0.9.0", 1625576163, "add source file bdrk-0.9.0.tar.gz"],
+ ["dantro", "0.17.1", 1625576165, "new release"],
+ ["dantro", "0.17.1", 1625576165, "add py3 file dantro-0.17.1-py3-none-any.whl"],
+ ["dantro", "0.17.1", 1625576167, "add source file dantro-0.17.1.tar.gz"],
+ ["bamr", "0.1.1", 1625576222, "new release"],
+ ["bamr", "0.1.1", 1625576222, "add source file bamr-0.1.1.tar.gz"],
+ ["niozdaspy", None, 1625576223, "create"],
+ ["niozdaspy", None, 1625576223, "add Owner nioz"],
+ ["niozdaspy", "1.0", 1625576223, "new release"],
+ ["niozdaspy", "1.0", 1625576223, "add py3 file niozdaspy-1.0-py3-none-any.whl"],
+ ["niozdaspy", "1.0", 1625576225, "add source file niozdaspy-1.0.tar.gz"],
+ ["analysis-engine", "0.0.30", 1625576277, "new release"],
+ [
+ "analysis-engine",
+ "0.0.30",
+ 1625576277,
+ "add py3 file analysis_engine-0.0.30-py3-none-any.whl",
+ ],
+ ]
+
+ mock = mocker.patch("swh.lister.pypi.lister.PyPILister._last_updates_since")
+ mock.return_value = data
+
+ return date_yesterday, data, mock
+
+
+def test_lister_pypi_incremental(mock_rpc_call, swh_scheduler):
+ date_yesterday, data, mock = mock_rpc_call
+
+ updated_packages = defaultdict(list)
+ for [package, _, release_date, _] in data:
+ updated_packages[package].append(release_date)
+
+ assert len(updated_packages) > 0
+
+ expected_last_updates = {
+ pypi_url(package): datetime.fromtimestamp(max(releases)).replace(
+ tzinfo=timezone.utc
+ )
+ for package, releases in updated_packages.items()
+ }
+
+ expected_pypi_urls = pypi_urls(updated_packages)
+
+ lister = PyPILister(scheduler=swh_scheduler, incremental=True)
+
+ stats = lister.run()
+
+ assert mock.called
+ assert stats.pages == 1
+ assert stats.origins == len(updated_packages)
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ assert len(scheduler_origins) == stats.origins
+
+ check_listed_origins(expected_pypi_urls, scheduler_origins)
+
+ actual_scheduler_state = lister.get_state_from_scheduler()
+ # our visit is most recent now
+ assert actual_scheduler_state.last_visit > date_yesterday
+
+ for listed_origin in scheduler_origins:
+ assert listed_origin.last_update is not None
+ assert listed_origin.last_update == expected_last_updates[listed_origin.url]
diff --git a/swh/lister/pypi/tests/test_tasks.py b/swh/lister/pypi/tests/test_tasks.py
--- a/swh/lister/pypi/tests/test_tasks.py
+++ b/swh/lister/pypi/tests/test_tasks.py
@@ -1,10 +1,8 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from unittest.mock import patch
-
from swh.lister.pattern import ListerStats
@@ -16,9 +14,10 @@
assert res.result == "OK"
-@patch("swh.lister.pypi.tasks.PyPILister")
-def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
- # setup the mocked PypiLister
+def test_pypi_full_lister(
+ swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
+):
+ lister = mocker.patch("swh.lister.pypi.tasks.PyPILister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=1, origins=0)
@@ -29,3 +28,21 @@
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()
+
+
+def test_pypi_incremental_lister(
+ swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
+):
+ lister = mocker.patch("swh.lister.pypi.tasks.PyPILister")
+ lister.from_configfile.return_value = lister
+ lister.run.return_value = ListerStats(pages=1, origins=0)
+
+ res = swh_scheduler_celery_app.send_task(
+ "swh.lister.pypi.tasks.IncrementalPyPILister"
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.from_configfile.assert_called_once_with(incremental=True)
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 11:42 AM (2 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234434

Event Timeline