Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7122917
D5977.id21538.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Subscribers
None
D5977.id21538.diff
View Options
diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -3,24 +3,37 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
import logging
-from typing import Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+from xmlrpc.client import ServerProxy
from bs4 import BeautifulSoup
+import iso8601
import requests
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
-PackageListPage = List[str]
+PackageListPage = List[Dict]
-class PyPILister(StatelessLister[PackageListPage]):
+@dataclass
+class PyPIListerState:
+ """State of PyPI lister"""
+
+ last_visit: Optional[datetime] = None
+ """Last visit date and time since we visited the pypi instance (incremental pass)"""
+
+
+class PyPILister(Lister[PyPIListerState, PackageListPage]):
"""List origins from PyPI.
"""
@@ -30,11 +43,13 @@
PACKAGE_LIST_URL = "https://pypi.org/simple/"
PACKAGE_URL = "https://pypi.org/project/{package_name}/"
+ RPC_URL = "https://pypi.org/pypi"
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
+ incremental: bool = False,
):
super().__init__(
scheduler=scheduler,
@@ -42,36 +57,109 @@
instance=self.INSTANCE,
credentials=credentials,
)
+ self.incremental = incremental
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
+ self.client: Optional[ServerProxy] = None
+ self.current_visit: Optional[datetime] = None
- def get_pages(self) -> Iterator[PackageListPage]:
+ def state_from_dict(self, d: Dict[str, Any]) -> PyPIListerState:
+ last_visit = d.get("last_visit")
+ if last_visit is not None:
+ d["last_visit"] = iso8601.parse_date(last_visit)
+ return PyPIListerState(**d)
+
+ def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]:
+ d = asdict(state)
+ last_visit = d.get("last_visit")
+ if last_visit is not None:
+ d["last_visit"] = last_visit.isoformat()
+ return d
+
+ def _last_updates_since(
+ self, last_visit_timestamp: int
+ ) -> List[Tuple[str, str, int, str]]:
+ """Execute the listing of the last update since the last_visit_timestamp.
- response = self.session.get(self.PACKAGE_LIST_URL)
+ The indirection method exists so the testing is actually doable. Technically,
+ the ServerProxy class does not expose the changelog method due to internal
+ implementation detail which makes the testing hard for no good reason.
- response.raise_for_status()
+ Args:
+ last_visit_timestamp: The last timestamp since we visited
- page = BeautifulSoup(response.content, features="html.parser")
+ Returns:
+ The list of tuple information (package-name, version, last-update,
+ description)
- page_results = [p.text for p in page.find_all("a")]
+ """
+ if not self.client:
+ self.client = ServerProxy(self.RPC_URL)
+
+ return self.client_changelog(last_visit_timestamp) # type: ignore
+
+ def finalize(self):
+ """Finalize incremental visit state with the current visit we did
+
+ """
+ if self.incremental and self.current_visit:
+ self.updated = True
+ self.state.last_visit = self.current_visit
+
+ def get_pages(self) -> Iterator[PackageListPage]:
- yield page_results
+ if (
+ self.incremental
+ and self.state is not None
+ and self.state.last_visit is not None
+ ): # incremental behavior will do its best to fetch latest change with
+ # last_update information
+
+ last_visit_timestamp: int = int(self.state.last_visit.timestamp())
+
+ updated_packages = defaultdict(list)
+ self.current_visit = datetime.now(tz=timezone.utc)
+ for package, _, last_update, _ in self._last_updates_since(
+ last_visit_timestamp
+ ):
+ updated_packages[package].append(last_update)
+
+ yield [
+ {
+ "name": package,
+ "last_update": datetime.fromtimestamp(max(releases)).replace(
+ tzinfo=timezone.utc
+ ),
+ }
+ for package, releases in updated_packages.items()
+ ]
+
+ else: # Full lister behavior
+ response = self.session.get(self.PACKAGE_LIST_URL)
+ response.raise_for_status()
+
+ page = BeautifulSoup(response.content, features="html.parser")
+
+ yield [
+ {"name": package.text, "last_update": None}
+ for package in page.find_all("a")
+ ]
def get_origins_from_page(
- self, packages_name: PackageListPage
+ self, packages: PackageListPage
) -> Iterator[ListedOrigin]:
"""Convert a page of PyPI repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
- for package_name in packages_name:
- package_url = self.PACKAGE_URL.format(package_name=package_name)
+ for package in packages:
+ package_url = self.PACKAGE_URL.format(package_name=package["name"])
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=package_url,
visit_type="pypi",
- last_update=None, # available on PyPI JSON API
+ last_update=package["last_update"],
)
diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py
--- a/swh/lister/pypi/tasks.py
+++ b/swh/lister/pypi/tasks.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018 the Software Heritage developers
+# Copyright (C) 2018-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -7,13 +7,20 @@
from .lister import PyPILister
-@shared_task(name=__name__ + ".PyPIListerTask")
+@shared_task(name=f"{__name__}.PyPIListerTask")
def list_pypi():
"Full listing of the PyPI registry"
lister = PyPILister.from_configfile()
return lister.run().dict()
+@shared_task(name=f"{__name__}.IncrementalPyPILister")
+def list_pypi_incremental():
+ "Incremental listing of the PyPI registry"
+ lister = PyPILister.from_configfile(incremental=True)
+ return lister.run().dict()
+
+
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"
diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py
--- a/swh/lister/pypi/tests/test_lister.py
+++ b/swh/lister/pypi/tests/test_lister.py
@@ -1,24 +1,40 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from collections import defaultdict
+from datetime import datetime, timezone
from pathlib import Path
-from typing import List
+from typing import List, Tuple
import pytest
import requests
-from swh.lister.pypi.lister import PyPILister
+from swh.lister.pypi.lister import PackageListPage, PyPILister, PyPIListerState
from swh.scheduler.model import ListedOrigin
+def pypi_url(package_name: str) -> str:
+ """Build pypi url out of a package name.
+
+ """
+ return PyPILister.PACKAGE_URL.format(package_name=package_name)
+
+
+def pypi_urls(packages: List[str]) -> List[str]:
+ """Build pypi urls out of package names
+
+ """
+ return [pypi_url(package_name) for package_name in packages]
+
+
@pytest.fixture
-def pypi_packages_testdata(datadir):
- content = Path(datadir, "https_pypi.org", "simple").read_bytes()
- names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"]
- urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names]
- return content, names, urls
+def pypi_packages_testdata(datadir) -> Tuple[bytes, PackageListPage, List[str]]:
+ raw_content = Path(datadir, "https_pypi.org", "simple").read_bytes()
+ package_names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"]
+ packages = [{"name": name, "last_update": None} for name in package_names]
+ return raw_content, packages, pypi_urls(package_names)
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
@@ -35,9 +51,9 @@
def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata):
- t_content, t_names, t_urls = pypi_packages_testdata
+ t_raw_content, t_packages, t_urls = pypi_packages_testdata
- requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_content)
+ requests_mock.get(PyPILister.PACKAGE_LIST_URL, content=t_raw_content)
lister = PyPILister(scheduler=swh_scheduler)
@@ -49,7 +65,7 @@
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL)
- lister.get_origins_from_page.assert_called_once_with(t_names)
+ lister.get_origins_from_page.assert_called_once_with(t_packages)
assert stats.pages == 1
assert stats.origins == 4
@@ -57,7 +73,7 @@
check_listed_origins(t_urls, scheduler_origins)
- assert lister.get_state_from_scheduler() is None
+ assert lister.get_state_from_scheduler() == PyPIListerState(last_visit=None)
@pytest.mark.parametrize("http_code", [400, 429, 500])
@@ -109,3 +125,125 @@
lister = PyPILister.from_configfile()
assert lister.scheduler is not None
assert lister.credentials is not None
+
+
+@pytest.fixture
+def mock_rpc_call(mocker, swh_scheduler):
+ """This setups a lister so it can actually fake the call to the rpc service executed
+ during an incremental listing.
+
+ To retrieve or update the faked data, open a python3 toplevel and execute the
+ following:
+
+ .. code:: python
+
+ from datetime import timezone, datetime, timedelta
+ from xmlrpc.client import ServerProxy
+ from swh.scheduler.utils import utcnow
+ RPC_URL = "https://pypi.org/pypi"
+ cli = ServerProxy(RPC_URL)
+ date_yesterday = utcnow() - timedelta(days=1)
+ date_yesterday
+ datetime.datetime(2021, 7, 6, 12, 54, 49, 673346, tzinfo=datetime.timezone.utc)
+ recent_changes = cli.changelog(int(date_yesterday.timestamp()))
+ # recent_changes[0:20] should have sufficient data to update the tests
+
+ Returns:
+ the following Tuple[datetime, List[Tuple[str, str, int, str]], MagicMock] type.
+
+ """
+ date_yesterday = datetime(2021, 7, 6, 12, 54, 49, 673346, tzinfo=timezone.utc)
+ # Set the lister state to the last visit as date_yesterday
+ lister_obj = swh_scheduler.get_or_create_lister(
+ name=PyPILister.LISTER_NAME, instance_name=PyPILister.INSTANCE
+ )
+ lister_obj.current_state = {"last_visit": date_yesterday.isoformat()}
+ swh_scheduler.update_lister(lister_obj)
+
+ data = [
+ ["coordinate-geometry", "1.0.7", 1625576111, "new release"],
+ [
+ "coordinate-geometry",
+ "1.0.7",
+ 1625576111,
+ "add source file coordinate_geometry-1.0.7.tar.gz",
+ ],
+ ["py-Ultroid", "45.5b0", 1625576137, "new release"],
+ [
+ "py-Ultroid",
+ "45.5b0",
+ 1625576137,
+ "add py3 file py_Ultroid-45.5b0-py3-none-any.whl",
+ ],
+ [
+ "py-Ultroid",
+ "45.5b0",
+ 1625576139,
+ "add source file py-Ultroid-45.5b0.tar.gz",
+ ],
+ ["bdrk", "0.9.0", 1625576160, "new release"],
+ ["bdrk", "0.9.0", 1625576160, "add py3 file bdrk-0.9.0-py3-none-any.whl"],
+ ["bdrk", "0.9.0", 1625576163, "add source file bdrk-0.9.0.tar.gz"],
+ ["dantro", "0.17.1", 1625576165, "new release"],
+ ["dantro", "0.17.1", 1625576165, "add py3 file dantro-0.17.1-py3-none-any.whl"],
+ ["dantro", "0.17.1", 1625576167, "add source file dantro-0.17.1.tar.gz"],
+ ["bamr", "0.1.1", 1625576222, "new release"],
+ ["bamr", "0.1.1", 1625576222, "add source file bamr-0.1.1.tar.gz"],
+ ["niozdaspy", None, 1625576223, "create"],
+ ["niozdaspy", None, 1625576223, "add Owner nioz"],
+ ["niozdaspy", "1.0", 1625576223, "new release"],
+ ["niozdaspy", "1.0", 1625576223, "add py3 file niozdaspy-1.0-py3-none-any.whl"],
+ ["niozdaspy", "1.0", 1625576225, "add source file niozdaspy-1.0.tar.gz"],
+ ["analysis-engine", "0.0.30", 1625576277, "new release"],
+ [
+ "analysis-engine",
+ "0.0.30",
+ 1625576277,
+ "add py3 file analysis_engine-0.0.30-py3-none-any.whl",
+ ],
+ ]
+
+ mock = mocker.patch("swh.lister.pypi.lister.PyPILister._last_updates_since")
+ mock.return_value = data
+
+ return date_yesterday, data, mock
+
+
+def test_lister_pypi_incremental(mock_rpc_call, swh_scheduler):
+ date_yesterday, data, mock = mock_rpc_call
+
+ updated_packages = defaultdict(list)
+ for [package, _, release_date, _] in data:
+ updated_packages[package].append(release_date)
+
+ assert len(updated_packages) > 0
+
+ expected_last_updates = {
+ pypi_url(package): datetime.fromtimestamp(max(releases)).replace(
+ tzinfo=timezone.utc
+ )
+ for package, releases in updated_packages.items()
+ }
+
+ expected_pypi_urls = pypi_urls(updated_packages)
+
+ lister = PyPILister(scheduler=swh_scheduler, incremental=True)
+
+ stats = lister.run()
+
+ assert mock.called
+ assert stats.pages == 1
+ assert stats.origins == len(updated_packages)
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ assert len(scheduler_origins) == stats.origins
+
+ check_listed_origins(expected_pypi_urls, scheduler_origins)
+
+ actual_scheduler_state = lister.get_state_from_scheduler()
+ # our visit is most recent now
+ assert actual_scheduler_state.last_visit > date_yesterday
+
+ for listed_origin in scheduler_origins:
+ assert listed_origin.last_update is not None
+ assert listed_origin.last_update == expected_last_updates[listed_origin.url]
diff --git a/swh/lister/pypi/tests/test_tasks.py b/swh/lister/pypi/tests/test_tasks.py
--- a/swh/lister/pypi/tests/test_tasks.py
+++ b/swh/lister/pypi/tests/test_tasks.py
@@ -1,10 +1,8 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from unittest.mock import patch
-
from swh.lister.pattern import ListerStats
@@ -16,9 +14,10 @@
assert res.result == "OK"
-@patch("swh.lister.pypi.tasks.PyPILister")
-def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
- # setup the mocked PypiLister
+def test_pypi_full_lister(
+ swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
+):
+ lister = mocker.patch("swh.lister.pypi.tasks.PyPILister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=1, origins=0)
@@ -29,3 +28,21 @@
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()
+
+
+def test_pypi_incremental_lister(
+ swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
+):
+ lister = mocker.patch("swh.lister.pypi.tasks.PyPILister")
+ lister.from_configfile.return_value = lister
+ lister.run.return_value = ListerStats(pages=1, origins=0)
+
+ res = swh_scheduler_celery_app.send_task(
+ "swh.lister.pypi.tasks.IncrementalPyPILister"
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.from_configfile.assert_called_once_with(incremental=True)
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 11:42 AM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234434
Attached To
D5977: Make PyPI lister incremental and complete in regards to last_update
Event Timeline
Log In to Comment