Page MenuHomeSoftware Heritage

D8287.id30050.diff
No OneTemporary

D8287.id30050.diff

diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@
lister.opam=swh.lister.opam:register
lister.packagist=swh.lister.packagist:register
lister.phabricator=swh.lister.phabricator:register
+ lister.pubdev=swh.lister.pubdev:register
lister.pypi=swh.lister.pypi:register
lister.sourceforge=swh.lister.sourceforge:register
lister.tuleap=swh.lister.tuleap:register
diff --git a/swh/lister/pubdev/__init__.py b/swh/lister/pubdev/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/__init__.py
@@ -0,0 +1,71 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+Pub.dev lister
+==============
+
+The Pubdev lister list origins from `pub.dev`_, the `Dart`_ and `Flutter`_ packages registry.
+
+The registry provide an `http api`_ from where the lister retrieve package names.
+
+As of August 2022 `pub.dev`_ list 33535 package names.
+
+Origins retrieving strategy
+---------------------------
+
+To get a list of all package names we call `https://pub.dev/api/packages` endpoint.
+There is no other way for discovery (no archive index, no database dump, no dvcs repository).
+
+Page listing
+------------
+
+There is only one page that list all origins url based
+on `https://pub.dev/api/packages/{pkgname}`.
+The origin url corresponds to the http api endpoint that returns complete information
+about the package versions (name, version, author, description, release date).
+
+Origins from page
+-----------------
+
+The lister yields all origins url from one page.
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/pubdev/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker-compose up -d
+
+Then connect to the lister::
+
+ docker exec -it docker_swh-lister_1 bash
+
+And run the lister (The output of this listing results in “oneshot” tasks in the scheduler)::
+
+ swh lister run -l pubdev
+
+.. _pub.dev: https://pub.dev
+.. _Dart: https://dart.dev
+.. _Flutter: https://flutter.dev
+.. _http api: https://pub.dev/help/api
+"""
+
+
+def register():
+ from .lister import PubDevLister
+
+ return {
+ "lister": PubDevLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/lister.py
@@ -0,0 +1,100 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import logging
+from typing import Any, Dict, Iterator, List, Optional
+
+import requests
+from tenacity.before_sleep import before_sleep_log
+
+from swh.lister.utils import throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+PubDevListerPage = List[str]
+
+
+class PubDevLister(StatelessLister[PubDevListerPage]):
+ """List pub.dev (Dart, Flutter) origins."""
+
+ LISTER_NAME = "pubdev"
+ VISIT_TYPE = "pubdev"
+ INSTANCE = "pubdev"
+
+ BASE_URL = "https://pub.dev/api/"
+ PACKAGE_NAMES_URL_PATTERN = "{base_url}package-names"
+ PACKAGE_INFO_URL_PATTERN = "{base_url}packages/{pkgname}"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=self.BASE_URL,
+ )
+ self.session = requests.Session()
+ self.session.headers.update(
+ {
+ "Accept": "application/json",
+ "User-Agent": USER_AGENT,
+ }
+ )
+
+ @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+ logger.info("Fetching URL %s with params %s", url, params)
+
+ response = self.session.get(url, params=params)
+ if response.status_code != 200:
+ logger.warning(
+ "Unexpected HTTP status code %s on %s: %s",
+ response.status_code,
+ response.url,
+ response.content,
+ )
+ response.raise_for_status()
+
+ return response
+
+ def get_pages(self) -> Iterator[PubDevListerPage]:
+ """Yield an iterator which returns 'page'
+
+ It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package
+ origins.
+
+ The http api call get "{base_url}package-names" to retrieve a sorted list
+ of all package names.
+
+ There is only one page that list all origins url based on "{base_url}packages/{pkgname}"
+ """
+ response = self.page_request(
+ url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={}
+ )
+ yield response.json()["packages"]
+
+ def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+
+ for pkgname in page:
+ url = self.PACKAGE_INFO_URL_PATTERN.format(
+ base_url=self.url, pkgname=pkgname
+ )
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=url,
+ last_update=None,
+ )
diff --git a/swh/lister/pubdev/tasks.py b/swh/lister/pubdev/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.pubdev.lister import PubDevLister
+
+
+@shared_task(name=__name__ + ".PubDevListerTask")
+def list_pubdev(**lister_args):
+ """Lister task for pub.dev (Dart, Flutter) registry"""
+ return PubDevLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/pubdev/tests/__init__.py b/swh/lister/pubdev/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names
@@ -0,0 +1 @@
+{"packages":["Autolinker","pdf"],"nextUrl":null}
diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/tests/test_lister.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.lister.pubdev.lister import PubDevLister
+
+expected_origins = [
+ {
+ "url": "https://pub.dev/api/packages/Autolinker",
+ },
+ {
+ "url": "https://pub.dev/api/packages/pdf",
+ },
+]
+
+
+def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler):
+ lister = PubDevLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 1 + 1
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ assert {
+ (
+ scheduled.visit_type,
+ scheduled.url,
+ )
+ for scheduled in scheduler_origins
+ } == {
+ (
+ "pubdev",
+ expected["url"],
+ )
+ for expected in expected_origins
+ }
diff --git a/swh/lister/pubdev/tests/test_tasks.py b/swh/lister/pubdev/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_pubdev_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_pubdev_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked PubDevLister
+ lister = mocker.patch("swh.lister.pubdev.tasks.PubDevLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.PubDevListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 10:51 AM (18 h, 7 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218140

Event Timeline