Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163582
D8287.id30050.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D8287.id30050.diff
View Options
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@
lister.opam=swh.lister.opam:register
lister.packagist=swh.lister.packagist:register
lister.phabricator=swh.lister.phabricator:register
+ lister.pubdev=swh.lister.pubdev:register
lister.pypi=swh.lister.pypi:register
lister.sourceforge=swh.lister.sourceforge:register
lister.tuleap=swh.lister.tuleap:register
diff --git a/swh/lister/pubdev/__init__.py b/swh/lister/pubdev/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/__init__.py
@@ -0,0 +1,71 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+Pub.dev lister
+==============
+
+The Pubdev lister list origins from `pub.dev`_, the `Dart`_ and `Flutter`_ packages registry.
+
+The registry provide an `http api`_ from where the lister retrieve package names.
+
+As of August 2022 `pub.dev`_ list 33535 package names.
+
+Origins retrieving strategy
+---------------------------
+
+To get a list of all package names we call `https://pub.dev/api/packages` endpoint.
+There is no other way for discovery (no archive index, no database dump, no dvcs repository).
+
+Page listing
+------------
+
+There is only one page that list all origins url based
+on `https://pub.dev/api/packages/{pkgname}`.
+The origin url corresponds to the http api endpoint that returns complete information
+about the package versions (name, version, author, description, release date).
+
+Origins from page
+-----------------
+
+The lister yields all origins url from one page.
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/pubdev/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker-compose up -d
+
+Then connect to the lister::
+
+ docker exec -it docker_swh-lister_1 bash
+
+And run the lister (The output of this listing results in “oneshot” tasks in the scheduler)::
+
+ swh lister run -l pubdev
+
+.. _pub.dev: https://pub.dev
+.. _Dart: https://dart.dev
+.. _Flutter: https://flutter.dev
+.. _http api: https://pub.dev/help/api
+"""
+
+
+def register():
+ from .lister import PubDevLister
+
+ return {
+ "lister": PubDevLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/lister.py
@@ -0,0 +1,100 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import logging
+from typing import Any, Dict, Iterator, List, Optional
+
+import requests
+from tenacity.before_sleep import before_sleep_log
+
+from swh.lister.utils import throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+PubDevListerPage = List[str]
+
+
+class PubDevLister(StatelessLister[PubDevListerPage]):
+ """List pub.dev (Dart, Flutter) origins."""
+
+ LISTER_NAME = "pubdev"
+ VISIT_TYPE = "pubdev"
+ INSTANCE = "pubdev"
+
+ BASE_URL = "https://pub.dev/api/"
+ PACKAGE_NAMES_URL_PATTERN = "{base_url}package-names"
+ PACKAGE_INFO_URL_PATTERN = "{base_url}packages/{pkgname}"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=self.BASE_URL,
+ )
+ self.session = requests.Session()
+ self.session.headers.update(
+ {
+ "Accept": "application/json",
+ "User-Agent": USER_AGENT,
+ }
+ )
+
+ @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+ logger.info("Fetching URL %s with params %s", url, params)
+
+ response = self.session.get(url, params=params)
+ if response.status_code != 200:
+ logger.warning(
+ "Unexpected HTTP status code %s on %s: %s",
+ response.status_code,
+ response.url,
+ response.content,
+ )
+ response.raise_for_status()
+
+ return response
+
+ def get_pages(self) -> Iterator[PubDevListerPage]:
+ """Yield an iterator which returns 'page'
+
+ It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package
+ origins.
+
+ The http api call get "{base_url}package-names" to retrieve a sorted list
+ of all package names.
+
+ There is only one page that list all origins url based on "{base_url}packages/{pkgname}"
+ """
+ response = self.page_request(
+ url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={}
+ )
+ yield response.json()["packages"]
+
+ def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+
+ for pkgname in page:
+ url = self.PACKAGE_INFO_URL_PATTERN.format(
+ base_url=self.url, pkgname=pkgname
+ )
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=url,
+ last_update=None,
+ )
diff --git a/swh/lister/pubdev/tasks.py b/swh/lister/pubdev/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.pubdev.lister import PubDevLister
+
+
+@shared_task(name=__name__ + ".PubDevListerTask")
+def list_pubdev(**lister_args):
+ """Lister task for pub.dev (Dart, Flutter) registry"""
+ return PubDevLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/pubdev/tests/__init__.py b/swh/lister/pubdev/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names
@@ -0,0 +1 @@
+{"packages":["Autolinker","pdf"],"nextUrl":null}
diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/tests/test_lister.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.lister.pubdev.lister import PubDevLister
+
+expected_origins = [
+ {
+ "url": "https://pub.dev/api/packages/Autolinker",
+ },
+ {
+ "url": "https://pub.dev/api/packages/pdf",
+ },
+]
+
+
+def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler):
+ lister = PubDevLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 1 + 1
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ assert {
+ (
+ scheduled.visit_type,
+ scheduled.url,
+ )
+ for scheduled in scheduler_origins
+ } == {
+ (
+ "pubdev",
+ expected["url"],
+ )
+ for expected in expected_origins
+ }
diff --git a/swh/lister/pubdev/tests/test_tasks.py b/swh/lister/pubdev/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/pubdev/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_pubdev_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_pubdev_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked PubDevLister
+ lister = mocker.patch("swh.lister.pubdev.tasks.PubDevLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.PubDevListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 10:51 AM (18 h, 7 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218140
Attached To
D8287: Pub.dev lister for Dart and Flutter packages
Event Timeline
Log In to Comment