diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register + lister.pubdev=swh.lister.pubdev:register lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register diff --git a/swh/lister/pubdev/__init__.py b/swh/lister/pubdev/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/__init__.py @@ -0,0 +1,71 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Pub.dev lister +============== + +The Pubdev lister list origins from `pub.dev`_, the `Dart`_ and `Flutter`_ packages registry. + +The registry provide an `http api`_ from where the lister retrieve package names. + +As of August 2022 `pub.dev`_ list 33535 package names. + +Origins retrieving strategy +--------------------------- + +To get a list of all package names we call `https://pub.dev/api/packages` endpoint. +There is no other way for discovery (no archive index, no database dump, no dvcs repository). + +Page listing +------------ + +There is only one page that list all origins url based +on `https://pub.dev/api/packages/{pkgname}`. +The origin url corresponds to the http api endpoint that returns complete information +about the package versions (name, version, author, description, release date). + +Origins from page +----------------- + +The lister yields all origins url from one page. + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/pubdev/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l pubdev + +.. _pub.dev: https://pub.dev +.. _Dart: https://dart.dev +.. _Flutter: https://flutter.dev +.. _http api: https://pub.dev/help/api +""" + + +def register(): + from .lister import PubDevLister + + return { + "lister": PubDevLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/lister.py @@ -0,0 +1,100 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import logging +from typing import Any, Dict, Iterator, List, Optional + +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +PubDevListerPage = List[str] + + +class PubDevLister(StatelessLister[PubDevListerPage]): + """List pub.dev (Dart, Flutter) origins.""" + + LISTER_NAME = "pubdev" + VISIT_TYPE = "pubdev" + INSTANCE = "pubdev" + + BASE_URL = "https://pub.dev/api/" + PACKAGE_NAMES_URL_PATTERN = "{base_url}package-names" + PACKAGE_INFO_URL_PATTERN = "{base_url}packages/{pkgname}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[PubDevListerPage]: + """Yield an iterator which returns 'page' + + It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package + origins. + + The http api call get "{base_url}package-names" to retrieve a sorted list + of all package names. + + There is only one page that list all origins url based on "{base_url}packages/{pkgname}" + """ + response = self.page_request( + url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} + ) + yield response.json()["packages"] + + def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + + for pkgname in page: + url = self.PACKAGE_INFO_URL_PATTERN.format( + base_url=self.url, pkgname=pkgname + ) + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=url, + last_update=None, + ) diff --git a/swh/lister/pubdev/tasks.py b/swh/lister/pubdev/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.pubdev.lister import PubDevLister + + +@shared_task(name=__name__ + ".PubDevListerTask") +def list_pubdev(**lister_args): + """Lister task for pub.dev (Dart, Flutter) registry""" + return PubDevLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/pubdev/tests/__init__.py b/swh/lister/pubdev/tests/__init__.py new file mode 100644 diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names @@ -0,0 +1 @@ +{"packages":["Autolinker","pdf"],"nextUrl":null} diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/test_lister.py @@ -0,0 +1,40 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.pubdev.lister import PubDevLister + +expected_origins = [ + { + "url": "https://pub.dev/api/packages/Autolinker", + }, + { + "url": "https://pub.dev/api/packages/pdf", + }, +] + + +def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = PubDevLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 1 + 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + assert { + ( + scheduled.visit_type, + scheduled.url, + ) + for scheduled in scheduler_origins + } == { + ( + "pubdev", + expected["url"], + ) + for expected in expected_origins + } diff --git a/swh/lister/pubdev/tests/test_tasks.py b/swh/lister/pubdev/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_pubdev_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_pubdev_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked PubDevLister + lister = mocker.patch("swh.lister.pubdev.tasks.PubDevLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.PubDevListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()