diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register + lister.pubdev=swh.lister.pubdev:register lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register diff --git a/swh/lister/pubdev/__init__.py b/swh/lister/pubdev/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Pub.dev lister +============== +""" + + +def register(): + from .lister import PubDevLister + + return { + "lister": PubDevLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/lister.py @@ -0,0 +1,106 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import datetime +import logging +from typing import Any, Dict, Iterator, Optional + +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +PubDevListerPage = str + + +class PubDevLister(StatelessLister[PubDevListerPage]): + """List pub.dev (Dart, Flutter) origins.""" + + LISTER_NAME = "pubdev" + VISIT_TYPE = "pubdev" + INSTANCE = "pubdev" + + BASE_URL = "https://pub.dev/api/" + PACKAGE_NAMES_URL_PATTERN = "{base_url}package-names" + PACKAGE_INFO_URL_PATTERN = "{base_url}packages/{pkgname}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[PubDevListerPage]: + """Yield an iterator which returns 'page' + + It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package + origins. + + The http api call get "{base_url}package-names" to retrieve a sorted list + of all package names. + + Each page is an origin url based on "{base_url}packages/{pkgname}" + """ + response = self.page_request( + url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} + ) + + pkgnames = response.json()["packages"] + + for pkgname in pkgnames: + origin_url: str = self.PACKAGE_INFO_URL_PATTERN.format( + base_url=self.url, pkgname=pkgname + ) + yield origin_url + + def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + + last_update = datetime.datetime.now() + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=page, + last_update=last_update, + ) diff --git a/swh/lister/pubdev/tasks.py b/swh/lister/pubdev/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.pubdev.lister import PubDevLister + + +@shared_task(name=__name__ + ".PubDevListerTask") +def list_pubdev(**lister_args): + """Lister task for pub.dev (Dart, Flutter) registry""" + return PubDevLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/pubdev/tests/__init__.py b/swh/lister/pubdev/tests/__init__.py new file mode 100644 diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names @@ -0,0 +1 @@ +{"packages":["Autolinker","pdf"],"nextUrl":null} diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/test_lister.py @@ -0,0 +1,48 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.pubdev.lister import PubDevLister + +expected_origins = [ + { + "url": "https://pub.dev/api/packages/Autolinker", + }, + { + "url": "https://pub.dev/api/packages/pdf", + }, +] + + +def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = PubDevLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + 1 + assert res.origins == 1 + 1 + + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url")) + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + assert [ + ( + scheduled.visit_type, + scheduled.url, + ) + for scheduled in scheduler_origins_sorted + ] == [ + ( + "pubdev", + expected.get("url"), + ) + for expected in expected_origins_sorted + ] diff --git a/swh/lister/pubdev/tests/test_tasks.py b/swh/lister/pubdev/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_pubdev_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_pubdev_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked ArchLister + lister = mocker.patch("swh.lister.pubdev.tasks.PubDevLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.PubDevListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()