diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register + lister.pubdev=swh.lister.pubdev:register lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register diff --git a/swh/lister/pubdev/__init__.py b/swh/lister/pubdev/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Pub.dev lister +============== +""" + + +def register(): + from .lister import PubDevLister + + return { + "lister": PubDevLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/lister.py @@ -0,0 +1,141 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import datetime +import logging +from typing import Any, Dict, Iterator, List, Optional + +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +PubDevListerPage = Dict[str, Any] + + +class PubDevLister(StatelessLister[PubDevListerPage]): + """List pub.dev (Dart, Flutter) origins.""" + + LISTER_NAME = "pubdev" + VISIT_TYPE = "pubdev" + INSTANCE = "pubdev" + + BASE_URL = "https://pub.dev/api/" + PACKAGE_NAMES_URL_PATTERN = "{base_url}package-names" + PACKAGE_INFO_URL_PATTERN = "{base_url}packages/{pkgname}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[PubDevListerPage]: + """Yield an iterator which returns 'page' + + It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package + origins. + + The first http api call get "{BASE_URL}package-names" to retrieve a sorted list + of all package names. + + Each page will corresponds to a package name and an url("{BASE_URL}packages/{pkgname}") + from which another http api call is made to get package metadata (versions, archive + url, last update) + """ + response = self.page_request( + url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} + ) + pkgnames = response.json()["packages"] + for pkgname in pkgnames: + origin_url: str = self.PACKAGE_INFO_URL_PATTERN.format( + base_url=self.url, pkgname=pkgname + ) + res = self.page_request( + url=origin_url, + params={}, + ) + page = res.json() + page["url"] = origin_url + yield page + + def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances. + + It uses the `packages/{pkgname}` http api endpoint as an origin and adds `artifacts` + and `pubdev_metadata` entries to 'extra_loader_arguments'. + + `artifacts` describe the archive file to download and `pubdev_metadata` store some + metadata that can be useful for the loader. + """ + assert self.lister_obj.id is not None + + url: str = self.PACKAGE_INFO_URL_PATTERN.format( + base_url=self.url, pkgname=page["name"] + ) + latest: Dict[str, Any] = page["latest"] + versions: List[Dict[str, Any]] = page["versions"] + + last_update = datetime.datetime.fromisoformat( + latest["published"].replace("Z", "+00:00") + ) + + artifacts: List[Dict[str, Any]] = [] + for version in versions: + artifact = { + "version": version["version"], + "url": version["archive_url"], + } + artifacts.append(artifact) + + pubdev_metadata: List[Dict[str, Any]] = versions + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=url, + last_update=last_update, + extra_loader_arguments={ + "artifacts": artifacts, + "pubdev_metadata": pubdev_metadata, + }, + ) diff --git a/swh/lister/pubdev/tasks.py b/swh/lister/pubdev/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.pubdev.lister import PubDevLister + + +@shared_task(name=__name__ + ".PubDevListerTask") +def list_pubdev(**lister_args): + """Lister task for pub.dev (Dart, Flutter) registry""" + return PubDevLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/pubdev/tests/__init__.py b/swh/lister/pubdev/tests/__init__.py new file mode 100644 diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names @@ -0,0 +1 @@ +{"packages":["Autolinker","pdf"],"nextUrl":null} diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker @@ -0,0 +1 @@ +{"name":"Autolinker","latest":{"version":"0.1.1","pubspec":{"version":"0.1.1","homepage":"https://github.com/hackcave","description":"Port of Autolinker.js to dart","name":"Autolinker","author":"hackcave "},"archive_url":"https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz","published":"2014-12-24T22:34:02.534090Z"},"versions":[{"version":"0.1.0","pubspec":{"version":"0.1.0","homepage":"https://github.com/hackcave","description":"Port of Autolinker.js to dart","name":"Autolinker","author":"hackcave "},"archive_url":"https://pub.dartlang.org/packages/Autolinker/versions/0.1.0.tar.gz","published":"2014-12-24T21:16:03.118270Z"},{"version":"0.1.1","pubspec":{"version":"0.1.1","homepage":"https://github.com/hackcave","description":"Port of Autolinker.js to dart","name":"Autolinker","author":"hackcave "},"archive_url":"https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz","published":"2014-12-24T22:34:02.534090Z"}]} \ No newline at end of file diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_pdf b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_pdf new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_pdf @@ -0,0 +1 @@ +{"name":"pdf","latest":{"version":"3.8.2","pubspec":{"name":"pdf","description":"A pdf producer for Dart. It can create pdf files for both web or flutter.","homepage":"https://github.com/DavBfr/dart_pdf/tree/master/pdf","repository":"https://github.com/DavBfr/dart_pdf","issue_tracker":"https://github.com/DavBfr/dart_pdf/issues","version":"3.8.2","environment":{"sdk":">=2.12.0 <3.0.0"},"dependencies":{"archive":"^3.1.0","barcode":">=2.2.0 <3.0.0","crypto":"^3.0.0","image":">=3.0.1 <4.0.0","meta":">=1.3.0 <2.0.0","path_parsing":">=0.2.0 <2.0.0","vector_math":"^2.1.0","xml":">=5.1.0 <7.0.0"},"dev_dependencies":{"flutter_lints":"^1.0.4","test":">=1.16.0 <2.0.0"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz","published":"2022-07-25T11:38:25.983876Z"},"versions":[{"version":"1.0.0","pubspec":{"version":"1.0.0","name":"pdf","dependencies":{"ttf_parser":"^1.0.0","vector_math":"^2.0.7","meta":"^1.1.5"},"author":"David PHAM-VAN ","description":"A pdf producer for Dart","homepage":"https://github.com/davbfr/dart_pdf","environment":{"sdk":">=1.8.0 <2.0.0"},"dev_dependencies":{"test":"any"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/1.0.0.tar.gz","published":"2018-07-16T21:12:28.894137Z"},{"version":"3.8.2","pubspec":{"name":"pdf","description":"A pdf producer for Dart. It can create pdf files for both web or flutter.","homepage":"https://github.com/DavBfr/dart_pdf/tree/master/pdf","repository":"https://github.com/DavBfr/dart_pdf","issue_tracker":"https://github.com/DavBfr/dart_pdf/issues","version":"3.8.2","environment":{"sdk":">=2.12.0 <3.0.0"},"dependencies":{"archive":"^3.1.0","barcode":">=2.2.0 <3.0.0","crypto":"^3.0.0","image":">=3.0.1 <4.0.0","meta":">=1.3.0 <2.0.0","path_parsing":">=0.2.0 <2.0.0","vector_math":"^2.1.0","xml":">=5.1.0 <7.0.0"},"dev_dependencies":{"flutter_lints":"^1.0.4","test":">=1.16.0 <2.0.0"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz","published":"2022-07-25T11:38:25.983876Z"}]} diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/test_lister.py @@ -0,0 +1,153 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.pubdev.lister import PubDevLister + +expected_origins = [ + { + "url": "https://pub.dev/api/packages/Autolinker", + "visit_type": "pubdev", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.0.tar.gz", + "version": "0.1.0", + }, + { + "url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "version": "0.1.1", + }, + ], + "pubdev_metadata": [ + { + "pubspec": { + "name": "Autolinker", + "author": "hackcave ", + "version": "0.1.0", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + }, + "version": "0.1.0", + "published": "2014-12-24T21:16:03.118270Z", + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.0.tar.gz", # noqa: B950 + }, + { + "pubspec": { + "name": "Autolinker", + "author": "hackcave ", + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + }, + "version": "0.1.1", + "published": "2014-12-24T22:34:02.534090Z", + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", # noqa: B950 + }, + ], + }, + }, + { + "url": "https://pub.dev/api/packages/pdf", + "visit_type": "pubdev", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://pub.dartlang.org/packages/pdf/versions/1.0.0.tar.gz", + "version": "1.0.0", + }, + { + "url": "https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz", + "version": "3.8.2", + }, + ], + "pubdev_metadata": [ + { + "pubspec": { + "name": "pdf", + "author": "David PHAM-VAN ", + "version": "1.0.0", + "homepage": "https://github.com/davbfr/dart_pdf", + "description": "A pdf producer for Dart", + "environment": {"sdk": ">=1.8.0 <2.0.0"}, + "dependencies": { + "meta": "^1.1.5", + "ttf_parser": "^1.0.0", + "vector_math": "^2.0.7", + }, + "dev_dependencies": {"test": "any"}, + }, + "version": "1.0.0", + "published": "2018-07-16T21:12:28.894137Z", + "archive_url": "https://pub.dartlang.org/packages/pdf/versions/1.0.0.tar.gz", # noqa: B950 + }, + { + "pubspec": { + "name": "pdf", + "version": "3.8.2", + "homepage": "https://github.com/DavBfr/dart_pdf/tree/master/pdf", + "repository": "https://github.com/DavBfr/dart_pdf", + "description": "A pdf producer for Dart. It can create pdf files for both web or flutter.", # noqa: B950 + "environment": {"sdk": ">=2.12.0 <3.0.0"}, + "dependencies": { + "xml": ">=5.1.0 <7.0.0", + "meta": ">=1.3.0 <2.0.0", + "image": ">=3.0.1 <4.0.0", + "crypto": "^3.0.0", + "archive": "^3.1.0", + "barcode": ">=2.2.0 <3.0.0", + "vector_math": "^2.1.0", + "path_parsing": ">=0.2.0 <2.0.0", + }, + "issue_tracker": "https://github.com/DavBfr/dart_pdf/issues", + "dev_dependencies": { + "test": ">=1.16.0 <2.0.0", + "flutter_lints": "^1.0.4", + }, + }, + "version": "3.8.2", + "published": "2022-07-25T11:38:25.983876Z", + "archive_url": "https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz", # noqa: B950 + }, + ], + }, + }, +] + + +def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = PubDevLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + 1 + assert res.origins == 1 + 1 + + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url")) + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments.get("artifacts"), + scheduled.extra_loader_arguments.get("pubdev_metadata"), + ) + for scheduled in scheduler_origins_sorted + ] == [ + ( + "pubdev", + expected.get("url"), + expected.get("extra_loader_arguments").get("artifacts"), + expected.get("extra_loader_arguments").get("pubdev_metadata"), + ) + for expected in expected_origins_sorted + ] diff --git a/swh/lister/pubdev/tests/test_tasks.py b/swh/lister/pubdev/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/pubdev/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_pubdev_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_pubdev_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked ArchLister + lister = mocker.patch("swh.lister.pubdev.tasks.PubDevLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.PubDevListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()