diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index 25ed934..7c75967 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -1,101 +1,119 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Iterator, List, Optional +import iso8601 import requests +from requests.exceptions import HTTPError from tenacity.before_sleep import before_sleep_log from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. PubDevListerPage = List[str] class PubDevLister(StatelessLister[PubDevListerPage]): """List pub.dev (Dart, Flutter) origins.""" LISTER_NAME = "pubdev" VISIT_TYPE = "pubdev" INSTANCE = "pubdev" BASE_URL = "https://pub.dev/" PACKAGE_NAMES_URL_PATTERN = "{base_url}api/package-names" PACKAGE_INFO_URL_PATTERN = "{base_url}api/packages/{pkgname}" ORIGIN_URL_PATTERN = "{base_url}packages/{pkgname}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[PubDevListerPage]: """Yield an iterator which returns 'page' It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package origins. The http api call get "{base_url}package-names" to retrieve a sorted list of all package names. There is only one page that list all origins url based on "{base_url}packages/{pkgname}" """ response = self.page_request( url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} ) yield response.json()["packages"] def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for pkgname in page: + package_info_url = self.PACKAGE_INFO_URL_PATTERN.format( + base_url=self.url, pkgname=pkgname + ) + try: + response = self.page_request(url=package_info_url, params={}) + except HTTPError: + logger.warning( + "Failed to fetch metadata for package %s, skipping it from listing.", + pkgname, + ) + continue + package_metadata = response.json() + package_versions = package_metadata["versions"] + last_published = max( + package_version["published"] for package_version in package_versions + ) origin_url = self.ORIGIN_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin_url, - last_update=None, + last_update=iso8601.parse_date(last_published), ) diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker new file mode 100644 index 0000000..5d19592 --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker @@ -0,0 +1,44 @@ +{ + "name": "Autolinker", + "latest": { + "version": "0.1.1", + "pubspec": { + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "archive_sha256": "0a5209a2d5a292a26fc65d7edb430163f209a7c7c24ba4f301676f1afd79fa3f", + "published": "2014-12-24T22:34:02.534090Z" + }, + "versions": [ + { + "version": "0.1.0", + "pubspec": { + "version": "0.1.0", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.0.tar.gz", + "archive_sha256": "717b30e27311c775293d4795ce33d15cedb5e5d21fa140f2cb46b30f3e969041", + "published": "2014-12-24T21:16:03.118270Z" + }, + { + "version": "0.1.1", + "pubspec": { + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "archive_sha256": "0a5209a2d5a292a26fc65d7edb430163f209a7c7c24ba4f301676f1afd79fa3f", + "published": "2014-12-24T22:34:02.534090Z" + } + ] +} \ No newline at end of file diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Babylon b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Babylon new file mode 100644 index 0000000..770d7ee --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Babylon @@ -0,0 +1,51 @@ +{ + "name": "Babylon", + "latest": { + "version": "0.0.3", + "pubspec": { + "version": "0.0.3", + "name": "Babylon", + "dependencies": { + "js": ">=0.6.0", + "browser": ">=0.10.0+2" + }, + "author": "Cedric Krause ", + "description": "A starting point for Dart libraries or applications.", + "homepage": "https://www.cedware.com", + "environment": { + "sdk": ">=1.0.0 <2.0.0" + }, + "dev_dependencies": { + "test": ">=0.12.0 <0.13.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/Babylon/versions/0.0.3.tar.gz", + "archive_sha256": "a18166c8082d795f22c38270b7fed0c306d5cb59fe390ce3a34c300770c4a8b3", + "published": "2016-06-01T19:15:38.052Z" + }, + "versions": [ + { + "version": "0.0.3", + "pubspec": { + "version": "0.0.3", + "name": "Babylon", + "dependencies": { + "js": ">=0.6.0", + "browser": ">=0.10.0+2" + }, + "author": "Cedric Krause ", + "description": "A starting point for Dart libraries or applications.", + "homepage": "https://www.cedware.com", + "environment": { + "sdk": ">=1.0.0 <2.0.0" + }, + "dev_dependencies": { + "test": ">=0.12.0 <0.13.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/Babylon/versions/0.0.3.tar.gz", + "archive_sha256": "a18166c8082d795f22c38270b7fed0c306d5cb59fe390ce3a34c300770c4a8b3", + "published": "2016-06-01T19:15:38.052Z" + } + ] +} \ No newline at end of file diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py index 75ad197..1591efb 100644 --- a/swh/lister/pubdev/tests/test_lister.py +++ b/swh/lister/pubdev/tests/test_lister.py @@ -1,27 +1,41 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.pubdev.lister import PubDevLister expected_origins = { "https://pub.dev/packages/Autolinker", "https://pub.dev/packages/Babylon", } def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler): lister = PubDevLister(scheduler=swh_scheduler) res = lister.run() assert res.pages == 1 assert res.origins == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(expected_origins) for origin in scheduler_origins: assert origin.visit_type == "pubdev" assert origin.url in expected_origins + assert origin.last_update is not None + + +def test_pubdev_lister_skip_package( + datadir, requests_mock_datadir, swh_scheduler, requests_mock +): + + requests_mock.get("https://pub.dev/api/packages/Autolinker", status_code=404) + + lister = PubDevLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 1