Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pubdev/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Iterator, List, Optional | ||||
import iso8601 | import iso8601 | ||||
import requests | |||||
from requests.exceptions import HTTPError | from requests.exceptions import HTTPError | ||||
from tenacity.before_sleep import before_sleep_log | |||||
from swh.lister.utils import http_retry | |||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import __version__ | from .. import __version__ | ||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
# https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers | # https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers | ||||
USER_AGENT = ( | USER_AGENT = ( | ||||
Show All 25 Lines | def __init__( | ||||
credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
credentials=credentials, | credentials=credentials, | ||||
instance=self.INSTANCE, | instance=self.INSTANCE, | ||||
url=self.BASE_URL, | url=self.BASE_URL, | ||||
) | ) | ||||
self.session = requests.Session() | |||||
self.session.headers.update( | self.session.headers.update( | ||||
{ | { | ||||
"Accept": "application/json", | "Accept": "application/json", | ||||
"User-Agent": USER_AGENT, | "User-Agent": USER_AGENT, | ||||
} | } | ||||
) | ) | ||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | |||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: | |||||
logger.debug("Fetching URL %s with params %s", url, params) | |||||
response = self.session.get(url, params=params) | |||||
if response.status_code != 200: | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
response.raise_for_status() | |||||
return response | |||||
def get_pages(self) -> Iterator[PubDevListerPage]: | def get_pages(self) -> Iterator[PubDevListerPage]: | ||||
"""Yield an iterator which returns 'page' | """Yield an iterator which returns 'page' | ||||
It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package | It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package | ||||
origins. | origins. | ||||
The http api call get "{base_url}package-names" to retrieve a sorted list | The http api call get "{base_url}package-names" to retrieve a sorted list | ||||
of all package names. | of all package names. | ||||
There is only one page that list all origins url based on "{base_url}packages/{pkgname}" | There is only one page that list all origins url based on "{base_url}packages/{pkgname}" | ||||
""" | """ | ||||
response = self.page_request( | response = self.http_request( | ||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} | url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url) | ||||
) | ) | ||||
yield response.json()["packages"] | yield response.json()["packages"] | ||||
def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: | ||||
"""Iterate on all pages and yield ListedOrigin instances.""" | """Iterate on all pages and yield ListedOrigin instances.""" | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for pkgname in page: | for pkgname in page: | ||||
package_info_url = self.PACKAGE_INFO_URL_PATTERN.format( | package_info_url = self.PACKAGE_INFO_URL_PATTERN.format( | ||||
base_url=self.url, pkgname=pkgname | base_url=self.url, pkgname=pkgname | ||||
) | ) | ||||
try: | try: | ||||
response = self.page_request(url=package_info_url, params={}) | response = self.http_request(url=package_info_url) | ||||
except HTTPError: | except HTTPError: | ||||
logger.warning( | logger.warning( | ||||
"Failed to fetch metadata for package %s, skipping it from listing.", | "Failed to fetch metadata for package %s, skipping it from listing.", | ||||
pkgname, | pkgname, | ||||
) | ) | ||||
continue | continue | ||||
package_metadata = response.json() | package_metadata = response.json() | ||||
package_versions = package_metadata["versions"] | package_versions = package_metadata["versions"] | ||||
Show All 12 Lines |