Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pubdev/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
import iso8601 | |||||
import requests | import requests | ||||
from requests.exceptions import HTTPError | |||||
from tenacity.before_sleep import before_sleep_log | from tenacity.before_sleep import before_sleep_log | ||||
from swh.lister.utils import throttling_retry | from swh.lister.utils import throttling_retry | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | from .. import USER_AGENT | ||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines | def get_pages(self) -> Iterator[PubDevListerPage]: | ||||
) | ) | ||||
yield response.json()["packages"] | yield response.json()["packages"] | ||||
def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: | ||||
"""Iterate on all pages and yield ListedOrigin instances.""" | """Iterate on all pages and yield ListedOrigin instances.""" | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for pkgname in page: | for pkgname in page: | ||||
package_info_url = self.PACKAGE_INFO_URL_PATTERN.format( | |||||
base_url=self.url, pkgname=pkgname | |||||
) | |||||
try: | |||||
response = self.page_request(url=package_info_url, params={}) | |||||
except HTTPError: | |||||
logger.warning( | |||||
"Failed to fetch metadata for package %s, skipping it from listing.", | |||||
pkgname, | |||||
) | |||||
continue | |||||
package_metadata = response.json() | |||||
package_versions = package_metadata["versions"] | |||||
last_published = max( | |||||
package_version["published"] for package_version in package_versions | |||||
) | |||||
origin_url = self.ORIGIN_URL_PATTERN.format( | origin_url = self.ORIGIN_URL_PATTERN.format( | ||||
base_url=self.url, pkgname=pkgname | base_url=self.url, pkgname=pkgname | ||||
) | ) | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | ||||
url=origin_url, | url=origin_url, | ||||
last_update=None, | last_update=iso8601.parse_date(last_published), | ||||
) | ) |