Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/packagist/lister.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import dataclass | from dataclasses import dataclass | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
import iso8601 | import iso8601 | ||||
import requests | import requests | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | |||||
from ..pattern import CredentialsType, Lister | from ..pattern import CredentialsType, Lister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
PackagistPageType = List[str] | PackagistPageType = List[str] | ||||
@dataclass | @dataclass | ||||
Show All 31 Lines | class PackagistLister(Lister[PackagistListerState, PackagistPageType]): | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
url=self.PACKAGIST_PACKAGES_LIST_URL, | url=self.PACKAGIST_PACKAGES_LIST_URL, | ||||
instance="packagist", | instance="packagist", | ||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
self.session = requests.Session() | self.session.headers.update({"Accept": "application/json"}) | ||||
self.session.headers.update( | |||||
{"Accept": "application/json", "User-Agent": USER_AGENT} | |||||
) | |||||
self.listing_date = datetime.now().astimezone(tz=timezone.utc) | self.listing_date = datetime.now().astimezone(tz=timezone.utc) | ||||
def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: | def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: | ||||
last_listing_date = d.get("last_listing_date") | last_listing_date = d.get("last_listing_date") | ||||
if last_listing_date is not None: | if last_listing_date is not None: | ||||
d["last_listing_date"] = iso8601.parse_date(last_listing_date) | d["last_listing_date"] = iso8601.parse_date(last_listing_date) | ||||
return PackagistListerState(**d) | return PackagistListerState(**d) | ||||
def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]: | def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]: | ||||
d: Dict[str, Optional[str]] = {"last_listing_date": None} | d: Dict[str, Optional[str]] = {"last_listing_date": None} | ||||
last_listing_date = state.last_listing_date | last_listing_date = state.last_listing_date | ||||
if last_listing_date is not None: | if last_listing_date is not None: | ||||
d["last_listing_date"] = last_listing_date.isoformat() | d["last_listing_date"] = last_listing_date.isoformat() | ||||
return d | return d | ||||
def api_request(self, url: str) -> Any: | def api_request(self, url: str) -> Any: | ||||
logger.debug("Fetching URL %s", url) | response = self.http_request(url) | ||||
response = self.session.get(url) | |||||
if response.status_code not in (200, 304): | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
response.raise_for_status() | |||||
# response is empty when status code is 304 | # response is empty when status code is 304 | ||||
return response.json() if response.status_code == 200 else {} | return response.json() if response.status_code == 200 else {} | ||||
def get_pages(self) -> Iterator[PackagistPageType]: | def get_pages(self) -> Iterator[PackagistPageType]: | ||||
""" | """ | ||||
Yield a single page listing all Packagist projects. | Yield a single page listing all Packagist projects. | ||||
""" | """ | ||||
yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"] | yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"] | ||||
Show All 22 Lines | def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]: | ||||
) | ) | ||||
if not metadata.get("packages", {}): | if not metadata.get("packages", {}): | ||||
# package metadata not updated since last listing | # package metadata not updated since last listing | ||||
continue | continue | ||||
if package_name not in metadata["packages"]: | if package_name not in metadata["packages"]: | ||||
# missing package metadata in response | # missing package metadata in response | ||||
continue | continue | ||||
versions_info = metadata["packages"][package_name].values() | versions_info = metadata["packages"][package_name].values() | ||||
except requests.exceptions.HTTPError: | except requests.HTTPError: | ||||
# error when getting package metadata (usually 404 when a | # error when getting package metadata (usually 404 when a | ||||
# package has been removed), skip it and process next package | # package has been removed), skip it and process next package | ||||
continue | continue | ||||
origin_url = None | origin_url = None | ||||
visit_type = None | visit_type = None | ||||
last_update = None | last_update = None | ||||
Show All 39 Lines |