Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pattern.py
# Copyright (C) 2020-2021 The Software Heritage developers | # Copyright (C) 2020-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from __future__ import annotations | from __future__ import annotations | ||||
from dataclasses import dataclass | from dataclasses import dataclass | ||||
import logging | |||||
from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar | from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar | ||||
from urllib.parse import urlparse | from urllib.parse import urlparse | ||||
import requests | |||||
from tenacity.before_sleep import before_sleep_log | |||||
from swh.core.config import load_from_envvar | from swh.core.config import load_from_envvar | ||||
from swh.core.utils import grouper | from swh.core.utils import grouper | ||||
from swh.scheduler import get_scheduler, model | from swh.scheduler import get_scheduler, model | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from . import USER_AGENT | |||||
from .utils import http_retry | |||||
logger = logging.getLogger(__name__) | |||||
@dataclass | @dataclass | ||||
class ListerStats: | class ListerStats: | ||||
pages: int = 0 | pages: int = 0 | ||||
origins: int = 0 | origins: int = 0 | ||||
def __add__(self, other: ListerStats) -> ListerStats: | def __add__(self, other: ListerStats) -> ListerStats: | ||||
return self.__class__(self.pages + other.pages, self.origins + other.origins) | return self.__class__(self.pages + other.pages, self.origins + other.origins) | ||||
▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines | ): | ||||
self.credentials = list( | self.credentials = list( | ||||
credentials.get(self.LISTER_NAME, {}).get(self.instance, []) | credentials.get(self.LISTER_NAME, {}).get(self.instance, []) | ||||
) | ) | ||||
# store the initial state of the lister | # store the initial state of the lister | ||||
self.state = self.get_state_from_scheduler() | self.state = self.get_state_from_scheduler() | ||||
self.updated = False | self.updated = False | ||||
self.session = requests.Session() | |||||
# Declare the USER_AGENT is more sysadm-friendly for the forge we list | |||||
self.session.headers.update({"User-Agent": USER_AGENT}) | |||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | |||||
def http_request(self, url: str, method="GET", **kwargs) -> requests.Response: | |||||
logger.debug("Fetching URL %s with params %s", url, kwargs.get("params")) | |||||
response = self.session.request(method, url, **kwargs) | |||||
if response.status_code not in (200, 304): | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
response.raise_for_status() | |||||
return response | |||||
def run(self) -> ListerStats: | def run(self) -> ListerStats: | ||||
"""Run the lister. | """Run the lister. | ||||
Returns: | Returns: | ||||
A counter with the number of pages and origins seen for this run | A counter with the number of pages and origins seen for this run | ||||
of the lister. | of the lister. | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 161 Lines • Show Last 20 Lines |