Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pattern.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from __future__ import annotations | |||||
from dataclasses import dataclass | from dataclasses import dataclass | ||||
from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar | from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar | ||||
from urllib.parse import urlparse | |||||
from swh.core.config import load_from_envvar | from swh.core.config import load_from_envvar | ||||
from swh.core.utils import grouper | from swh.core.utils import grouper | ||||
from swh.scheduler import get_scheduler, model | from swh.scheduler import get_scheduler, model | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
@dataclass | @dataclass | ||||
class ListerStats: | class ListerStats: | ||||
pages: int = 0 | pages: int = 0 | ||||
origins: int = 0 | origins: int = 0 | ||||
def __add__(self, other: "ListerStats") -> "ListerStats": | def __add__(self, other: ListerStats) -> ListerStats: | ||||
return self.__class__(self.pages + other.pages, self.origins + other.origins) | return self.__class__(self.pages + other.pages, self.origins + other.origins) | ||||
def __iadd__(self, other: "ListerStats"): | def __iadd__(self, other: ListerStats): | ||||
self.pages += other.pages | self.pages += other.pages | ||||
self.origins += other.origins | self.origins += other.origins | ||||
def dict(self) -> Dict[str, int]: | def dict(self) -> Dict[str, int]: | ||||
return {"pages": self.pages, "origins": self.origins} | return {"pages": self.pages, "origins": self.origins} | ||||
StateType = TypeVar("StateType") | StateType = TypeVar("StateType") | ||||
Show All 28 Lines | class Lister(Generic[StateType, PageType]): | ||||
The state of the lister is serialized and deserialized from the dict stored in the | The state of the lister is serialized and deserialized from the dict stored in the | ||||
scheduler backend, using the :meth:`state_from_dict` and :meth:`state_to_dict` | scheduler backend, using the :meth:`state_from_dict` and :meth:`state_to_dict` | ||||
methods. | methods. | ||||
Args: | Args: | ||||
scheduler: the instance of the Scheduler being used to register the | scheduler: the instance of the Scheduler being used to register the | ||||
origins listed by this lister | origins listed by this lister | ||||
url: a URL representing this lister, e.g. the API's base URL | url: a URL representing this lister, e.g. the API's base URL | ||||
instance: the instance name used, in conjunction with :attr:`LISTER_NAME`, to | instance: the instance name, to uniquely identify this lister instance, | ||||
uniquely identify this lister instance. | if not provided the URL network location will be used | ||||
credentials: dictionary of credentials for all listers. The first level | credentials: dictionary of credentials for all listers. The first level | ||||
identifies the :attr:`LISTER_NAME`, the second level the lister | identifies the :attr:`LISTER_NAME`, the second level the lister | ||||
:attr:`instance`. The final level is a list of dicts containing the | :attr:`instance`. The final level is a list of dicts containing the | ||||
expected credentials for the given instance of that lister. | expected credentials for the given instance of that lister. | ||||
Generic types: | Generic types: | ||||
- *StateType*: concrete lister type; should usually be a :class:`dataclass` for | - *StateType*: concrete lister type; should usually be a :class:`dataclass` for | ||||
stricter typing | stricter typing | ||||
- *PageType*: type of scrape results; can usually be a :class:`requests.Response`, | - *PageType*: type of scrape results; can usually be a :class:`requests.Response`, | ||||
or a :class:`dict` | or a :class:`dict` | ||||
""" | """ | ||||
LISTER_NAME: str = "" | LISTER_NAME: str = "" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||
url: str, | url: str, | ||||
instance: str, | instance: Optional[str] = None, | ||||
credentials: CredentialsType = None, | credentials: CredentialsType = None, | ||||
): | ): | ||||
if not self.LISTER_NAME: | if not self.LISTER_NAME: | ||||
raise ValueError("Must set the LISTER_NAME attribute on Lister classes") | raise ValueError("Must set the LISTER_NAME attribute on Lister classes") | ||||
self.url = url | self.url = url | ||||
if instance is not None: | |||||
self.instance = instance | self.instance = instance | ||||
else: | |||||
self.instance = urlparse(url).netloc | |||||
self.scheduler = scheduler | self.scheduler = scheduler | ||||
if not credentials: | if not credentials: | ||||
credentials = {} | credentials = {} | ||||
self.credentials = list( | self.credentials = list( | ||||
credentials.get(self.LISTER_NAME, {}).get(self.instance, []) | credentials.get(self.LISTER_NAME, {}).get(self.instance, []) | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 174 Lines • Show Last 20 Lines |