Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pypi/lister.py
# Copyright (C) 2018-2019 The Software Heritage developers | # Copyright (C) 2018-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
from typing import Iterator, List | from typing import Iterator, List, Optional | ||||
import requests | import requests | ||||
import xmltodict | import xmltodict | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | from .. import USER_AGENT | ||||
from ..pattern import StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
PackageListPage = List[str] | PackageListPage = List[str] | ||||
class PyPILister(StatelessLister[PackageListPage]): | class PyPILister(StatelessLister[PackageListPage]): | ||||
"""List origins from PyPI. | """List origins from PyPI. | ||||
""" | """ | ||||
LISTER_NAME = "pypi" | LISTER_NAME = "pypi" | ||||
INSTANCE = "pypi" # As of today only the main pypi.org is used | INSTANCE = "pypi" # As of today only the main pypi.org is used | ||||
PACKAGE_LIST_URL = "https://pypi.org/simple/" | PACKAGE_LIST_URL = "https://pypi.org/simple/" | ||||
PACKAGE_URL = "https://pypi.org/project/{package_name}/" | PACKAGE_URL = "https://pypi.org/project/{package_name}/" | ||||
def __init__(self, scheduler: SchedulerInterface): | def __init__( | ||||
self, | |||||
scheduler: SchedulerInterface, | |||||
credentials: Optional[CredentialsType] = None, | |||||
): | |||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
credentials=None, | |||||
url=self.PACKAGE_LIST_URL, | url=self.PACKAGE_LIST_URL, | ||||
instance=self.INSTANCE, | instance=self.INSTANCE, | ||||
credentials=credentials, | |||||
) | ) | ||||
self.session = requests.Session() | self.session = requests.Session() | ||||
self.session.headers.update( | self.session.headers.update( | ||||
{"Accept": "application/html", "User-Agent": USER_AGENT} | {"Accept": "application/html", "User-Agent": USER_AGENT} | ||||
) | ) | ||||
def get_pages(self) -> Iterator[PackageListPage]: | def get_pages(self) -> Iterator[PackageListPage]: | ||||
Show All 25 Lines |