Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import logging | import logging | ||||
import re | import re | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
from urllib.parse import urljoin, urlparse | from urllib.parse import urljoin, urlparse | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import requests | import requests | ||||
from requests.exceptions import HTTPError | from requests.exceptions import HTTPError | ||||
from swh.lister import USER_AGENT | from swh.lister import USER_AGENT | ||||
from swh.lister.pattern import StatelessLister | from swh.lister.pattern import CredentialsType, StatelessLister | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
Repositories = List[Dict[str, Any]] | Repositories = List[Dict[str, Any]] | ||||
Show All 9 Lines | class CGitLister(StatelessLister[Repositories]): | ||||
If several "Clone" urls are provided, prefer the http/https one, if | If several "Clone" urls are provided, prefer the http/https one, if | ||||
any, otherwise fallback to the first one. | any, otherwise fallback to the first one. | ||||
""" | """ | ||||
LISTER_NAME = "cgit" | LISTER_NAME = "cgit" | ||||
def __init__( | def __init__( | ||||
self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None | self, | ||||
scheduler: SchedulerInterface, | |||||
url: str, | |||||
instance: Optional[str] = None, | |||||
credentials: Optional[CredentialsType] = None, | |||||
): | ): | ||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||
Args: | Args: | ||||
url (str): main URL of the CGit instance, i.e. url of the index | url (str): main URL of the CGit instance, i.e. url of the index | ||||
of published git repositories on this instance. | of published git repositories on this instance. | ||||
instance (str): Name of cgit instance. Defaults to url's hostname | instance (str): Name of cgit instance. Defaults to url's hostname | ||||
if unset. | if unset. | ||||
""" | """ | ||||
if not instance: | if not instance: | ||||
instance = urlparse(url).hostname | instance = urlparse(url).hostname | ||||
assert instance is not None # Make mypy happy | assert instance is not None # Make mypy happy | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, credentials=None, url=url, instance=instance, | scheduler=scheduler, url=url, instance=instance, credentials=credentials, | ||||
) | ) | ||||
self.session = requests.Session() | self.session = requests.Session() | ||||
self.session.headers.update( | self.session.headers.update( | ||||
{"Accept": "application/html", "User-Agent": USER_AGENT} | {"Accept": "application/html", "User-Agent": USER_AGENT} | ||||
) | ) | ||||
def _get_and_parse(self, url: str) -> BeautifulSoup: | def _get_and_parse(self, url: str) -> BeautifulSoup: | ||||
▲ Show 20 Lines • Show All 117 Lines • Show Last 20 Lines |