Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2022 The Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import logging | import logging | ||||
import re | import re | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
from urllib.parse import urljoin, urlparse | from urllib.parse import urljoin, urlparse | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import requests | |||||
from requests.exceptions import HTTPError | from requests.exceptions import HTTPError | ||||
from tenacity.before_sleep import before_sleep_log | |||||
from swh.lister import USER_AGENT | |||||
from swh.lister.pattern import CredentialsType, StatelessLister | from swh.lister.pattern import CredentialsType, StatelessLister | ||||
from swh.lister.utils import http_retry | |||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
Repositories = List[Dict[str, Any]] | Repositories = List[Dict[str, Any]] | ||||
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | ): | ||||
""" | """ | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
url=url, | url=url, | ||||
instance=instance, | instance=instance, | ||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
self.session = requests.Session() | self.session.headers.update({"Accept": "application/html"}) | ||||
self.session.headers.update( | |||||
{"Accept": "application/html", "User-Agent": USER_AGENT} | |||||
) | |||||
self.base_git_url = base_git_url | self.base_git_url = base_git_url | ||||
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) | |||||
def _get_and_parse(self, url: str) -> BeautifulSoup: | def _get_and_parse(self, url: str) -> BeautifulSoup: | ||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup""" | """Get the given url and parse the retrieved HTML using BeautifulSoup""" | ||||
response = self.session.get(url) | response = self.http_request(url) | ||||
response.raise_for_status() | |||||
return BeautifulSoup(response.text, features="html.parser") | return BeautifulSoup(response.text, features="html.parser") | ||||
def get_pages(self) -> Iterator[Repositories]: | def get_pages(self) -> Iterator[Repositories]: | ||||
"""Generate git 'project' URLs found on the current CGit server | """Generate git 'project' URLs found on the current CGit server | ||||
The last_update date is retrieved on the list of repo page to avoid | The last_update date is retrieved on the list of repo page to avoid | ||||
to compute it on the repository details which only give a date per branch | to compute it on the repository details which only give a date per branch | ||||
""" | """ | ||||
next_page: Optional[str] = self.url | next_page: Optional[str] = self.url | ||||
▲ Show 20 Lines • Show All 140 Lines • Show Last 20 Lines |