Changeset View
Standalone View
swh/lister/gitea/lister.py
# Copyright (C) 2018-2020 The Software Heritage developers | # Copyright (C) 2018-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import re | import logging | ||||
from typing import Any, Dict, List, MutableMapping, Optional, Tuple | from typing import Any, Dict, Iterator, List, Optional | ||||
from urllib.parse import urljoin | |||||
from requests import Response | |||||
import iso8601 | |||||
import requests | |||||
from tenacity.before_sleep import before_sleep_log | |||||
from urllib3.util import parse_url | from urllib3.util import parse_url | ||||
from ..core.page_by_page_lister import PageByPageHttpLister | from swh.lister.utils import throttling_retry | ||||
from .models import GiteaModel | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | |||||
from .. import USER_AGENT | |||||
from ..pattern import CredentialsType, StatelessLister | |||||
logger = logging.getLogger(__name__) | |||||
RepoListPage = List[Dict[str, Any]] | |||||
class GiteaLister(PageByPageHttpLister): | class GiteaLister(StatelessLister[RepoListPage]): | ||||
# Template path expecting an integer that represents the page id | """List origins from Gitea. | ||||
PATH_TEMPLATE = "repos/search?page=%d&sort=id" | |||||
anlambert: A link to the Gitea API documentation would be more useful here https://try.gitea. | |||||
DEFAULT_URL = "https://try.gitea.io/api/v1/" | Gitea API documentation: https://try.gitea.io/api/swagger | ||||
MODEL = GiteaModel | |||||
The API does pagination and provides navigation URLs through the 'Link' header. | |||||
The default value for page size is the maximum value observed on the instances | |||||
accessible at https://try.gitea.io/api/v1/ and https://codeberg.org/api/v1/.""" | |||||
LISTER_NAME = "gitea" | LISTER_NAME = "gitea" | ||||
REPO_LIST_PATH = "repos/search" | |||||
def __init__( | def __init__( | ||||
self, url=None, instance=None, override_config=None, order="asc", limit=3 | self, | ||||
scheduler: SchedulerInterface, | |||||
url: str, | |||||
Done Inline ActionsI would rather make the url and instance parameters mandatory here and set default value of "https://try.gitea.io/api/v1/" and "gitea". anlambert: I would rather make the `url` and `instance` parameters mandatory here and set default value of… | |||||
Done Inline ActionsWhat do you mean by mandatory and have a default value? The idea was that for such services that have many instances, we must pass the URL, but the instance name can be inferred if not passed. tenma: What do you mean by mandatory and have a default value?
The idea was that for such services… | |||||
Done Inline ActionsRight, I did not see the instance = parse_url(url).host instruction below, forget that comment anlambert: Right, I did not see the ` instance = parse_url(url).host` instruction below, forget that… | |||||
instance: Optional[str] = None, | |||||
api_token: Optional[str] = None, | |||||
Done Inline Actionsmaybe add a comment saying that it is actually the maximum number of pages Gitea API can return ? I only tested with try.gitea and codeberg so this could not be true to other instances but I truly doubt it. anlambert: maybe add a comment saying that it is actually the maximum number of pages Gitea API can return… | |||||
page_size: int = 50, | |||||
credentials: CredentialsType = None, | |||||
): | ): | ||||
super().__init__(url=url, override_config=override_config) | |||||
if instance is None: | if instance is None: | ||||
instance = parse_url(self.url).host | instance = parse_url(url).host | ||||
self.instance = instance | |||||
self.PATH_TEMPLATE = "%s&order=%s&limit=%s" % ( | super().__init__( | ||||
self.PATH_TEMPLATE, | scheduler=scheduler, credentials=credentials, url=url, instance=instance, | ||||
order, | |||||
limit, | |||||
) | ) | ||||
def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]: | self.query_params = { | ||||
return { | "sort": "id", | ||||
"instance": self.instance, | "order": "asc", | ||||
"uid": f'{self.instance}/{repo["id"]}', | "limit": page_size, | ||||
"name": repo["name"], | "page": 1, | ||||
"full_name": repo["full_name"], | |||||
"html_url": repo["html_url"], | |||||
"origin_url": repo["clone_url"], | |||||
"origin_type": "git", | |||||
} | } | ||||
def get_next_target_from_response(self, response: Response) -> Optional[int]: | self.session = requests.Session() | ||||
"""Determine the next page identifier. | self.session.headers.update( | ||||
{"Accept": "application/json", "User-Agent": USER_AGENT,} | |||||
) | |||||
""" | if api_token is None and len(self.credentials) > 0: | ||||
logger.warning( | |||||
"Gitea lister support only API token authentication " | |||||
" as of now. Will use the first password as token." | |||||
) | |||||
api_token = self.credentials[0]["password"] | |||||
Done Inline ActionsThat warning is not really useful. I guess you could pick a random token in the list instead. anlambert: That warning is not really useful. I guess you could pick a random token in the list instead. | |||||
Done Inline ActionsUpdated. The warning is more about only supporting tokens than the fact that only one is supported. (Not the same case as the other lister.) tenma: Updated. The warning is more about only supporting tokens than the fact that only one is… | |||||
Done Inline Actions
That's implementation details and not really relevant to log, please remove it.
I do not agree, we could have multiple credentials stored in configuration so a random
Yes do not do that. You could log the username associated to the token if available in the conf instead (as in the github lister). So the code could be rewritten in: if len(self.credentials) > 0: cred = random.choice(self.credentials) username = cred.get("username") logger.warning("Using authentication token from user %s", username or "???") self.set_credentials(username, cred["password"]) else: logger.warning("No authentication tokens set in configuration, using anonymous mode") I will make the same changes to the Bitbucket lister. anlambert: > Updated. The warning is more about only supporting tokens than the fact that only one is… | |||||
Done Inline Actions
anlambert: > I will make the same changes to the Bitbucket lister.
D4938 | |||||
Done Inline ActionsOK it is good like this. tenma: OK it is good like this. | |||||
if api_token: | |||||
self.session.headers["Authorization"] = "Token %s" % api_token | |||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | |||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: | |||||
Done Inline ActionsYou should remove that method and merge its content in the constructor. anlambert: You should remove that method and merge its content in the constructor. | |||||
logger.info("Fetching URL %s with params %s", url, params) | |||||
response = self.session.get(url, params=params) | |||||
if response.status_code != 200: | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
response.raise_for_status() | |||||
return response | |||||
@classmethod | |||||
def results_simplified(cls, body: Dict[str, RepoListPage]) -> RepoListPage: | |||||
fields_filter = ["id", "clone_url", "updated_at"] | |||||
return [{k: r[k] for k in fields_filter} for r in body["data"]] | |||||
def get_pages(self) -> Iterator[RepoListPage]: | |||||
# base with trailing slash, path without leading slash for urljoin | |||||
url: str = urljoin(self.url, self.REPO_LIST_PATH) | |||||
response = self.page_request(url, self.query_params) | |||||
while True: | |||||
page_results = self.results_simplified(response.json()) | |||||
yield page_results | |||||
assert len(response.links) > 0, "API changed: no Link header found" | |||||
if "next" in response.links: | if "next" in response.links: | ||||
next_url = response.links["next"]["url"] | url = response.links["next"]["url"] | ||||
return self.get_page_from_url(next_url) | else: | ||||
return None | # last page | ||||
break | |||||
def get_page_from_url(self, url: str) -> int: | |||||
page_re = re.compile(r"^.*/search\?.*page=(\d+)") | |||||
return int(page_re.match(url).group(1)) # type: ignore | |||||
def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]: | |||||
repos = response.json()["data"] | |||||
return [self.get_model_from_repo(repo) for repo in repos] | |||||
def get_pages_information( | response = self.page_request(url, {}) | ||||
self, | |||||
) -> Tuple[Optional[int], Optional[int], Optional[int]]: | def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]: | ||||
"""Determine pages information. | """Convert a page of Gitea repositories into a list of ListedOrigins. | ||||
""" | """ | ||||
response = self.transport_head(identifier=1) # type: ignore | assert self.lister_obj.id is not None | ||||
if not response.ok: | |||||
raise ValueError( | |||||
"Problem during information fetch: %s" % response.status_code | |||||
) | |||||
h = response.headers | |||||
return ( | |||||
self._get_int(h, "x-total-count"), | |||||
int(self.get_page_from_url(response.links["last"]["url"])), | |||||
self._get_int(h, "x-per-page"), | |||||
) | |||||
def _get_int(self, headers: MutableMapping[str, Any], key: str) -> Optional[int]: | for repo in page: | ||||
_val = headers.get(key) | last_update = iso8601.parse_date(repo["updated_at"]) | ||||
if _val: | |||||
return int(_val) | |||||
return None | |||||
def run(self, min_bound=1, max_bound=None, check_existence=False): | yield ListedOrigin( | ||||
return super().run(min_bound, max_bound, check_existence) | lister_id=self.lister_obj.id, | ||||
url=repo["clone_url"], | |||||
visit_type="git", | |||||
last_update=last_update, | |||||
) |
A link to the Gitea API documentation would be more useful here https://try.gitea.io/api/swagger.