diff --git a/swh/lister/gitea/lister.py b/swh/lister/gitea/lister.py --- a/swh/lister/gitea/lister.py +++ b/swh/lister/gitea/lister.py @@ -4,27 +4,13 @@ # See top-level LICENSE file for more information import logging -import random -from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import urljoin -import iso8601 -import requests -from tenacity.before_sleep import before_sleep_log - -from swh.lister.utils import throttling_retry -from swh.scheduler.interface import SchedulerInterface -from swh.scheduler.model import ListedOrigin - -from .. import USER_AGENT -from ..pattern import CredentialsType, StatelessLister +from ..gogs.lister import GogsLister logger = logging.getLogger(__name__) -RepoListPage = List[Dict[str, Any]] - -class GiteaLister(StatelessLister[RepoListPage]): +class GiteaLister(GogsLister): """List origins from Gitea. Gitea API documentation: https://try.gitea.io/api/swagger @@ -35,108 +21,7 @@ LISTER_NAME = "gitea" - REPO_LIST_PATH = "repos/search" - - def __init__( - self, - scheduler: SchedulerInterface, - url: str, - instance: Optional[str] = None, - api_token: Optional[str] = None, - page_size: int = 50, - credentials: CredentialsType = None, - ): - super().__init__( - scheduler=scheduler, - credentials=credentials, - url=url, - instance=instance, - ) - - self.query_params = { - "sort": "id", - "order": "asc", - "limit": page_size, - "page": 1, - } - - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "User-Agent": USER_AGENT, - } + def on_anonymous_mode(self): + logger.warning( + "No authentication token set in configuration, using anonymous mode" ) - - if api_token is None: - if len(self.credentials) > 0: - cred = random.choice(self.credentials) - username = cred.get("username") - api_token = cred["password"] - logger.warning( - "Using authentication token from user %s", username or "???" - ) - else: - logger.warning( - "No authentication token set in configuration, using anonymous mode" - ) - - if api_token: - self.session.headers["Authorization"] = "Token %s" % api_token - - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: - - logger.info("Fetching URL %s with params %s", url, params) - - response = self.session.get(url, params=params) - - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - - return response - - @classmethod - def results_simplified(cls, body: Dict[str, RepoListPage]) -> RepoListPage: - fields_filter = ["id", "clone_url", "updated_at"] - return [{k: r[k] for k in fields_filter} for r in body["data"]] - - def get_pages(self) -> Iterator[RepoListPage]: - # base with trailing slash, path without leading slash for urljoin - url: str = urljoin(self.url, self.REPO_LIST_PATH) - - response = self.page_request(url, self.query_params) - - while True: - page_results = self.results_simplified(response.json()) - - yield page_results - - assert len(response.links) > 0, "API changed: no Link header found" - if "next" in response.links: - url = response.links["next"]["url"] - else: - # last page - break - - response = self.page_request(url, {}) - - def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]: - """Convert a page of Gitea repositories into a list of ListedOrigins.""" - assert self.lister_obj.id is not None - - for repo in page: - last_update = iso8601.parse_date(repo["updated_at"]) - - yield ListedOrigin( - lister_id=self.lister_obj.id, - url=repo["clone_url"], - visit_type="git", - last_update=last_update, - ) diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py --- a/swh/lister/gitea/tests/test_lister.py +++ b/swh/lister/gitea/tests/test_lister.py @@ -10,33 +10,40 @@ import pytest import requests -from swh.lister.gitea.lister import GiteaLister, RepoListPage +from swh.lister.gitea.lister import GiteaLister +from swh.lister.gogs.lister import GogsListerPage from swh.scheduler.model import ListedOrigin TRYGITEA_URL = "https://try.gitea.io/api/v1/" -TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=1" -TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=2" +TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?limit=3&page=1" +TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?limit=3&page=2" @pytest.fixture -def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]: +def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]: text = Path(datadir, "https_try.gitea.io", "repos_page1").read_text() headers = { "Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=TRYGITEA_P2_URL) } - page_result = GiteaLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + page_data = json.loads(text) + page_result = GogsListerPage( + repos=GiteaLister.extract_repos(page_data), next_link=TRYGITEA_P2_URL + ) + origin_urls = [r["clone_url"] for r in page_data["data"]] return text, headers, page_result, origin_urls @pytest.fixture -def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]: +def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]: text = Path(datadir, "https_try.gitea.io", "repos_page2").read_text() headers = { "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=TRYGITEA_P1_URL) } - page_result = GiteaLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + page_data = json.loads(text) + page_result = GogsListerPage( + repos=GiteaLister.extract_repos(page_data), next_link=None + ) + origin_urls = [r["clone_url"] for r in page_data["data"]] return text, headers, page_result, origin_urls @@ -93,7 +100,9 @@ check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) - assert lister.get_state_from_scheduler() is None + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == TRYGITEA_P2_URL + assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"] def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1): diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -96,7 +96,8 @@ "Using authentication credentials from user %s", username or "???" ) else: - raise ValueError("No credentials or API token provided") + # Raises an error on Gogs, or a warning on Gitea + self.on_anonymous_mode() self.max_page_limit = 2 @@ -109,6 +110,9 @@ } ) + def on_anonymous_mode(self): + raise ValueError("No credentials or API token provided") + def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState: return GogsListerState(**d)