diff --git a/swh/lister/gitea/__init__.py b/swh/lister/gitea/__init__.py --- a/swh/lister/gitea/__init__.py +++ b/swh/lister/gitea/__init__.py @@ -5,10 +5,9 @@ def register(): from .lister import GiteaLister - from .models import GiteaModel return { - "models": [GiteaModel], + "models": [], "lister": GiteaLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/gitea/lister.py b/swh/lister/gitea/lister.py --- a/swh/lister/gitea/lister.py +++ b/swh/lister/gitea/lister.py @@ -1,89 +1,135 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import re -from typing import Any, Dict, List, MutableMapping, Optional, Tuple +import logging +from typing import Any, Dict, Iterator, List, Optional +from urllib.parse import urljoin -from requests import Response +import iso8601 +import requests +from tenacity.before_sleep import before_sleep_log from urllib3.util import parse_url -from ..core.page_by_page_lister import PageByPageHttpLister -from .models import GiteaModel +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +RepoListPage = List[Dict[str, Any]] + + +class GiteaLister(StatelessLister[RepoListPage]): + """List origins from Gitea. + + Gitea API documentation: https://try.gitea.io/api/swagger + + The API does pagination and provides navigation URLs through the 'Link' header. + The default value for page size is the maximum value observed on the instances + accessible at https://try.gitea.io/api/v1/ and https://codeberg.org/api/v1/.""" -class GiteaLister(PageByPageHttpLister): - # Template path expecting an integer that represents the page id - PATH_TEMPLATE = "repos/search?page=%d&sort=id" - DEFAULT_URL = "https://try.gitea.io/api/v1/" - MODEL = GiteaModel LISTER_NAME = "gitea" + REPO_LIST_PATH = "repos/search" + def __init__( - self, url=None, instance=None, override_config=None, order="asc", limit=3 + self, + scheduler: SchedulerInterface, + url: str, + instance: Optional[str] = None, + api_token: Optional[str] = None, + page_size: int = 50, + credentials: CredentialsType = None, ): - super().__init__(url=url, override_config=override_config) if instance is None: - instance = parse_url(self.url).host - self.instance = instance - self.PATH_TEMPLATE = "%s&order=%s&limit=%s" % ( - self.PATH_TEMPLATE, - order, - limit, + instance = parse_url(url).host + + super().__init__( + scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) - def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]: - return { - "instance": self.instance, - "uid": f'{self.instance}/{repo["id"]}', - "name": repo["name"], - "full_name": repo["full_name"], - "html_url": repo["html_url"], - "origin_url": repo["clone_url"], - "origin_type": "git", + self.query_params = { + "sort": "id", + "order": "asc", + "limit": page_size, + "page": 1, } - def get_next_target_from_response(self, response: Response) -> Optional[int]: - """Determine the next page identifier. + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT,} + ) - """ - if "next" in response.links: - next_url = response.links["next"]["url"] - return self.get_page_from_url(next_url) - return None + if api_token is None and len(self.credentials) > 0: + logger.warning( + "Gitea lister support only API token authentication " + " as of now. Will use the first password as token." + ) + api_token = self.credentials[0]["password"] - def get_page_from_url(self, url: str) -> int: - page_re = re.compile(r"^.*/search\?.*page=(\d+)") - return int(page_re.match(url).group(1)) # type: ignore + if api_token: + self.session.headers["Authorization"] = "Token %s" % api_token - def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]: - repos = response.json()["data"] - return [self.get_model_from_repo(repo) for repo in repos] + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: - def get_pages_information( - self, - ) -> Tuple[Optional[int], Optional[int], Optional[int]]: - """Determine pages information. + logger.info("Fetching URL %s with params %s", url, params) - """ - response = self.transport_head(identifier=1) # type: ignore - if not response.ok: - raise ValueError( - "Problem during information fetch: %s" % response.status_code + response = self.session.get(url, params=params) + + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, ) - h = response.headers - return ( - self._get_int(h, "x-total-count"), - int(self.get_page_from_url(response.links["last"]["url"])), - self._get_int(h, "x-per-page"), - ) + response.raise_for_status() + + return response + + @classmethod + def results_simplified(cls, body: Dict[str, RepoListPage]) -> RepoListPage: + fields_filter = ["id", "clone_url", "updated_at"] + return [{k: r[k] for k in fields_filter} for r in body["data"]] + + def get_pages(self) -> Iterator[RepoListPage]: + # base with trailing slash, path without leading slash for urljoin + url: str = urljoin(self.url, self.REPO_LIST_PATH) + + response = self.page_request(url, self.query_params) + + while True: + page_results = self.results_simplified(response.json()) - def _get_int(self, headers: MutableMapping[str, Any], key: str) -> Optional[int]: - _val = headers.get(key) - if _val: - return int(_val) - return None + yield page_results - def run(self, min_bound=1, max_bound=None, check_existence=False): - return super().run(min_bound, max_bound, check_existence) + assert len(response.links) > 0, "API changed: no Link header found" + if "next" in response.links: + url = response.links["next"]["url"] + else: + # last page + break + + response = self.page_request(url, {}) + + def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]: + """Convert a page of Gitea repositories into a list of ListedOrigins. + + """ + assert self.lister_obj.id is not None + + for repo in page: + last_update = iso8601.parse_date(repo["updated_at"]) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=repo["clone_url"], + visit_type="git", + last_update=last_update, + ) diff --git a/swh/lister/gitea/models.py b/swh/lister/gitea/models.py deleted file mode 100644 --- a/swh/lister/gitea/models.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2020 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, String - -from ..core.models import ModelBase - - -class GiteaModel(ModelBase): - """a Gitea repository from a gitea instance - - """ - - __tablename__ = "gitea_repo" - - uid = Column(String, primary_key=True) - instance = Column(String, index=True) diff --git a/swh/lister/gitea/tasks.py b/swh/lister/gitea/tasks.py --- a/swh/lister/gitea/tasks.py +++ b/swh/lister/gitea/tasks.py @@ -2,51 +2,27 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import random +from typing import Dict, Optional -from celery import group, shared_task +from celery import shared_task -from .. import utils from .lister import GiteaLister -NBPAGES = 10 - -@shared_task(name=__name__ + ".IncrementalGiteaLister") -def list_gitea_incremental(**lister_args): - """Incremental update of a Gitea instance""" - lister_args["order"] = "desc" - lister = GiteaLister(**lister_args) - total_pages = lister.get_pages_information()[1] - # stopping as soon as existing origins for that instance are detected - return lister.run(min_bound=1, max_bound=total_pages, check_existence=True) - - -@shared_task(name=__name__ + ".RangeGiteaLister") -def _range_gitea_lister(start, end, **lister_args): - lister = GiteaLister(**lister_args) - return lister.run(min_bound=start, max_bound=end) - - -@shared_task(name=__name__ + ".FullGiteaRelister", bind=True) -def list_gitea_full(self, **lister_args): +@shared_task(name=__name__ + ".FullGiteaRelister") +def list_gitea_full( + url: str, + instance: Optional[str] = None, + api_token: Optional[str] = None, + page_size: Optional[int] = None, +) -> Dict[str, int]: """Full update of a Gitea instance""" - lister = GiteaLister(**lister_args) - _, total_pages, _ = lister.get_pages_information() - ranges = list(utils.split_range(total_pages, NBPAGES)) - random.shuffle(ranges) - promise = group( - _range_gitea_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges - )() - self.log.debug("%s OK (spawned %s subtasks)" % (self.name, len(ranges))) - try: - promise.save() - except (NotImplementedError, AttributeError): - self.log.info("Unable to call save_group with current result backend.") - # FIXME: what to do in terms of return here? - return promise.id + lister = GiteaLister.from_configfile( + url=url, instance=instance, api_token=api_token, page_size=page_size + ) + return lister.run().dict() @shared_task(name=__name__ + ".ping") -def _ping(): +def _ping() -> str: return "OK" diff --git a/swh/lister/gitea/tests/data/https_try.gitea.io/api_empty_response.json b/swh/lister/gitea/tests/data/https_try.gitea.io/api_empty_response.json deleted file mode 100644 --- a/swh/lister/gitea/tests/data/https_try.gitea.io/api_empty_response.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "ok": true, - "data": [] -} \ No newline at end of file diff --git a/swh/lister/gitea/tests/data/https_try.gitea.io/api_v1_repos_search,page=1,sort=id,order=asc,limit=3 b/swh/lister/gitea/tests/data/https_try.gitea.io/api_v1_repos_search,page=1,sort=id,order=asc,limit=3 deleted file mode 100644 --- a/swh/lister/gitea/tests/data/https_try.gitea.io/api_v1_repos_search,page=1,sort=id,order=asc,limit=3 +++ /dev/null @@ -1,182 +0,0 @@ -{ - "ok": true, - "data": [ - { - "id": 5017, - "owner": { - "id": 1609, - "login": "JonasFranzDEV", - "full_name": "", - "email": "info@jonasfranz.software", - "avatar_url": "https://try.gitea.io/user/avatar/JonasFranzDEV/-1", - "language": "de-DE", - "is_admin": false, - "last_login": "2019-10-19T10:58:29Z", - "created": "2017-06-25T17:43:19Z", - "username": "JonasFranzDEV" - }, - "name": "drone-gitea-release", - "full_name": "JonasFranzDEV/drone-gitea-release", - "description": "", - "empty": false, - "private": false, - "fork": false, - "template": false, - "parent": null, - "mirror": false, - "size": 380, - "html_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release", - "ssh_url": "git@try.gitea.io:JonasFranzDEV/drone-gitea-release.git", - "clone_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release.git", - "original_url": "", - "website": "", - "stars_count": 0, - "forks_count": 0, - "watchers_count": 1, - "open_issues_count": 1, - "open_pr_counter": 0, - "release_counter": 2, - "default_branch": "master", - "archived": false, - "created_at": "2018-03-30T19:34:44Z", - "updated_at": "2018-05-29T20:09:40Z", - "permissions": { - "admin": false, - "push": false, - "pull": true - }, - "has_issues": true, - "internal_tracker": { - "enable_time_tracker": true, - "allow_only_contributors_to_track_time": true, - "enable_issue_dependencies": true - }, - "has_wiki": true, - "has_pull_requests": true, - "ignore_whitespace_conflicts": false, - "allow_merge_commits": false, - "allow_rebase": false, - "allow_rebase_explicit": true, - "allow_squash_merge": false, - "avatar_url": "" - }, - { - "id": 5018, - "owner": { - "id": 4495, - "login": "nick.korsakov", - "full_name": "", - "email": "nick@korsakov.email", - "avatar_url": "https://try.gitea.io/user/avatar/nick.korsakov/-1", - "language": "ru-RU", - "is_admin": false, - "last_login": "2020-02-15T10:29:10Z", - "created": "2018-03-31T15:00:07Z", - "username": "nick.korsakov" - }, - "name": "one", - "full_name": "nick.korsakov/one", - "description": "", - "empty": true, - "private": false, - "fork": false, - "template": false, - "parent": null, - "mirror": false, - "size": 0, - "html_url": "https://try.gitea.io/nick.korsakov/one", - "ssh_url": "git@try.gitea.io:nick.korsakov/one.git", - "clone_url": "https://try.gitea.io/nick.korsakov/one.git", - "original_url": "", - "website": "", - "stars_count": 0, - "forks_count": 0, - "watchers_count": 1, - "open_issues_count": 0, - "open_pr_counter": 0, - "release_counter": 0, - "default_branch": "master", - "archived": false, - "created_at": "2018-03-31T15:00:33Z", - "updated_at": "2018-03-31T15:00:33Z", - "permissions": { - "admin": false, - "push": false, - "pull": true - }, - "has_issues": true, - "internal_tracker": { - "enable_time_tracker": true, - "allow_only_contributors_to_track_time": true, - "enable_issue_dependencies": true - }, - "has_wiki": true, - "has_pull_requests": true, - "ignore_whitespace_conflicts": false, - "allow_merge_commits": false, - "allow_rebase": false, - "allow_rebase_explicit": true, - "allow_squash_merge": false, - "avatar_url": "" - }, - { - "id": 5030, - "owner": { - "id": 1623, - "login": "xingshijun", - "full_name": "", - "email": "934302794@qq.com", - "avatar_url": "https://try.gitea.io/user/avatar/xingshijun/-1", - "language": "zh-CN", - "is_admin": false, - "last_login": "2019-06-15T12:28:43Z", - "created": "2017-06-28T02:19:23Z", - "username": "xingshijun" - }, - "name": "lfzl", - "full_name": "xingshijun/lfzl", - "description": "", - "empty": false, - "private": false, - "fork": false, - "template": false, - "parent": null, - "mirror": false, - "size": 10990, - "html_url": "https://try.gitea.io/xingshijun/lfzl", - "ssh_url": "git@try.gitea.io:xingshijun/lfzl.git", - "clone_url": "https://try.gitea.io/xingshijun/lfzl.git", - "original_url": "", - "website": "", - "stars_count": 0, - "forks_count": 0, - "watchers_count": 1, - "open_issues_count": 0, - "open_pr_counter": 0, - "release_counter": 0, - "default_branch": "master", - "archived": false, - "created_at": "2018-04-02T08:34:08Z", - "updated_at": "2019-11-21T10:23:36Z", - "permissions": { - "admin": false, - "push": false, - "pull": true - }, - "has_issues": true, - "internal_tracker": { - "enable_time_tracker": true, - "allow_only_contributors_to_track_time": true, - "enable_issue_dependencies": true - }, - "has_wiki": true, - "has_pull_requests": true, - "ignore_whitespace_conflicts": false, - "allow_merge_commits": false, - "allow_rebase": false, - "allow_rebase_explicit": true, - "allow_squash_merge": false, - "avatar_url": "" - } - ] -} \ No newline at end of file diff --git a/swh/lister/gitea/tests/data/https_try.gitea.io/api_response.json b/swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1 rename from swh/lister/gitea/tests/data/https_try.gitea.io/api_response.json rename to swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1 --- a/swh/lister/gitea/tests/data/https_try.gitea.io/api_response.json +++ b/swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1 @@ -1,182 +1,195 @@ { - "ok": true, "data": [ { + "allow_merge_commits": false, + "allow_rebase": false, + "allow_rebase_explicit": true, + "allow_squash_merge": false, + "archived": false, + "avatar_url": "", + "clone_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release.git", + "created_at": "2018-03-30T19:34:44Z", + "default_branch": "master", + "description": "", + "empty": false, + "fork": false, + "forks_count": 1, + "full_name": "JonasFranzDEV/drone-gitea-release", + "has_issues": true, + "has_projects": false, + "has_pull_requests": true, + "has_wiki": true, + "html_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release", "id": 5017, + "ignore_whitespace_conflicts": false, + "internal": false, + "internal_tracker": { + "allow_only_contributors_to_track_time": true, + "enable_issue_dependencies": true, + "enable_time_tracker": true + }, + "mirror": false, + "mirror_interval": "", + "name": "drone-gitea-release", + "open_issues_count": 1, + "open_pr_counter": 0, + "original_url": "", "owner": { - "id": 1609, - "login": "JonasFranzDEV", - "full_name": "", - "email": "info@jonasfranz.software", "avatar_url": "https://try.gitea.io/user/avatar/JonasFranzDEV/-1", - "language": "de-DE", - "is_admin": false, - "last_login": "2019-10-19T10:58:29Z", "created": "2017-06-25T17:43:19Z", + "email": "info@jonasfranz.software", + "full_name": "", + "id": 1609, + "is_admin": false, + "language": "", + "last_login": "0001-01-01T00:00:00Z", + "login": "JonasFranzDEV", "username": "JonasFranzDEV" }, - "name": "drone-gitea-release", - "full_name": "JonasFranzDEV/drone-gitea-release", - "description": "", - "empty": false, - "private": false, - "fork": false, - "template": false, "parent": null, - "mirror": false, + "permissions": { + "admin": false, + "pull": true, + "push": false + }, + "private": false, + "release_counter": 2, "size": 380, - "html_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release", "ssh_url": "git@try.gitea.io:JonasFranzDEV/drone-gitea-release.git", - "clone_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release.git", - "original_url": "", - "website": "", "stars_count": 0, - "forks_count": 0, - "watchers_count": 1, - "open_issues_count": 1, - "open_pr_counter": 0, - "release_counter": 2, - "default_branch": "master", - "archived": false, - "created_at": "2018-03-30T19:34:44Z", + "template": false, "updated_at": "2018-05-29T20:09:40Z", - "permissions": { - "admin": false, - "push": false, - "pull": true - }, - "has_issues": true, - "internal_tracker": { - "enable_time_tracker": true, - "allow_only_contributors_to_track_time": true, - "enable_issue_dependencies": true - }, - "has_wiki": true, - "has_pull_requests": true, - "ignore_whitespace_conflicts": false, + "watchers_count": 1, + "website": "" + }, + { "allow_merge_commits": false, "allow_rebase": false, "allow_rebase_explicit": true, "allow_squash_merge": false, - "avatar_url": "" - }, - { - "id": 5018, - "owner": { - "id": 4495, - "login": "nick.korsakov", - "full_name": "", - "email": "nick@korsakov.email", - "avatar_url": "https://try.gitea.io/user/avatar/nick.korsakov/-1", - "language": "ru-RU", - "is_admin": false, - "last_login": "2020-02-15T10:29:10Z", - "created": "2018-03-31T15:00:07Z", - "username": "nick.korsakov" - }, - "name": "one", - "full_name": "nick.korsakov/one", + "archived": false, + "avatar_url": "", + "clone_url": "https://try.gitea.io/xingshijun/lfzl.git", + "created_at": "2018-04-02T08:34:08Z", + "default_branch": "master", "description": "", - "empty": true, - "private": false, + "empty": false, "fork": false, - "template": false, - "parent": null, - "mirror": false, - "size": 0, - "html_url": "https://try.gitea.io/nick.korsakov/one", - "ssh_url": "git@try.gitea.io:nick.korsakov/one.git", - "clone_url": "https://try.gitea.io/nick.korsakov/one.git", - "original_url": "", - "website": "", - "stars_count": 0, "forks_count": 0, - "watchers_count": 1, - "open_issues_count": 0, - "open_pr_counter": 0, - "release_counter": 0, - "default_branch": "master", - "archived": false, - "created_at": "2018-03-31T15:00:33Z", - "updated_at": "2018-03-31T15:00:33Z", - "permissions": { - "admin": false, - "push": false, - "pull": true - }, + "full_name": "xingshijun/lfzl", "has_issues": true, + "has_projects": false, + "has_pull_requests": true, + "has_wiki": true, + "html_url": "https://try.gitea.io/xingshijun/lfzl", + "id": 5030, + "ignore_whitespace_conflicts": false, + "internal": false, "internal_tracker": { - "enable_time_tracker": true, "allow_only_contributors_to_track_time": true, - "enable_issue_dependencies": true + "enable_issue_dependencies": true, + "enable_time_tracker": true }, - "has_wiki": true, - "has_pull_requests": true, - "ignore_whitespace_conflicts": false, - "allow_merge_commits": false, - "allow_rebase": false, - "allow_rebase_explicit": true, - "allow_squash_merge": false, - "avatar_url": "" - }, - { - "id": 5030, + "mirror": false, + "mirror_interval": "", + "name": "lfzl", + "open_issues_count": 0, + "open_pr_counter": 0, + "original_url": "", "owner": { - "id": 1623, - "login": "xingshijun", - "full_name": "", - "email": "934302794@qq.com", "avatar_url": "https://try.gitea.io/user/avatar/xingshijun/-1", - "language": "zh-CN", - "is_admin": false, - "last_login": "2019-06-15T12:28:43Z", "created": "2017-06-28T02:19:23Z", + "email": "934302794@qq.com", + "full_name": "", + "id": 1623, + "is_admin": false, + "language": "", + "last_login": "0001-01-01T00:00:00Z", + "login": "xingshijun", "username": "xingshijun" }, - "name": "lfzl", - "full_name": "xingshijun/lfzl", - "description": "", - "empty": false, - "private": false, - "fork": false, - "template": false, "parent": null, - "mirror": false, - "size": 10990, - "html_url": "https://try.gitea.io/xingshijun/lfzl", + "permissions": { + "admin": false, + "pull": true, + "push": false + }, + "private": false, + "release_counter": 0, + "size": 10997, "ssh_url": "git@try.gitea.io:xingshijun/lfzl.git", - "clone_url": "https://try.gitea.io/xingshijun/lfzl.git", - "original_url": "", - "website": "", "stars_count": 0, - "forks_count": 0, + "template": false, + "updated_at": "2020-04-16T08:39:18Z", "watchers_count": 1, - "open_issues_count": 0, - "open_pr_counter": 0, - "release_counter": 0, - "default_branch": "master", + "website": "" + }, + { + "allow_merge_commits": false, + "allow_rebase": false, + "allow_rebase_explicit": true, + "allow_squash_merge": false, "archived": false, - "created_at": "2018-04-02T08:34:08Z", - "updated_at": "2019-11-21T10:23:36Z", - "permissions": { - "admin": false, - "push": false, - "pull": true - }, + "avatar_url": "", + "clone_url": "https://try.gitea.io/ulm0/negroni.git", + "created_at": "2018-04-02T17:30:26Z", + "default_branch": "master", + "description": "Idiomatic HTTP Middleware for Golang", + "empty": false, + "fork": false, + "forks_count": 1, + "full_name": "ulm0/negroni", "has_issues": true, + "has_projects": false, + "has_pull_requests": true, + "has_wiki": true, + "html_url": "https://try.gitea.io/ulm0/negroni", + "id": 5034, + "ignore_whitespace_conflicts": false, + "internal": false, "internal_tracker": { - "enable_time_tracker": true, "allow_only_contributors_to_track_time": true, - "enable_issue_dependencies": true + "enable_issue_dependencies": true, + "enable_time_tracker": true }, - "has_wiki": true, - "has_pull_requests": true, - "ignore_whitespace_conflicts": false, - "allow_merge_commits": false, - "allow_rebase": false, - "allow_rebase_explicit": true, - "allow_squash_merge": false, - "avatar_url": "" + "mirror": true, + "mirror_interval": "8h0m0s", + "name": "negroni", + "open_issues_count": 0, + "open_pr_counter": 0, + "original_url": "", + "owner": { + "avatar_url": "https://try.gitea.io/user/avatar/ulm0/-1", + "created": "2017-07-09T18:58:34Z", + "email": "ulm0@innersea.xyz", + "full_name": "Mauricio Ugaz", + "id": 1706, + "is_admin": false, + "language": "", + "last_login": "0001-01-01T00:00:00Z", + "login": "ulm0", + "username": "ulm0" + }, + "parent": null, + "permissions": { + "admin": false, + "pull": true, + "push": false + }, + "private": false, + "release_counter": 7, + "size": 17739, + "ssh_url": "git@try.gitea.io:ulm0/negroni.git", + "stars_count": 0, + "template": false, + "updated_at": "2020-11-14T17:50:56Z", + "watchers_count": 1, + "website": "" } - ] + ], + "ok": true, + "links": { + "next": "https://try.gitea.io/api/v1/repos/search?limit=3&order=asc&page=2&sort=id", + "last": "https://try.gitea.io/api/v1/repos/search?limit=3&order=asc&page=2282&sort=id" + } } \ No newline at end of file diff --git a/swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2 b/swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2 new file mode 100644 --- /dev/null +++ b/swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2 @@ -0,0 +1,252 @@ +{ + "data": [ + { + "allow_merge_commits": false, + "allow_rebase": false, + "allow_rebase_explicit": true, + "allow_squash_merge": false, + "archived": false, + "avatar_url": "", + "clone_url": "https://try.gitea.io/ulm0/mux.git", + "created_at": "2018-04-02T17:35:13Z", + "default_branch": "master", + "description": "A powerful URL router and dispatcher for golang.", + "empty": false, + "fork": false, + "forks_count": 1, + "full_name": "ulm0/mux", + "has_issues": true, + "has_projects": false, + "has_pull_requests": true, + "has_wiki": true, + "html_url": "https://try.gitea.io/ulm0/mux", + "id": 5035, + "ignore_whitespace_conflicts": false, + "internal": false, + "internal_tracker": { + "allow_only_contributors_to_track_time": true, + "enable_issue_dependencies": true, + "enable_time_tracker": true + }, + "mirror": true, + "mirror_interval": "8h0m0s", + "name": "mux", + "open_issues_count": 0, + "open_pr_counter": 0, + "original_url": "", + "owner": { + "avatar_url": "https://try.gitea.io/user/avatar/ulm0/-1", + "created": "2017-07-09T18:58:34Z", + "email": "ulm0@innersea.xyz", + "full_name": "Mauricio Ugaz", + "id": 1706, + "is_admin": false, + "language": "", + "last_login": "0001-01-01T00:00:00Z", + "login": "ulm0", + "username": "ulm0" + }, + "parent": null, + "permissions": { + "admin": false, + "pull": true, + "push": false + }, + "private": false, + "release_counter": 14, + "size": 2512, + "ssh_url": "git@try.gitea.io:ulm0/mux.git", + "stars_count": 0, + "template": false, + "updated_at": "2020-09-12T19:20:56Z", + "watchers_count": 1, + "website": "http://www.gorillatoolkit.org/pkg/mux" + }, + { + "allow_merge_commits": false, + "allow_rebase": false, + "allow_rebase_explicit": true, + "allow_squash_merge": false, + "archived": false, + "avatar_url": "", + "clone_url": "https://try.gitea.io/ligh0721/negroni.git", + "created_at": "2018-04-03T10:41:41Z", + "default_branch": "master", + "description": "Idiomatic HTTP Middleware for Golang", + "empty": false, + "fork": true, + "forks_count": 0, + "full_name": "ligh0721/negroni", + "has_issues": true, + "has_projects": false, + "has_pull_requests": true, + "has_wiki": true, + "html_url": "https://try.gitea.io/ligh0721/negroni", + "id": 5045, + "ignore_whitespace_conflicts": false, + "internal": false, + "internal_tracker": { + "allow_only_contributors_to_track_time": true, + "enable_issue_dependencies": true, + "enable_time_tracker": true + }, + "mirror": false, + "mirror_interval": "", + "name": "negroni", + "open_issues_count": 0, + "open_pr_counter": 0, + "original_url": "", + "owner": { + "avatar_url": "https://try.gitea.io/user/avatar/ligh0721/-1", + "created": "2018-04-03T10:37:01Z", + "email": "lightning_0721@163.com", + "full_name": "", + "id": 4534, + "is_admin": false, + "language": "", + "last_login": "0001-01-01T00:00:00Z", + "login": "ligh0721", + "username": "ligh0721" + }, + "parent": { + "allow_merge_commits": false, + "allow_rebase": false, + "allow_rebase_explicit": true, + "allow_squash_merge": false, + "archived": false, + "avatar_url": "", + "clone_url": "https://try.gitea.io/ulm0/negroni.git", + "created_at": "2018-04-02T17:30:26Z", + "default_branch": "master", + "description": "Idiomatic HTTP Middleware for Golang", + "empty": false, + "fork": false, + "forks_count": 1, + "full_name": "ulm0/negroni", + "has_issues": true, + "has_projects": false, + "has_pull_requests": true, + "has_wiki": true, + "html_url": "https://try.gitea.io/ulm0/negroni", + "id": 5034, + "ignore_whitespace_conflicts": false, + "internal": false, + "internal_tracker": { + "allow_only_contributors_to_track_time": true, + "enable_issue_dependencies": true, + "enable_time_tracker": true + }, + "mirror": true, + "mirror_interval": "8h0m0s", + "name": "negroni", + "open_issues_count": 0, + "open_pr_counter": 0, + "original_url": "", + "owner": { + "avatar_url": "https://try.gitea.io/user/avatar/ulm0/-1", + "created": "2017-07-09T18:58:34Z", + "email": "ulm0@innersea.xyz", + "full_name": "Mauricio Ugaz", + "id": 1706, + "is_admin": false, + "language": "", + "last_login": "0001-01-01T00:00:00Z", + "login": "ulm0", + "username": "ulm0" + }, + "parent": null, + "permissions": { + "admin": false, + "pull": true, + "push": false + }, + "private": false, + "release_counter": 7, + "size": 17739, + "ssh_url": "git@try.gitea.io:ulm0/negroni.git", + "stars_count": 0, + "template": false, + "updated_at": "2020-11-14T17:50:56Z", + "watchers_count": 1, + "website": "" + }, + "permissions": { + "admin": false, + "pull": true, + "push": false + }, + "private": false, + "release_counter": 3, + "size": 344, + "ssh_url": "git@try.gitea.io:ligh0721/negroni.git", + "stars_count": 0, + "template": false, + "updated_at": "2018-04-03T10:41:41Z", + "watchers_count": 1, + "website": "" + }, + { + "allow_merge_commits": false, + "allow_rebase": false, + "allow_rebase_explicit": true, + "allow_squash_merge": false, + "archived": false, + "avatar_url": "", + "clone_url": "https://try.gitea.io/user12312341324124/Tiny.git", + "created_at": "2018-04-03T13:08:29Z", + "default_branch": "master", + "description": "", + "empty": false, + "fork": false, + "forks_count": 1, + "full_name": "user12312341324124/Tiny", + "has_issues": true, + "has_projects": false, + "has_pull_requests": true, + "has_wiki": true, + "html_url": "https://try.gitea.io/user12312341324124/Tiny", + "id": 5046, + "ignore_whitespace_conflicts": false, + "internal": false, + "internal_tracker": { + "allow_only_contributors_to_track_time": true, + "enable_issue_dependencies": true, + "enable_time_tracker": true + }, + "mirror": false, + "mirror_interval": "", + "name": "Tiny", + "open_issues_count": 1, + "open_pr_counter": 0, + "original_url": "", + "owner": { + "avatar_url": "https://try.gitea.io/user/avatar/user12312341324124/-1", + "created": "2018-04-03T13:07:45Z", + "email": "z333676@mvrht.net", + "full_name": "", + "id": 4536, + "is_admin": false, + "language": "", + "last_login": "0001-01-01T00:00:00Z", + "login": "user12312341324124", + "username": "user12312341324124" + }, + "parent": null, + "permissions": { + "admin": false, + "pull": true, + "push": false + }, + "private": false, + "release_counter": 0, + "size": 110, + "ssh_url": "git@try.gitea.io:user12312341324124/Tiny.git", + "stars_count": 0, + "template": false, + "updated_at": "2018-04-03T13:08:29Z", + "watchers_count": 1, + "website": "" + } + ], + "ok": true +} \ No newline at end of file diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py --- a/swh/lister/gitea/tests/test_lister.py +++ b/swh/lister/gitea/tests/test_lister.py @@ -3,56 +3,149 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging -import re -import unittest - -from swh.lister.core.tests.test_lister import HttpListerTesterBase -from swh.lister.gitea.lister import GiteaLister - -logger = logging.getLogger(__name__) - - -class GiteaListerTester(HttpListerTesterBase, unittest.TestCase): - Lister = GiteaLister - test_re = re.compile(r"^.*/projects.*page=(\d+).*") - lister_subdir = "gitea" - good_api_response_file = "data/https_try.gitea.io/api_response.json" - bad_api_response_file = "data/https_try.gitea.io/api_empty_response.json" - first_index = 1 - last_index = 2 - entries_per_page = 3 - convert_type = int - - def response_headers(self, request): - headers = {} - if self.request_index(request) == self.first_index: - headers.update( - { - "Link": ";" - ' rel="next"' % self.last_index - } - ) - - return headers - - -def test_lister_gitea(lister_gitea, requests_mock_datadir): - lister_gitea.run() - r = lister_gitea.scheduler.search_tasks(task_type="load-git") - assert len(r) == 3 - - for row in r: - assert row["type"] == "load-git" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 - - # kwargs - kwargs = row["arguments"]["kwargs"] - url = kwargs["url"] - assert url.startswith("https://try.gitea.io") - - assert row["policy"] == "recurring" - assert row["priority"] is None +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import pytest +import requests + +from swh.lister.gitea.lister import GiteaLister, RepoListPage +from swh.scheduler.model import ListedOrigin + +TRYGITEA_URL = "https://try.gitea.io/api/v1/" +TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=1" +TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=2" + + +@pytest.fixture +def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]: + text = Path(datadir, "https_try.gitea.io", "repos_page1").read_text() + headers = { + "Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=TRYGITEA_P2_URL) + } + page_result = GiteaLister.results_simplified(json.loads(text)) + origin_urls = [r["clone_url"] for r in page_result] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]: + text = Path(datadir, "https_try.gitea.io", "repos_page2").read_text() + headers = { + "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=TRYGITEA_P1_URL) + } + page_result = GiteaLister.results_simplified(json.loads(text)) + origin_urls = [r["clone_url"] for r in page_result] + return text, headers, page_result, origin_urls + + +def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): + """Asserts that the two collections have the same origin URLs. + + Does not test last_update.""" + + sorted_lister_urls = list(sorted(lister_urls)) + sorted_scheduler_origins = list(sorted(scheduler_origins)) + + assert len(sorted_lister_urls) == len(sorted_scheduler_origins) + + for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): + assert l_url == s_origin.url + + +def test_gitea_full_listing( + swh_scheduler, requests_mock, mocker, trygitea_p1, trygitea_p2 +): + """Covers full listing of multiple pages, rate-limit, page size (required for test), + checking page results and listed origins, statelessness.""" + + kwargs = dict(url=TRYGITEA_URL, instance="try_gitea", page_size=3) + lister = GiteaLister(scheduler=swh_scheduler, **kwargs) + + lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page") + + p1_text, p1_headers, p1_result, p1_origin_urls = trygitea_p1 + p2_text, p2_headers, p2_result, p2_origin_urls = trygitea_p2 + + requests_mock.get(TRYGITEA_P1_URL, text=p1_text, headers=p1_headers) + requests_mock.get( + TRYGITEA_P2_URL, + [ + {"status_code": requests.codes.too_many_requests}, + {"text": p2_text, "headers": p2_headers}, + ], + ) + + # end test setup + + stats = lister.run() + + # start test checks + + assert stats.pages == 2 + assert stats.origins == 6 + + calls = [mocker.call(p1_result), mocker.call(p2_result)] + lister.get_origins_from_page.assert_has_calls(calls) + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins + + check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) + + assert lister.get_state_from_scheduler() is None + + +def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1): + """Covers token authentication, token from credentials, + instance inference from URL.""" + + api_token = "teapot" + instance = "try.gitea.io" + creds = {"gitea": {instance: [{"username": "u", "password": api_token}]}} + + kwargs1 = dict(url=TRYGITEA_URL, api_token=api_token) + lister = GiteaLister(scheduler=swh_scheduler, **kwargs1) + + # test API token + assert "Authorization" in lister.session.headers + assert lister.session.headers["Authorization"].lower() == "token %s" % api_token + + kwargs2 = dict(url=TRYGITEA_URL, credentials=creds) + lister = GiteaLister(scheduler=swh_scheduler, **kwargs2) + + # test API token from credentials + assert "Authorization" in lister.session.headers + assert lister.session.headers["Authorization"].lower() == "token %s" % api_token + + # test instance inference from URL + assert lister.instance + assert "gitea" in lister.instance # infer something related to that + + # setup requests mocking + p1_text, p1_headers, _, _ = trygitea_p1 + p1_headers["Link"] = p1_headers["Link"].replace("next", "") # only 1 page + + base_url = TRYGITEA_URL + lister.REPO_LIST_PATH + requests_mock.get(base_url, text=p1_text, headers=p1_headers) + + # now check the lister runs without error + stats = lister.run() + + assert stats.pages == 1 + + +@pytest.mark.parametrize("http_code", [400, 500, 502]) +def test_gitea_list_http_error(swh_scheduler, requests_mock, http_code): + """Test handling of some HTTP errors commonly encountered""" + + lister = GiteaLister(scheduler=swh_scheduler, url=TRYGITEA_URL, page_size=3) + + base_url = TRYGITEA_URL + lister.REPO_LIST_PATH + requests_mock.get(base_url, status_code=http_code) + + with pytest.raises(requests.HTTPError): + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins + assert len(scheduler_origins) == 0 diff --git a/swh/lister/gitea/tests/test_tasks.py b/swh/lister/gitea/tests/test_tasks.py --- a/swh/lister/gitea/tests/test_tasks.py +++ b/swh/lister/gitea/tests/test_tasks.py @@ -3,13 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from time import sleep -from unittest.mock import call, patch +from unittest.mock import patch -from celery.result import GroupResult - -from swh.lister.gitea.tasks import NBPAGES -from swh.lister.utils import split_range +from swh.lister.pattern import ListerStats def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): @@ -21,125 +17,43 @@ @patch("swh.lister.gitea.tasks.GiteaLister") -def test_incremental(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked GiteaLister - lister.return_value = lister - lister.run.return_value = None - lister.get_pages_information.return_value = (None, 10, None) +def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + kwargs = dict(url="https://try.gitea.io/api/v1") res = swh_scheduler_celery_app.send_task( - "swh.lister.gitea.tasks.IncrementalGiteaLister" + "swh.lister.gitea.tasks.FullGiteaRelister", kwargs=kwargs, ) assert res res.wait() assert res.successful() - lister.assert_called_once_with(order="desc") - lister.db_last_index.assert_not_called() - lister.get_pages_information.assert_called_once_with() - lister.run.assert_called_once_with(min_bound=1, max_bound=10, check_existence=True) - - -@patch("swh.lister.gitea.tasks.GiteaLister") -def test_range(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked GiteaLister - lister.return_value = lister - lister.run.return_value = None - - res = swh_scheduler_celery_app.send_task( - "swh.lister.gitea.tasks.RangeGiteaLister", kwargs=dict(start=12, end=42) - ) - assert res - res.wait() - assert res.successful() - - lister.assert_called_once_with() - lister.db_last_index.assert_not_called() - lister.run.assert_called_once_with(min_bound=12, max_bound=42) - - -@patch("swh.lister.gitea.tasks.GiteaLister") -def test_relister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - total_pages = 85 - # setup the mocked GiteaLister - lister.return_value = lister - lister.run.return_value = None - lister.get_pages_information.return_value = (None, total_pages, None) - - res = swh_scheduler_celery_app.send_task("swh.lister.gitea.tasks.FullGiteaRelister") - assert res - - res.wait() - assert res.successful() - - # retrieve the GroupResult for this task and wait for all the subtasks - # to complete - promise_id = res.result - assert promise_id - promise = GroupResult.restore(promise_id, app=swh_scheduler_celery_app) - for i in range(5): - if promise.ready(): - break - sleep(1) - - lister.assert_called_with() - - # one by the FullGiteaRelister task - # + 9 for the RangeGiteaLister subtasks - assert lister.call_count == 10 - - lister.db_last_index.assert_not_called() - lister.db_partition_indices.assert_not_called() - lister.get_pages_information.assert_called_once_with() + actual_kwargs = dict(**kwargs, instance=None, api_token=None, page_size=None) - # lister.run should have been called once per partition interval - for min_bound, max_bound in split_range(total_pages, NBPAGES): - assert ( - call(min_bound=min_bound, max_bound=max_bound) in lister.run.call_args_list - ) + lister.from_configfile.assert_called_once_with(**actual_kwargs) + lister.run.assert_called_once_with() @patch("swh.lister.gitea.tasks.GiteaLister") -def test_relister_instance( +def test_full_listing_params( lister, swh_scheduler_celery_app, swh_scheduler_celery_worker ): - total_pages = 85 - # setup the mocked GiteaLister - lister.return_value = lister - lister.run.return_value = None - lister.get_pages_information.return_value = (None, total_pages, None) - + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://0xacab.org/api/v4", + instance="0xacab", + api_token="test", + page_size=50, + ) res = swh_scheduler_celery_app.send_task( - "swh.lister.gitea.tasks.FullGiteaRelister", - kwargs=dict(url="https://0xacab.org/api/v4"), + "swh.lister.gitea.tasks.FullGiteaRelister", kwargs=kwargs, ) assert res - res.wait() assert res.successful() - # retrieve the GroupResult for this task and wait for all the subtasks - # to complete - promise_id = res.result - assert promise_id - promise = GroupResult.restore(promise_id, app=swh_scheduler_celery_app) - for i in range(5): - if promise.ready(): - break - sleep(1) - - lister.assert_called_with(url="https://0xacab.org/api/v4") - - # one by the FullGiteaRelister task - # + 9 for the RangeGiteaLister subtasks - assert lister.call_count == 10 - - lister.db_last_index.assert_not_called() - lister.db_partition_indices.assert_not_called() - lister.get_pages_information.assert_called_once_with() - - # lister.run should have been called once per partition interval - for min_bound, max_bound in split_range(total_pages, NBPAGES): - assert ( - call(min_bound=min_bound, max_bound=max_bound) in lister.run.call_args_list - ) + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -15,6 +15,7 @@ "url": "https://forge.softwareheritage.org/api/diffusion.repository.search", "api_token": "bogus", }, + "gitea": {"url": "https://try.gitea.io/api/v1/",}, }