diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py --- a/swh/lister/gitlab/__init__.py +++ b/swh/lister/gitlab/__init__.py @@ -1,14 +1,12 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): from .lister import GitLabLister - from .models import GitLabModel return { - "models": [GitLabModel], "lister": GitLabLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,97 +1,202 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import time -from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Union +from dataclasses import asdict, dataclass +import logging +from typing import Any, Dict, Iterator, Optional, Tuple +from urllib.parse import parse_qs, urlparse -from requests import Response +import requests +from requests.exceptions import HTTPError +from requests.status_codes import codes +from tenacity.before_sleep import before_sleep_log from urllib3.util import parse_url -from ..core.page_by_page_lister import PageByPageHttpLister -from .models import GitLabModel +from swh.lister import USER_AGENT +from swh.lister.pattern import CredentialsType, Lister +from swh.lister.utils import retry_attempt, throttling_retry +from swh.scheduler.model import ListedOrigin +logger = logging.getLogger(__name__) + + +@dataclass +class GitLabListerState: + """State of the GitLabLister""" + + last_seen_next_link: Optional[str] = None + """Last link header (not visited yet) during an incremental pass + + """ + + +Repository = Dict[str, Any] + + +@dataclass +class PageResult: + """Result from a query to a gitlab project api page.""" + + repositories: Optional[Tuple[Repository, ...]] = None + next_page: Optional[str] = None + + +def _if_rate_limited(retry_state) -> bool: + """Custom tenacity retry predicate for handling HTTP responses with status code 403 + with specific ratelimit header. + + """ + attempt = retry_attempt(retry_state) + if attempt.failed: + exc = attempt.exception() + return ( + isinstance(exc, HTTPError) + and exc.response.status_code == codes.forbidden + and int(exc.response.headers.get("RateLimit-Remaining", "0")) == 0 + ) + return False + + +def _parse_page_id(url: Optional[str]) -> Optional[int]: + """Given an url, extract a return the 'page' query parameter associated value or None. + + """ + if not url: + return None + # link: https://${project-api}/?...&page=2x... + query_data = parse_qs(urlparse(url).query) + page = query_data.get("page") + if page and len(page) > 0: + return int(page[0]) + return None + + +class GitLabLister(Lister[GitLabListerState, PageResult]): + """List origins for a gitlab instance. + + By default, the lister runs in incremental mode: it lists all repositories, + starting with the `last_seen_next_link` stored in the scheduler backend. + + """ -class GitLabLister(PageByPageHttpLister): - # Template path expecting an integer that represents the page id - PATH_TEMPLATE = "/projects?page=%d&order_by=id" - DEFAULT_URL = "https://gitlab.com/api/v4/" - MODEL = GitLabModel LISTER_NAME = "gitlab" def __init__( - self, url=None, instance=None, override_config=None, sort="asc", per_page=20 + self, + scheduler, + url=None, + instance=None, + credentials: CredentialsType = None, + incremental: bool = False, ): - super().__init__(url=url, override_config=override_config) if instance is None: - instance = parse_url(self.url).host - self.instance = instance - self.PATH_TEMPLATE = "%s&sort=%s&per_page=%s" % ( - self.PATH_TEMPLATE, - sort, - per_page, + instance = parse_url(url).host + super().__init__( + scheduler=scheduler, + credentials=None, # anonymous for now + url=url, + instance=instance, ) + self.incremental = incremental - def uid(self, repo: Dict[str, Any]) -> str: - return "%s/%s" % (self.instance, repo["path_with_namespace"]) - - def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]: - return { - "instance": self.instance, - "uid": self.uid(repo), - "name": repo["name"], - "full_name": repo["path_with_namespace"], - "html_url": repo["web_url"], - "origin_url": repo["http_url_to_repo"], - "origin_type": "git", - } - - def transport_quota_check( - self, response: Response - ) -> Tuple[bool, Union[int, float]]: - """Deal with rate limit if any. + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) - """ - # not all gitlab instance have rate limit - if "RateLimit-Remaining" in response.headers: - reqs_remaining = int(response.headers["RateLimit-Remaining"]) - if response.status_code == 403 and reqs_remaining == 0: - reset_at = int(response.headers["RateLimit-Reset"]) - delay = min(reset_at - time.time(), 3600) - return True, delay - return False, 0 - - def _get_int(self, headers: MutableMapping[str, Any], key: str) -> Optional[int]: - _val = headers.get(key) - if _val: - return int(_val) - return None + def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState: + return GitLabListerState(**d) + + def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]: + return asdict(state) + + @throttling_retry( + retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) + ) + def get_page_result(self, url: str) -> PageResult: + logger.debug("Fetching URL %s", url) + response = self.session.get(url) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + repositories: Tuple[Repository, ...] = tuple(response.json()) + if hasattr(response, "links") and response.links.get("next"): + next_page = response.links["next"]["url"] + else: + next_page = None + + return PageResult(repositories, next_page) + + def get_pages(self) -> Iterator[PageResult]: + next_page: Optional[str] + if self.incremental and self.state is not None: + next_page = self.state.last_seen_next_link + else: + next_page = f"{self.url}projects?page=1&order_by=id&sort=asc&per_page=20" + + while next_page: + page_result = self.get_page_result(next_page) + yield page_result + next_page = page_result.next_page + + def get_origins_from_page(self, page_result: PageResult) -> Iterator[ListedOrigin]: + assert self.lister_obj.id is not None + + repositories = page_result.repositories if page_result.repositories else [] + for repo in repositories: + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=repo["http_url_to_repo"], + visit_type="git", + # TODO: Support "last_activity_at" as update information + # last_update=repo["last_activity_at"], + ) - def get_next_target_from_response(self, response: Response) -> Optional[int]: - """Determine the next page identifier. + def commit_page(self, page_result: PageResult) -> None: + """Update currently stored state using the latest listed "next" page if relevant. - """ - return self._get_int(response.headers, "x-next-page") + Relevancy is determined by the next_page link whose 'page' id must be strictly + superior to the currently stored one. - def get_pages_information( - self, - ) -> Tuple[Optional[int], Optional[int], Optional[int]]: - """Determine pages information. + Note: this is a noop for full listing mode """ - response = self.transport_head(identifier=1) # type: ignore - if not response.ok: - raise ValueError( - "Problem during information fetch: %s" % response.status_code - ) - h = response.headers - return ( - self._get_int(h, "x-total"), - self._get_int(h, "x-total-pages"), - self._get_int(h, "x-per-page"), - ) + if self.incremental: + # link: https://${project-api}/?...&page=2x... + next_page = page_result.next_page - def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]: - repos = response.json() - return [self.get_model_from_repo(repo) for repo in repos] + if next_page: + page_id = _parse_page_id(next_page) + previous_next_page = self.state.last_seen_next_link + previous_page_id = _parse_page_id(previous_next_page) + if previous_next_page is None or ( + previous_page_id and page_id and previous_page_id < page_id + ): + self.state.last_seen_next_link = next_page + + def finalize(self) -> None: + """finalize the lister state when relevant (see `fn:commit_page` for details) + + Note: this is a noop for full listing mode + + """ + next_page = self.state.last_seen_next_link + if self.incremental and next_page: + # link: https://${project-api}/?...&page=2x... + next_page_id = _parse_page_id(next_page) + scheduler_state = self.get_state_from_scheduler() + previous_next_page_id = _parse_page_id(scheduler_state.last_seen_next_link) + + if ( + previous_next_page_id + and next_page_id + and previous_next_page_id < next_page_id + ): + self.updated = True diff --git a/swh/lister/gitlab/models.py b/swh/lister/gitlab/models.py deleted file mode 100644 --- a/swh/lister/gitlab/models.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2018 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, String - -from ..core.models import ModelBase - - -class GitLabModel(ModelBase): - """a Gitlab repository from a gitlab instance - - """ - - __tablename__ = "gitlab_repo" - - uid = Column(String, primary_key=True) - instance = Column(String, index=True) diff --git a/swh/lister/gitlab/tests/conftest.py b/swh/lister/gitlab/tests/conftest.py deleted file mode 100644 --- a/swh/lister/gitlab/tests/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - - -@pytest.fixture -def lister_under_test(): - return "gitlab" - - -@pytest.fixture -def lister_gitlab(swh_lister): - for task_type in [ - { - "type": "load-git", - "description": "Load git repository", - "backend_name": "swh.loader.git.tasks.UpdateGitRepository", - "default_interval": "1 day", - }, - ]: - swh_lister.scheduler.create_task_type(task_type) - - return swh_lister diff --git a/swh/lister/gitlab/tests/data/https_gitlab.com/api_empty_response.json b/swh/lister/gitlab/tests/data/https_gitlab.com/api_empty_response.json deleted file mode 100644 --- a/swh/lister/gitlab/tests/data/https_gitlab.com/api_empty_response.json +++ /dev/null @@ -1 +0,0 @@ -[] diff --git a/swh/lister/gitlab/tests/data/https_gitlab.com/api_response.json b/swh/lister/gitlab/tests/data/https_gitlab.com/api_response.json deleted file mode 120000 --- a/swh/lister/gitlab/tests/data/https_gitlab.com/api_response.json +++ /dev/null @@ -1 +0,0 @@ -api_v4__projects,page=0,order_by=id,sort=asc,per_page=20 \ No newline at end of file diff --git a/swh/lister/gitlab/tests/data/https_gitlab.com/api_v4__projects,page=0,order_by=id,sort=asc,per_page=20 b/swh/lister/gitlab/tests/data/https_gitlab.com/api_v4_projects,page=1,order_by=id,sort=asc,per_page=20 rename from swh/lister/gitlab/tests/data/https_gitlab.com/api_v4__projects,page=0,order_by=id,sort=asc,per_page=20 rename to swh/lister/gitlab/tests/data/https_gitlab.com/api_v4_projects,page=1,order_by=id,sort=asc,per_page=20 diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -1,70 +1,74 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from datetime import datetime, timedelta import logging -import re -import unittest import pytest -from swh.lister.core.tests.test_lister import HttpListerTesterBase -from swh.lister.gitlab.lister import GitLabLister +from swh.lister.gitlab.lister import GitLabLister, _parse_page_id +from swh.lister.pattern import ListerStats logger = logging.getLogger(__name__) -class GitLabListerTester(HttpListerTesterBase, unittest.TestCase): - Lister = GitLabLister - test_re = re.compile(r"^.*/projects.*page=(\d+).*") - lister_subdir = "gitlab" - good_api_response_file = "data/gitlab.com/api_response.json" - bad_api_response_file = "data/gitlab.com/api_empty_response.json" - first_index = 1 - entries_per_page = 10 - convert_type = int - - def response_headers(self, request): - headers = {"RateLimit-Remaining": "1"} - if self.request_index(request) == self.first_index: - headers.update( - {"x-next-page": "3",} - ) - - return headers - - def mock_rate_quota(self, n, request, context): - self.rate_limit += 1 - context.status_code = 403 - context.headers["RateLimit-Remaining"] = "0" - one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) - context.headers["RateLimit-Reset"] = str(one_second) - return '{"error":"dummy"}' - - @pytest.fixture -def lister_under_test(): - return "gitlab" +def lister_gitlab(swh_scheduler): + url = "https://gitlab.com/api/v4/" + return GitLabLister(swh_scheduler, url=url) + + +# class GitLabListerTester(HttpListerTesterBase, unittest.TestCase): +# Lister = GitLabLister +# test_re = re.compile(r"^.*/projects.*page=(\d+).*") +# lister_subdir = "gitlab" +# good_api_response_file = "data/gitlab.com/api_response.json" +# bad_api_response_file = "data/gitlab.com/api_empty_response.json" +# first_index = 1 +# entries_per_page = 10 +# convert_type = int + +# def response_headers(self, request): +# headers = {"RateLimit-Remaining": "1"} +# if self.request_index(request) == self.first_index: +# headers.update( +# {"x-next-page": "3",} +# ) + +# return headers + +# def mock_rate_quota(self, n, request, context): +# self.rate_limit += 1 +# context.status_code = 403 +# context.headers["RateLimit-Remaining"] = "0" +# one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) +# context.headers["RateLimit-Reset"] = str(one_second) +# return '{"error":"dummy"}' def test_lister_gitlab(lister_gitlab, requests_mock_datadir): - lister_gitlab.run() - - r = lister_gitlab.scheduler.search_tasks(task_type="load-git") - assert len(r) == 10 - - for row in r: - assert row["type"] == "load-git" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 - - # kwargs - kwargs = row["arguments"]["kwargs"] - url = kwargs["url"] - assert url.startswith("https://gitlab.com") - - assert row["policy"] == "recurring" - assert row["priority"] is None + listed_result = lister_gitlab.run() + assert listed_result == ListerStats(pages=1, origins=10) + + scheduler_origins = lister_gitlab.scheduler.get_listed_origins( + lister_gitlab.lister_obj.id + ).origins + assert len(scheduler_origins) == 10 + + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith("https://gitlab.com") + + +@pytest.mark.parametrize( + "url,expected_result", + [ + (None, None), + ("http://dummy/?query=1", None), + ("http://dummy/?foo=bar&page=1&some=result", 1), + ("http://dummy/?foo=bar&page=&some=result", None), + ], +) +def test__parse_page_id(url, expected_result): + assert _parse_page_id(url) == expected_result diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -48,7 +48,6 @@ db_url = init_db().url() listers = { - "gitlab": "https://other.gitlab.uni/api/v4/", "cgit": "https://some.where/cgit", }