diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index cb510a4..6e69a4b 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,202 +1,207 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging from typing import Any, Dict, Iterator, Optional, Tuple from urllib.parse import parse_qs, urlparse import requests from requests.exceptions import HTTPError from requests.status_codes import codes from tenacity.before_sleep import before_sleep_log from urllib3.util import parse_url from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister from swh.lister.utils import retry_attempt, throttling_retry from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) @dataclass class GitLabListerState: """State of the GitLabLister""" last_seen_next_link: Optional[str] = None """Last link header (not visited yet) during an incremental pass """ Repository = Dict[str, Any] @dataclass class PageResult: """Result from a query to a gitlab project api page.""" repositories: Optional[Tuple[Repository, ...]] = None next_page: Optional[str] = None def _if_rate_limited(retry_state) -> bool: """Custom tenacity retry predicate for handling HTTP responses with status code 403 with specific ratelimit header. """ attempt = retry_attempt(retry_state) if attempt.failed: exc = attempt.exception() return ( isinstance(exc, HTTPError) and exc.response.status_code == codes.forbidden and int(exc.response.headers.get("RateLimit-Remaining", "0")) == 0 ) return False def _parse_page_id(url: Optional[str]) -> Optional[int]: """Given an url, extract a return the 'page' query parameter associated value or None. """ if not url: return None # link: https://${project-api}/?...&page=2x... query_data = parse_qs(urlparse(url).query) page = query_data.get("page") if page and len(page) > 0: return int(page[0]) return None class GitLabLister(Lister[GitLabListerState, PageResult]): """List origins for a gitlab instance. By default, the lister runs in incremental mode: it lists all repositories, starting with the `last_seen_next_link` stored in the scheduler backend. """ LISTER_NAME = "gitlab" def __init__( self, scheduler, url=None, instance=None, credentials: CredentialsType = None, incremental: bool = False, ): if instance is None: instance = parse_url(url).host super().__init__( scheduler=scheduler, credentials=None, # anonymous for now url=url, instance=instance, ) self.incremental = incremental + self.last_page: Optional[str] = None self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState: return GitLabListerState(**d) def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]: return asdict(state) @throttling_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def get_page_result(self, url: str) -> PageResult: logger.debug("Fetching URL %s", url) response = self.session.get(url) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() repositories: Tuple[Repository, ...] = tuple(response.json()) if hasattr(response, "links") and response.links.get("next"): next_page = response.links["next"]["url"] else: next_page = None return PageResult(repositories, next_page) def get_pages(self) -> Iterator[PageResult]: next_page: Optional[str] - if self.incremental and self.state is not None: + if self.incremental and self.state and self.state.last_seen_next_link: next_page = self.state.last_seen_next_link else: next_page = f"{self.url}projects?page=1&order_by=id&sort=asc&per_page=20" while next_page: + self.last_page = next_page page_result = self.get_page_result(next_page) yield page_result next_page = page_result.next_page def get_origins_from_page(self, page_result: PageResult) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None repositories = page_result.repositories if page_result.repositories else [] for repo in repositories: yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["http_url_to_repo"], visit_type="git", # TODO: Support "last_activity_at" as update information # last_update=repo["last_activity_at"], ) def commit_page(self, page_result: PageResult) -> None: """Update currently stored state using the latest listed "next" page if relevant. Relevancy is determined by the next_page link whose 'page' id must be strictly superior to the currently stored one. Note: this is a noop for full listing mode """ if self.incremental: # link: https://${project-api}/?...&page=2x... next_page = page_result.next_page + if not next_page and self.last_page: + next_page = self.last_page if next_page: page_id = _parse_page_id(next_page) previous_next_page = self.state.last_seen_next_link previous_page_id = _parse_page_id(previous_next_page) + if previous_next_page is None or ( previous_page_id and page_id and previous_page_id < page_id ): self.state.last_seen_next_link = next_page def finalize(self) -> None: """finalize the lister state when relevant (see `fn:commit_page` for details) Note: this is a noop for full listing mode """ next_page = self.state.last_seen_next_link if self.incremental and next_page: # link: https://${project-api}/?...&page=2x... next_page_id = _parse_page_id(next_page) scheduler_state = self.get_state_from_scheduler() previous_next_page_id = _parse_page_id(scheduler_state.last_seen_next_link) - if ( + if (not previous_next_page_id and next_page_id) or ( previous_next_page_id and next_page_id and previous_next_page_id < next_page_id ): self.updated = True diff --git a/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json b/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json new file mode 100644 index 0000000..cbcff78 --- /dev/null +++ b/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json @@ -0,0 +1,22 @@ +[ + { + "avatar_url": null, + "created_at": "2020-11-18T18:26:08.538Z", + "default_branch": "master", + "description": "PID WRapper for urdfdom and urdfdom_headers project, provided by ROS but independent from ROS.", + "forks_count": 0, + "http_url_to_repo": "https://gite.lirmm.fr/rob-miscellaneous/wrappers/urdfdom.git", + "id": 4363, + "last_activity_at": "2020-11-19T08:56:18.573Z", + "name": "urdfdom", + "name_with_namespace": "rob-miscellaneous / wrappers / urdfdom", + "namespace": {}, + "path": "urdfdom", + "path_with_namespace": "rob-miscellaneous/wrappers/urdfdom", + "readme_url": "https://gite.lirmm.fr/rob-miscellaneous/wrappers/urdfdom/-/blob/master/README.md", + "ssh_url_to_repo": "git@gite.lirmm.fr:rob-miscellaneous/wrappers/urdfdom.git", + "star_count": 0, + "tag_list": [], + "web_url": "https://gite.lirmm.fr/rob-miscellaneous/wrappers/urdfdom" + } +] diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py index 9155291..164a712 100644 --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -1,115 +1,176 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging from pathlib import Path from typing import Dict, List import pytest from swh.lister import USER_AGENT from swh.lister.gitlab.lister import GitLabLister, _parse_page_id from swh.lister.pattern import ListerStats logger = logging.getLogger(__name__) def api_url(instance: str) -> str: return f"https://{instance}/api/v4/" def url_page(api_url: str, page_id: int) -> str: return f"{api_url}projects?page={page_id}&order_by=id&sort=asc&per_page=20" def _match_request(request): return request.headers.get("User-Agent") == USER_AGENT def test_lister_gitlab(datadir, swh_scheduler, requests_mock): """Gitlab lister supports full listing """ instance = "gitlab.com" url = api_url(instance) response = gitlab_page_response(datadir, instance, 1) requests_mock.get( url_page(url, 1), [{"json": response}], additional_matcher=_match_request, ) lister_gitlab = GitLabLister( swh_scheduler, url=api_url(instance), instance=instance ) listed_result = lister_gitlab.run() expected_nb_origins = len(response) assert listed_result == ListerStats(pages=1, origins=expected_nb_origins) scheduler_origins = lister_gitlab.scheduler.get_listed_origins( lister_gitlab.lister_obj.id ).origins assert len(scheduler_origins) == expected_nb_origins for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(f"https://{instance}") def gitlab_page_response(datadir, instance: str, page_id: int) -> List[Dict]: """Return list of repositories (out of test dataset)""" datapath = Path(datadir, f"https_{instance}", f"api_response_page{page_id}.json") return json.loads(datapath.read_text()) if datapath.exists else [] def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir): """Gitlab lister supports pagination """ instance = "gite.lirmm.fr" url = api_url(instance) response1 = gitlab_page_response(datadir, instance, 1) response2 = gitlab_page_response(datadir, instance, 2) requests_mock.get( url_page(url, 1), [{"json": response1, "headers": {"Link": f"<{url_page(url, 2)}>; rel=next"}}], additional_matcher=_match_request, ) requests_mock.get( url_page(url, 2), [{"json": response2}], additional_matcher=_match_request, ) lister = GitLabLister(swh_scheduler, url=url) listed_result = lister.run() expected_nb_origins = len(response1) + len(response2) assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).origins assert len(scheduler_origins) == expected_nb_origins for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(f"https://{instance}") +def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir): + """Gitlab lister supports pagination + + """ + instance = "gite.lirmm.fr" + url = api_url(instance) + + url_page1 = url_page(url, 1) + response1 = gitlab_page_response(datadir, instance, 1) + url_page2 = url_page(url, 2) + response2 = gitlab_page_response(datadir, instance, 2) + url_page3 = url_page(url, 3) + response3 = gitlab_page_response(datadir, instance, 3) + + requests_mock.get( + url_page1, + [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}], + additional_matcher=_match_request, + ) + requests_mock.get( + url_page2, [{"json": response2}], additional_matcher=_match_request, + ) + + lister = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) + listed_result = lister.run() + + expected_nb_origins = len(response1) + len(response2) + assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) + assert lister.state.last_seen_next_link == url_page2 + + lister2 = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) + requests_mock.reset() + # Lister will start back at the last stop + requests_mock.get( + url_page2, + [{"json": response2, "headers": {"Link": f"<{url_page3}>; rel=next"}}], + additional_matcher=_match_request, + ) + requests_mock.get( + url_page3, [{"json": response3}], additional_matcher=_match_request, + ) + + listed_result2 = lister2.run() + + assert listed_result2 == ListerStats( + pages=2, origins=len(response2) + len(response3) + ) + assert lister2.state.last_seen_next_link == url_page3 + + assert lister.lister_obj.id == lister2.lister_obj.id + scheduler_origins = lister2.scheduler.get_listed_origins( + lister2.lister_obj.id + ).origins + + assert len(scheduler_origins) == len(response1) + len(response2) + len(response3) + + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith(f"https://{instance}") + + @pytest.mark.parametrize( "url,expected_result", [ (None, None), ("http://dummy/?query=1", None), ("http://dummy/?foo=bar&page=1&some=result", 1), ("http://dummy/?foo=bar&page=&some=result", None), ], ) def test__parse_page_id(url, expected_result): assert _parse_page_id(url) == expected_result