diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -3,10 +3,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dataclasses import asdict, dataclass import logging import random from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import urljoin +from urllib.parse import parse_qs, urljoin, urlparse import iso8601 import requests @@ -17,15 +18,34 @@ from swh.scheduler.model import ListedOrigin from .. import USER_AGENT -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) -# Aliasing page results returned by `GogsLister.get_pages` method -GogsListerPage = List[Dict[str, Any]] +Repo = Dict[str, Any] -class GogsLister(StatelessLister[GogsListerPage]): +@dataclass +class GogsListerPage: + repos: Optional[List[Repo]] = None + next_link: Optional[str] = None + + +@dataclass +class GogsListerState: + last_seen_next_link: Optional[str] = None + """Last link header (not visited yet) during an incremental pass.""" + + +def _parse_page_id(url: Optional[str]) -> int: + """Parse the page id from a Gogs page url.""" + if url is None: + return 0 + + return int(parse_qs(urlparse(url).query)["page"][0]) + + +class GogsLister(Lister[GogsListerState, GogsListerPage]): """List origins from the Gogs @@ -61,7 +81,6 @@ self.query_params = { "limit": page_size, - "page": 1, } self.api_token = api_token @@ -88,6 +107,12 @@ } ) + def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState: + return GogsListerState(**d) + + def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: + return asdict(state) + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url, params) -> requests.Response: @@ -107,38 +132,61 @@ return response @classmethod - def results_simplified(cls, body: Dict[str, GogsListerPage]) -> GogsListerPage: + def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]: fields_filter = ["id", "clone_url", "updated_at"] return [{k: r[k] for k in fields_filter} for r in body["data"]] def get_pages(self) -> Iterator[GogsListerPage]: - # base with trailing slash, path without leading slash for urljoin - url = urljoin(self.url, self.REPO_LIST_PATH) - response = self.page_request(url, self.query_params) + page_id = 1 + if self.state.last_seen_next_link is not None: + page_id = _parse_page_id(self.state.last_seen_next_link) - while True: - page_results = self.results_simplified(response.json()) + # base with trailing slash, path without leading slash for urljoin + next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) + response = self.page_request(next_link, {**self.query_params, "page": page_id}) - yield page_results + while next_link is not None: + repos = self.extract_repos(response.json()) assert len(response.links) > 0, "API changed: no Link header found" if "next" in response.links: - url = response.links["next"]["url"] + next_link = response.links["next"]["url"] else: - break + next_link = None + + yield GogsListerPage(repos=repos, next_link=next_link) - response = self.page_request(url, {}) + if next_link is not None: + response = self.page_request(next_link, {}) def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]: """Convert a page of Gogs repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None + if page.repos is None: + return [] - for repo in page: - last_update = iso8601.parse_date(repo["updated_at"]) + for r in page.repos: + last_update = iso8601.parse_date(r["updated_at"]) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, - url=repo["clone_url"], + url=r["clone_url"], last_update=last_update, ) + + def commit_page(self, page: GogsListerPage) -> None: + last_seen_next_link = page.next_link + + if _parse_page_id(last_seen_next_link) > _parse_page_id( + self.state.last_seen_next_link + ): + self.state.last_seen_next_link = last_seen_next_link + + def finalize(self) -> None: + scheduler_state = self.get_state_from_scheduler() + + if _parse_page_id(self.state.last_seen_next_link) > _parse_page_id( + scheduler_state.last_seen_next_link + ): + self.updated = True diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -11,7 +11,7 @@ import pytest from requests import HTTPError -from swh.lister.gogs.lister import GogsLister +from swh.lister.gogs.lister import GogsLister, GogsListerPage from swh.scheduler.model import ListedOrigin TRY_GOGS_URL = "https://try.gogs.io/api/v1/" @@ -27,8 +27,10 @@ headers = { "Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=try_gogs_page(2)) } - page_result = GogsLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=try_gogs_page(2) + ) + origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @@ -38,19 +40,22 @@ headers = { "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) } - page_result = GogsLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=None + ) + origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture -def trygogs_empty_page(): +def trygogs_empty_p2(): origins_urls = [] - page_result = {"data": [], "ok": True} + body = {"data": [], "ok": True} headers = { "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) } - text = json.dumps(page_result) + page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None) + text = json.dumps(body) return text, headers, page_result, origins_urls @@ -69,7 +74,7 @@ def test_gogs_full_listing( - swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_empty_page + swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2 ): kwargs = dict( url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" @@ -80,11 +85,9 @@ p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 - p3_text, p3_headers, _, _ = trygogs_empty_page requests_mock.get(try_gogs_page(1), text=p1_text, headers=p1_headers) requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) - requests_mock.get(try_gogs_page(3), text=p3_text, headers=p3_headers) stats = lister.run() @@ -98,51 +101,42 @@ check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) - assert lister.get_state_from_scheduler() is None + assert lister.get_state_from_scheduler().last_seen_next_link == try_gogs_page(2) -def test_gogs_auth_instance( - swh_scheduler, requests_mock, trygogs_p1, trygogs_empty_page -): +def test_gogs_auth_instance(swh_scheduler, requests_mock, trygogs_p1, trygogs_empty_p2): """Covers token authentication, token from credentials, instance inference from URL.""" api_token = "secret" - instance = "try.gogs.io" - creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} + instance = "try_gogs" - kwargs1 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) - lister = GogsLister(scheduler=swh_scheduler, **kwargs1) + # Test lister initialization without api_token or credentials: + with pytest.raises(ValueError, match="No credentials or API token provided"): + kwargs1 = dict(url=TRY_GOGS_URL, instance=instance) + GogsLister(scheduler=swh_scheduler, **kwargs1) - # test API token - assert "Authorization" in lister.session.headers + # Test lister initialization using api_token: + kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) + lister = GogsLister(scheduler=swh_scheduler, **kwargs2) assert lister.session.headers["Authorization"].lower() == "token %s" % api_token - with pytest.raises(ValueError, match="No credentials or API token provided"): - kwargs2 = dict(url=TRY_GOGS_URL, instance=instance) - GogsLister(scheduler=swh_scheduler, **kwargs2) - + # Test lister initialization with credentials: + creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) lister = GogsLister(scheduler=swh_scheduler, **kwargs3) - - # test API token from credentials - assert "Authorization" in lister.session.headers assert lister.session.headers["Authorization"].lower() == "token %s" % api_token - - # test instance inference from URL - assert lister.instance - assert "gogs" in lister.instance + assert lister.instance == "try_gogs" # setup requests mocking p1_text, p1_headers, _, _ = trygogs_p1 - p2_text, p2_headers, _, _ = trygogs_empty_page + p2_text, p2_headers, _, _ = trygogs_empty_p2 - base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH - requests_mock.get(base_url, text=p1_text, headers=p1_headers) + requests_mock.get(try_gogs_page(1), text=p1_text, headers=p1_headers) requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) - # now check the lister runs without error - stats = lister.run() + # lister should run without any error and extract the origins + stats = lister.run() assert stats.pages == 2 assert stats.origins == 3