diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py index f9da09d..be88333 100644 --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -1,144 +1,203 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dataclasses import asdict, dataclass import logging import random from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import urljoin +from urllib.parse import parse_qs, urljoin, urlparse import iso8601 import requests from tenacity.before_sleep import before_sleep_log from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) -# Aliasing page results returned by `GogsLister.get_pages` method -GogsListerPage = List[Dict[str, Any]] +Repo = Dict[str, Any] -class GogsLister(StatelessLister[GogsListerPage]): +@dataclass +class GogsListerPage: + repos: Optional[List[Repo]] = None + next_link: Optional[str] = None + + +@dataclass +class GogsListerState: + last_seen_next_link: Optional[str] = None + """Last link header (could be already visited) during an incremental pass.""" + last_seen_repo_id: Optional[int] = None + """Last repo id seen during an incremental pass.""" + + +def _parse_page_id(url: Optional[str]) -> int: + """Parse the page id from a Gogs page url.""" + if url is None: + return 0 + + return int(parse_qs(urlparse(url).query)["page"][0]) + + +class GogsLister(Lister[GogsListerState, GogsListerPage]): """List origins from the Gogs Gogs API documentation: https://github.com/gogs/docs-api The API is protected behind authentication so credentials/API tokens are mandatory. It supports pagination and provides next page URL through the 'next' value of the 'Link' header. The default value for page size ('limit') is 10 but the maximum allowed value is 50. """ LISTER_NAME = "gogs" VISIT_TYPE = "git" REPO_LIST_PATH = "repos/search" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, api_token: Optional[str] = None, page_size: int = 50, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.query_params = { "limit": page_size, - "page": 1, } self.api_token = api_token if self.api_token is None: if len(self.credentials) > 0: cred = random.choice(self.credentials) username = cred.get("username") self.api_token = cred["password"] logger.warning( "Using authentication credentials from user %s", username or "???" ) else: raise ValueError("No credentials or API token provided") self.max_page_limit = 2 self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, "Authorization": f"token {self.api_token}", } ) + def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState: + return GogsListerState(**d) + + def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: + return asdict(state) + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url, params) -> requests.Response: logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response @classmethod - def results_simplified(cls, body: Dict[str, GogsListerPage]) -> GogsListerPage: + def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]: fields_filter = ["id", "clone_url", "updated_at"] return [{k: r[k] for k in fields_filter} for r in body["data"]] def get_pages(self) -> Iterator[GogsListerPage]: - # base with trailing slash, path without leading slash for urljoin - url = urljoin(self.url, self.REPO_LIST_PATH) - response = self.page_request(url, self.query_params) + page_id = 1 + if self.state.last_seen_next_link is not None: + page_id = _parse_page_id(self.state.last_seen_next_link) - while True: - page_results = self.results_simplified(response.json()) + # base with trailing slash, path without leading slash for urljoin + next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) + response = self.page_request(next_link, {**self.query_params, "page": page_id}) - yield page_results + while next_link is not None: + repos = self.extract_repos(response.json()) assert len(response.links) > 0, "API changed: no Link header found" if "next" in response.links: - url = response.links["next"]["url"] + next_link = response.links["next"]["url"] else: - break + next_link = None # Happens for the last page + + yield GogsListerPage(repos=repos, next_link=next_link) - response = self.page_request(url, {}) + if next_link is not None: + response = self.page_request(next_link, {}) def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]: """Convert a page of Gogs repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None + assert page.repos is not None - for repo in page: - last_update = iso8601.parse_date(repo["updated_at"]) + for r in page.repos: + last_update = iso8601.parse_date(r["updated_at"]) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, - url=repo["clone_url"], + url=r["clone_url"], last_update=last_update, ) + + def commit_page(self, page: GogsListerPage) -> None: + last_seen_next_link = page.next_link + + page_id = _parse_page_id(last_seen_next_link) + state_page_id = _parse_page_id(self.state.last_seen_next_link) + + if page_id > state_page_id: + self.state.last_seen_next_link = last_seen_next_link + + if (page.repos is not None) and len(page.repos) > 0: + self.state.last_seen_repo_id = page.repos[-1]["id"] + + def finalize(self) -> None: + scheduler_state = self.get_state_from_scheduler() + + state_page_id = _parse_page_id(self.state.last_seen_next_link) + scheduler_page_id = _parse_page_id(scheduler_state.last_seen_next_link) + + state_last_repo_id = self.state.last_seen_repo_id or 0 + scheduler_last_repo_id = scheduler_state.last_seen_repo_id or 0 + + if (state_page_id >= scheduler_page_id) and ( + state_last_repo_id > scheduler_last_repo_id + ): + self.updated = True # Marked updated only if it finds new repos diff --git a/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 new file mode 100644 index 0000000..5a1ac12 --- /dev/null +++ b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 @@ -0,0 +1,168 @@ +{ + "data": [ + { + "id": 340, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "zork/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 1436672, + "html_url": "https://try.gogs.io/zork/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:51:12Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 350, + "owner": { + "id": 599, + "username": "perekre", + "login": "perekre", + "full_name": "", + "email": "perekre@nincsmail.com", + "avatar_url": "https://secure.gravatar.com/avatar/0e2666adf16f8a958a56141a2d94565c?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "perekre/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": true, + "parent": { + "id": 340, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "zork/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 1436672, + "html_url": "https://try.gogs.io/zork/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:51:12Z", + "updated_at": "2022-03-26T07:28:38Z", + "permissions": { + "admin": false, + "push": false, + "pull": true + } + }, + "empty": false, + "mirror": false, + "size": 1437696, + "html_url": "https://try.gogs.io/perekre/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:perekre/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/perekre/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-04T10:40:46Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 369, + "owner": { + "id": 108, + "username": "yinheli", + "login": "yinheli", + "full_name": "", + "email": "me@yinheli.com", + "avatar_url": "https://secure.gravatar.com/avatar/dedb067ecae8155b87428ac7920dd0ae?d=identicon" + }, + "name": "digits", + "full_name": "yinheli/digits", + "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.", + "private": false, + "fork": true, + "parent": { + "id": 339, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "digits", + "full_name": "zork/digits", + "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 18432, + "html_url": "https://try.gogs.io/zork/digits", + "ssh_url": "git@try.gogs.io:zork/digits.git", + "clone_url": "https://try.gogs.io/zork/digits.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:47:56Z", + "updated_at": "2022-03-26T07:28:38Z", + "permissions": { + "admin": false, + "push": false, + "pull": true + } + }, + "empty": false, + "mirror": false, + "size": 18432, + "html_url": "https://try.gogs.io/yinheli/digits", + "ssh_url": "git@try.gogs.io:yinheli/digits.git", + "clone_url": "https://try.gogs.io/yinheli/digits.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-06T01:31:17Z", + "updated_at": "2022-03-26T07:28:38Z" + } + ], + "ok": true +} diff --git a/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page4 b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page4 new file mode 100644 index 0000000..ad2c034 --- /dev/null +++ b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page4 @@ -0,0 +1 @@ +{"data":[{"id":380,"owner":{"id":653,"username":"gdr","login":"gdr","full_name":"","email":"gdr@gdr.name","avatar_url":"https://secure.gravatar.com/avatar/237e2bf0a3687301ed4ef3c65e56c672?d=identicon"},"name":"ttrss-af_nofacebook","full_name":"gdr/ttrss-af_nofacebook","description":"Tiny Tiny RSS plugin for removing Facebook's l.php links","private":false,"fork":false,"parent":null,"empty":false,"mirror":true,"size":4096,"html_url":"https://try.gogs.io/gdr/ttrss-af_nofacebook","ssh_url":"git@try.gogs.io:gdr/ttrss-af_nofacebook.git","clone_url":"https://try.gogs.io/gdr/ttrss-af_nofacebook.git","website":"","stars_count":0,"forks_count":0,"watchers_count":1,"open_issues_count":0,"default_branch":"master","created_at":"2015-03-06T11:39:30Z","updated_at":"2022-03-26T07:28:38Z"},{"id":399,"owner":{"id":683,"username":"brejoc","login":"brejoc","full_name":"","email":"brejoc@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/80674ca691e4a325d8bff1977a1d881d?d=identicon"},"name":"gosgp","full_name":"brejoc/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":false,"parent":null,"empty":false,"mirror":false,"size":35840,"html_url":"https://try.gogs.io/brejoc/gosgp","ssh_url":"git@try.gogs.io:brejoc/gosgp.git","clone_url":"https://try.gogs.io/brejoc/gosgp.git","website":"","stars_count":0,"forks_count":2,"watchers_count":1,"open_issues_count":1,"default_branch":"master","created_at":"2015-03-09T12:15:24Z","updated_at":"2022-03-26T07:28:38Z"},{"id":403,"owner":{"id":687,"username":"mirdhyn","login":"mirdhyn","full_name":"","email":"mirdhyn@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/047818f3fffe0df833958ea40a25fd5c?d=identicon"},"name":"gosgp","full_name":"mirdhyn/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":true,"parent":{"id":399,"owner":{"id":683,"username":"brejoc","login":"brejoc","full_name":"","email":"brejoc@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/80674ca691e4a325d8bff1977a1d881d?d=identicon"},"name":"gosgp","full_name":"brejoc/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":false,"parent":null,"empty":false,"mirror":false,"size":35840,"html_url":"https://try.gogs.io/brejoc/gosgp","ssh_url":"git@try.gogs.io:brejoc/gosgp.git","clone_url":"https://try.gogs.io/brejoc/gosgp.git","website":"","stars_count":0,"forks_count":2,"watchers_count":1,"open_issues_count":1,"default_branch":"master","created_at":"2015-03-09T12:15:24Z","updated_at":"2022-03-26T07:28:38Z","permissions":{"admin":false,"push":false,"pull":true}},"empty":false,"mirror":false,"size":48128,"html_url":"https://try.gogs.io/mirdhyn/gosgp","ssh_url":"git@try.gogs.io:mirdhyn/gosgp.git","clone_url":"https://try.gogs.io/mirdhyn/gosgp.git","website":"","stars_count":0,"forks_count":0,"watchers_count":1,"open_issues_count":0,"default_branch":"master","created_at":"2015-03-09T21:34:21Z","updated_at":"2022-03-26T07:28:38Z"}],"ok":true} \ No newline at end of file diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py index 62326b5..b9f48a9 100644 --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -1,163 +1,329 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path from typing import List from unittest.mock import Mock import pytest from requests import HTTPError -from swh.lister.gogs.lister import GogsLister +from swh.lister.gogs.lister import GogsLister, GogsListerPage, _parse_page_id from swh.scheduler.model import ListedOrigin TRY_GOGS_URL = "https://try.gogs.io/api/v1/" def try_gogs_page(n: int): - return TRY_GOGS_URL + f"repos/search?page={n}&limit=3" + return TRY_GOGS_URL + GogsLister.REPO_LIST_PATH + f"?page={n}&limit=3" + + +P1 = try_gogs_page(1) +P2 = try_gogs_page(2) +P3 = try_gogs_page(3) +P4 = try_gogs_page(4) @pytest.fixture def trygogs_p1(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text() - headers = { - "Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=try_gogs_page(2)) - } - page_result = GogsLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + headers = {"Link": f'<{P2}>; rel="next"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P2 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture def trygogs_p2(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text() - headers = { - "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) - } - page_result = GogsLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + headers = {"Link": f'<{P3}>; rel="next",<{P1}>; rel="prev"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_p3(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() + headers = {"Link": f'<{P4}>; rel="next",<{P2}>; rel="prev"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_p4(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page4").read_text() + headers = {"Link": f'<{P3}>; rel="prev"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_p3_last(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() + headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=None + ) + origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture -def trygogs_empty_page(): +def trygogs_p3_empty(): origins_urls = [] - page_result = {"data": [], "ok": True} - headers = { - "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) - } - text = json.dumps(page_result) + body = {"data": [], "ok": True} + headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'} + page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None) + text = json.dumps(body) return text, headers, page_result, origins_urls def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs. Does not test last_update.""" sorted_lister_urls = list(sorted(lister_urls)) sorted_scheduler_origins = list(sorted(scheduler_origins)) assert len(sorted_lister_urls) == len(sorted_scheduler_origins) for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): assert l_url == s_origin.url def test_gogs_full_listing( - swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_empty_page + swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3_last ): kwargs = dict( url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" ) lister = GogsLister(scheduler=swh_scheduler, **kwargs) lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 - p3_text, p3_headers, _, _ = trygogs_empty_page + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last - requests_mock.get(try_gogs_page(1), text=p1_text, headers=p1_headers) - requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) - requests_mock.get(try_gogs_page(3), text=p3_text, headers=p3_headers) + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) stats = lister.run() - assert stats.pages == 2 - assert stats.origins == 6 + assert stats.pages == 3 + assert stats.origins == 9 - calls = [mocker.call(p1_result), mocker.call(p2_result)] - lister.get_origins_from_page.assert_has_calls(calls) + calls = map(mocker.call, [p1_result, p2_result, p3_result]) + lister.get_origins_from_page.assert_has_calls(list(calls)) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) - assert lister.get_state_from_scheduler() is None + assert ( + lister.get_state_from_scheduler().last_seen_next_link == P3 + ) # P3 didn't provide any next link so it remains the last_seen_next_link def test_gogs_auth_instance( - swh_scheduler, requests_mock, trygogs_p1, trygogs_empty_page + swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_p3_empty ): """Covers token authentication, token from credentials, instance inference from URL.""" api_token = "secret" - instance = "try.gogs.io" - creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} + instance = "try_gogs" - kwargs1 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) - lister = GogsLister(scheduler=swh_scheduler, **kwargs1) + # Test lister initialization without api_token or credentials: + with pytest.raises(ValueError, match="No credentials or API token provided"): + kwargs1 = dict(url=TRY_GOGS_URL, instance=instance) + GogsLister(scheduler=swh_scheduler, **kwargs1) - # test API token - assert "Authorization" in lister.session.headers + # Test lister initialization using api_token: + kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) + lister = GogsLister(scheduler=swh_scheduler, **kwargs2) assert lister.session.headers["Authorization"].lower() == "token %s" % api_token - with pytest.raises(ValueError, match="No credentials or API token provided"): - kwargs2 = dict(url=TRY_GOGS_URL, instance=instance) - GogsLister(scheduler=swh_scheduler, **kwargs2) - + # Test lister initialization with credentials and run it: + creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) lister = GogsLister(scheduler=swh_scheduler, **kwargs3) - - # test API token from credentials - assert "Authorization" in lister.session.headers assert lister.session.headers["Authorization"].lower() == "token %s" % api_token - - # test instance inference from URL - assert lister.instance - assert "gogs" in lister.instance + assert lister.instance == "try_gogs" # setup requests mocking p1_text, p1_headers, _, _ = trygogs_p1 - p2_text, p2_headers, _, _ = trygogs_empty_page + p2_text, p2_headers, _, _ = trygogs_p2 + p3_text, p3_headers, _, _ = trygogs_p3_empty - base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH - requests_mock.get(base_url, text=p1_text, headers=p1_headers) - requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) - # now check the lister runs without error - stats = lister.run() + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) - assert stats.pages == 2 - assert stats.origins == 3 + # lister should run without any error and extract the origins + stats = lister.run() + assert stats.pages == 3 + assert stats.origins == 6 @pytest.mark.parametrize("http_code", [400, 500, 502]) -def test_gogs_list_http_error(swh_scheduler, requests_mock, http_code): +def test_gogs_list_http_error( + swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last +): """Test handling of some HTTP errors commonly encountered""" lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") + p1_text, p1_headers, _, p1_origin_urls = trygogs_p1 + p3_text, p3_headers, _, _ = trygogs_p3_last + base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH - requests_mock.get(base_url, status_code=http_code) + requests_mock.get( + base_url, + [ + {"text": p1_text, "headers": p1_headers, "status_code": 200}, + {"status_code": http_code}, + {"text": p3_text, "headers": p3_headers, "status_code": 200}, + ], + ) with pytest.raises(HTTPError): lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == 0 + check_listed_origins( + p1_origin_urls, scheduler_origins + ) # Only the first page is listed + + +def test_gogs_incremental_lister( + swh_scheduler, + requests_mock, + mocker, + trygogs_p1, + trygogs_p2, + trygogs_p3, + trygogs_p3_last, + trygogs_p3_empty, + trygogs_p4, +): + kwargs = dict( + url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" + ) + lister = GogsLister(scheduler=swh_scheduler, **kwargs) + + lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") + + # First listing attempt: P1 and P2 return 3 origins each + # while P3 (current last page) is empty. + + p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 + p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_empty + + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) + + attempt1_stats = lister.run() + assert attempt1_stats.pages == 3 + assert attempt1_stats.origins == 6 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"] + assert lister.updated + + check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) + + lister.updated = False # Reset the flag + + # Second listing attempt: P3 isn't empty anymore. + # The lister should restart from last state and hence revisit P3. + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last + requests_mock.get(P3, text=p3_text, headers=p3_headers) + + lister.session.get = mocker.spy(lister.session, "get") + + attempt2_stats = lister.run() + + assert attempt2_stats.pages == 1 + assert attempt2_stats.origins == 3 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + page_id = _parse_page_id(lister_state.last_seen_next_link) + query_params = lister.query_params + query_params["page"] = page_id + + lister.session.get.assert_called_once_with( + TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params + ) + + # All the 9 origins (3 pages) should be passed on to the scheduler: + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] + assert lister.updated + + lister.updated = False # Reset the flag + + # Third listing attempt: No new origins + # The lister should revisit last seen page (P3) + attempt3_stats = lister.run() + + assert attempt3_stats.pages == 1 + assert attempt3_stats.origins == 3 + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] + assert lister.updated is False # No new origins so state isn't updated. + + # Fourth listing attempt: Page 4 is introduced and returns 3 new origins + # The lister should revisit last seen page (P3) as well as P4. + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 # new P3 points to P4 + p4_text, p4_headers, p4_result, p4_origin_urls = trygogs_p4 + + requests_mock.get(P3, text=p3_text, headers=p3_headers) + requests_mock.get(P4, text=p4_text, headers=p4_headers) + + attempt4_stats = lister.run() + + assert attempt4_stats.pages == 2 + assert attempt4_stats.origins == 6 + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P4 + assert lister_state.last_seen_repo_id == p4_result.repos[-1]["id"] + assert lister.updated + + # All the 12 origins (4 pages) should be passed on to the scheduler: + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls + p4_origin_urls, + scheduler_origins, + )