diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -3,10 +3,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dataclasses import asdict, dataclass import logging import random from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import urljoin +from urllib.parse import parse_qs, urljoin, urlparse import iso8601 import requests @@ -17,15 +18,36 @@ from swh.scheduler.model import ListedOrigin from .. import USER_AGENT -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) -# Aliasing page results returned by `GogsLister.get_pages` method -GogsListerPage = List[Dict[str, Any]] +Repo = Dict[str, Any] -class GogsLister(StatelessLister[GogsListerPage]): +@dataclass +class GogsListerPage: + repos: Optional[List[Repo]] = None + next_link: Optional[str] = None + + +@dataclass +class GogsListerState: + last_seen_next_link: Optional[str] = None + """Last link header (could be already visited) during an incremental pass.""" + last_seen_repo_id: Optional[int] = None + """Last repo id seen during an incremental pass.""" + + +def _parse_page_id(url: Optional[str]) -> int: + """Parse the page id from a Gogs page url.""" + if url is None: + return 0 + + return int(parse_qs(urlparse(url).query)["page"][0]) + + +class GogsLister(Lister[GogsListerState, GogsListerPage]): """List origins from the Gogs @@ -61,7 +83,6 @@ self.query_params = { "limit": page_size, - "page": 1, } self.api_token = api_token @@ -88,6 +109,12 @@ } ) + def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState: + return GogsListerState(**d) + + def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: + return asdict(state) + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url, params) -> requests.Response: @@ -107,38 +134,70 @@ return response @classmethod - def results_simplified(cls, body: Dict[str, GogsListerPage]) -> GogsListerPage: + def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]: fields_filter = ["id", "clone_url", "updated_at"] return [{k: r[k] for k in fields_filter} for r in body["data"]] def get_pages(self) -> Iterator[GogsListerPage]: - # base with trailing slash, path without leading slash for urljoin - url = urljoin(self.url, self.REPO_LIST_PATH) - response = self.page_request(url, self.query_params) + page_id = 1 + if self.state.last_seen_next_link is not None: + page_id = _parse_page_id(self.state.last_seen_next_link) - while True: - page_results = self.results_simplified(response.json()) + # base with trailing slash, path without leading slash for urljoin + next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) + response = self.page_request(next_link, {**self.query_params, "page": page_id}) - yield page_results + while next_link is not None: + repos = self.extract_repos(response.json()) assert len(response.links) > 0, "API changed: no Link header found" if "next" in response.links: - url = response.links["next"]["url"] + next_link = response.links["next"]["url"] else: - break + next_link = None # Happens for the last page + + yield GogsListerPage(repos=repos, next_link=next_link) - response = self.page_request(url, {}) + if next_link is not None: + response = self.page_request(next_link, {}) def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]: """Convert a page of Gogs repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None + assert page.repos is not None - for repo in page: - last_update = iso8601.parse_date(repo["updated_at"]) + for r in page.repos: + last_update = iso8601.parse_date(r["updated_at"]) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, - url=repo["clone_url"], + url=r["clone_url"], last_update=last_update, ) + + def commit_page(self, page: GogsListerPage) -> None: + last_seen_next_link = page.next_link + + page_id = _parse_page_id(last_seen_next_link) + state_page_id = _parse_page_id(self.state.last_seen_next_link) + + if page_id > state_page_id: + self.state.last_seen_next_link = last_seen_next_link + + if (page.repos is not None) and len(page.repos) > 0: + self.state.last_seen_repo_id = page.repos[-1]["id"] + + def finalize(self) -> None: + scheduler_state = self.get_state_from_scheduler() + + state_page_id = _parse_page_id(self.state.last_seen_next_link) + scheduler_page_id = _parse_page_id(scheduler_state.last_seen_next_link) + + state_last_repo_id = self.state.last_seen_repo_id or 0 + scheduler_last_repo_id = scheduler_state.last_seen_repo_id or 0 + + if (state_page_id >= scheduler_page_id) and ( + state_last_repo_id > scheduler_last_repo_id + ): + self.updated = True # Marked updated only if it finds new repos diff --git a/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 new file mode 100644 --- /dev/null +++ b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 @@ -0,0 +1,168 @@ +{ + "data": [ + { + "id": 340, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "zork/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 1436672, + "html_url": "https://try.gogs.io/zork/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:51:12Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 350, + "owner": { + "id": 599, + "username": "perekre", + "login": "perekre", + "full_name": "", + "email": "perekre@nincsmail.com", + "avatar_url": "https://secure.gravatar.com/avatar/0e2666adf16f8a958a56141a2d94565c?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "perekre/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": true, + "parent": { + "id": 340, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "zork/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 1436672, + "html_url": "https://try.gogs.io/zork/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:51:12Z", + "updated_at": "2022-03-26T07:28:38Z", + "permissions": { + "admin": false, + "push": false, + "pull": true + } + }, + "empty": false, + "mirror": false, + "size": 1437696, + "html_url": "https://try.gogs.io/perekre/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:perekre/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/perekre/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-04T10:40:46Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 369, + "owner": { + "id": 108, + "username": "yinheli", + "login": "yinheli", + "full_name": "", + "email": "me@yinheli.com", + "avatar_url": "https://secure.gravatar.com/avatar/dedb067ecae8155b87428ac7920dd0ae?d=identicon" + }, + "name": "digits", + "full_name": "yinheli/digits", + "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.", + "private": false, + "fork": true, + "parent": { + "id": 339, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "digits", + "full_name": "zork/digits", + "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 18432, + "html_url": "https://try.gogs.io/zork/digits", + "ssh_url": "git@try.gogs.io:zork/digits.git", + "clone_url": "https://try.gogs.io/zork/digits.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:47:56Z", + "updated_at": "2022-03-26T07:28:38Z", + "permissions": { + "admin": false, + "push": false, + "pull": true + } + }, + "empty": false, + "mirror": false, + "size": 18432, + "html_url": "https://try.gogs.io/yinheli/digits", + "ssh_url": "git@try.gogs.io:yinheli/digits.git", + "clone_url": "https://try.gogs.io/yinheli/digits.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-06T01:31:17Z", + "updated_at": "2022-03-26T07:28:38Z" + } + ], + "ok": true +} diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -11,46 +11,61 @@ import pytest from requests import HTTPError -from swh.lister.gogs.lister import GogsLister +from swh.lister.gogs.lister import GogsLister, GogsListerPage, _parse_page_id from swh.scheduler.model import ListedOrigin TRY_GOGS_URL = "https://try.gogs.io/api/v1/" def try_gogs_page(n: int): - return TRY_GOGS_URL + f"repos/search?page={n}&limit=3" + return TRY_GOGS_URL + GogsLister.REPO_LIST_PATH + f"?page={n}&limit=3" + + +P1 = try_gogs_page(1) +P2 = try_gogs_page(2) +P3 = try_gogs_page(3) @pytest.fixture def trygogs_p1(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text() - headers = { - "Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=try_gogs_page(2)) - } - page_result = GogsLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + headers = {"Link": '<{p2}>; rel="next",<{p3}>; rel="last"'.format(p2=P2, p3=P3)} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P2 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture def trygogs_p2(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text() - headers = { - "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) - } - page_result = GogsLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + headers = {"Link": '<{p3}>; rel="next",<{p1}>; rel="prev"'.format(p1=P1, p3=P3)} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture -def trygogs_empty_page(): +def trygogs_p3(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() + headers = {"Link": '<{p2}>; rel="prev",<{p1}>; rel="first"'.format(p1=P1, p2=P2)} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=None + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_empty_p3(): origins_urls = [] - page_result = {"data": [], "ok": True} - headers = { - "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) - } - text = json.dumps(page_result) + body = {"data": [], "ok": True} + headers = {"Link": '<{p2}>; rel="prev",<{p1}>; rel="first"'.format(p1=P1, p2=P2)} + page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None) + text = json.dumps(body) return text, headers, page_result, origins_urls @@ -69,7 +84,7 @@ def test_gogs_full_listing( - swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_empty_page + swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3 ): kwargs = dict( url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" @@ -80,84 +95,181 @@ p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 - p3_text, p3_headers, _, _ = trygogs_empty_page + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 - requests_mock.get(try_gogs_page(1), text=p1_text, headers=p1_headers) - requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) - requests_mock.get(try_gogs_page(3), text=p3_text, headers=p3_headers) + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) stats = lister.run() - assert stats.pages == 2 - assert stats.origins == 6 + assert stats.pages == 3 + assert stats.origins == 9 - calls = [mocker.call(p1_result), mocker.call(p2_result)] - lister.get_origins_from_page.assert_has_calls(calls) + calls = map(mocker.call, [p1_result, p2_result, p3_result]) + lister.get_origins_from_page.assert_has_calls(list(calls)) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) - assert lister.get_state_from_scheduler() is None + assert ( + lister.get_state_from_scheduler().last_seen_next_link == P3 + ) # P3 didn't provide any next link so it remains the last_seen_next_link def test_gogs_auth_instance( - swh_scheduler, requests_mock, trygogs_p1, trygogs_empty_page + swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_empty_p3 ): """Covers token authentication, token from credentials, instance inference from URL.""" api_token = "secret" - instance = "try.gogs.io" - creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} + instance = "try_gogs" - kwargs1 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) - lister = GogsLister(scheduler=swh_scheduler, **kwargs1) + # Test lister initialization without api_token or credentials: + with pytest.raises(ValueError, match="No credentials or API token provided"): + kwargs1 = dict(url=TRY_GOGS_URL, instance=instance) + GogsLister(scheduler=swh_scheduler, **kwargs1) - # test API token - assert "Authorization" in lister.session.headers + # Test lister initialization using api_token: + kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) + lister = GogsLister(scheduler=swh_scheduler, **kwargs2) assert lister.session.headers["Authorization"].lower() == "token %s" % api_token - with pytest.raises(ValueError, match="No credentials or API token provided"): - kwargs2 = dict(url=TRY_GOGS_URL, instance=instance) - GogsLister(scheduler=swh_scheduler, **kwargs2) - + # Test lister initialization with credentials and run it: + creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) lister = GogsLister(scheduler=swh_scheduler, **kwargs3) - - # test API token from credentials - assert "Authorization" in lister.session.headers assert lister.session.headers["Authorization"].lower() == "token %s" % api_token - - # test instance inference from URL - assert lister.instance - assert "gogs" in lister.instance + assert lister.instance == "try_gogs" # setup requests mocking p1_text, p1_headers, _, _ = trygogs_p1 - p2_text, p2_headers, _, _ = trygogs_empty_page + p2_text, p2_headers, _, _ = trygogs_p2 + p3_text, p3_headers, _, _ = trygogs_empty_p3 - base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH - requests_mock.get(base_url, text=p1_text, headers=p1_headers) - requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) - # now check the lister runs without error - stats = lister.run() + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) - assert stats.pages == 2 - assert stats.origins == 3 + # lister should run without any error and extract the origins + stats = lister.run() + assert stats.pages == 3 + assert stats.origins == 6 @pytest.mark.parametrize("http_code", [400, 500, 502]) -def test_gogs_list_http_error(swh_scheduler, requests_mock, http_code): +def test_gogs_list_http_error( + swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3 +): """Test handling of some HTTP errors commonly encountered""" lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") + p1_text, p1_headers, _, p1_origin_urls = trygogs_p1 + p3_text, p3_headers, _, _ = trygogs_p3 + base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH - requests_mock.get(base_url, status_code=http_code) + requests_mock.get( + base_url, + [ + {"text": p1_text, "headers": p1_headers, "status_code": 200}, + {"status_code": http_code}, + {"text": p3_text, "headers": p3_headers, "status_code": 200}, + ], + ) with pytest.raises(HTTPError): lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == 0 + check_listed_origins( + p1_origin_urls, scheduler_origins + ) # Only the first page is listed + + +def test_gogs_incremental_lister_last_page_empty_and_later_filled( + swh_scheduler, + requests_mock, + mocker, + trygogs_p1, + trygogs_p2, + trygogs_p3, + trygogs_empty_p3, +): + kwargs = dict( + url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" + ) + lister = GogsLister(scheduler=swh_scheduler, **kwargs) + + lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") + + p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 + p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_empty_p3 + + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) + + attempt1_stats = lister.run() + assert attempt1_stats.pages == 3 + assert attempt1_stats.origins == 6 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"] + assert lister.updated + + check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) + + lister.updated = False # Reset the flag + + # Second listing attempt should restart from last state: + # Assume that page 3 content got updated + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 + requests_mock.get(P3, text=p3_text, headers=p3_headers) + + lister.session.get = mocker.spy(lister.session, "get") + + attempt2_stats = lister.run() + + assert attempt2_stats.pages == 1 + assert attempt2_stats.origins == 3 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + page_id = _parse_page_id(lister_state.last_seen_next_link) + query_params = lister.query_params + query_params["page"] = page_id + + lister.session.get.assert_called_once_with( + TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params + ) + + # All the 9 origins (3 pages) should be passed on to the scheduler: + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] + assert lister.updated + + lister.updated = False # Reset the flag + + # Third listing attempt: No new origins but it should revisit page 3 + attempt3_stats = lister.run() + + assert attempt3_stats.pages == 1 + assert attempt3_stats.origins == 3 + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] + assert lister.updated is False # No new origins so state isn't updated.