diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -11,9 +11,11 @@ from bs4 import BeautifulSoup import requests from requests.exceptions import HTTPError +from tenacity.before_sleep import before_sleep_log from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister +from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -74,6 +76,7 @@ ) self.base_git_url = base_git_url + @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.session.get(url) diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -3,6 +3,7 @@ # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone +import os from typing import List import pytest @@ -229,3 +230,30 @@ assert ( listed_origin.url.startswith(url) is False ), f"url should be mapped to {base_git_url}" + + +def test_lister_cgit_get_pages_with_pages_and_retry( + requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler +): + url = "https://git.tizen/cgit/" + + with open(os.path.join(datadir, "https_git.tizen/cgit,ofs=50"), "rb") as page: + + requests_mock.get( + f"{url}?ofs=50", + [ + {"content": None, "status_code": 429}, + {"content": None, "status_code": 429}, + {"content": page.read(), "status_code": 200}, + ], + ) + + lister_cgit = CGitLister(swh_scheduler, url=url) + + mocker.patch.object(lister_cgit._get_and_parse.retry, "sleep") + + repos: List[List[str]] = list(lister_cgit.get_pages()) + flattened_repos = sum(repos, []) + # we should have 16 repos (listed on 3 pages) + assert len(repos) == 3 + assert len(flattened_repos) == 16