diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -11,7 +11,8 @@ import iso8601 from launchpadlib.launchpad import Launchpad from lazr.restfulclient.errors import RestfulError -from lazr.restfulclient.resource import Collection +from lazr.restfulclient.resource import Collection, Resource +from tenacity.before_sleep import before_sleep_log from swh.lister.utils import retry_if_exception, throttling_retry from swh.scheduler.interface import SchedulerInterface @@ -99,10 +100,13 @@ d[attribute_name] = date_last_modified.isoformat() return d - @throttling_retry(retry=retry_if_restful_error) + @throttling_retry( + retry=retry_if_restful_error, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) def _page_request( self, launchpad, vcs_type: str, date_last_modified: Optional[datetime] - ) -> Optional[Collection]: + ) -> Collection: """Querying the page of results for a given vcs_type since the date_last_modified. If some issues occurs, this will deal with the retrying policy. @@ -135,13 +139,19 @@ launchpad, vcs_type, self.date_last_modified[vcs_type] ) except RestfulError as e: - logger.warning("Listing %s origins raised %s", vcs_type, e) + logger.warning("Listing %s origins raised %s, skipping", vcs_type, e) result = None if not result: continue yield vcs_type, result - @throttling_retry(retry=retry_if_restful_error) + @throttling_retry( + retry=retry_if_restful_error, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def get_next_repo(self, repos_it: Iterator[Resource]) -> Resource: + return next(repos_it) + def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. @@ -149,31 +159,37 @@ assert self.lister_obj.id is not None vcs_type, repos = page + repos_it = iter(repos) + try: + while True: + repo = self.get_next_repo(repos_it) + origin_url = origin(vcs_type, repo) - for repo in repos: - origin_url = origin(vcs_type, repo) - - # filter out origins with invalid URL - if not origin_url.startswith("https://"): - continue + # filter out origins with invalid URL + if not origin_url.startswith("https://"): + continue - last_update = repo.date_last_modified + last_update = repo.date_last_modified - self.date_last_modified[vcs_type] = last_update + self.date_last_modified[vcs_type] = last_update - logger.debug( - "Found origin %s with type %s last updated on %s", - origin_url, - vcs_type, - last_update, - ) + logger.debug( + "Found origin %s with type %s last updated on %s", + origin_url, + vcs_type, + last_update, + ) - yield ListedOrigin( - lister_id=self.lister_obj.id, - visit_type=vcs_type, - url=origin_url, - last_update=last_update, - ) + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=vcs_type, + url=origin_url, + last_update=last_update, + ) + except RestfulError as e: + logger.warning("Listing %s origins raised %s, skipping", vcs_type, e) + except StopIteration: + pass def finalize(self) -> None: git_date_last_modified = self.date_last_modified["git"] diff --git a/swh/lister/launchpad/tests/test_lister.py b/swh/lister/launchpad/tests/test_lister.py --- a/swh/lister/launchpad/tests/test_lister.py +++ b/swh/lister/launchpad/tests/test_lister.py @@ -26,18 +26,23 @@ class _Collection: entries: List[_Repo] = [] - def __init__(self, file): - self.entries = [_Repo(r) for r in file] + def __init__(self, repos): + self.repos = repos + self.it = iter(self.repos) + + def __next__(self): + return next(self.it) def __getitem__(self, key): - return self.entries[key] + return self.repos[key] def __len__(self): - return len(self.entries) + return len(self.repos) def _launchpad_response(datadir, datafile): - return _Collection(json.loads(Path(datadir, datafile).read_text())) + repos = json.loads(Path(datadir, datafile).read_text()) + return _Collection([_Repo(r) for r in repos]) @pytest.fixture @@ -194,7 +199,9 @@ def test_launchpad_lister_invalid_url_filtering( swh_scheduler, mocker, ): - invalid_origin = [_Repo({"git_https_url": "tag:launchpad.net:2008:redacted",})] + invalid_origin = _Collection( + [_Repo({"git_https_url": "tag:launchpad.net:2008:redacted",})] + ) _mock_launchpad(mocker, invalid_origin) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() @@ -213,7 +220,7 @@ "date_last_modified": "2021-01-14 21:05:31.231406+00:00", } ) - origins = [origin, origin] + origins = _Collection([origin, origin]) _mock_launchpad(mocker, origins) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run()