diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index d79dd43..9bfe306 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -1,182 +1,209 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import logging from typing import Any, Dict, Iterator, Optional, Tuple import iso8601 from launchpadlib.launchpad import Launchpad +from lazr.restfulclient.errors import RestfulError from lazr.restfulclient.resource import Collection +from swh.lister.utils import retry_if_exception, throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) VcsType = str LaunchpadPageType = Tuple[VcsType, Collection] @dataclass class LaunchpadListerState: """State of Launchpad lister""" git_date_last_modified: Optional[datetime] = None """modification date of last updated git repository since last listing""" bzr_date_last_modified: Optional[datetime] = None """modification date of last updated bzr repository since last listing""" def origin(vcs_type: str, repo: Any) -> str: """Determine the origin url out of a repository with a given vcs_type""" return repo.git_https_url if vcs_type == "git" else repo.web_link +def retry_if_restful_error(retry_state): + return retry_if_exception(retry_state, lambda e: isinstance(e, RestfulError)) + + class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): """ List repositories from Launchpad (git or bzr). Args: scheduler: instance of SchedulerInterface incremental: defines if incremental listing should be used, in that case only modified or new repositories since last incremental listing operation will be returned """ LISTER_NAME = "launchpad" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url="https://launchpad.net/", instance="launchpad", credentials=credentials, ) self.incremental = incremental self.date_last_modified: Dict[str, Optional[datetime]] = { "git": None, "bzr": None, } def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: for vcs_type in ["git", "bzr"]: key = f"{vcs_type}_date_last_modified" date_last_modified = d.get(key) if date_last_modified is not None: d[key] = iso8601.parse_date(date_last_modified) return LaunchpadListerState(**d) def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {} for vcs_type in ["git", "bzr"]: attribute_name = f"{vcs_type}_date_last_modified" d[attribute_name] = None if hasattr(state, attribute_name): date_last_modified = getattr(state, attribute_name) if date_last_modified is not None: d[attribute_name] = date_last_modified.isoformat() return d + @throttling_retry(retry=retry_if_restful_error) + def _page_request( + self, launchpad, vcs_type: str, date_last_modified: Optional[datetime] + ) -> Optional[Collection]: + """Querying the page of results for a given vcs_type since the date_last_modified. If + some issues occurs, this will deal with the retrying policy. + + """ + get_vcs_fns = { + "git": launchpad.git_repositories.getRepositories, + "bzr": launchpad.branches.getBranches, + } + + return get_vcs_fns[vcs_type]( + order_by="most neglected first", modified_since_date=date_last_modified, + ) + def get_pages(self) -> Iterator[LaunchpadPageType]: """ Yields an iterator on all git/bzr repositories hosted on Launchpad sorted by last modification date in ascending order. """ launchpad = Launchpad.login_anonymously( "softwareheritage", "production", version="devel" ) if self.incremental: self.date_last_modified = { "git": self.state.git_date_last_modified, "bzr": self.state.bzr_date_last_modified, } - for vcs_type, get_vcs_fn in [ - ("git", launchpad.git_repositories.getRepositories), - ("bzr", launchpad.branches.getBranches), - ]: - yield vcs_type, get_vcs_fn( - order_by="most neglected first", - modified_since_date=self.date_last_modified[vcs_type], - ) + for vcs_type in ["git", "bzr"]: + try: + result = self._page_request( + launchpad, vcs_type, self.date_last_modified[vcs_type] + ) + except RestfulError as e: + logger.warning("Listing %s origins raised %s", vcs_type, e) + result = None + if not result: + continue + yield vcs_type, result + @throttling_retry(retry=retry_if_restful_error) def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. """ assert self.lister_obj.id is not None - prev_origin_url: Dict[str, Optional[str]] = {"git": None, "bzr": None} - vcs_type, repos = page assert vcs_type in {"git", "bzr"} + prev_origin_url: Dict[str, Optional[str]] = {"git": None, "bzr": None} + for repo in repos: origin_url = origin(vcs_type, repo) # filter out origins with invalid URL or origin previously listed # (last modified repository will be listed twice by launchpadlib) if ( not origin_url.startswith("https://") or origin_url == prev_origin_url[vcs_type] ): continue last_update = repo.date_last_modified self.date_last_modified[vcs_type] = last_update logger.debug( "Found origin %s with type %s last updated on %s", origin_url, vcs_type, last_update, ) prev_origin_url[vcs_type] = origin_url yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=vcs_type, url=origin_url, last_update=last_update, ) def finalize(self) -> None: git_date_last_modified = self.date_last_modified["git"] bzr_date_last_modified = self.date_last_modified["bzr"] if git_date_last_modified is None and bzr_date_last_modified is None: return if self.incremental and ( self.state.git_date_last_modified is None or ( git_date_last_modified is not None and git_date_last_modified > self.state.git_date_last_modified ) ): self.state.git_date_last_modified = git_date_last_modified if self.incremental and ( self.state.bzr_date_last_modified is None or ( bzr_date_last_modified is not None and bzr_date_last_modified > self.state.bzr_date_last_modified ) ): self.state.bzr_date_last_modified = self.date_last_modified["bzr"] self.updated = True diff --git a/swh/lister/launchpad/tests/test_lister.py b/swh/lister/launchpad/tests/test_lister.py index 7473b6c..59fe605 100644 --- a/swh/lister/launchpad/tests/test_lister.py +++ b/swh/lister/launchpad/tests/test_lister.py @@ -1,215 +1,256 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import json from pathlib import Path from typing import List +from lazr.restfulclient.errors import RestfulError import pytest from ..lister import LaunchpadLister, origin class _Repo: def __init__(self, d: dict): for key in d.keys(): if key == "date_last_modified": setattr(self, key, datetime.fromisoformat(d[key])) else: setattr(self, key, d[key]) class _Collection: entries: List[_Repo] = [] def __init__(self, file): self.entries = [_Repo(r) for r in file] def __getitem__(self, key): return self.entries[key] def __len__(self): return len(self.entries) def _launchpad_response(datadir, datafile): return _Collection(json.loads(Path(datadir, datafile).read_text())) @pytest.fixture def launchpad_response1(datadir): return _launchpad_response(datadir, "launchpad_response1.json") @pytest.fixture def launchpad_response2(datadir): return _launchpad_response(datadir, "launchpad_response2.json") @pytest.fixture def launchpad_bzr_response(datadir): return _launchpad_response(datadir, "launchpad_bzr_response.json") def _mock_launchpad(mocker, launchpad_response, launchpad_bzr_response=None): mock_launchpad = mocker.patch("swh.lister.launchpad.lister.Launchpad") mock_getRepositories = mock_launchpad.git_repositories.getRepositories - mock_getRepositories.return_value = launchpad_response + if isinstance(launchpad_response, Exception): + mock_getRepositories.side_effect = launchpad_response + else: + mock_getRepositories.return_value = launchpad_response mock_getBranches = mock_launchpad.branches.getBranches - mock_getBranches.return_value = ( - [] if launchpad_bzr_response is None else launchpad_bzr_response - ) + if launchpad_bzr_response is not None: + if isinstance(launchpad_bzr_response, Exception): + mock_getBranches.side_effect = launchpad_bzr_response + else: + mock_getBranches.return_value = launchpad_bzr_response + else: + mock_getBranches.return_value = [] # empty page mock_launchpad.login_anonymously.return_value = mock_launchpad return mock_getRepositories, mock_getBranches def _check_listed_origins(scheduler_origins, launchpad_response, vcs_type="git"): for repo in launchpad_response: filtered_origins = [ o for o in scheduler_origins if o.url == origin(vcs_type, repo) ] assert len(filtered_origins) == 1 assert filtered_origins[0].last_update == repo.date_last_modified assert filtered_origins[0].visit_type == vcs_type def test_lister_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "credentials": {}, } lister = LaunchpadLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None def test_launchpad_full_lister( swh_scheduler, mocker, launchpad_response1, launchpad_bzr_response ): mock_getRepositories, mock_getBranches = _mock_launchpad( mocker, launchpad_response1, launchpad_bzr_response ) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() assert not lister.incremental assert lister.updated assert stats.pages == 1 + 1, "Expects 1 page for git origins, another for bzr ones" assert stats.origins == len(launchpad_response1) + len(launchpad_bzr_response) mock_getRepositories.assert_called_once_with( order_by="most neglected first", modified_since_date=None ) mock_getBranches.assert_called_once_with( order_by="most neglected first", modified_since_date=None ) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(launchpad_response1) + len( launchpad_bzr_response ) _check_listed_origins(scheduler_origins, launchpad_response1) _check_listed_origins(scheduler_origins, launchpad_bzr_response, vcs_type="bzr") def test_launchpad_incremental_lister( swh_scheduler, mocker, launchpad_response1, launchpad_response2, launchpad_bzr_response, ): mock_getRepositories, mock_getBranches = _mock_launchpad( mocker, launchpad_response1, launchpad_bzr_response ) lister = LaunchpadLister(scheduler=swh_scheduler, incremental=True) stats = lister.run() assert lister.incremental assert lister.updated assert stats.pages == 1 + 1, "Expects 1 page for git origins, another for bzr ones" len_first_runs = len(launchpad_response1) + len(launchpad_bzr_response) assert stats.origins == len_first_runs mock_getRepositories.assert_called_once_with( order_by="most neglected first", modified_since_date=None ) mock_getBranches.assert_called_once_with( order_by="most neglected first", modified_since_date=None ) lister_state = lister.get_state_from_scheduler() assert ( lister_state.git_date_last_modified == launchpad_response1[-1].date_last_modified ) assert ( lister_state.bzr_date_last_modified == launchpad_bzr_response[-1].date_last_modified ) mock_getRepositories, mock_getBranches = _mock_launchpad( mocker, launchpad_response2 ) lister = LaunchpadLister(scheduler=swh_scheduler, incremental=True) stats = lister.run() assert lister.incremental assert lister.updated - assert stats.pages == 2, "Empty bzr response still accounts for 1 page" + assert stats.pages == 1, "Empty bzr page response is ignored" assert stats.origins == len(launchpad_response2) mock_getRepositories.assert_called_once_with( order_by="most neglected first", modified_since_date=lister_state.git_date_last_modified, ) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len_first_runs + len(launchpad_response2) _check_listed_origins(scheduler_origins, launchpad_response1) _check_listed_origins(scheduler_origins, launchpad_bzr_response, vcs_type="bzr") _check_listed_origins(scheduler_origins, launchpad_response2) def test_launchpad_lister_invalid_url_filtering( swh_scheduler, mocker, ): invalid_origin = [_Repo({"git_https_url": "tag:launchpad.net:2008:redacted",})] _mock_launchpad(mocker, invalid_origin) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() assert not lister.updated - assert stats.pages == 1 + 1, "Empty pages are still accounted for (1 git, 1 bzr)" + assert stats.pages == 1, "Empty pages are ignored(only 1 git page of results)" assert stats.origins == 0 def test_launchpad_lister_duplicated_origin( swh_scheduler, mocker, ): origin = _Repo( { "git_https_url": "https://git.launchpad.net/test", "date_last_modified": "2021-01-14 21:05:31.231406+00:00", } ) origins = [origin, origin] _mock_launchpad(mocker, origins) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() assert lister.updated - assert stats.pages == 1 + 1, "Empty bzr page is still accounted for (1 git, 1 bzr)" + assert stats.pages == 1, "Empty bzr page are ignored (only 1 git page of results)" assert stats.origins == 1 + + +def test_launchpad_lister_raise_during_listing( + swh_scheduler, mocker, launchpad_response1, launchpad_bzr_response +): + lister = LaunchpadLister(scheduler=swh_scheduler) + # Exponential retries take a long time, so stub time.sleep + mocker.patch.object(lister._page_request.retry, "sleep") + + mock_getRepositories, mock_getBranches = _mock_launchpad( + mocker, + RestfulError("Refuse to list git page"), # breaks git page listing + launchpad_bzr_response, + ) + + stats = lister.run() + + assert lister.updated + assert stats.pages == 1 + assert stats.origins == len(launchpad_bzr_response) + + mock_getRepositories, mock_getBranches = _mock_launchpad( + mocker, + launchpad_response1, + RestfulError("Refuse to list bzr"), # breaks bzr page listing + ) + + lister = LaunchpadLister(scheduler=swh_scheduler) + stats = lister.run() + + assert lister.updated + assert stats.pages == 1 + assert stats.origins == len(launchpad_response1)