diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index 3a5004f..4c66471 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -1,127 +1,132 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime import logging from typing import Any, Dict, Iterator, Optional import iso8601 from launchpadlib.launchpad import Launchpad from lazr.restfulclient.resource import Collection from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) LaunchpadPageType = Iterator[Collection] @dataclass class LaunchpadListerState: """State of Launchpad lister""" date_last_modified: Optional[datetime] = None """modification date of last updated repository since last listing""" class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): """ List git repositories from Launchpad. Args: scheduler: instance of SchedulerInterface incremental: defines if incremental listing should be used, in that case only modified or new repositories since last incremental listing operation will be returned """ LISTER_NAME = "launchpad" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url="https://launchpad.net/", instance="launchpad", credentials=credentials, ) self.incremental = incremental self.date_last_modified = None def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: date_last_modified = d.get("date_last_modified") if date_last_modified is not None: d["date_last_modified"] = iso8601.parse_date(date_last_modified) return LaunchpadListerState(**d) def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: d = asdict(state) date_last_modified = d.get("date_last_modified") if date_last_modified is not None: d["date_last_modified"] = date_last_modified.isoformat() return d def get_pages(self) -> Iterator[LaunchpadPageType]: """ Yields an iterator on all git repositories hosted on Launchpad sorted by last modification date in ascending order. """ launchpad = Launchpad.login_anonymously( "softwareheritage", "production", version="devel" ) date_last_modified = None if self.incremental: date_last_modified = self.state.date_last_modified get_repos = launchpad.git_repositories.getRepositories yield get_repos( order_by="most neglected first", modified_since_date=date_last_modified ) def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. """ assert self.lister_obj.id is not None + prev_origin_url = None + for repo in page: origin_url = repo.git_https_url - # filter out origins with invalid URL - if not origin_url.startswith("https://"): + # filter out origins with invalid URL or origin previously listed + # (last modified repository will be listed twice by launchpadlib) + if not origin_url.startswith("https://") or origin_url == prev_origin_url: continue last_update = repo.date_last_modified self.date_last_modified = last_update logger.debug("Found origin %s last updated on %s", origin_url, last_update) + prev_origin_url = origin_url + yield ListedOrigin( lister_id=self.lister_obj.id, visit_type="git", url=origin_url, last_update=last_update, ) def finalize(self) -> None: if self.date_last_modified is None: return if self.incremental and ( self.state.date_last_modified is None or self.date_last_modified > self.state.date_last_modified ): self.state.date_last_modified = self.date_last_modified self.updated = True diff --git a/swh/lister/launchpad/tests/test_lister.py b/swh/lister/launchpad/tests/test_lister.py index d36f026..836fcec 100644 --- a/swh/lister/launchpad/tests/test_lister.py +++ b/swh/lister/launchpad/tests/test_lister.py @@ -1,156 +1,175 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import json from pathlib import Path from typing import List import pytest from ..lister import LaunchpadLister class _Repo: def __init__(self, d: dict): for key in d.keys(): if key == "date_last_modified": setattr(self, key, datetime.fromisoformat(d[key])) else: setattr(self, key, d[key]) class _Collection: entries: List[_Repo] = [] def __init__(self, file): self.entries = [_Repo(r) for r in file] def __getitem__(self, key): return self.entries[key] def __len__(self): return len(self.entries) def _launchpad_response(datadir, datafile): return _Collection(json.loads(Path(datadir, datafile).read_text())) @pytest.fixture def launchpad_response1(datadir): return _launchpad_response(datadir, "launchpad_response1.json") @pytest.fixture def launchpad_response2(datadir): return _launchpad_response(datadir, "launchpad_response2.json") def _mock_getRepositories(mocker, launchpad_response): mock_launchpad = mocker.patch("swh.lister.launchpad.lister.Launchpad") mock_getRepositories = mock_launchpad.git_repositories.getRepositories mock_getRepositories.return_value = launchpad_response mock_launchpad.login_anonymously.return_value = mock_launchpad return mock_getRepositories def _check_listed_origins(scheduler_origins, launchpad_response): for origin in launchpad_response: filtered_origins = [ o for o in scheduler_origins if o.url == origin.git_https_url ] assert len(filtered_origins) == 1 assert filtered_origins[0].last_update == origin.date_last_modified def test_lister_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "credentials": {}, } lister = LaunchpadLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None def test_launchpad_full_lister(swh_scheduler, mocker, launchpad_response1): mock_getRepositories = _mock_getRepositories(mocker, launchpad_response1) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() assert not lister.incremental assert lister.updated assert stats.pages == 1 assert stats.origins == len(launchpad_response1) mock_getRepositories.assert_called_once_with( order_by="most neglected first", modified_since_date=None ) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(launchpad_response1) _check_listed_origins(scheduler_origins, launchpad_response1) def test_launchpad_incremental_lister( swh_scheduler, mocker, launchpad_response1, launchpad_response2 ): mock_getRepositories = _mock_getRepositories(mocker, launchpad_response1) lister = LaunchpadLister(scheduler=swh_scheduler, incremental=True) stats = lister.run() assert lister.incremental assert lister.updated assert stats.pages == 1 assert stats.origins == len(launchpad_response1) mock_getRepositories.assert_called_once_with( order_by="most neglected first", modified_since_date=None ) lister_state = lister.get_state_from_scheduler() assert lister_state.date_last_modified == launchpad_response1[-1].date_last_modified mock_getRepositories = _mock_getRepositories(mocker, launchpad_response2) lister = LaunchpadLister(scheduler=swh_scheduler, incremental=True) stats = lister.run() assert lister.incremental assert lister.updated assert stats.pages == 1 assert stats.origins == len(launchpad_response2) mock_getRepositories.assert_called_once_with( order_by="most neglected first", modified_since_date=lister_state.date_last_modified, ) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(launchpad_response1) + len(launchpad_response2) _check_listed_origins(scheduler_origins, launchpad_response1) _check_listed_origins(scheduler_origins, launchpad_response2) def test_launchpad_lister_invalid_url_filtering( swh_scheduler, mocker, ): invalid_origin = [_Repo({"git_https_url": "tag:launchpad.net:2008:redacted",})] _mock_getRepositories(mocker, invalid_origin) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() assert not lister.updated assert stats.pages == 1 assert stats.origins == 0 + + +def test_launchpad_lister_duplicated_origin( + swh_scheduler, mocker, +): + origin = _Repo( + { + "git_https_url": "https://git.launchpad.net/test", + "date_last_modified": "2021-01-14 21:05:31.231406+00:00", + } + ) + origins = [origin, origin] + _mock_getRepositories(mocker, origins) + lister = LaunchpadLister(scheduler=swh_scheduler) + stats = lister.run() + + assert lister.updated + assert stats.pages == 1 + assert stats.origins == 1