diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index 381106d..d79dd43 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -1,132 +1,182 @@ -# Copyright (C) 2020-2021 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import logging -from typing import Any, Dict, Iterator, Optional +from typing import Any, Dict, Iterator, Optional, Tuple import iso8601 from launchpadlib.launchpad import Launchpad from lazr.restfulclient.resource import Collection from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) -LaunchpadPageType = Iterator[Collection] +VcsType = str +LaunchpadPageType = Tuple[VcsType, Collection] @dataclass class LaunchpadListerState: """State of Launchpad lister""" - date_last_modified: Optional[datetime] = None - """modification date of last updated repository since last listing""" + git_date_last_modified: Optional[datetime] = None + """modification date of last updated git repository since last listing""" + bzr_date_last_modified: Optional[datetime] = None + """modification date of last updated bzr repository since last listing""" + + +def origin(vcs_type: str, repo: Any) -> str: + """Determine the origin url out of a repository with a given vcs_type""" + return repo.git_https_url if vcs_type == "git" else repo.web_link class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): """ - List git repositories from Launchpad. + List repositories from Launchpad (git or bzr). Args: scheduler: instance of SchedulerInterface incremental: defines if incremental listing should be used, in that case only modified or new repositories since last incremental listing operation will be returned """ LISTER_NAME = "launchpad" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url="https://launchpad.net/", instance="launchpad", credentials=credentials, ) self.incremental = incremental - self.date_last_modified = None + self.date_last_modified: Dict[str, Optional[datetime]] = { + "git": None, + "bzr": None, + } def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: - date_last_modified = d.get("date_last_modified") - if date_last_modified is not None: - d["date_last_modified"] = iso8601.parse_date(date_last_modified) + for vcs_type in ["git", "bzr"]: + key = f"{vcs_type}_date_last_modified" + date_last_modified = d.get(key) + if date_last_modified is not None: + d[key] = iso8601.parse_date(date_last_modified) + return LaunchpadListerState(**d) def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: - d: Dict[str, Optional[str]] = {"date_last_modified": None} - date_last_modified = state.date_last_modified - if date_last_modified is not None: - d["date_last_modified"] = date_last_modified.isoformat() + d: Dict[str, Optional[str]] = {} + for vcs_type in ["git", "bzr"]: + attribute_name = f"{vcs_type}_date_last_modified" + d[attribute_name] = None + + if hasattr(state, attribute_name): + date_last_modified = getattr(state, attribute_name) + if date_last_modified is not None: + d[attribute_name] = date_last_modified.isoformat() return d def get_pages(self) -> Iterator[LaunchpadPageType]: """ - Yields an iterator on all git repositories hosted on Launchpad sorted + Yields an iterator on all git/bzr repositories hosted on Launchpad sorted by last modification date in ascending order. """ launchpad = Launchpad.login_anonymously( "softwareheritage", "production", version="devel" ) - date_last_modified = None if self.incremental: - date_last_modified = self.state.date_last_modified - get_repos = launchpad.git_repositories.getRepositories - yield get_repos( - order_by="most neglected first", modified_since_date=date_last_modified - ) + self.date_last_modified = { + "git": self.state.git_date_last_modified, + "bzr": self.state.bzr_date_last_modified, + } + for vcs_type, get_vcs_fn in [ + ("git", launchpad.git_repositories.getRepositories), + ("bzr", launchpad.branches.getBranches), + ]: + yield vcs_type, get_vcs_fn( + order_by="most neglected first", + modified_since_date=self.date_last_modified[vcs_type], + ) def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. """ assert self.lister_obj.id is not None - prev_origin_url = None + prev_origin_url: Dict[str, Optional[str]] = {"git": None, "bzr": None} + + vcs_type, repos = page - for repo in page: + assert vcs_type in {"git", "bzr"} - origin_url = repo.git_https_url + for repo in repos: + origin_url = origin(vcs_type, repo) # filter out origins with invalid URL or origin previously listed # (last modified repository will be listed twice by launchpadlib) - if not origin_url.startswith("https://") or origin_url == prev_origin_url: + if ( + not origin_url.startswith("https://") + or origin_url == prev_origin_url[vcs_type] + ): continue last_update = repo.date_last_modified - self.date_last_modified = last_update + self.date_last_modified[vcs_type] = last_update - logger.debug("Found origin %s last updated on %s", origin_url, last_update) + logger.debug( + "Found origin %s with type %s last updated on %s", + origin_url, + vcs_type, + last_update, + ) - prev_origin_url = origin_url + prev_origin_url[vcs_type] = origin_url yield ListedOrigin( lister_id=self.lister_obj.id, - visit_type="git", + visit_type=vcs_type, url=origin_url, last_update=last_update, ) def finalize(self) -> None: - if self.date_last_modified is None: + git_date_last_modified = self.date_last_modified["git"] + bzr_date_last_modified = self.date_last_modified["bzr"] + if git_date_last_modified is None and bzr_date_last_modified is None: return if self.incremental and ( - self.state.date_last_modified is None - or self.date_last_modified > self.state.date_last_modified + self.state.git_date_last_modified is None + or ( + git_date_last_modified is not None + and git_date_last_modified > self.state.git_date_last_modified + ) + ): + self.state.git_date_last_modified = git_date_last_modified + + if self.incremental and ( + self.state.bzr_date_last_modified is None + or ( + bzr_date_last_modified is not None + and bzr_date_last_modified > self.state.bzr_date_last_modified + ) ): - self.state.date_last_modified = self.date_last_modified + self.state.bzr_date_last_modified = self.date_last_modified["bzr"] self.updated = True diff --git a/swh/lister/launchpad/tests/data/launchpad_bzr_response.json b/swh/lister/launchpad/tests/data/launchpad_bzr_response.json new file mode 100644 index 0000000..3341c82 --- /dev/null +++ b/swh/lister/launchpad/tests/data/launchpad_bzr_response.json @@ -0,0 +1,126 @@ +[ + { + "self_link": "https://api.launchpad.net/1.0/fourbar", + "web_link": "https://launchpad.net/fourbar", + "resource_type_link": "https://api.launchpad.net/1.0/#project", + "official_answers": true, + "official_blueprints": true, + "official_codehosting": true, + "official_bugs": true, + "information_type": "Public", + "active": true, + "bug_reporting_guidelines": null, + "bug_reported_acknowledgement": null, + "official_bug_tags": [], + "recipes_collection_link": "https://api.launchpad.net/1.0/fourbar/recipes", + "active_milestones_collection_link": "https://api.launchpad.net/1.0/fourbar/active_milestones", + "all_milestones_collection_link": "https://api.launchpad.net/1.0/fourbar/all_milestones", + "bug_supervisor_link": null, + "qualifies_for_free_hosting": true, + "reviewer_whiteboard": "tag:launchpad.net:2008:redacted", + "is_permitted": "tag:launchpad.net:2008:redacted", + "project_reviewed": "tag:launchpad.net:2008:redacted", + "license_approved": "tag:launchpad.net:2008:redacted", + "private": false, + "display_name": "fourBar", + "icon_link": "https://api.launchpad.net/1.0/fourbar/icon", + "logo_link": "https://api.launchpad.net/1.0/fourbar/logo", + "name": "fourbar", + "owner_link": "https://api.launchpad.net/1.0/~sorivenul", + "project_group_link": null, + "title": "fourBar", + "registrant_link": "https://api.launchpad.net/1.0/~sorivenul", + "driver_link": null, + "summary": "fourBar is a minimal application launcher for POSIX systems. It launches four commonly used applications (terminal, file browser, editor, and web browser by default). It is written in Python/Tkinter. Documentation on simple customization is included. ", + "description": "If you wish to help with the development of fourBar, download a branch, test, report bugs and propose features. There is still work to be done.", + "date_created": "2008-11-03T07:03:00.872230+00:00", + "homepage_url": null, + "wiki_url": null, + "screenshots_url": null, + "download_url": "http://downloads.sourceforge.net/fourbar/fourbar-1.0.0.tar.gz?modtime=1224102066&big_mirror=0", + "programming_language": "Python", + "sourceforge_project": "fourBar", + "freshmeat_project": null, + "brand_link": "https://api.launchpad.net/1.0/fourbar/brand", + "private_bugs": false, + "licenses": [ + "GNU GPL v3" + ], + "license_info": null, + "bug_tracker_link": null, + "date_next_suggest_packaging": null, + "series_collection_link": "https://api.launchpad.net/1.0/fourbar/series", + "development_focus_link": "https://api.launchpad.net/1.0/fourbar/trunk", + "releases_collection_link": "https://api.launchpad.net/1.0/fourbar/releases", + "translation_focus_link": null, + "commercial_subscription_link": null, + "commercial_subscription_is_due": false, + "remote_product": "242408&1119369", + "security_contact": null, + "vcs": "Bazaar", + "http_etag": "\"e3685b989bd2609f9a84bd2d90bef380c6f3c92b-13a47c4e8b4688c8fc042bf7eede3a2f4c14a9d6\"", + "date_last_modified":"2016-05-19T16:05:23.706734+00:00" + }, + { + "self_link": "https://api.launchpad.net/1.0/gekkoware", + "web_link": "https://launchpad.net/gekkoware", + "resource_type_link": "https://api.launchpad.net/1.0/#project", + "official_answers": false, + "official_blueprints": false, + "official_codehosting": false, + "official_bugs": false, + "information_type": "Public", + "active": true, + "bug_reporting_guidelines": null, + "bug_reported_acknowledgement": null, + "official_bug_tags": [], + "recipes_collection_link": "https://api.launchpad.net/1.0/gekkoware/recipes", + "active_milestones_collection_link": "https://api.launchpad.net/1.0/gekkoware/active_milestones", + "all_milestones_collection_link": "https://api.launchpad.net/1.0/gekkoware/all_milestones", + "bug_supervisor_link": null, + "qualifies_for_free_hosting": true, + "reviewer_whiteboard": "tag:launchpad.net:2008:redacted", + "is_permitted": "tag:launchpad.net:2008:redacted", + "project_reviewed": "tag:launchpad.net:2008:redacted", + "license_approved": "tag:launchpad.net:2008:redacted", + "private": false, + "display_name": "gekkoware", + "icon_link": "https://api.launchpad.net/1.0/gekkoware/icon", + "logo_link": "https://api.launchpad.net/1.0/gekkoware/logo", + "name": "gekkoware", + "owner_link": "https://api.launchpad.net/1.0/~compermisos", + "project_group_link": null, + "title": "gekkoware", + "registrant_link": "https://api.launchpad.net/1.0/~compermisos", + "driver_link": null, + "summary": "A port of gekko to ubuntu", + "description": null, + "date_created": "2007-10-21T03:02:22.186775+00:00", + "homepage_url": "http://gekkoware.org", + "wiki_url": null, + "screenshots_url": null, + "download_url": null, + "programming_language": "php", + "sourceforge_project": "gekkoware", + "freshmeat_project": null, + "brand_link": "https://api.launchpad.net/1.0/gekkoware/brand", + "private_bugs": false, + "licenses": [ + "GNU GPL v2" + ], + "license_info": null, + "bug_tracker_link": null, + "date_next_suggest_packaging": null, + "series_collection_link": "https://api.launchpad.net/1.0/gekkoware/series", + "development_focus_link": "https://api.launchpad.net/1.0/gekkoware/trunk", + "releases_collection_link": "https://api.launchpad.net/1.0/gekkoware/releases", + "translation_focus_link": null, + "commercial_subscription_link": null, + "commercial_subscription_is_due": false, + "remote_product": "117004&676653", + "security_contact": null, + "vcs": "Bazaar", + "http_etag": "\"b9802efcebb5afdd87c8ee10f8473040340bcead-159127be59c12e7cbb161eee4cae2ade72353c0d\"", + "date_last_modified":"2017-03-15T16:03:22.706432+00:00" + } +] diff --git a/swh/lister/launchpad/tests/test_lister.py b/swh/lister/launchpad/tests/test_lister.py index 836fcec..7473b6c 100644 --- a/swh/lister/launchpad/tests/test_lister.py +++ b/swh/lister/launchpad/tests/test_lister.py @@ -1,175 +1,215 @@ -# Copyright (C) 2020-2021 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import json from pathlib import Path from typing import List import pytest -from ..lister import LaunchpadLister +from ..lister import LaunchpadLister, origin class _Repo: def __init__(self, d: dict): for key in d.keys(): if key == "date_last_modified": setattr(self, key, datetime.fromisoformat(d[key])) else: setattr(self, key, d[key]) class _Collection: entries: List[_Repo] = [] def __init__(self, file): self.entries = [_Repo(r) for r in file] def __getitem__(self, key): return self.entries[key] def __len__(self): return len(self.entries) def _launchpad_response(datadir, datafile): return _Collection(json.loads(Path(datadir, datafile).read_text())) @pytest.fixture def launchpad_response1(datadir): return _launchpad_response(datadir, "launchpad_response1.json") @pytest.fixture def launchpad_response2(datadir): return _launchpad_response(datadir, "launchpad_response2.json") -def _mock_getRepositories(mocker, launchpad_response): +@pytest.fixture +def launchpad_bzr_response(datadir): + return _launchpad_response(datadir, "launchpad_bzr_response.json") + + +def _mock_launchpad(mocker, launchpad_response, launchpad_bzr_response=None): mock_launchpad = mocker.patch("swh.lister.launchpad.lister.Launchpad") mock_getRepositories = mock_launchpad.git_repositories.getRepositories mock_getRepositories.return_value = launchpad_response + mock_getBranches = mock_launchpad.branches.getBranches + mock_getBranches.return_value = ( + [] if launchpad_bzr_response is None else launchpad_bzr_response + ) mock_launchpad.login_anonymously.return_value = mock_launchpad - return mock_getRepositories + return mock_getRepositories, mock_getBranches -def _check_listed_origins(scheduler_origins, launchpad_response): - for origin in launchpad_response: +def _check_listed_origins(scheduler_origins, launchpad_response, vcs_type="git"): + for repo in launchpad_response: filtered_origins = [ - o for o in scheduler_origins if o.url == origin.git_https_url + o for o in scheduler_origins if o.url == origin(vcs_type, repo) ] assert len(filtered_origins) == 1 - assert filtered_origins[0].last_update == origin.date_last_modified + assert filtered_origins[0].last_update == repo.date_last_modified + assert filtered_origins[0].visit_type == vcs_type def test_lister_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "credentials": {}, } lister = LaunchpadLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None -def test_launchpad_full_lister(swh_scheduler, mocker, launchpad_response1): - mock_getRepositories = _mock_getRepositories(mocker, launchpad_response1) +def test_launchpad_full_lister( + swh_scheduler, mocker, launchpad_response1, launchpad_bzr_response +): + mock_getRepositories, mock_getBranches = _mock_launchpad( + mocker, launchpad_response1, launchpad_bzr_response + ) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() assert not lister.incremental assert lister.updated - assert stats.pages == 1 - assert stats.origins == len(launchpad_response1) + assert stats.pages == 1 + 1, "Expects 1 page for git origins, another for bzr ones" + assert stats.origins == len(launchpad_response1) + len(launchpad_bzr_response) mock_getRepositories.assert_called_once_with( order_by="most neglected first", modified_since_date=None ) + mock_getBranches.assert_called_once_with( + order_by="most neglected first", modified_since_date=None + ) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == len(launchpad_response1) + assert len(scheduler_origins) == len(launchpad_response1) + len( + launchpad_bzr_response + ) _check_listed_origins(scheduler_origins, launchpad_response1) + _check_listed_origins(scheduler_origins, launchpad_bzr_response, vcs_type="bzr") def test_launchpad_incremental_lister( - swh_scheduler, mocker, launchpad_response1, launchpad_response2 + swh_scheduler, + mocker, + launchpad_response1, + launchpad_response2, + launchpad_bzr_response, ): - mock_getRepositories = _mock_getRepositories(mocker, launchpad_response1) + mock_getRepositories, mock_getBranches = _mock_launchpad( + mocker, launchpad_response1, launchpad_bzr_response + ) lister = LaunchpadLister(scheduler=swh_scheduler, incremental=True) stats = lister.run() assert lister.incremental assert lister.updated - assert stats.pages == 1 - assert stats.origins == len(launchpad_response1) + assert stats.pages == 1 + 1, "Expects 1 page for git origins, another for bzr ones" + len_first_runs = len(launchpad_response1) + len(launchpad_bzr_response) + assert stats.origins == len_first_runs mock_getRepositories.assert_called_once_with( order_by="most neglected first", modified_since_date=None ) + mock_getBranches.assert_called_once_with( + order_by="most neglected first", modified_since_date=None + ) lister_state = lister.get_state_from_scheduler() - assert lister_state.date_last_modified == launchpad_response1[-1].date_last_modified + assert ( + lister_state.git_date_last_modified + == launchpad_response1[-1].date_last_modified + ) + assert ( + lister_state.bzr_date_last_modified + == launchpad_bzr_response[-1].date_last_modified + ) - mock_getRepositories = _mock_getRepositories(mocker, launchpad_response2) + mock_getRepositories, mock_getBranches = _mock_launchpad( + mocker, launchpad_response2 + ) lister = LaunchpadLister(scheduler=swh_scheduler, incremental=True) stats = lister.run() assert lister.incremental assert lister.updated - assert stats.pages == 1 + assert stats.pages == 2, "Empty bzr response still accounts for 1 page" assert stats.origins == len(launchpad_response2) mock_getRepositories.assert_called_once_with( order_by="most neglected first", - modified_since_date=lister_state.date_last_modified, + modified_since_date=lister_state.git_date_last_modified, ) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == len(launchpad_response1) + len(launchpad_response2) + assert len(scheduler_origins) == len_first_runs + len(launchpad_response2) _check_listed_origins(scheduler_origins, launchpad_response1) + _check_listed_origins(scheduler_origins, launchpad_bzr_response, vcs_type="bzr") _check_listed_origins(scheduler_origins, launchpad_response2) def test_launchpad_lister_invalid_url_filtering( swh_scheduler, mocker, ): invalid_origin = [_Repo({"git_https_url": "tag:launchpad.net:2008:redacted",})] - _mock_getRepositories(mocker, invalid_origin) + _mock_launchpad(mocker, invalid_origin) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() assert not lister.updated - assert stats.pages == 1 + assert stats.pages == 1 + 1, "Empty pages are still accounted for (1 git, 1 bzr)" assert stats.origins == 0 def test_launchpad_lister_duplicated_origin( swh_scheduler, mocker, ): origin = _Repo( { "git_https_url": "https://git.launchpad.net/test", "date_last_modified": "2021-01-14 21:05:31.231406+00:00", } ) origins = [origin, origin] - _mock_getRepositories(mocker, origins) + _mock_launchpad(mocker, origins) lister = LaunchpadLister(scheduler=swh_scheduler) stats = lister.run() assert lister.updated - assert stats.pages == 1 + assert stats.pages == 1 + 1, "Empty bzr page is still accounted for (1 git, 1 bzr)" assert stats.origins == 1