diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,3 +3,4 @@ requests_mock testing.postgresql sqlalchemy-stubs +launchpadlib \ No newline at end of file diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ iso8601 beautifulsoup4 pytz +launchpadlib \ No newline at end of file diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -65,6 +65,7 @@ lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register lister.pypi=swh.lister.pypi:register + lister.launchpad=swh.lister.launchpad:register ''', classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/launchpad/__init__.py b/swh/lister/launchpad/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/launchpad/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .models import LaunchpadModel + from .lister import LaunchpadLister + + return {'models': [LaunchpadModel], + 'lister': LaunchpadLister, + 'task_modules': ['%s.tasks' % __name__], + } diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/launchpad/lister.py @@ -0,0 +1,130 @@ +# Copyright (C) 2017-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Optional, Dict, List, Union, Tuple + +from swh.lister.core.lister_base import ListerBase +from .models import LaunchpadModel + +from itertools import count +from launchpadlib.launchpad import Launchpad # type: ignore +from lazr.restfulclient.resource import Collection, Entry # type: ignore +from datetime import datetime, timedelta +from sqlalchemy import func + + +class LaunchpadLister(ListerBase): + MODEL = LaunchpadModel + LISTER_NAME = 'launchpad' + instance = 'launchpad' + launchpad: Launchpad + flush_packet_db = 20 + + def __init__(self, override_config=None): + super().__init__(override_config=override_config) + self.launchpad = Launchpad.login_anonymously( + 'softwareheritage', 'production', version='devel') + + def get_model_from_repo(self, repo: Entry + ) -> Dict[str, Union[str, datetime]]: + return { + 'uid': repo.unique_name, + 'name': repo.name, + 'full_name': repo.name, + 'origin_url': repo.git_https_url, + 'html_url': repo.web_link, + 'origin_type': 'git', + 'date_last_modified': repo.date_last_modified + } + + def lib_response_simplified(self, response: Collection + ) -> List[Dict[str, Union[str, datetime]]]: + return [self.get_model_from_repo(repo) + for repo in response[:len(response.entries)]] + + def get_git_repos(self, threshold: Optional[datetime]) -> Collection: + get_repos = self.launchpad.git_repositories.getRepositories + + return get_repos(order_by='most neglected first', + modified_since_date=threshold) + + def db_last_threshold(self) -> Optional[datetime]: + t = self.db_session.query( + func.max(self.MODEL.date_last_modified)).first() + if t: + return t[0] + else: + return None + + def ingest_data(self, identifier: Optional[datetime], checks=False + ) -> Tuple[Collection, dict]: + """The core data fetch sequence. Request launchpadlib endpoint. Simplify and + filter response list of repositories. Inject repo information into + local db. Queue loader tasks for linked repositories. + + Args: + identifier: Resource identifier. + checks (bool): Additional checks required + """ + response = self.get_git_repos(identifier) + models_list = self.lib_response_simplified(response) + models_list = self.filter_before_inject(models_list) + if checks: + models_list = self.do_additional_checks(models_list) + if not models_list: + return response, {} + # inject into local db + injected = self.inject_repo_data_into_db(models_list) + # queue workers + self.schedule_missing_tasks(models_list, injected) + return response, injected + + def run(self, max_bound: Optional[datetime] = None): + """Main entry function. Sequentially fetches repository data + from the service according to the basic outline in the class + docstring, continually fetching sublists until either there + is no next index reference given or the given next index is greater + than the desired max_bound. + + Args: + max_bound : optional date to start at + Returns: + nothing + """ + status = 'uneventful' + + def ingest_git_repos(): + threshold = max_bound + for i in count(1): + response, injected_repos = self.ingest_data(threshold) + if not response and not injected_repos: + return + + # batch is empty + if len(response.entries) == 0: + return + + first: datetime = response[0].date_last_modified + last: datetime = response[len( + response.entries)-1].date_last_modified + + next_date = last - timedelta(seconds=15) + + if next_date <= first: + delta = last - first + next_date = last - delta/2 + + threshold = next_date + yield i + + for i in ingest_git_repos(): + if (i % self.flush_packet_db) == 0: + self.db_session.commit() + self.db_session = self.mk_session() + status = 'eventful' + + self.db_session.commit() + self.db_session = self.mk_session() + return {'status': status} diff --git a/swh/lister/launchpad/models.py b/swh/lister/launchpad/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/launchpad/models.py @@ -0,0 +1,15 @@ +# Copyright (C) 2017-2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String, Date + +from swh.lister.core.models import ModelBase + + +class LaunchpadModel(ModelBase): + """a Launchpad repository""" + __tablename__ = 'launchpad_repo' + + uid = Column(String, primary_key=True) + date_last_modified = Column(Date, index=True) diff --git a/swh/lister/launchpad/tasks.py b/swh/lister/launchpad/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/launchpad/tasks.py @@ -0,0 +1,40 @@ +# Copyright (C) 2017-2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# import random + +from celery import shared_task + +from .lister import LaunchpadLister + + +@shared_task(name=__name__ + '.IncrementalLaunchpadLister') +def launchpad_lister_incremental(threshold, **lister_args): + """Incremental update + """ + lister = LaunchpadLister(**lister_args) + return lister.run(max_bound=threshold) + + +@shared_task(name=__name__ + '.FullLaunchpadLister', bind=True) +def list_launchpad_full(self, **lister_args): + """Full update of Launchpad + """ + self.log.debug('%s OK, spawned full task' % (self.name)) + return launchpad_lister_incremental(threshold=None, **lister_args) + + +@shared_task(name=__name__ + '.NewLaunchpadLister', bind=True) +def list_launchpad_new(self, **lister_args): + """Update new entries of Launchpad + """ + lister = LaunchpadLister(**lister_args) + threshold = lister.db_last_threshold() + self.log.debug('%s OK, spawned new task' % (self.name)) + return launchpad_lister_incremental(threshold=threshold, **lister_args) + + +@shared_task(name=__name__ + '.ping') +def _ping(): + return 'OK' diff --git a/swh/lister/launchpad/tests/__init__.py b/swh/lister/launchpad/tests/__init__.py new file mode 100644 diff --git a/swh/lister/launchpad/tests/conftest.py b/swh/lister/launchpad/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/lister/launchpad/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/launchpad/tests/test_lister.py b/swh/lister/launchpad/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/launchpad/tests/test_lister.py @@ -0,0 +1,33 @@ +import logging + +from datetime import datetime +from ..lister import LaunchpadLister + +logger = logging.getLogger(__name__) + + +def test_launchpad_lister(swh_listers, requests_mock_datadir): + lister: LaunchpadLister = swh_listers['launchpad'] + + lister.ingest_data(None) + lister.run(max_bound=datetime.now()) + + r = lister.scheduler.search_tasks(task_type='load-git') + assert len(r) == 75 + + for row in r: + assert row['type'] == 'load-git' + # arguments check + args = row['arguments']['args'] + assert len(args) == 0 + + # kwargs + kwargs = row['arguments']['kwargs'] + assert set(kwargs.keys()) == {'url'} + + url = kwargs['url'] + assert url.startswith('https://git.launchpad.net') + + assert row['policy'] == 'recurring' + assert row['priority'] is None + assert row['retries_left'] == 0 diff --git a/swh/lister/launchpad/tests/test_tasks.py b/swh/lister/launchpad/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/launchpad/tests/test_tasks.py @@ -0,0 +1,46 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.launchpad.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.launchpad.tasks.LaunchpadLister') +def test_new(lister, swh_app, celery_session_worker): + # setup the mocked LaunchpadLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.launchpad.tasks.NewLaunchpadLister' + ) + assert res + res.wait() + assert res.successful() + + assert lister.call_count == 2 + lister.db_last_threshold.assert_called_once() + lister.run.assert_called_once() + + +@patch('swh.lister.launchpad.tasks.LaunchpadLister') +def test_full(lister, swh_app, celery_session_worker): + # setup the mocked LaunchpadLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.launchpad.tasks.FullLaunchpadLister' + ) + assert res + res.wait() + assert res.successful() + + lister.assert_called_once() + lister.db_last_threshold.assert_not_called() + lister.run.assert_called_once_with(max_bound=None)