diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ - `swh.lister.pypi` - `swh.lister.npm` - `swh.lister.phabricator` +- `swh.lister.rcran` Dependencies ------------ @@ -177,6 +178,18 @@ incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` +## lister-rcran + +Once configured, you can execute a RCRAN lister using the following instructions in a `python3` script: + +```lang=python +import logging +from swh.lister.rcran.tasks import rcran_lister + +logging.basicConfig(level=logging.DEBUG) +rcran_lister() +``` + Licensing --------- diff --git a/swh/lister/cli.py b/swh/lister/cli.py --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi', - 'npm', 'phabricator'] + 'npm', 'phabricator', 'rcran'] @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @@ -115,6 +115,11 @@ api_token='', override_config=override_conf) + elif lister == 'rcran': + from .rcran.models import ModelBase + from .rcran.lister import RCRANLister + _lister = RCRANLister(override_config=override_conf) + else: raise ValueError( 'Invalid lister %s: only supported listers are %s' % diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -12,4 +12,5 @@ 'swh.lister.npm.tasks', 'swh.lister.pypi.tasks', 'swh.lister.phabricator.tasks', + 'swh.lister.rcran.tasks', ] diff --git a/swh/lister/rcran/__init__.py b/swh/lister/rcran/__init__.py new file mode 100644 diff --git a/swh/lister/rcran/list_all_the_packages.R b/swh/lister/rcran/list_all_the_packages.R new file mode 100755 --- /dev/null +++ b/swh/lister/rcran/list_all_the_packages.R @@ -0,0 +1,5 @@ +#!/usr/bin/Rscript + +db <- tools::CRAN_package_db() +dbjson <- jsonlite::toJSON(db) +write(dbjson, stdout()) \ No newline at end of file diff --git a/swh/lister/rcran/lister.py b/swh/lister/rcran/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/rcran/lister.py @@ -0,0 +1,125 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import subprocess +import json +import logging + +from swh.lister.rcran.models import RCRANModel + +from swh.scheduler import utils +from swh.lister.core.simple_lister import SimpleLister + + +class RCRANLister(SimpleLister): + MODEL = RCRANModel + LISTER_NAME = 'rcran' + + def __init__(self, override_config=None): + SimpleLister.__init__(self, override_config=override_config) + + def task_dict(self, origin_type, origin_url, **kwargs): + """Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + + """ + _type = 'load-%s' % origin_type + _policy = 'recurring' + project_name = kwargs.get('name') + project_version = kwargs.get('version') + project_metadata = kwargs.get('description') + return utils.create_task_dict( + _type, _policy, project_name, origin_url, project_version, + project_metadata=project_metadata) + + def r_script_request(self): + """Runs r script which uses inbuilt API to return a json + response containing data about all the R packages + + Returns: + JSON response + + """ + response = subprocess.getoutput("./list_all_the_packages.R") + return json.loads(response) + + def list_packages(self, response): + """List the actual rcran origins from the response. + + """ + pass + + def _compute_url(self, repo): + """Returns the package tarball URL + + """ + return ( + 'https://cran.r-project.org/src/contrib/%s_%s.tar.gz' % + (repo["Package"], repo["Version"]) + ) + + def get_model_from_repo(self, repo): + """Transform from repository representation to model + + """ + project_url = self._compute_url(repo) + return { + 'uid': repo["Package"], + 'name': repo["Package"], + 'full_name': repo["Title"], + 'version': repo["Version"], + 'html_url': project_url, + 'origin_url': project_url, + 'origin_type': 'rcran', + 'description': repo["Description"] + } + + def transport_response_simplified(self, response): + """Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo_name) for repo_name in response] + + def ingest_data(self, identifier, checks=False): + """Rework the base ingest_data. + Request server endpoint which gives all in one go. + + Simplify and filter response list of repositories. Inject + repo information into local db. Queue loader tasks for + linked repositories. + + Args: + identifier: Resource identifier (unused) + checks (bool): Additional checks required (unused) + + """ + response = self.r_script_request() + if not response: + return response, [] + models_list = self.transport_response_simplified(response) + models_list = self.filter_before_inject(models_list) + all_injected = [] + for models in utils.grouper(models_list, n=10000): + models = list(models) + logging.debug('models: %s' % len(models)) + # inject into local db + injected = self.inject_repo_data_into_db(models) + # queue workers + self.create_missing_origins_and_tasks(models, injected) + all_injected.append(injected) + # flush + self.db_session.commit() + self.db_session = self.mk_session() + + return response, all_injected + + def transport_quota_check(self): + pass + + def transport_request(self): + pass + + def transport_response_to_string(self): + pass diff --git a/swh/lister/rcran/models.py b/swh/lister/rcran/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/rcran/models.py @@ -0,0 +1,17 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from swh.lister.core.models import ModelBase + + +class RCRANModel(ModelBase): + """a RCRAN repository representation + + """ + __tablename__ = 'rcran_repo' + + uid = Column(String, primary_key=True) + version = Column(String) diff --git a/swh/lister/rcran/tasks.py b/swh/lister/rcran/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/rcran/tasks.py @@ -0,0 +1,17 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.celery_backend.config import app + +from swh.lister.rcran.lister import RCRANLister + + +@app.task(name=__name__ + '.RCRANListerTask') +def rcran_lister(**lister_args): + RCRANLister(**lister_args).run() + + +@app.task(name=__name__ + '.ping') +def ping(): + return 'OK' diff --git a/swh/lister/rcran/tests/__init__.py b/swh/lister/rcran/tests/__init__.py new file mode 100644 diff --git a/swh/lister/rcran/tests/conftest.py b/swh/lister/rcran/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/lister/rcran/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/rcran/tests/test_tasks.py b/swh/lister/rcran/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/rcran/tests/test_tasks.py @@ -0,0 +1,27 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.rcran.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.rcran.tasks.RCRANLister') +def test_lister(lister, swh_app, celery_session_worker): + # setup the mocked RCRANLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.rcran.tasks.RCRANListerTask') + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with() + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with()