diff --git a/MANIFEST.in b/MANIFEST.in --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,4 +4,5 @@ include requirements-swh.txt include requirements-test.txt include version.txt +include swh/lister/cran/list_all_packages.R recursive-include swh/lister/*/tests/ *.json diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ - `swh.lister.pypi` - `swh.lister.npm` - `swh.lister.phabricator` +- `swh.lister.cran` Dependencies ------------ @@ -188,6 +189,17 @@ logging.basicConfig(level=logging.DEBUG) gnu_lister() + +## lister-cran + +Once configured, you can execute a RCRAN lister using the following instructions in a `python3` script: + +```lang=python +import logging +from swh.lister.cran.tasks import cran_lister + +logging.basicConfig(level=logging.DEBUG) +cran_lister() ``` Licensing diff --git a/swh/lister/cli.py b/swh/lister/cli.py --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi', - 'npm', 'phabricator', 'gnu'] + 'npm', 'phabricator', 'gnu', 'cran'] @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @@ -120,6 +120,11 @@ from .gnu.lister import GNULister _lister = GNULister(override_config=override_conf) + elif lister == 'cran': + from .cran.models import ModelBase + from .cran.lister import CRANLister + _lister = CRANLister(override_config=override_conf) + else: raise ValueError( 'Invalid lister %s: only supported listers are %s' % diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -6,6 +6,7 @@ def celery_includes(): return [ 'swh.lister.bitbucket.tasks', + 'swh.lister.cran.tasks', 'swh.lister.debian.tasks', 'swh.lister.github.tasks', 'swh.lister.gitlab.tasks', diff --git a/swh/lister/cran/__init__.py b/swh/lister/cran/__init__.py new file mode 100644 diff --git a/swh/lister/cran/list_all_packages.R b/swh/lister/cran/list_all_packages.R new file mode 100755 --- /dev/null +++ b/swh/lister/cran/list_all_packages.R @@ -0,0 +1,9 @@ +#!/usr/bin/Rscript + +# This R script calls the buildin API to get list of +# all the packages of R and their description, then convert the API +# response to JSON string and print it + +db <- tools::CRAN_package_db()[, c("Package", "Version", "Title", "Description")] +dbjson <- jsonlite::toJSON(db) +print(dbjson) \ No newline at end of file diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/cran/lister.py @@ -0,0 +1,119 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import subprocess +import json +import logging +import pkg_resources + +from swh.lister.cran.models import CRANModel + +from swh.scheduler.utils import create_task_dict +from swh.core import utils +from swh.lister.core.simple_lister import SimpleLister + + +class CRANLister(SimpleLister): + MODEL = CRANModel + LISTER_NAME = 'cran' + instance = 'cran' + + def task_dict(self, origin_type, origin_url, **kwargs): + """Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + """ + return create_task_dict( + 'load-%s' % origin_type, 'recurring', + kwargs.get('name'), origin_url, kwargs.get('version'), + project_metadata=kwargs.get('description')) + + def r_script_request(self): + """Runs R script which uses inbuilt API to return a json + response containing data about all the R packages + + Returns: + List of dictionaries + example + [ + {'Package': 'A3', + 'Version': '1.0.0', + 'Title': + 'Accurate, Adaptable, and Accessible Error Metrics for + Predictive\nModels', + 'Description': + 'Supplies tools for tabulating and analyzing the results + of predictive models. The methods employed are ... ' + } + {'Package': 'abbyyR', + 'Version': '0.5.4', + 'Title': + 'Access to Abbyy Optical Character Recognition (OCR) API', + 'Description': 'Get text from images of text using Abbyy + Cloud Optical Character\n ...' + } + ... + ] + """ + file_path = pkg_resources.resource_filename('swh.lister.cran', + 'list_all_packages.R') + response = subprocess.run(file_path, stdout=subprocess.PIPE, + shell=False) + return json.loads(response.stdout) + + def get_model_from_repo(self, repo): + """Transform from repository representation to model + + """ + project_url = 'https://cran.r-project.org/src/contrib' \ + '/%(Package)s_%(Version)s.tar.gz' % repo + return { + 'uid': repo["Package"], + 'name': repo["Package"], + 'full_name': repo["Title"], + 'version': repo["Version"], + 'html_url': project_url, + 'origin_url': project_url, + 'origin_type': 'cran', + 'description': repo["Description"] + } + + def transport_response_simplified(self, response): + """Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo) for repo in response] + + def ingest_data(self, identifier, checks=False): + """Rework the base ingest_data. + Request server endpoint which gives all in one go. + + Simplify and filter response list of repositories. Inject + repo information into local db. Queue loader tasks for + linked repositories. + + Args: + identifier: Resource identifier (unused) + checks (bool): Additional checks required (unused) + + """ + response = self.r_script_request() + if not response: + return response, [] + models_list = self.transport_response_simplified(response) + models_list = self.filter_before_inject(models_list) + all_injected = [] + for models in utils.grouper(models_list, n=10000): + models = list(models) + logging.debug('models: %s' % len(models)) + # inject into local db + injected = self.inject_repo_data_into_db(models) + # queue workers + self.create_missing_origins_and_tasks(models, injected) + all_injected.append(injected) + # flush + self.db_session.commit() + self.db_session = self.mk_session() + + return response, all_injected diff --git a/swh/lister/cran/models.py b/swh/lister/cran/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/cran/models.py @@ -0,0 +1,17 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from swh.lister.core.models import ModelBase + + +class CRANModel(ModelBase): + """a CRAN repository representation + + """ + __tablename__ = 'cran_repo' + + uid = Column(String, primary_key=True) + version = Column(String) diff --git a/swh/lister/cran/tasks.py b/swh/lister/cran/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/cran/tasks.py @@ -0,0 +1,17 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.celery_backend.config import app + +from swh.lister.cran.lister import CRANLister + + +@app.task(name=__name__ + '.CRANListerTask') +def cran_lister(**lister_args): + CRANLister(**lister_args).run() + + +@app.task(name=__name__ + '.ping') +def ping(): + return 'OK' diff --git a/swh/lister/cran/tests/__init__.py b/swh/lister/cran/tests/__init__.py new file mode 100644 diff --git a/swh/lister/cran/tests/conftest.py b/swh/lister/cran/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/lister/cran/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/cran/tests/test_tasks.py b/swh/lister/cran/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/cran/tests/test_tasks.py @@ -0,0 +1,27 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.cran.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.cran.tasks.CRANLister') +def test_lister(lister, swh_app, celery_session_worker): + # setup the mocked CRANLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.cran.tasks.CRANListerTask') + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with() + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with()