Page MenuHomeSoftware Heritage

D1492.diff
No OneTemporary

D1492.diff

diff --git a/MANIFEST.in b/MANIFEST.in
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,4 +4,5 @@
include requirements-swh.txt
include requirements-test.txt
include version.txt
+include swh/lister/cran/list_all_packages.R
recursive-include swh/lister/*/tests/ *.json
diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@
- `swh.lister.pypi`
- `swh.lister.npm`
- `swh.lister.phabricator`
+- `swh.lister.cran`
Dependencies
------------
@@ -188,6 +189,17 @@
logging.basicConfig(level=logging.DEBUG)
gnu_lister()
+
+## lister-cran
+
+Once configured, you can execute a RCRAN lister using the following instructions in a `python3` script:
+
+```lang=python
+import logging
+from swh.lister.cran.tasks import cran_lister
+
+logging.basicConfig(level=logging.DEBUG)
+cran_lister()
```
Licensing
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -12,7 +12,7 @@
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator', 'gnu']
+ 'npm', 'phabricator', 'gnu', 'cran']
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
@@ -120,6 +120,11 @@
from .gnu.lister import GNULister
_lister = GNULister(override_config=override_conf)
+ elif lister == 'cran':
+ from .cran.models import ModelBase
+ from .cran.lister import CRANLister
+ _lister = CRANLister(override_config=override_conf)
+
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -6,6 +6,7 @@
def celery_includes():
return [
'swh.lister.bitbucket.tasks',
+ 'swh.lister.cran.tasks',
'swh.lister.debian.tasks',
'swh.lister.github.tasks',
'swh.lister.gitlab.tasks',
diff --git a/swh/lister/cran/__init__.py b/swh/lister/cran/__init__.py
new file mode 100644
diff --git a/swh/lister/cran/list_all_packages.R b/swh/lister/cran/list_all_packages.R
new file mode 100755
--- /dev/null
+++ b/swh/lister/cran/list_all_packages.R
@@ -0,0 +1,9 @@
+#!/usr/bin/Rscript
+
+# This R script calls the buildin API to get list of
+# all the packages of R and their description, then convert the API
+# response to JSON string and print it
+
+db <- tools::CRAN_package_db()[, c("Package", "Version", "Title", "Description")]
+dbjson <- jsonlite::toJSON(db)
+print(dbjson)
\ No newline at end of file
diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cran/lister.py
@@ -0,0 +1,119 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import subprocess
+import json
+import logging
+import pkg_resources
+
+from swh.lister.cran.models import CRANModel
+
+from swh.scheduler.utils import create_task_dict
+from swh.core import utils
+from swh.lister.core.simple_lister import SimpleLister
+
+
+class CRANLister(SimpleLister):
+ MODEL = CRANModel
+ LISTER_NAME = 'cran'
+ instance = 'cran'
+
+ def task_dict(self, origin_type, origin_url, **kwargs):
+ """Return task format dict
+
+ This is overridden from the lister_base as more information is
+ needed for the ingestion task creation.
+ """
+ return create_task_dict(
+ 'load-%s' % origin_type, 'recurring',
+ kwargs.get('name'), origin_url, kwargs.get('version'),
+ project_metadata=kwargs.get('description'))
+
+ def r_script_request(self):
+ """Runs R script which uses inbuilt API to return a json
+ response containing data about all the R packages
+
+ Returns:
+ List of dictionaries
+ example
+ [
+ {'Package': 'A3',
+ 'Version': '1.0.0',
+ 'Title':
+ 'Accurate, Adaptable, and Accessible Error Metrics for
+ Predictive\nModels',
+ 'Description':
+ 'Supplies tools for tabulating and analyzing the results
+ of predictive models. The methods employed are ... '
+ }
+ {'Package': 'abbyyR',
+ 'Version': '0.5.4',
+ 'Title':
+ 'Access to Abbyy Optical Character Recognition (OCR) API',
+ 'Description': 'Get text from images of text using Abbyy
+ Cloud Optical Character\n ...'
+ }
+ ...
+ ]
+ """
+ file_path = pkg_resources.resource_filename('swh.lister.cran',
+ 'list_all_packages.R')
+ response = subprocess.run(file_path, stdout=subprocess.PIPE,
+ shell=False)
+ return json.loads(response.stdout)
+
+ def get_model_from_repo(self, repo):
+ """Transform from repository representation to model
+
+ """
+ project_url = 'https://cran.r-project.org/src/contrib' \
+ '/%(Package)s_%(Version)s.tar.gz' % repo
+ return {
+ 'uid': repo["Package"],
+ 'name': repo["Package"],
+ 'full_name': repo["Title"],
+ 'version': repo["Version"],
+ 'html_url': project_url,
+ 'origin_url': project_url,
+ 'origin_type': 'cran',
+ 'description': repo["Description"]
+ }
+
+ def transport_response_simplified(self, response):
+ """Transform response to list for model manipulation
+
+ """
+ return [self.get_model_from_repo(repo) for repo in response]
+
+ def ingest_data(self, identifier, checks=False):
+ """Rework the base ingest_data.
+ Request server endpoint which gives all in one go.
+
+ Simplify and filter response list of repositories. Inject
+ repo information into local db. Queue loader tasks for
+ linked repositories.
+
+ Args:
+ identifier: Resource identifier (unused)
+ checks (bool): Additional checks required (unused)
+
+ """
+ response = self.r_script_request()
+ if not response:
+ return response, []
+ models_list = self.transport_response_simplified(response)
+ models_list = self.filter_before_inject(models_list)
+ all_injected = []
+ for models in utils.grouper(models_list, n=10000):
+ models = list(models)
+ logging.debug('models: %s' % len(models))
+ # inject into local db
+ injected = self.inject_repo_data_into_db(models)
+ # queue workers
+ self.create_missing_origins_and_tasks(models, injected)
+ all_injected.append(injected)
+ # flush
+ self.db_session.commit()
+ self.db_session = self.mk_session()
+
+ return response, all_injected
diff --git a/swh/lister/cran/models.py b/swh/lister/cran/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cran/models.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String
+
+from swh.lister.core.models import ModelBase
+
+
+class CRANModel(ModelBase):
+ """a CRAN repository representation
+
+ """
+ __tablename__ = 'cran_repo'
+
+ uid = Column(String, primary_key=True)
+ version = Column(String)
diff --git a/swh/lister/cran/tasks.py b/swh/lister/cran/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cran/tasks.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from swh.lister.cran.lister import CRANLister
+
+
+@app.task(name=__name__ + '.CRANListerTask')
+def cran_lister(**lister_args):
+ CRANLister(**lister_args).run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/cran/tests/__init__.py b/swh/lister/cran/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/cran/tests/conftest.py b/swh/lister/cran/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cran/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/cran/tests/test_tasks.py b/swh/lister/cran/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cran/tests/test_tasks.py
@@ -0,0 +1,27 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+ res = swh_app.send_task(
+ 'swh.lister.cran.tasks.ping')
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == 'OK'
+
+
+@patch('swh.lister.cran.tasks.CRANLister')
+def test_lister(lister, swh_app, celery_session_worker):
+ # setup the mocked CRANLister
+ lister.return_value = lister
+ lister.run.return_value = None
+
+ res = swh_app.send_task(
+ 'swh.lister.cran.tasks.CRANListerTask')
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.assert_called_once_with()
+ lister.db_last_index.assert_not_called()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Dec 19 2024, 9:54 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219569

Event Timeline