diff --git a/MANIFEST.in b/MANIFEST.in --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,4 +6,4 @@ include version.txt include swh/lister/cran/list_all_packages.R recursive-include swh/lister/*/tests/ *.json *.html *.txt *.* * -recursive-include swh/lister/cgit/tests/data/ *.* * +recursive-include swh/lister/*/tests/data/ *.* * diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py --- a/swh/lister/core/simple_lister.py +++ b/swh/lister/core/simple_lister.py @@ -1,4 +1,5 @@ -# Copyright (C) 2018 the Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -1,5 +1,11 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + from swh.scheduler.tests.conftest import * # noqa +import logging import pytest from sqlalchemy import create_engine @@ -8,6 +14,9 @@ from swh.lister.core.models import initialize +logger = logging.getLogger(__name__) + + @pytest.fixture def swh_listers(request, postgresql_proc, postgresql, swh_scheduler): db_url = 'postgresql://{user}@{host}:{port}/{dbname}'.format( diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,4 +1,5 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,7 +8,6 @@ import pkg_resources import subprocess -from collections import defaultdict from typing import List, Dict from swh.lister.cran.models import CRANModel @@ -19,77 +19,115 @@ logger = logging.getLogger(__name__) +def read_cran_data() -> Dict: + """Execute r script to read cran listing. + + """ + filepath = pkg_resources.resource_filename('swh.lister.cran', + 'list_all_packages.R') + logger.debug('script list-all-packages.R path: %s', filepath) + response = subprocess.run( + filepath, stdout=subprocess.PIPE, shell=False) + return json.loads(response.stdout) + + +def compute_package_url(repo: Dict) -> str: + """Compute the package url from the repo dict. + + Args: + repo: dict with key 'Package', 'Version' + + Returns: + the package url + + """ + return 'https://cran.r-project.org/src/contrib' \ + '/%(Package)s_%(Version)s.tar.gz' % repo + + class CRANLister(SimpleLister): MODEL = CRANModel LISTER_NAME = 'cran' instance = 'cran' - descriptions = defaultdict(dict) def task_dict(self, origin_type, origin_url, **kwargs): - """Return task format dict + """Return task format dict. This creates tasks with args and kwargs set, for + example: + + .. code-block:: python + + args: ['package', 'https://cran.r-project.org/...', 'version'] + kwargs: {} - This is overridden from the lister_base as more information is - needed for the ingestion task creation. """ + policy = kwargs.get('policy', 'oneshot') + package = kwargs.get('name') + version = kwargs.get('version') return create_task_dict( 'load-%s' % origin_type, - kwargs.get('policy', 'recurring'), - kwargs.get('name'), origin_url, kwargs.get('version'), - project_metadata=self.descriptions[kwargs.get('name')]) + policy, package, origin_url, version, + retries_left=3, + ) + + def safely_issue_request(self, identifier): + """Bypass the implementation. It's now the `list_packages` which + returns data. + + As an implementation detail, we cannot change simply the base + SimpleLister yet as other implementation still uses it. This shall be + part of another refactoring pass. - def safely_issue_request(self, identifier: str) -> List[Dict]: + """ + return None + + def list_packages(self, *args) -> List[Dict]: """Runs R script which uses inbuilt API to return a json response - containing data about all the R packages. + containing data about the R packages. Returns: - List of Dict about r packages. - - Sample: - [ - { - 'Package': 'A3', - 'Version': '1.0.0', - 'Title': - 'Accurate, Adaptable, and Accessible Error Metrics for - Predictive\nModels', - 'Description': - 'Supplies tools for tabulating and analyzing the results - of predictive models. The methods employed are ... ' - }, - { - 'Package': 'abbyyR', - 'Version': '0.5.4', - 'Title': - 'Access to Abbyy Optical Character Recognition (OCR) API', - 'Description': 'Get text from images of text using Abbyy - Cloud Optical Character\n ...' - }, - ... - ] + List of Dict about r packages. For example: + + .. code-block:: python + + [ + { + 'Package': 'A3', + 'Version': '1.0.0', + 'Title': + 'Accurate, Adaptable, and Accessible Error Metrics + for Predictive\nModels', + 'Description': + 'Supplies tools for tabulating and analyzing the + results of predictive models. The methods employed + are ... ' + }, + { + 'Package': 'abbyyR', + 'Version': '0.5.4', + 'Title': + 'Access to Abbyy OCR (OCR) API', + 'Description': 'Get text from images of text using + Abbyy Cloud Optical Character\n ...' + }, + ... + ] """ - filepath = pkg_resources.resource_filename('swh.lister.cran', - 'list_all_packages.R') - logger.debug('script list-all-packages.R path: %s', filepath) - response = subprocess.run( - filepath, stdout=subprocess.PIPE, shell=False) - data = json.loads(response.stdout) - logger.debug('r-script-request: %s', data) - return data - - def get_model_from_repo(self, repo): + return read_cran_data() + + def get_model_from_repo(self, repo: Dict) -> Dict: """Transform from repository representation to model """ - self.descriptions[repo["Package"]] = repo['Description'] - project_url = 'https://cran.r-project.org/src/contrib' \ - '/%(Package)s_%(Version)s.tar.gz' % repo + logger.debug('repo: %s', repo) + project_url = compute_package_url(repo) + package = repo['Package'] return { - 'uid': repo["Package"], - 'name': repo["Package"], - 'full_name': repo["Title"], - 'version': repo["Version"], + 'uid': package, + 'name': package, + 'full_name': repo['Title'], + 'version': repo['Version'], 'html_url': project_url, 'origin_url': project_url, - 'origin_type': 'cran', + 'origin_type': 'tar', } diff --git a/swh/lister/cran/tests/data/list-r-packages.json b/swh/lister/cran/tests/data/list-r-packages.json new file mode 100644 --- /dev/null +++ b/swh/lister/cran/tests/data/list-r-packages.json @@ -0,0 +1,39 @@ +[ + { + "Package": "SeleMix", + "Version": "1.0.1", + "Title": "Selective Editing via Mixture Models", + "Description": "Detection of outliers and influential errors using a latent variable model. " + }, + { + "Package": "plink", + "Version": "1.5-1", + "Title": "IRT Separate Calibration Linking Methods", + "Description": "Item response theory based methods are used to compute\n linking constants and conduct chain linking of unidimensional\n or multidimensional tests for multiple groups under a common\n item design. The unidimensional methods include the Mean/Mean,\n Mean/Sigma, Haebara, and Stocking-Lord methods for dichotomous\n (1PL, 2PL and 3PL) and/or polytomous (graded response, partial\n credit/generalized partial credit, nominal, and multiple-choice\n model) items. The multidimensional methods include the least\n squares method and extensions of the Haebara and Stocking-Lord\n method using single or multiple dilation parameters for\n multidimensional extensions of all the unidimensional\n dichotomous and polytomous item response models. The package\n also includes functions for importing item and/or ability\n parameters from common IRT software, conducting IRT true score\n and observed score equating, and plotting item response\n curves/surfaces, vector plots, information plots, and comparison \n plots for examining parameter drift." + }, + { + "Package": "justifier", + "Version": "0.1.0", + "Title": "Human and Machine-Readable Justifications and Justified\nDecisions Based on 'YAML'", + "Description": "Leverages the 'yum' package to\n implement a 'YAML' ('YAML Ain't Markup Language', a human\n friendly standard for data serialization; see )\n standard for documenting justifications, such as for decisions\n taken during the planning, execution and analysis of a study\n or during the development of a behavior change intervention\n as illustrated by Marques & Peters (2019)\n . These justifications are both\n human- and machine-readable, facilitating efficient extraction\n and organisation." + }, + { + "Package": "Records", + "Version": "1.0", + "Title": "Record Values and Record Times", + "Description": "Functions for generating k-record values and k-record\n times" + }, + { + "Package": "scRNAtools", + "Version": "1.0", + "Title": "Single Cell RNA Sequencing Data Analysis Tools", + "Description": "We integrated the common analysis methods utilized in single cell RNA sequencing data, which included cluster method, principal components analysis (PCA), the filter of differentially expressed genes, pathway enrichment analysis and correlated analysis methods." + }, + + { + "Package": "Deriv", + "Version": "3.9.0", + "Title": "Symbolic Differentiation", + "Description": "R-based solution for symbolic differentiation. It admits\n user-defined function as well as function substitution\n in arguments of functions to be differentiated. Some symbolic\n simplification is part of the work." + } +] diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,13 +1,65 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import pytest + +from os import path from unittest.mock import patch -from swh.lister.cran.lister import CRANLister - - -def test_task_dict(): - lister = CRANLister() - lister.descriptions['test_pack'] = 'Test Description' - with patch('swh.lister.cran.lister.create_task_dict') as mock_create_tasks: - lister.task_dict(origin_type='cran', origin_url='https://abc', - name='test_pack') - mock_create_tasks.assert_called_once_with( - 'load-cran', 'recurring', 'test_pack', 'https://abc', None, - project_metadata='Test Description') + +from swh.lister.cran.lister import compute_package_url + + +def test_cran_compute_package_url(): + url = compute_package_url({'Package': 'something', 'Version': '0.0.1'}) + + assert url == 'https://cran.r-project.org/src/contrib/%s_%s.tar.gz' % ( + 'something', + '0.0.1', + ) + + +def test_cran_compute_package_url_failure(): + for incomplete_repo in [{'Version': '0.0.1'}, {'Package': 'package'}, {}]: + with pytest.raises(KeyError): + compute_package_url(incomplete_repo) + + +@patch('swh.lister.cran.lister.read_cran_data') +def test_cran_lister_cran(mock_cran, datadir, swh_listers): + lister = swh_listers['cran'] + + with open(path.join(datadir, 'list-r-packages.json')) as f: + data = json.loads(f.read()) + + mock_cran.return_value = data + assert len(data) == 6 + + lister.run() + + r = lister.scheduler.search_tasks(task_type='load-tar') + assert len(r) == 6 + + for row in r: + assert row['type'] == 'load-tar' + # arguments check + args = row['arguments']['args'] + assert len(args) == 3 + # ['SeleMix', + # 'https://cran.r-project.org/src/contrib/SeleMix_1.0.1.tar.gz', + # '1.0.1'] + + package = args[0] + url = args[1] + version = args[2] + + assert url == compute_package_url( + {'Package': package, 'Version': version}) + + # kwargs + kwargs = row['arguments']['kwargs'] + assert kwargs == {} + + assert row['policy'] == 'oneshot'