diff --git a/MANIFEST.in b/MANIFEST.in index 7266142..62515f4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,9 @@ include Makefile include README include requirements.txt include requirements-swh.txt include requirements-test.txt include version.txt include swh/lister/cran/list_all_packages.R recursive-include swh/lister/*/tests/ *.json *.html *.txt *.* * -recursive-include swh/lister/cgit/tests/data/ *.* * +recursive-include swh/lister/*/tests/data/ *.* * diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py index c31f4b7..c63fd32 100644 --- a/swh/lister/core/simple_lister.py +++ b/swh/lister/core/simple_lister.py @@ -1,80 +1,81 @@ -# Copyright (C) 2018 the Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.core import utils from .lister_base import ListerBase class SimpleLister(ListerBase): """Lister* intermediate class for any service that follows the simple, 'list in oneshot information' pattern. - Client sends a request to list repositories in oneshot - Client receives structured (json/xml/etc) response with information and stores those in db """ def list_packages(self, *args): """Listing packages method. """ pass def ingest_data(self, identifier, checks=False): """Rework the base ingest_data. Request server endpoint which gives all in one go. Simplify and filter response list of repositories. Inject repo information into local db. Queue loader tasks for linked repositories. Args: identifier: Resource identifier (unused) checks (bool): Additional checks required (unused) """ response = self.safely_issue_request(identifier) response = self.list_packages(response) if not response: return response, [] models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) all_injected = [] for models in utils.grouper(models_list, n=1000): models = list(models) logging.debug('models: %s' % len(models)) # inject into local db injected = self.inject_repo_data_into_db(models) # queue workers self.schedule_missing_tasks(models, injected) all_injected.append(injected) # flush self.db_session.commit() self.db_session = self.mk_session() return response, all_injected def transport_response_simplified(self, response): """Transform response to list for model manipulation """ return [self.get_model_from_repo(repo_name) for repo_name in response] def run(self): """Query the server which answers in one query. Stores the information, dropping actual redundant information we already have. Returns: nothing """ dump_not_used_identifier = 0 response, injected_repos = self.ingest_data(dump_not_used_identifier) if not response and not injected_repos: logging.info('No response from api server, stopping') diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py index 34eefa4..0d852f4 100644 --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -1,28 +1,37 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + from swh.scheduler.tests.conftest import * # noqa +import logging import pytest from sqlalchemy import create_engine from swh.lister import get_lister, SUPPORTED_LISTERS from swh.lister.core.models import initialize +logger = logging.getLogger(__name__) + + @pytest.fixture def swh_listers(request, postgresql_proc, postgresql, swh_scheduler): db_url = 'postgresql://{user}@{host}:{port}/{dbname}'.format( host=postgresql_proc.host, port=postgresql_proc.port, user='postgres', dbname='tests') listers = {} # Prepare schema for all listers for lister_name in SUPPORTED_LISTERS: lister = get_lister(lister_name, db_url=db_url) lister.scheduler = swh_scheduler # inject scheduler fixture listers[lister_name] = lister initialize(create_engine(db_url), drop_tables=True) return listers diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 414e833..77539f4 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,95 +1,132 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import pkg_resources import subprocess -from collections import defaultdict -from typing import List, Dict +from typing import List, Mapping from swh.lister.cran.models import CRANModel from swh.lister.core.simple_lister import SimpleLister from swh.scheduler.utils import create_task_dict logger = logging.getLogger(__name__) +def read_cran_data() -> List[Mapping[str, str]]: + """Execute r script to read cran listing. + + """ + filepath = pkg_resources.resource_filename('swh.lister.cran', + 'list_all_packages.R') + logger.debug('script list-all-packages.R path: %s', filepath) + response = subprocess.run( + filepath, stdout=subprocess.PIPE, shell=False, encoding='utf-8') + return json.loads(response.stdout) + + +def compute_package_url(repo: Mapping[str, str]) -> str: + """Compute the package url from the repo dict. + + Args: + repo: dict with key 'Package', 'Version' + + Returns: + the package url + + """ + return 'https://cran.r-project.org/src/contrib' \ + '/%(Package)s_%(Version)s.tar.gz' % repo + + class CRANLister(SimpleLister): MODEL = CRANModel LISTER_NAME = 'cran' instance = 'cran' - descriptions = defaultdict(dict) def task_dict(self, origin_type, origin_url, **kwargs): - """Return task format dict + """Return task format dict. This creates tasks with args and kwargs + set, for example:: + + args: ['package', 'https://cran.r-project.org/...', 'version'] + kwargs: {} - This is overridden from the lister_base as more information is - needed for the ingestion task creation. """ + policy = kwargs.get('policy', 'oneshot') + package = kwargs.get('name') + version = kwargs.get('version') return create_task_dict( 'load-%s' % origin_type, - kwargs.get('policy', 'recurring'), - kwargs.get('name'), origin_url, kwargs.get('version'), - project_metadata=self.descriptions[kwargs.get('name')]) + policy, package, origin_url, version, + retries_left=3, + ) + + def safely_issue_request(self, identifier): + """Bypass the implementation. It's now the `list_packages` which + returns data. + + As an implementation detail, we cannot change simply the base + SimpleLister yet as other implementation still uses it. This shall be + part of another refactoring pass. - def safely_issue_request(self, identifier: str) -> List[Dict]: + """ + return None + + def list_packages(self, *args) -> List[Mapping[str, str]]: """Runs R script which uses inbuilt API to return a json response - containing data about all the R packages. + containing data about the R packages. Returns: - List of Dict about r packages. - - Sample: - [ - { - 'Package': 'A3', - 'Version': '1.0.0', - 'Title': - 'Accurate, Adaptable, and Accessible Error Metrics for - Predictive\nModels', - 'Description': - 'Supplies tools for tabulating and analyzing the results - of predictive models. The methods employed are ... ' - }, - { - 'Package': 'abbyyR', - 'Version': '0.5.4', - 'Title': - 'Access to Abbyy Optical Character Recognition (OCR) API', - 'Description': 'Get text from images of text using Abbyy - Cloud Optical Character\n ...' - }, - ... - ] + List of Dict about r packages. For example: + + .. code-block:: python + + [ + { + 'Package': 'A3', + 'Version': '1.0.0', + 'Title': + 'Accurate, Adaptable, and Accessible Error Metrics + for Predictive\nModels', + 'Description': + 'Supplies tools for tabulating and analyzing the + results of predictive models. The methods employed + are ... ' + }, + { + 'Package': 'abbyyR', + 'Version': '0.5.4', + 'Title': + 'Access to Abbyy OCR (OCR) API', + 'Description': 'Get text from images of text using + Abbyy Cloud Optical Character\n ...' + }, + ... + ] """ - filepath = pkg_resources.resource_filename('swh.lister.cran', - 'list_all_packages.R') - logger.debug('script list-all-packages.R path: %s', filepath) - response = subprocess.run( - filepath, stdout=subprocess.PIPE, shell=False) - data = json.loads(response.stdout) - logger.debug('r-script-request: %s', data) - return data - - def get_model_from_repo(self, repo): + return read_cran_data() + + def get_model_from_repo( + self, repo: Mapping[str, str]) -> Mapping[str, str]: """Transform from repository representation to model """ - self.descriptions[repo["Package"]] = repo['Description'] - project_url = 'https://cran.r-project.org/src/contrib' \ - '/%(Package)s_%(Version)s.tar.gz' % repo + logger.debug('repo: %s', repo) + project_url = compute_package_url(repo) + package = repo['Package'] return { - 'uid': repo["Package"], - 'name': repo["Package"], - 'full_name': repo["Title"], - 'version': repo["Version"], + 'uid': package, + 'name': package, + 'full_name': repo['Title'], + 'version': repo['Version'], 'html_url': project_url, 'origin_url': project_url, - 'origin_type': 'cran', + 'origin_type': 'tar', } diff --git a/swh/lister/cran/tests/data/list-r-packages.json b/swh/lister/cran/tests/data/list-r-packages.json new file mode 100644 index 0000000..7835c37 --- /dev/null +++ b/swh/lister/cran/tests/data/list-r-packages.json @@ -0,0 +1,39 @@ +[ + { + "Package": "SeleMix", + "Version": "1.0.1", + "Title": "Selective Editing via Mixture Models", + "Description": "Detection of outliers and influential errors using a latent variable model. " + }, + { + "Package": "plink", + "Version": "1.5-1", + "Title": "IRT Separate Calibration Linking Methods", + "Description": "Item response theory based methods are used to compute\n linking constants and conduct chain linking of unidimensional\n or multidimensional tests for multiple groups under a common\n item design. The unidimensional methods include the Mean/Mean,\n Mean/Sigma, Haebara, and Stocking-Lord methods for dichotomous\n (1PL, 2PL and 3PL) and/or polytomous (graded response, partial\n credit/generalized partial credit, nominal, and multiple-choice\n model) items. The multidimensional methods include the least\n squares method and extensions of the Haebara and Stocking-Lord\n method using single or multiple dilation parameters for\n multidimensional extensions of all the unidimensional\n dichotomous and polytomous item response models. The package\n also includes functions for importing item and/or ability\n parameters from common IRT software, conducting IRT true score\n and observed score equating, and plotting item response\n curves/surfaces, vector plots, information plots, and comparison \n plots for examining parameter drift." + }, + { + "Package": "justifier", + "Version": "0.1.0", + "Title": "Human and Machine-Readable Justifications and Justified\nDecisions Based on 'YAML'", + "Description": "Leverages the 'yum' package to\n implement a 'YAML' ('YAML Ain't Markup Language', a human\n friendly standard for data serialization; see )\n standard for documenting justifications, such as for decisions\n taken during the planning, execution and analysis of a study\n or during the development of a behavior change intervention\n as illustrated by Marques & Peters (2019)\n . These justifications are both\n human- and machine-readable, facilitating efficient extraction\n and organisation." + }, + { + "Package": "Records", + "Version": "1.0", + "Title": "Record Values and Record Times", + "Description": "Functions for generating k-record values and k-record\n times" + }, + { + "Package": "scRNAtools", + "Version": "1.0", + "Title": "Single Cell RNA Sequencing Data Analysis Tools", + "Description": "We integrated the common analysis methods utilized in single cell RNA sequencing data, which included cluster method, principal components analysis (PCA), the filter of differentially expressed genes, pathway enrichment analysis and correlated analysis methods." + }, + + { + "Package": "Deriv", + "Version": "3.9.0", + "Title": "Symbolic Differentiation", + "Description": "R-based solution for symbolic differentiation. It admits\n user-defined function as well as function substitution\n in arguments of functions to be differentiated. Some symbolic\n simplification is part of the work." + } +] diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index 31552e1..688a6de 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,13 +1,65 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import pytest + +from os import path from unittest.mock import patch -from swh.lister.cran.lister import CRANLister - - -def test_task_dict(): - lister = CRANLister() - lister.descriptions['test_pack'] = 'Test Description' - with patch('swh.lister.cran.lister.create_task_dict') as mock_create_tasks: - lister.task_dict(origin_type='cran', origin_url='https://abc', - name='test_pack') - mock_create_tasks.assert_called_once_with( - 'load-cran', 'recurring', 'test_pack', 'https://abc', None, - project_metadata='Test Description') + +from swh.lister.cran.lister import compute_package_url + + +def test_cran_compute_package_url(): + url = compute_package_url({'Package': 'something', 'Version': '0.0.1'}) + + assert url == 'https://cran.r-project.org/src/contrib/%s_%s.tar.gz' % ( + 'something', + '0.0.1', + ) + + +def test_cran_compute_package_url_failure(): + for incomplete_repo in [{'Version': '0.0.1'}, {'Package': 'package'}, {}]: + with pytest.raises(KeyError): + compute_package_url(incomplete_repo) + + +@patch('swh.lister.cran.lister.read_cran_data') +def test_cran_lister_cran(mock_cran, datadir, swh_listers): + lister = swh_listers['cran'] + + with open(path.join(datadir, 'list-r-packages.json')) as f: + data = json.loads(f.read()) + + mock_cran.return_value = data + assert len(data) == 6 + + lister.run() + + r = lister.scheduler.search_tasks(task_type='load-tar') + assert len(r) == 6 + + for row in r: + assert row['type'] == 'load-tar' + # arguments check + args = row['arguments']['args'] + assert len(args) == 3 + # ['SeleMix', + # 'https://cran.r-project.org/src/contrib/SeleMix_1.0.1.tar.gz', + # '1.0.1'] + + package = args[0] + url = args[1] + version = args[2] + + assert url == compute_package_url( + {'Package': package, 'Version': version}) + + # kwargs + kwargs = row['arguments']['kwargs'] + assert kwargs == {} + + assert row['policy'] == 'oneshot'