diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 0e71da8..d176ca5 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,134 +1,148 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import pkg_resources import subprocess -from typing import List, Mapping +from typing import List, Mapping, Tuple from swh.lister.cran.models import CRANModel from swh.lister.core.simple_lister import SimpleLister from swh.scheduler.utils import create_task_dict logger = logging.getLogger(__name__) +CRAN_MIRROR = 'https://cran.r-project.org' + + class CRANLister(SimpleLister): MODEL = CRANModel LISTER_NAME = 'cran' instance = 'cran' - def task_dict(self, origin_type, origin_url, **kwargs): + def task_dict(self, origin_type, origin_url, version=None, html_url=None, + policy=None, **kwargs): """Return task format dict. This creates tasks with args and kwargs set, for example:: args: [] kwargs: { - 'url': 'https://cran.r-project.org/...', - 'version': '0.0.1' + 'url': 'https://cran.r-project.org/Packages/...', + 'artifacts': [{ + 'url': 'https://cran.r-project.org/...', + 'version': '0.0.1', + }] } """ - policy = kwargs.get('policy', 'oneshot') - version = kwargs.get('version') + if not policy: + policy = 'oneshot' + artifact_url = html_url assert origin_type == 'tar' return create_task_dict( 'load-cran', policy, - url=origin_url, version=version, retries_left=3 + url=origin_url, artifacts=[{ + 'url': artifact_url, + 'version': version + }], retries_left=3 ) def safely_issue_request(self, identifier): """Bypass the implementation. It's now the `list_packages` which returns data. As an implementation detail, we cannot change simply the base SimpleLister yet as other implementation still uses it. This shall be part of another refactoring pass. """ return None def list_packages(self, response) -> List[Mapping[str, str]]: """Runs R script which uses inbuilt API to return a json response containing data about the R packages. Returns: List of Dict about r packages. For example: .. code-block:: python [ { 'Package': 'A3', 'Version': '1.0.0', 'Title': 'Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModels', 'Description': 'Supplies tools for tabulating and analyzing the results of predictive models. The methods employed are ... ' }, { 'Package': 'abbyyR', 'Version': '0.5.4', 'Title': 'Access to Abbyy OCR (OCR) API', 'Description': 'Get text from images of text using Abbyy Cloud Optical Character\n ...' }, ... ] """ return read_cran_data() def get_model_from_repo( self, repo: Mapping[str, str]) -> Mapping[str, str]: """Transform from repository representation to model """ logger.debug('repo: %s', repo) - project_url = compute_package_url(repo) + origin_url, artifact_url = compute_origin_urls(repo) package = repo['Package'] version = repo['Version'] return { 'uid': f'{package}-{version}', 'name': package, 'full_name': repo['Title'], 'version': version, - 'html_url': project_url, - 'origin_url': project_url, + 'html_url': artifact_url, + 'origin_url': origin_url, 'origin_type': 'tar', } def read_cran_data() -> List[Mapping[str, str]]: """Execute r script to read cran listing. """ filepath = pkg_resources.resource_filename('swh.lister.cran', 'list_all_packages.R') logger.debug('script list-all-packages.R path: %s', filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout.decode('utf-8')) -def compute_package_url(repo: Mapping[str, str]) -> str: +def compute_origin_urls(repo: Mapping[str, str]) -> Tuple[str, str]: """Compute the package url from the repo dict. Args: repo: dict with key 'Package', 'Version' Returns: - the package url + the tuple project url, artifact url """ - return 'https://cran.r-project.org/src/contrib' \ - '/{Package}_{Version}.tar.gz'.format(**repo) + package = repo['Package'] + version = repo['Version'] + origin_url = f'{CRAN_MIRROR}/package={package}' + artifact_url = f'{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz' + return origin_url, artifact_url diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index fbbc30c..3b6847c 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,64 +1,72 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import pytest from os import path from unittest.mock import patch -from swh.lister.cran.lister import compute_package_url +from swh.lister.cran.lister import compute_origin_urls, CRAN_MIRROR -def test_cran_compute_package_url(): - url = compute_package_url({'Package': 'something', 'Version': '0.0.1'}) +def test_cran_compute_origin_urls(): + pack = 'something' + vers = '0.0.1' + origin_url, artifact_url = compute_origin_urls({ + 'Package': pack, + 'Version': vers, + }) - assert url == 'https://cran.r-project.org/src/contrib/%s_%s.tar.gz' % ( - 'something', - '0.0.1', - ) + assert origin_url == f'{CRAN_MIRROR}/package={pack}' + assert artifact_url == f'{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz' -def test_cran_compute_package_url_failure(): +def test_cran_compute_origin_urls_failure(): for incomplete_repo in [{'Version': '0.0.1'}, {'Package': 'package'}, {}]: with pytest.raises(KeyError): - compute_package_url(incomplete_repo) + compute_origin_urls(incomplete_repo) @patch('swh.lister.cran.lister.read_cran_data') def test_cran_lister_cran(mock_cran, datadir, lister_cran): lister = lister_cran with open(path.join(datadir, 'list-r-packages.json')) as f: data = json.loads(f.read()) mock_cran.return_value = data assert len(data) == 6 lister.run() r = lister.scheduler.search_tasks(task_type='load-cran') assert len(r) == 6 for row in r: assert row['type'] == 'load-cran' # arguments check args = row['arguments']['args'] assert len(args) == 0 # kwargs kwargs = row['arguments']['kwargs'] assert len(kwargs) == 2 - assert set(kwargs.keys()) == {'url', 'version'} + assert set(kwargs.keys()) == {'url', 'artifacts'} + + artifacts = kwargs['artifacts'] + assert len(artifacts) == 1 + + assert set(artifacts[0].keys()) == {'url', 'version'} assert row['policy'] == 'oneshot' assert row['retries_left'] == 3 origin_url = kwargs['url'] record = lister.db_session \ .query(lister.MODEL) \ .filter(origin_url == origin_url).first() assert record assert record.uid == f'{record.name}-{record.version}'