diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 3a46acf..fdb83e0 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,131 +1,132 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import pkg_resources import subprocess from typing import List, Mapping from swh.lister.cran.models import CRANModel from swh.lister.core.simple_lister import SimpleLister from swh.scheduler.utils import create_task_dict logger = logging.getLogger(__name__) def read_cran_data() -> List[Mapping[str, str]]: """Execute r script to read cran listing. """ filepath = pkg_resources.resource_filename('swh.lister.cran', 'list_all_packages.R') logger.debug('script list-all-packages.R path: %s', filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout.decode('utf-8')) def compute_package_url(repo: Mapping[str, str]) -> str: """Compute the package url from the repo dict. Args: repo: dict with key 'Package', 'Version' Returns: the package url """ return 'https://cran.r-project.org/src/contrib' \ '/{Package}_{Version}.tar.gz'.format(**repo) class CRANLister(SimpleLister): MODEL = CRANModel LISTER_NAME = 'cran' instance = 'cran' def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict. This creates tasks with args and kwargs set, for example:: args: ['package', 'https://cran.r-project.org/...', 'version'] kwargs: {} """ policy = kwargs.get('policy', 'oneshot') package = kwargs.get('name') version = kwargs.get('version') + assert origin_type == 'tar' return create_task_dict( 'load-%s' % origin_type, policy, package, origin_url, version, retries_left=3, ) def safely_issue_request(self, identifier): """Bypass the implementation. It's now the `list_packages` which returns data. As an implementation detail, we cannot change simply the base SimpleLister yet as other implementation still uses it. This shall be part of another refactoring pass. """ return None def list_packages(self, response) -> List[Mapping[str, str]]: """Runs R script which uses inbuilt API to return a json response containing data about the R packages. Returns: List of Dict about r packages. For example: .. code-block:: python [ { 'Package': 'A3', 'Version': '1.0.0', 'Title': 'Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModels', 'Description': 'Supplies tools for tabulating and analyzing the results of predictive models. The methods employed are ... ' }, { 'Package': 'abbyyR', 'Version': '0.5.4', 'Title': 'Access to Abbyy OCR (OCR) API', 'Description': 'Get text from images of text using Abbyy Cloud Optical Character\n ...' }, ... ] """ return read_cran_data() def get_model_from_repo( self, repo: Mapping[str, str]) -> Mapping[str, str]: """Transform from repository representation to model """ logger.debug('repo: %s', repo) project_url = compute_package_url(repo) package = repo['Package'] return { 'uid': package, 'name': package, 'full_name': repo['Title'], 'version': repo['Version'], 'html_url': project_url, 'origin_url': project_url, 'origin_type': 'tar', } diff --git a/swh/lister/cran/tests/conftest.py b/swh/lister/cran/tests/conftest.py index 507fef9..34741f2 100644 --- a/swh/lister/cran/tests/conftest.py +++ b/swh/lister/cran/tests/conftest.py @@ -1 +1,23 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + from swh.lister.core.tests.conftest import * # noqa + + +@pytest.fixture +def lister_cran(swh_listers): + lister = swh_listers['cran'] + + # Add the load-deb-package in the scheduler backend + lister.scheduler.create_task_type({ + 'type': 'load-tar', + 'description': 'Load archive files', + 'backend_name': 'swh.loader.package.tasks.LoadArchive', + 'default_interval': '1 day', + }) + + return lister diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index 688a6de..338579f 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,65 +1,63 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import pytest from os import path from unittest.mock import patch from swh.lister.cran.lister import compute_package_url def test_cran_compute_package_url(): url = compute_package_url({'Package': 'something', 'Version': '0.0.1'}) assert url == 'https://cran.r-project.org/src/contrib/%s_%s.tar.gz' % ( 'something', '0.0.1', ) def test_cran_compute_package_url_failure(): for incomplete_repo in [{'Version': '0.0.1'}, {'Package': 'package'}, {}]: with pytest.raises(KeyError): compute_package_url(incomplete_repo) @patch('swh.lister.cran.lister.read_cran_data') -def test_cran_lister_cran(mock_cran, datadir, swh_listers): - lister = swh_listers['cran'] - +def test_cran_lister_cran(mock_cran, datadir, lister_cran): with open(path.join(datadir, 'list-r-packages.json')) as f: data = json.loads(f.read()) mock_cran.return_value = data assert len(data) == 6 - lister.run() + lister_cran.run() - r = lister.scheduler.search_tasks(task_type='load-tar') + r = lister_cran.scheduler.search_tasks(task_type='load-tar') assert len(r) == 6 for row in r: assert row['type'] == 'load-tar' # arguments check args = row['arguments']['args'] assert len(args) == 3 # ['SeleMix', # 'https://cran.r-project.org/src/contrib/SeleMix_1.0.1.tar.gz', # '1.0.1'] package = args[0] url = args[1] version = args[2] assert url == compute_package_url( {'Package': package, 'Version': version}) # kwargs kwargs = row['arguments']['kwargs'] assert kwargs == {} assert row['policy'] == 'oneshot' diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 0e99a58..a3e1714 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,107 +1,108 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister from swh.lister.gnu.models import GNUModel from swh.lister.gnu.tree import GNUTree logger = logging.getLogger(__name__) class GNULister(SimpleLister): MODEL = GNUModel LISTER_NAME = 'gnu' instance = 'gnu' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz') def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. This creates tasks with args and kwargs set, for example: .. code-block:: python args: kwargs: { 'url': 'https://ftp.gnu.org/gnu/3dldf/', 'artifacts': [{ 'url': 'https://...', 'time': '2003-12-09T21:43:20+00:00', 'length': 128, 'version': '1.0.1', 'filename': 'something-1.0.1.tar.gz', }, ... ] } """ artifacts = self.gnu_tree.artifacts[origin_url] + assert origin_type == 'tar' return utils.create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'oneshot'), url=origin_url, artifacts=artifacts, retries_left=3, ) def safely_issue_request(self, identifier): """Bypass the implementation. It's now the GNUTree which deals with querying the gnu mirror. As an implementation detail, we cannot change simply the base SimpleLister as other implementation still uses it. This shall be part of another refactoring pass. """ return None def list_packages(self, response): """List the actual gnu origins (package name) with their name, url and associated tarballs. Args: response: Unused Returns: List of packages name, url, last modification time .. code-block:: python [ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', 'time_modified': '2003-12-09T20:43:20+00:00'}, {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', 'time_modified': '2016-12-06T02:37:10+00:00'}, ... ] """ return list(self.gnu_tree.projects.values()) def get_model_from_repo(self, repo): """Transform from repository representation to model """ return { 'uid': repo['url'], 'name': repo['name'], 'full_name': repo['name'], 'html_url': repo['url'], 'origin_url': repo['url'], 'time_last_updated': repo['time_modified'], 'origin_type': 'tar', } diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py index 507fef9..a766127 100644 --- a/swh/lister/gnu/tests/conftest.py +++ b/swh/lister/gnu/tests/conftest.py @@ -1 +1,23 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + from swh.lister.core.tests.conftest import * # noqa + + +@pytest.fixture +def lister_gnu(swh_listers): + lister = swh_listers['gnu'] + + # Add the load-deb-package in the scheduler backend + lister.scheduler.create_task_type({ + 'type': 'load-tar', + 'description': 'Load archive files', + 'backend_name': 'swh.loader.package.tasks.LoadArchive', + 'default_interval': '1 day', + }) + + return lister diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py index 327c73d..9176ae2 100644 --- a/swh/lister/gnu/tests/test_lister.py +++ b/swh/lister/gnu/tests/test_lister.py @@ -1,52 +1,50 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging logger = logging.getLogger(__name__) -def test_gnu_lister(swh_listers, requests_mock_datadir): - lister = swh_listers['gnu'] +def test_gnu_lister(lister_gnu, requests_mock_datadir): + lister_gnu.run() - lister.run() - - r = lister.scheduler.search_tasks(task_type='load-tar') + r = lister_gnu.scheduler.search_tasks(task_type='load-tar') assert len(r) == 383 for row in r: assert row['type'] == 'load-tar' # arguments check args = row['arguments']['args'] assert len(args) == 0 # kwargs kwargs = row['arguments']['kwargs'] assert set(kwargs.keys()) == {'url', 'artifacts'} url = kwargs['url'] assert url.startswith('https://ftp.gnu.org') url_suffix = url.split('https://ftp.gnu.org')[1] assert 'gnu' in url_suffix or 'old-gnu' in url_suffix artifacts = kwargs['artifacts'] # check the artifact's structure artifact = artifacts[0] assert set(artifact.keys()) == { 'url', 'length', 'time', 'filename', 'version' } for artifact in artifacts: logger.debug(artifact) # 'time' is an isoformat string now for key in ['url', 'time', 'filename', 'version']: assert isinstance(artifact[key], str) assert isinstance(artifact['length'], int) assert row['policy'] == 'oneshot' assert row['priority'] is None assert row['retries_left'] == 3 diff --git a/swh/lister/npm/tests/conftest.py b/swh/lister/npm/tests/conftest.py index 507fef9..a7f2433 100644 --- a/swh/lister/npm/tests/conftest.py +++ b/swh/lister/npm/tests/conftest.py @@ -1 +1,23 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + from swh.lister.core.tests.conftest import * # noqa + + +@pytest.fixture +def lister_npm(swh_listers): + lister = swh_listers['npm'] + + # Add the load-deb-package in the scheduler backend + lister.scheduler.create_task_type({ + 'type': 'load-npm', + 'description': 'Load npm package', + 'backend_name': 'swh.loader.package.tasks.LoadNpm', + 'default_interval': '1 day', + }) + + return lister diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py index 281631a..33e0ea7 100644 --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -1,102 +1,100 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import re import requests_mock import unittest from swh.lister.core.tests.test_lister import HttpListerTesterBase from swh.lister.npm.lister import NpmLister, NpmIncrementalLister from typing import Any, List logger = logging.getLogger(__name__) class NpmListerTester(HttpListerTesterBase, unittest.TestCase): Lister = NpmLister test_re = re.compile(r'^.*/_all_docs\?startkey="(.+)".*') lister_subdir = 'npm' good_api_response_file = 'data/replicate.npmjs.com/api_response.json' bad_api_response_file = 'data/api_empty_response.json' first_index = 'jquery' entries_per_page = 100 @requests_mock.Mocker() def test_is_within_bounds(self, http_mocker): # disable this test from HttpListerTesterBase as # it can not succeed for the npm lister due to the # overriding of the string_pattern_check method pass class NpmIncrementalListerTester(HttpListerTesterBase, unittest.TestCase): Lister = NpmIncrementalLister test_re = re.compile(r'^.*/_changes\?since=([0-9]+).*') lister_subdir = 'npm' good_api_response_file = 'data/api_inc_response.json' bad_api_response_file = 'data/api_inc_empty_response.json' first_index = '6920642' entries_per_page = 100 @requests_mock.Mocker() def test_is_within_bounds(self, http_mocker): # disable this test from HttpListerTesterBase as # it can not succeed for the npm lister due to the # overriding of the string_pattern_check method pass def check_tasks(tasks: List[Any]): """Ensure scheduled tasks are in the expected format. """ for row in tasks: logger.debug('row: %s', row) assert row['type'] == 'load-npm' # arguments check args = row['arguments']['args'] assert len(args) == 2 package = args[0] url = args[1] assert url == 'https://www.npmjs.com/package/%s' % package # kwargs kwargs = row['arguments']['kwargs'] meta_url = kwargs['package_metadata_url'] assert meta_url == 'https://replicate.npmjs.com/%s' % package assert row['policy'] == 'recurring' assert row['priority'] is None -def test_lister_npm_basic_listing(swh_listers, requests_mock_datadir): - lister = swh_listers['npm'] +def test_lister_npm_basic_listing(lister_npm, requests_mock_datadir): + lister_npm.run() - lister.run() - - tasks = lister.scheduler.search_tasks(task_type='load-npm') + tasks = lister_npm.scheduler.search_tasks(task_type='load-npm') assert len(tasks) == 100 check_tasks(tasks) -def test_lister_npm_listing_pagination(swh_listers, requests_mock_datadir): - lister = swh_listers['npm'] +def test_lister_npm_listing_pagination(lister_npm, requests_mock_datadir): + lister = lister_npm # Patch per page pagination lister.per_page = 10 + 1 lister.PATH_TEMPLATE = lister.PATH_TEMPLATE.replace( '&limit=1001', '&limit=%s' % lister.per_page) lister.run() tasks = lister.scheduler.search_tasks(task_type='load-npm') assert len(tasks) == 2 * 10 # only 2 files with 10 results each check_tasks(tasks) diff --git a/swh/lister/pypi/tests/conftest.py b/swh/lister/pypi/tests/conftest.py index 507fef9..50a4239 100644 --- a/swh/lister/pypi/tests/conftest.py +++ b/swh/lister/pypi/tests/conftest.py @@ -1 +1,23 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + from swh.lister.core.tests.conftest import * # noqa + + +@pytest.fixture +def lister_pypi(swh_listers): + lister = swh_listers['pypi'] + + # Add the load-deb-package in the scheduler backend + lister.scheduler.create_task_type({ + 'type': 'load-pypi', + 'description': 'Load PyPI package', + 'backend_name': 'swh.loader.package.tasks.LoadPyPI', + 'default_interval': '1 day', + }) + + return lister diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py index bcc3043..a67dd22 100644 --- a/swh/lister/pypi/tests/test_lister.py +++ b/swh/lister/pypi/tests/test_lister.py @@ -1,31 +1,29 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -def test_pypi_lister(swh_listers, requests_mock_datadir): - lister = swh_listers['pypi'] +def test_pypi_lister(lister_pypi, requests_mock_datadir): + lister_pypi.run() - lister.run() - - r = lister.scheduler.search_tasks(task_type='load-pypi') + r = lister_pypi.scheduler.search_tasks(task_type='load-pypi') assert len(r) == 4 for row in r: assert row['type'] == 'load-pypi' # arguments check args = row['arguments']['args'] assert len(args) == 2 project = args[0] url = args[1] assert url == 'https://pypi.org/project/%s/' % project # kwargs kwargs = row['arguments']['kwargs'] meta_url = kwargs['project_metadata_url'] assert meta_url == 'https://pypi.org/pypi/%s/json' % project assert row['policy'] == 'recurring' assert row['priority'] is None