diff --git a/swh/lister/cran/__init__.py b/swh/lister/cran/__init__.py --- a/swh/lister/cran/__init__.py +++ b/swh/lister/cran/__init__.py @@ -5,10 +5,9 @@ def register(): from .lister import CRANLister - from .models import CRANModel return { - "models": [CRANModel], + "models": [], "lister": CRANLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/cran/list_all_packages.R b/swh/lister/cran/list_all_packages.R --- a/swh/lister/cran/list_all_packages.R +++ b/swh/lister/cran/list_all_packages.R @@ -4,6 +4,6 @@ # all the packages of R and their description, then convert the API # response to JSON string and print it -db <- tools::CRAN_package_db()[, c("Package", "Version", "Title", "Description")] +db <- tools::CRAN_package_db()[, c("Package", "Version")] dbjson <- jsonlite::toJSON(db) print(dbjson) \ No newline at end of file diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,131 +1,87 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import subprocess -from typing import List, Mapping, Tuple +from typing import Dict, Iterator, List, Tuple import pkg_resources -from swh.lister.core.simple_lister import SimpleLister -from swh.lister.cran.models import CRANModel -from swh.scheduler.utils import create_task_dict +from swh.lister.pattern import StatelessLister +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) - CRAN_MIRROR = "https://cran.r-project.org" +PageType = List[Dict[str, str]] -class CRANLister(SimpleLister): - MODEL = CRANModel - LISTER_NAME = "cran" - instance = "cran" - def task_dict( - self, - origin_type, - origin_url, - version=None, - html_url=None, - policy=None, - **kwargs, - ): - """Return task format dict. This creates tasks with args and kwargs - set, for example:: - - args: [] - kwargs: { - 'url': 'https://cran.r-project.org/Packages/...', - 'artifacts': [{ - 'url': 'https://cran.r-project.org/...', - 'version': '0.0.1', - }] - } +class CRANLister(StatelessLister[PageType]): + """ + List all packages hosted on The Comprehensive R Archive Network. + """ - """ - if not policy: - policy = "oneshot" - artifact_url = html_url - assert origin_type == "tar" - return create_task_dict( - "load-cran", - policy, - url=origin_url, - artifacts=[{"url": artifact_url, "version": version}], - retries_left=3, - ) - - def safely_issue_request(self, identifier): - """Bypass the implementation. It's now the `list_packages` which - returns data. - - As an implementation detail, we cannot change simply the base - SimpleLister yet as other implementation still uses it. This shall be - part of another refactoring pass. + LISTER_NAME = "CRAN" - """ - return None - - def list_packages(self, response) -> List[Mapping[str, str]]: - """Runs R script which uses inbuilt API to return a json response - containing data about the R packages. - - Returns: - List of Dict about R packages. For example:: - - [ - { - 'Package': 'A3', - 'Version': '1.0.0', - 'Title': 'A3 package', - 'Description': ... - }, - { - 'Package': 'abbyyR', - 'Version': '0.5.4', - 'Title': 'Access to Abbyy OCR (OCR) API', - 'Description': ...' - }, - ... - ] + def __init__( + self, scheduler: SchedulerInterface, + ): + super().__init__(scheduler, url=CRAN_MIRROR, instance="cran") + def get_pages(self) -> Iterator[PageType]: """ - return read_cran_data() - - def get_model_from_repo(self, repo: Mapping[str, str]) -> Mapping[str, str]: - """Transform from repository representation to model - + Yields a single page containing all CRAN packages info. """ - logger.debug("repo: %s", repo) - origin_url, artifact_url = compute_origin_urls(repo) - package = repo["Package"] - version = repo["Version"] - return { - "uid": f"{package}-{version}", - "name": package, - "full_name": repo["Title"], - "version": version, - "html_url": artifact_url, - "origin_url": origin_url, - "origin_type": "tar", - } - - -def read_cran_data() -> List[Mapping[str, str]]: - """Execute r script to read cran listing. + yield read_cran_data() + + def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: + assert self.lister_obj.id is not None + for package_info in page: + origin_url, artifact_url = compute_origin_urls(package_info) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="tar", + extra_loader_arguments={ + "artifacts": [ + {"url": artifact_url, "version": package_info["Version"]} + ] + }, + ) + + +def read_cran_data() -> List[Dict[str, str]]: + """ + Runs R script which uses inbuilt API to return a json response + containing data about the R packages. + Returns: + List of Dict about R packages. For example:: + + [ + { + 'Package': 'A3', + 'Version': '1.0.0' + }, + { + 'Package': 'abbyyR', + 'Version': '0.5.4' + }, + ... + ] """ filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R") - logger.debug("script list-all-packages.R path: %s", filepath) + logger.debug("Executing R script %s", filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout.decode("utf-8")) -def compute_origin_urls(repo: Mapping[str, str]) -> Tuple[str, str]: +def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]: """Compute the package url from the repo dict. Args: @@ -135,8 +91,8 @@ the tuple project url, artifact url """ - package = repo["Package"] - version = repo["Version"] + package = package_info["Package"] + version = package_info["Version"] origin_url = f"{CRAN_MIRROR}/package={package}" artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz" return origin_url, artifact_url diff --git a/swh/lister/cran/models.py b/swh/lister/cran/models.py deleted file mode 100644 --- a/swh/lister/cran/models.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, String - -from swh.lister.core.models import ModelBase - - -class CRANModel(ModelBase): - """a CRAN repository representation - - """ - - __tablename__ = "cran_repo" - - uid = Column(String, primary_key=True) - version = Column(String) diff --git a/swh/lister/cran/tasks.py b/swh/lister/cran/tasks.py --- a/swh/lister/cran/tasks.py +++ b/swh/lister/cran/tasks.py @@ -10,7 +10,7 @@ @shared_task(name=__name__ + ".CRANListerTask") def list_cran(**lister_args): """Lister task for the CRAN registry""" - return CRANLister(**lister_args).run() + return CRANLister.from_configfile(**lister_args).run().dict() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/cran/tests/conftest.py b/swh/lister/cran/tests/conftest.py --- a/swh/lister/cran/tests/conftest.py +++ b/swh/lister/cran/tests/conftest.py @@ -9,17 +9,3 @@ @pytest.fixture def lister_under_test(): return "cran" - - -@pytest.fixture -def lister_cran(swh_lister): - swh_lister.scheduler.create_task_type( - { - "type": "load-cran", - "description": "Load a CRAN package", - "backend_name": "swh.loader.package.cran.tasks.LoaderCRAN", - "default_interval": "1 day", - } - ) - - return swh_lister diff --git a/swh/lister/cran/tests/data/list-r-packages.json b/swh/lister/cran/tests/data/list-r-packages.json --- a/swh/lister/cran/tests/data/list-r-packages.json +++ b/swh/lister/cran/tests/data/list-r-packages.json @@ -1,39 +1,28 @@ [ + { "Package": "SeleMix", - "Version": "1.0.1", - "Title": "Selective Editing via Mixture Models", - "Description": "Detection of outliers and influential errors using a latent variable model. " + "Version": "1.0.1" }, { "Package": "plink", - "Version": "1.5-1", - "Title": "IRT Separate Calibration Linking Methods", - "Description": "Item response theory based methods are used to compute\n linking constants and conduct chain linking of unidimensional\n or multidimensional tests for multiple groups under a common\n item design. The unidimensional methods include the Mean/Mean,\n Mean/Sigma, Haebara, and Stocking-Lord methods for dichotomous\n (1PL, 2PL and 3PL) and/or polytomous (graded response, partial\n credit/generalized partial credit, nominal, and multiple-choice\n model) items. The multidimensional methods include the least\n squares method and extensions of the Haebara and Stocking-Lord\n method using single or multiple dilation parameters for\n multidimensional extensions of all the unidimensional\n dichotomous and polytomous item response models. The package\n also includes functions for importing item and/or ability\n parameters from common IRT software, conducting IRT true score\n and observed score equating, and plotting item response\n curves/surfaces, vector plots, information plots, and comparison \n plots for examining parameter drift." + "Version": "1.5-1" }, { "Package": "justifier", - "Version": "0.1.0", - "Title": "Human and Machine-Readable Justifications and Justified\nDecisions Based on 'YAML'", - "Description": "Leverages the 'yum' package to\n implement a 'YAML' ('YAML Ain't Markup Language', a human\n friendly standard for data serialization; see )\n standard for documenting justifications, such as for decisions\n taken during the planning, execution and analysis of a study\n or during the development of a behavior change intervention\n as illustrated by Marques & Peters (2019)\n . These justifications are both\n human- and machine-readable, facilitating efficient extraction\n and organisation." + "Version": "0.1.0" }, { "Package": "Records", - "Version": "1.0", - "Title": "Record Values and Record Times", - "Description": "Functions for generating k-record values and k-record\n times" + "Version": "1.0" }, { "Package": "scRNAtools", - "Version": "1.0", - "Title": "Single Cell RNA Sequencing Data Analysis Tools", - "Description": "We integrated the common analysis methods utilized in single cell RNA sequencing data, which included cluster method, principal components analysis (PCA), the filter of differentially expressed genes, pathway enrichment analysis and correlated analysis methods." + "Version": "1.0" }, - { "Package": "Deriv", - "Version": "3.9.0", - "Title": "Symbolic Differentiation", - "Description": "R-based solution for symbolic differentiation. It admits\n user-defined function as well as function substitution\n in arguments of functions to be differentiated. Some symbolic\n simplification is part of the work." + "Version": "3.9.0" } -] + +] \ No newline at end of file diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,15 +1,14 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from os import path -from unittest.mock import patch import pytest -from swh.lister.cran.lister import CRAN_MIRROR, compute_origin_urls +from swh.lister.cran.lister import CRAN_MIRROR, CRANLister, compute_origin_urls def test_cran_compute_origin_urls(): @@ -27,43 +26,32 @@ compute_origin_urls(incomplete_repo) -@patch("swh.lister.cran.lister.read_cran_data") -def test_cran_lister_cran(mock_cran, datadir, lister_cran): +def test_cran_lister_cran(datadir, swh_scheduler, mocker): with open(path.join(datadir, "list-r-packages.json")) as f: - data = json.loads(f.read()) + cran_data = json.loads(f.read()) - mock_cran.return_value = data - assert len(data) == 6 + lister = CRANLister(swh_scheduler) - lister_cran.run() + mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") - r = lister_cran.scheduler.search_tasks(task_type="load-cran") - assert len(r) == 6 + mock_cran.return_value = cran_data - for row in r: - assert row["type"] == "load-cran" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 + stats = lister.run() - # kwargs - kwargs = row["arguments"]["kwargs"] - assert len(kwargs) == 2 - assert set(kwargs.keys()) == {"url", "artifacts"} + assert stats.pages == 1 + assert stats.origins == len(cran_data) - artifacts = kwargs["artifacts"] - assert len(artifacts) == 1 + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert set(artifacts[0].keys()) == {"url", "version"} + assert len(scheduler_origins) == len(cran_data) - assert row["policy"] == "oneshot" - assert row["retries_left"] == 3 + for package_info in cran_data: + origin_url, artifact_url = compute_origin_urls(package_info) - origin_url = kwargs["url"] - record = ( - lister_cran.db_session.query(lister_cran.MODEL) - .filter(origin_url == origin_url) - .first() - ) - assert record - assert record.uid == f"{record.name}-{record.version}" + filtered_origins = [o for o in scheduler_origins if o.url == origin_url] + + assert len(filtered_origins) == 1 + + assert filtered_origins[0].extra_loader_arguments == { + "artifacts": [{"url": artifact_url, "version": package_info["Version"]}] + } diff --git a/swh/lister/cran/tests/test_tasks.py b/swh/lister/cran/tests/test_tasks.py --- a/swh/lister/cran/tests/test_tasks.py +++ b/swh/lister/cran/tests/test_tasks.py @@ -1,9 +1,9 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch +from swh.lister.pattern import ListerStats def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): @@ -14,17 +14,18 @@ assert res.result == "OK" -@patch("swh.lister.cran.tasks.CRANLister") -def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): +def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): # setup the mocked CRANLister - lister.return_value = lister - lister.run.return_value = None + lister = mocker.patch("swh.lister.cran.tasks.CRANLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=17042) + lister.run.return_value = stats res = swh_scheduler_celery_app.send_task("swh.lister.cran.tasks.CRANListerTask") assert res res.wait() assert res.successful() + assert res.result == stats.dict() - lister.assert_called_once_with() - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()