diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,17 +1,23 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import subprocess + +import os import json import logging import pkg_resources +import subprocess + from collections import defaultdict +from typing import List, Dict from swh.lister.cran.models import CRANModel -from swh.scheduler.utils import create_task_dict -from swh.core import utils from swh.lister.core.simple_lister import SimpleLister +from swh.scheduler.utils import create_task_dict + + +logger = logging.getLogger(__name__) class CRANLister(SimpleLister): @@ -32,15 +38,17 @@ kwargs.get('name'), origin_url, kwargs.get('version'), project_metadata=self.descriptions[kwargs.get('name')]) - def r_script_request(self): - """Runs R script which uses inbuilt API to return a json - response containing data about all the R packages + def safely_issue_request(self, identifier: str) -> List[Dict]: + """Runs R script which uses inbuilt API to return a json response + containing data about all the R packages. Returns: - List of dictionaries - example + List of Dict about r packages. + + Sample: [ - {'Package': 'A3', + { + 'Package': 'A3', 'Version': '1.0.0', 'Title': 'Accurate, Adaptable, and Accessible Error Metrics for @@ -48,22 +56,27 @@ 'Description': 'Supplies tools for tabulating and analyzing the results of predictive models. The methods employed are ... ' - } - {'Package': 'abbyyR', + }, + { + 'Package': 'abbyyR', 'Version': '0.5.4', 'Title': 'Access to Abbyy Optical Character Recognition (OCR) API', 'Description': 'Get text from images of text using Abbyy - Cloud Optical Character\n ...' - } + Cloud Optical Character\n ...' + }, ... ] + """ - file_path = pkg_resources.resource_filename('swh.lister.cran', - 'list_all_packages.R') - response = subprocess.run(file_path, stdout=subprocess.PIPE, - shell=False) - return json.loads(response.stdout) + filepath = pkg_resources.resource_filename('swh.lister.cran', + 'list_all_packages.R') + logger.debug('script list-all-packages.R path: %s', filepath) + response = subprocess.run( + filepath, stdout=subprocess.PIPE, shell=False) + data = json.loads(response.stdout) + logger.debug('r-script-request: %s', data) + return data def get_model_from_repo(self, repo): """Transform from repository representation to model @@ -87,36 +100,3 @@ """ return [self.get_model_from_repo(repo) for repo in response] - - def ingest_data(self, identifier, checks=False): - """Rework the base ingest_data. - Request server endpoint which gives all in one go. - - Simplify and filter response list of repositories. Inject - repo information into local db. Queue loader tasks for - linked repositories. - - Args: - identifier: Resource identifier (unused) - checks (bool): Additional checks required (unused) - - """ - response = self.r_script_request() - if not response: - return response, [] - models_list = self.transport_response_simplified(response) - models_list = self.filter_before_inject(models_list) - all_injected = [] - for models in utils.grouper(models_list, n=10000): - models = list(models) - logging.debug('models: %s' % len(models)) - # inject into local db - injected = self.inject_repo_data_into_db(models) - # queue workers - self.create_missing_origins_and_tasks(models, injected) - all_injected.append(injected) - # flush - self.db_session.commit() - self.db_session = self.mk_session() - - return response, all_injected