Changeset View
Standalone View
swh/lister/rcran/lister.py
- This file was added.
# Copyright (C) 2019 the Software Heritage developers | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import subprocess | |||||
import json | |||||
import logging | |||||
from swh.lister.rcran.models import RCRANModel | |||||
from swh.scheduler import utils | |||||
from swh.lister.core.simple_lister import SimpleLister | |||||
class RCRANLister(SimpleLister): | |||||
MODEL = RCRANModel | |||||
LISTER_NAME = 'rcran' | |||||
def __init__(self, override_config=None): | |||||
SimpleLister.__init__(self, override_config=override_config) | |||||
douardda: why implement the overloaded __init__ if it does nothing more than calling super()?
| |||||
def task_dict(self, origin_type, origin_url, **kwargs): | |||||
"""(Override) Return task format dict | |||||
This is overridden from the lister_base as more information is | |||||
needed for the ingestion task creation. | |||||
""" | |||||
_type = 'load-%s' % origin_type | |||||
Done Inline ActionsFYI, we are currently renaming the scheduler task names to something more rational (see T1508). _type = 'load-%s' % origin_type anlambert: FYI, we are currently renaming the scheduler task names to something more rational (see T1508). | |||||
Done Inline ActionsThanks for updating me about the change @anlambert. I made the recommended changes. nahimilega: Thanks for updating me about the change @anlambert. I made the recommended changes. | |||||
_policy = 'recurring' | |||||
project_name = kwargs.get('name') | |||||
project_version = kwargs.get('version') | |||||
project_metadata = kwargs.get('description') | |||||
return utils.create_task_dict( | |||||
_type, _policy, project_name, origin_url, project_version, | |||||
project_metadata=project_metadata) | |||||
Done Inline Actionsas I said in D1497 I don't like that kwargs.get is used to extract mandatory values. Also don't see the need for those project_xxx variables. This method should be a one statement only. douardda: as I said in D1497 I don't like that kwargs.get is used to extract mandatory values.
Also… | |||||
def r_script_request(self): | |||||
"""(Override) Runs r script which uses inbuilt API to return a json | |||||
response containing data about all the R packages | |||||
Returns: | |||||
JSON response | |||||
""" | |||||
response = subprocess.getoutput("./list_all_the_packages.R") | |||||
Done Inline ActionsThis will not work. You assume you are running the command from the package directory. And it is a rather bad idea to call the script directly (rather than calling the Rscript tool with the path to the script as argument). Yo need to figure out where this script will be shipped once the python package is installed and ensure you get it's proper path at execution time. I'd also rather use check_output (without a shell) here. No need for a shell in the path. douardda: This will not work. You assume you are running the command from the package directory. And it… | |||||
return json.loads(response) | |||||
def list_packages(self, response): | |||||
"""(Override) List the actual rcran origins from the response. | |||||
""" | |||||
pass | |||||
Done Inline Actions? douardda: ? | |||||
def _compute_urls(self, repo): | |||||
"""Returns a tuple (project_url, project_metadata_url) | |||||
""" | |||||
return ( | |||||
'https://cran.r-project.org/src/contrib/%s_%s.tar.gz' % | |||||
(repo["Package"], repo["Version"]) | |||||
) | |||||
Done Inline Actionswhy do you need a method for this one liner? Also, use named-based formatting here: return 'https://cran.r-project.org/src/contrib/%(Package)s_%(Version)s.tar.gz' % repo douardda: why do you need a method for this one liner?
Plus I don't like having URLs hardwritten in the… | |||||
def get_model_from_repo(self, repo): | |||||
"""(Override) Transform from repository representation to model | |||||
""" | |||||
project_url = self._compute_urls(repo) | |||||
return { | |||||
'uid': repo["Package"], | |||||
'name': repo["Package"], | |||||
'full_name': repo["Title"], | |||||
'version': repo["Version"], | |||||
'html_url': project_url, | |||||
'origin_url': project_url, | |||||
'origin_type': 'rcran', | |||||
'description': repo["Description"] | |||||
} | |||||
def transport_response_simplified(self, response): | |||||
"""(Override) Transform response to list for model manipulation | |||||
""" | |||||
return [self.get_model_from_repo(repo_name) for repo_name in response] | |||||
def ingest_data(self, identifier, checks=False): | |||||
"""(Override)Rework the base ingest_data. | |||||
Request server endpoint which gives all in one go. | |||||
Simplify and filter response list of repositories. Inject | |||||
repo information into local db. Queue loader tasks for | |||||
linked repositories. | |||||
Args: | |||||
identifier: Resource identifier (unused) | |||||
checks (bool): Additional checks required (unused) | |||||
""" | |||||
response = self.r_script_request() | |||||
if not response: | |||||
return response, [] | |||||
models_list = self.transport_response_simplified(response) | |||||
models_list = self.filter_before_inject(models_list) | |||||
all_injected = [] | |||||
for models in utils.grouper(models_list, n=10000): | |||||
models = list(models) | |||||
logging.debug('models: %s' % len(models)) | |||||
# inject into local db | |||||
injected = self.inject_repo_data_into_db(models) | |||||
# queue workers | |||||
self.create_missing_origins_and_tasks(models, injected) | |||||
all_injected.append(injected) | |||||
# flush | |||||
self.db_session.commit() | |||||
self.db_session = self.mk_session() | |||||
return response, all_injected | |||||
def transport_quota_check(self): | |||||
pass | |||||
def transport_request(self): | |||||
pass | |||||
def transport_response_to_string(self): | |||||
pass | |||||
Done Inline Actionsweird. why are these (NOP-like) methods needed? douardda: weird. why are these (NOP-like) methods needed? | |||||
Done Inline ActionsSimpleLister class is inherited, and these methods are @abc.abstractmethod hence, they are needed to be present or else code will throw an error. However, the response is not achieved from conventional methods. Thus, there is no need for these functions. nahimilega: SimpleLister class is inherited, and these methods are @abc.abstractmethod hence, they are… | |||||
Not Done Inline ActionsYeah right... this code (I mean the base classes provided for Lister implementation) really need some serious refactoring... douardda: Yeah right... this code (I mean the base classes provided for Lister implementation) really… |
why implement the overloaded init if it does nothing more than calling super()?