diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py index c63fd32..80e2040 100644 --- a/swh/lister/core/simple_lister.py +++ b/swh/lister/core/simple_lister.py @@ -1,81 +1,82 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -from swh.core import utils +from typing import Any, List +from swh.core import utils from .lister_base import ListerBase class SimpleLister(ListerBase): """Lister* intermediate class for any service that follows the simple, 'list in oneshot information' pattern. - Client sends a request to list repositories in oneshot - Client receives structured (json/xml/etc) response with information and stores those in db """ - def list_packages(self, *args): + def list_packages(self, response: Any) -> List[Any]: """Listing packages method. """ pass def ingest_data(self, identifier, checks=False): """Rework the base ingest_data. Request server endpoint which gives all in one go. Simplify and filter response list of repositories. Inject repo information into local db. Queue loader tasks for linked repositories. Args: identifier: Resource identifier (unused) checks (bool): Additional checks required (unused) """ response = self.safely_issue_request(identifier) response = self.list_packages(response) if not response: return response, [] models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) all_injected = [] for models in utils.grouper(models_list, n=1000): models = list(models) logging.debug('models: %s' % len(models)) # inject into local db injected = self.inject_repo_data_into_db(models) # queue workers self.schedule_missing_tasks(models, injected) all_injected.append(injected) # flush self.db_session.commit() self.db_session = self.mk_session() return response, all_injected def transport_response_simplified(self, response): """Transform response to list for model manipulation """ return [self.get_model_from_repo(repo_name) for repo_name in response] def run(self): """Query the server which answers in one query. Stores the information, dropping actual redundant information we already have. Returns: nothing """ dump_not_used_identifier = 0 response, injected_repos = self.ingest_data(dump_not_used_identifier) if not response and not injected_repos: logging.info('No response from api server, stopping') diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 5c8e6a2..9edd35a 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,131 +1,131 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import pkg_resources import subprocess from typing import List, Mapping from swh.lister.cran.models import CRANModel from swh.lister.core.simple_lister import SimpleLister from swh.scheduler.utils import create_task_dict logger = logging.getLogger(__name__) def read_cran_data() -> List[Mapping[str, str]]: """Execute r script to read cran listing. """ filepath = pkg_resources.resource_filename('swh.lister.cran', 'list_all_packages.R') logger.debug('script list-all-packages.R path: %s', filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout.decode('utf-8')) def compute_package_url(repo: Mapping[str, str]) -> str: """Compute the package url from the repo dict. Args: repo: dict with key 'Package', 'Version' Returns: the package url """ return 'https://cran.r-project.org/src/contrib' \ '/%(Package)s_%(Version)s.tar.gz'.format(repo) class CRANLister(SimpleLister): MODEL = CRANModel LISTER_NAME = 'cran' instance = 'cran' def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict. This creates tasks with args and kwargs set, for example:: args: ['package', 'https://cran.r-project.org/...', 'version'] kwargs: {} """ policy = kwargs.get('policy', 'oneshot') package = kwargs.get('name') version = kwargs.get('version') return create_task_dict( 'load-%s' % origin_type, policy, package, origin_url, version, retries_left=3, ) def safely_issue_request(self, identifier): """Bypass the implementation. It's now the `list_packages` which returns data. As an implementation detail, we cannot change simply the base SimpleLister yet as other implementation still uses it. This shall be part of another refactoring pass. """ return None - def list_packages(self, *args) -> List[Mapping[str, str]]: + def list_packages(self, response) -> List[Mapping[str, str]]: """Runs R script which uses inbuilt API to return a json response containing data about the R packages. Returns: List of Dict about r packages. For example: .. code-block:: python [ { 'Package': 'A3', 'Version': '1.0.0', 'Title': 'Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModels', 'Description': 'Supplies tools for tabulating and analyzing the results of predictive models. The methods employed are ... ' }, { 'Package': 'abbyyR', 'Version': '0.5.4', 'Title': 'Access to Abbyy OCR (OCR) API', 'Description': 'Get text from images of text using Abbyy Cloud Optical Character\n ...' }, ... ] """ return read_cran_data() def get_model_from_repo( self, repo: Mapping[str, str]) -> Mapping[str, str]: """Transform from repository representation to model """ logger.debug('repo: %s', repo) project_url = compute_package_url(repo) package = repo['Package'] return { 'uid': package, 'name': package, 'full_name': repo['Title'], 'version': repo['Version'], 'html_url': project_url, 'origin_url': project_url, 'origin_type': 'tar', } diff --git a/swh/lister/debian/__init__.py b/swh/lister/debian/__init__.py index 67af7b5..5601976 100644 --- a/swh/lister/debian/__init__.py +++ b/swh/lister/debian/__init__.py @@ -1,58 +1,58 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Mapping, Optional +from typing import Any, List, Mapping, Optional def debian_init(db_engine, lister=None, override_conf: Optional[Mapping[str, Any]] = None, - distributions: Optional[str] = ['stretch', 'buster'], - area_names: Optional[str] = ['main', 'contrib', 'non-free']): + distributions: List[str] = ['stretch', 'buster'], + area_names: List[str] = ['main', 'contrib', 'non-free']): """Initialize the debian data model. Args: db_engine: SQLAlchemy manipulation database object lister: Debian lister instance. None by default. override_conf: Override conf to pass to instantiate a lister. None by default distributions: Default distribution to build """ from swh.storage.schemata.distribution import ( Distribution, Area) if lister is None: from .lister import DebianLister lister = DebianLister(override_config=override_conf) if not lister.db_session\ .query(Distribution)\ .filter(Distribution.name == 'Debian')\ .one_or_none(): d = Distribution( name='Debian', type='deb', mirror_uri='http://deb.debian.org/debian/') lister.db_session.add(d) areas = [] for distribution_name in distributions: for area_name in area_names: areas.append(Area( name='%s/%s' % (distribution_name, area_name), distribution=d, )) lister.db_session.add_all(areas) lister.db_session.commit() def register() -> Mapping[str, Any]: from .lister import DebianLister return {'models': [DebianLister.MODEL], 'lister': DebianLister, 'task_modules': ['%s.tasks' % __name__], 'init': debian_init}