diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py index 72f5b97..c31f4b7 100644 --- a/swh/lister/core/simple_lister.py +++ b/swh/lister/core/simple_lister.py @@ -1,74 +1,80 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.core import utils from .lister_base import ListerBase class SimpleLister(ListerBase): """Lister* intermediate class for any service that follows the simple, 'list in oneshot information' pattern. - Client sends a request to list repositories in oneshot - Client receives structured (json/xml/etc) response with information and stores those in db """ def list_packages(self, *args): """Listing packages method. """ pass def ingest_data(self, identifier, checks=False): """Rework the base ingest_data. Request server endpoint which gives all in one go. Simplify and filter response list of repositories. Inject repo information into local db. Queue loader tasks for linked repositories. Args: identifier: Resource identifier (unused) checks (bool): Additional checks required (unused) """ response = self.safely_issue_request(identifier) response = self.list_packages(response) if not response: return response, [] models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) all_injected = [] for models in utils.grouper(models_list, n=1000): models = list(models) logging.debug('models: %s' % len(models)) # inject into local db injected = self.inject_repo_data_into_db(models) # queue workers self.schedule_missing_tasks(models, injected) all_injected.append(injected) # flush self.db_session.commit() self.db_session = self.mk_session() return response, all_injected + def transport_response_simplified(self, response): + """Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo_name) for repo_name in response] + def run(self): """Query the server which answers in one query. Stores the information, dropping actual redundant information we already have. Returns: nothing """ dump_not_used_identifier = 0 response, injected_repos = self.ingest_data(dump_not_used_identifier) if not response and not injected_repos: logging.info('No response from api server, stopping') diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 2c21852..414e833 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,101 +1,95 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import pkg_resources import subprocess from collections import defaultdict from typing import List, Dict from swh.lister.cran.models import CRANModel from swh.lister.core.simple_lister import SimpleLister from swh.scheduler.utils import create_task_dict logger = logging.getLogger(__name__) class CRANLister(SimpleLister): MODEL = CRANModel LISTER_NAME = 'cran' instance = 'cran' descriptions = defaultdict(dict) def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ return create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'recurring'), kwargs.get('name'), origin_url, kwargs.get('version'), project_metadata=self.descriptions[kwargs.get('name')]) def safely_issue_request(self, identifier: str) -> List[Dict]: """Runs R script which uses inbuilt API to return a json response containing data about all the R packages. Returns: List of Dict about r packages. Sample: [ { 'Package': 'A3', 'Version': '1.0.0', 'Title': 'Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModels', 'Description': 'Supplies tools for tabulating and analyzing the results of predictive models. The methods employed are ... ' }, { 'Package': 'abbyyR', 'Version': '0.5.4', 'Title': 'Access to Abbyy Optical Character Recognition (OCR) API', 'Description': 'Get text from images of text using Abbyy Cloud Optical Character\n ...' }, ... ] """ filepath = pkg_resources.resource_filename('swh.lister.cran', 'list_all_packages.R') logger.debug('script list-all-packages.R path: %s', filepath) response = subprocess.run( filepath, stdout=subprocess.PIPE, shell=False) data = json.loads(response.stdout) logger.debug('r-script-request: %s', data) return data def get_model_from_repo(self, repo): """Transform from repository representation to model """ self.descriptions[repo["Package"]] = repo['Description'] project_url = 'https://cran.r-project.org/src/contrib' \ '/%(Package)s_%(Version)s.tar.gz' % repo return { 'uid': repo["Package"], 'name': repo["Package"], 'full_name': repo["Title"], 'version': repo["Version"], 'html_url': project_url, 'origin_url': project_url, 'origin_type': 'cran', } - - def transport_response_simplified(self, response): - """Transform response to list for model manipulation - - """ - return [self.get_model_from_repo(repo) for repo in response] diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 11daa2f..8609895 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,222 +1,216 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import gzip import json import requests from pathlib import Path from collections import defaultdict from .models import GNUModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister class GNULister(SimpleLister): MODEL = GNUModel LISTER_NAME = 'gnu' TREE_URL = 'https://ftp.gnu.org/tree.json.gz' BASE_URL = 'https://ftp.gnu.org' instance = 'gnu' tarballs = defaultdict(dict) # Dict of key with project name value the # associated is list of tarballs of package to ingest from the gnu mirror def task_dict(self, origin_type, origin_url, **kwargs): """ Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ return utils.create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'oneshot'), kwargs.get('name'), origin_url, tarballs=self.tarballs[kwargs.get('name')]) def safely_issue_request(self, identifier): ''' Download and unzip tree.json.gz file and returns its content in JSON format File content in dictionary format Args: identifier: resource identifier (unused) Returns: Server response ''' response = requests.get(self.TREE_URL, allow_redirects=True) uncompressed_content = gzip.decompress(response.content) return json.loads(uncompressed_content.decode('utf-8')) def list_packages(self, response): """ List the actual gnu origins with their names,url and the list of all the tarball for a package from the response. Args: response : File structure of the website in dictionary format Returns: A list of all the packages with their names, url of their root directory and the tarballs present for the particular package. [ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', 'tarballs': [ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', 'date': '1071002600'}, {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', 'date': '1071078759'}} ] }, {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', 'tarballs': [ {'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', 'date': '1461357336'}, {'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'date': '1480991830'} ] ] """ response = filter_directories(response) packages = [] for directory in response: content = directory['contents'] for repo in content: if repo['type'] == 'directory': package_url = '%s/%s/%s/' % (self.BASE_URL, directory['name'], repo['name']) package_tarballs = find_tarballs( repo['contents'], package_url) if package_tarballs != []: repo_details = { 'name': repo['name'], 'url': package_url, 'time_modified': repo['time'], } self.tarballs[repo['name']] = package_tarballs packages.append(repo_details) random.shuffle(packages) return packages def get_model_from_repo(self, repo): """Transform from repository representation to model """ return { 'uid': repo['name'], 'name': repo['name'], 'full_name': repo['name'], 'html_url': repo['url'], 'origin_url': repo['url'], 'time_last_updated': int(repo['time_modified']), 'origin_type': 'tar', } - def transport_response_simplified(self, response): - """Transform response to list for model manipulation - - """ - return [self.get_model_from_repo(repo) for repo in response] - def find_tarballs(package_file_structure, url): '''Recursively lists tarballs present in the folder and subfolders for a particular package url. Args package_file_structure: File structure of the package root directory url: URL of the corresponding package Returns List of tarball urls and their associated metadata (time, length). For example: [ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', 'time': 1071002600, 'length': 543}, {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', 'time': 1071078759, 'length': 456}, {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz', 'time': 1074278633, 'length': 251}, ... ] ''' tarballs = [] for single_file in package_file_structure: filetype = single_file['type'] filename = single_file['name'] if filetype == 'file': if file_extension_check(filename): tarballs.append({ 'archive': url + filename, 'time': int(single_file['time']), 'length': int(single_file['size']), }) # It will recursively check for tarballs in all sub-folders elif filetype == 'directory': tarballs_in_dir = find_tarballs( single_file['contents'], url + filename + '/') tarballs.extend(tarballs_in_dir) return tarballs def filter_directories(response): ''' Keep only gnu and old-gnu folders from JSON ''' final_response = [] file_system = response[0]['contents'] for directory in file_system: if directory['name'] in ('gnu', 'old-gnu'): final_response.append(directory) return final_response def file_extension_check(file_name): ''' Check for the extension of the file, if the file is of zip format of .tar.x format, where x could be anything, then returns true. Args: file_name : name of the file for which the extensions is needs to be checked. Returns: True or False example file_extension_check('abc.zip') will return True file_extension_check('abc.tar.gz') will return True file_extension_check('abc.tar.gz.sig') will return False ''' file_suffixes = Path(file_name).suffixes if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip': return True elif len(file_suffixes) > 1: if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': return True return False diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index c159ca6..1620e24 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -1,85 +1,79 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import json from .models import PackagistModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister from swh.lister.core.lister_transports import ListerOnePageApiTransport class PackagistLister(ListerOnePageApiTransport, SimpleLister): """List packages available in the Packagist package manager. The lister sends the request to the url present in the class variable `PAGE`, to receive a list of all the package names present in the Packagist package manager. Iterates over all the packages and constructs the metadata url of the package from the name of the package and creates a loading task. Task: Type: load-packagist Policy: recurring Args: Example: Type: load-packagist Policy: recurring Args: 'hypejunction/hypegamemechanics' 'https://repo.packagist.org/p/hypejunction/hypegamemechanics.json' """ MODEL = PackagistModel LISTER_NAME = 'packagist' PAGE = 'https://packagist.org/packages/list.json' instance = 'packagist' def __init__(self, override_config=None): ListerOnePageApiTransport .__init__(self) SimpleLister.__init__(self, override_config=override_config) def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ return utils.create_task_dict('load-%s' % origin_type, kwargs.get('policy', 'recurring'), kwargs.get('name'), origin_url) def list_packages(self, response): """List the actual packagist origins from the response. """ response = json.loads(response.text) packages = [name for name in response['packageNames']] random.shuffle(packages) return packages def get_model_from_repo(self, repo_name): """Transform from repository representation to model """ url = 'https://repo.packagist.org/p/%s.json' % repo_name return { 'uid': repo_name, 'name': repo_name, 'full_name': repo_name, 'html_url': url, 'origin_url': url, 'origin_type': 'packagist', } - - def transport_response_simplified(self, response): - """Transform response to list for model manipulation - - """ - return [self.get_model_from_repo(repo_name) for repo_name in response] diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index c8e0e0d..4da6e55 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,76 +1,70 @@ # Copyright (C) 2018-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import xmltodict from .models import PyPIModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister from swh.lister.core.lister_transports import ListerOnePageApiTransport class PyPILister(ListerOnePageApiTransport, SimpleLister): MODEL = PyPIModel LISTER_NAME = 'pypi' PAGE = 'https://pypi.org/simple/' instance = 'pypi' # As of today only the main pypi.org is used def __init__(self, override_config=None): ListerOnePageApiTransport .__init__(self) SimpleLister.__init__(self, override_config=override_config) def task_dict(self, origin_type, origin_url, **kwargs): """(Override) Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ _type = 'load-%s' % origin_type _policy = kwargs.get('policy', 'recurring') project_name = kwargs.get('name') project_metadata_url = kwargs.get('html_url') return utils.create_task_dict( _type, _policy, project_name, origin_url, project_metadata_url=project_metadata_url) def list_packages(self, response): """(Override) List the actual pypi origins from the response. """ result = xmltodict.parse(response.content) _packages = [p['#text'] for p in result['html']['body']['a']] random.shuffle(_packages) return _packages def _compute_urls(self, repo_name): """Returns a tuple (project_url, project_metadata_url) """ return ( 'https://pypi.org/project/%s/' % repo_name, 'https://pypi.org/pypi/%s/json' % repo_name ) def get_model_from_repo(self, repo_name): """(Override) Transform from repository representation to model """ project_url, project_url_meta = self._compute_urls(repo_name) return { 'uid': repo_name, 'name': repo_name, 'full_name': repo_name, 'html_url': project_url_meta, 'origin_url': project_url, 'origin_type': 'pypi', } - - def transport_response_simplified(self, response): - """(Override) Transform response to list for model manipulation - - """ - return [self.get_model_from_repo(repo_name) for repo_name in response]