diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@ Actual lister implementations are: +- swh-lister-bitbucket - swh-lister-debian - swh-lister-github - swh-lister-gitlab -- swh-lister-bitbucket +- swh-lister-pypi Licensing ---------- @@ -63,7 +64,7 @@ lister_db_url: postgres:///lister-github credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/github.com + cache_dir: /home/user/.cache/swh/lister/github.com Note: This expects storage (5002) and scheduler (5008) services to run locally @@ -101,7 +102,7 @@ lister_db_url: postgres:///lister-gitlab credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/gitlab + cache_dir: /home/user/.cache/swh/lister/gitlab Note: This expects storage (5002) and scheduler (5008) services to run locally @@ -144,7 +145,7 @@ lister_db_url: postgres:///lister-debian credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/debian + cache_dir: /home/user/.cache/swh/lister/debian Note: This expects storage (5002) and scheduler (5008) services to run locally @@ -160,3 +161,42 @@ DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): deb.debian.org DEBUG:urllib3.connectionpool:http://deb.debian.org:80 "GET /debian//dists/stretch/main/source/Sources.xz HTTP/1.1" 302 325 ... + + +## lister-pypi + +### preparation steps + +1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) +2. mkdir ~/.config/swh/ ~/.cache/swh/lister/pypi/ +3. create configuration file ~/.config/swh/lister-pypi.yml +4. Bootstrap the db instance schema + + $ createdb lister-pypi + $ python3 -m swh.lister.cli --db-url postgres:///lister-pypi \ + --lister pypi \ + --create-tables \ + --with-data + + Note: This bootstraps a minimum data set needed for the pypi + lister to run (for development) + +### Configuration file sample + + $ cat ~/.config/swh/lister-pypi.yml + # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls + lister_db_url: postgres:///lister-pypi + credentials: [] + cache_responses: True + cache_dir: /home/user/.cache/swh/lister/pypi + +Note: This expects storage (5002) and scheduler (5008) services to run locally + +### Run + + $ python3 + Python 3.6.6 (default, Jun 27 2018, 14:44:17) + [GCC 8.1.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> from swh.lister.pypi.tasks import PyPiListerTask; PyPiListerTask().run_task() + >>> diff --git a/swh/lister/cli.py b/swh/lister/cli.py --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -6,7 +6,7 @@ import click -SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian'] +SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi'] @click.command() @@ -71,6 +71,11 @@ lister.db_session.add_all(areas) lister.db_session.commit() + elif lister == 'pypi': + from .pypi.models import ModelBase + from .pypi.lister import PyPiLister + _lister = PyPiLister(override_config=override_conf) + else: raise ValueError('Only supported listers are %s' % SUPPORTED_LISTERS) diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,6 +7,7 @@ from datetime import datetime from email.utils import parsedate from pprint import pformat +from xmlrpc import client import requests import xmltodict @@ -20,6 +21,62 @@ from .lister_base import FetchError +class ListerXMLRPCTransport(abc.ABC): + """Use the xmlrpc library for making Lister endpoint requests. + + To be used in conjunction with SWHListerBase or a subclass of it. + """ + SERVER = AbstractAttribute('string containing the server to contact for ' + 'information') + + def __init__(self): + self.lister_version = __version__ + + def get_client(self, path): + """Initialize client to query for result + + """ + return client.ServerProxy(path) + + def request_uri(self, _): + """Same uri called once + + """ + return self.SERVER + + def request_params(self, identifier): + """Cannot pass any parameters to query to the xmlrpc client so cannot + even pass our user-agent specifics. + + """ + return {} + + def transport_quota_check(self, response): + """No rate limit dealing explained. + + """ + return False, 0 + + def transport_request(self, identifier): + """Implements SWHListerBase.transport_request + + """ + path = self.request_uri(identifier) + try: + return self.get_client(path) + except Exception as e: + raise FetchError(e) + + def transport_response_to_string(self, response): + """Implements SWHListerBase.transport_response_to_string for XMLRPC + given responses. + + """ + s = pformat(self.SERVER) + s += '\n#\n' + pformat(response) # Note: will potentially be big + return s + + class SWHListerHttpTransport(abc.ABC): """Use the Requests library for making Lister endpoint requests. @@ -151,3 +208,25 @@ except Exception: # not xml s += pformat(response.text) return s + + +class ListerOnePageApiTransport(SWHListerHttpTransport): + """Leverage requests library to retrieve basic html page and parse + result. + + To be used in conjunction with SWHListerBase or a subclass of it. + + """ + PAGE = AbstractAttribute("The server api's unique page to retrieve and " + "parse for information") + PATH_TEMPLATE = None # we do not use it + + def __init__(self, api_baseurl=None): + self.session = requests.Session() + self.lister_version = __version__ + + def request_uri(self, _): + """Get the full request URI given the transport_request identifier. + + """ + return self.PAGE diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/core/simple_lister.py @@ -0,0 +1,76 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import logging + +from swh.core import utils + +from .lister_base import SWHListerBase + + +class SimpleLister(SWHListerBase): + """Lister* intermediate class for any service that follows the simple, + 'list in oneshot information' pattern. + + - Client sends a request to list repositories in oneshot + + - Client receives structured (json/xml/etc) response with + information and stores those in db + + """ + @abc.abstractmethod + def list_packages(self, *args): + """Listing packages method. + + """ + pass + + def ingest_data(self, identifier, checks=False): + """Rework the base ingest_data. + Request server endpoint which gives all in one go. + + Simplify and filter response list of repositories. Inject + repo information into local db. Queue loader tasks for + linked repositories. + + Args: + identifier: Resource identifier (unused) + checks (bool): Additional checks required (unused) + + """ + response = self.safely_issue_request(identifier) + response = self.list_packages(response) + if not response: + return response, [] + models_list = self.transport_response_simplified(response) + models_list = self.filter_before_inject(models_list) + all_injected = [] + for models in utils.grouper(models_list, n=10000): + models = list(models) + logging.debug('models: %s' % len(models)) + # inject into local db + injected = self.inject_repo_data_into_db(models) + # queue workers + self.create_missing_origins_and_tasks(models, injected) + all_injected.append(injected) + # flush + self.db_session.commit() + self.db_session = self.mk_session() + + return response, all_injected + + def run(self): + """Query the server which answers in one query. Stores the + information, dropping actual redundant information we + already have. + + Returns: + nothing + + """ + dump_not_used_identifier = 0 + response, injected_repos = self.ingest_data(dump_not_used_identifier) + if not response and not injected_repos: + logging.info('No response from api server, stopping') diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py new file mode 100644 diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/pypi/lister.py @@ -0,0 +1,76 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import xmltodict + +from .models import PyPiModel + +from swh.scheduler import utils +from swh.lister.core.simple_lister import SimpleLister +from swh.lister.core.lister_transports import ListerOnePageApiTransport + + +class PyPiLister(ListerOnePageApiTransport, SimpleLister): + MODEL = PyPiModel + LISTER_NAME = 'pypi' + PAGE = 'https://pypi.org/simple/' + + def __init__(self, override_config=None): + ListerOnePageApiTransport .__init__(self) + SimpleLister.__init__(self, override_config=override_config) + + def task_dict(self, origin_type, origin_url, **kwargs): + """(Override) Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + + """ + _type = 'origin-update-%s' % origin_type + _policy = 'recurring' + project_name = kwargs.get('name') + project_metadata_url = kwargs.get('html_url') + return utils.create_task_dict( + _type, _policy, project_name, origin_url, + project_metadata_url=project_metadata_url) + + def list_packages(self, response): + """(Override) List the actual pypi origins from the response. + + """ + result = xmltodict.parse(response.content) + _packages = [p['#text'] for p in result['html']['body']['a']] + random.shuffle(_packages) + return _packages + + def _compute_urls(self, repo_name): + """Returns a tuple (project_url, project_metadata_url) + + """ + return ( + 'https://pypi.org/pypi/%s/' % repo_name, + 'https://pypi.org/pypi/%s/json' % repo_name + ) + + def get_model_from_repo(self, repo_name): + """(Override) Transform from repository representation to model + + """ + project_url, project_url_meta = self._compute_urls(repo_name) + return { + 'uid': repo_name, + 'name': repo_name, + 'full_name': repo_name, + 'html_url': project_url_meta, + 'origin_url': project_url, + 'origin_type': 'pypi', + 'description': None, + } + + def transport_response_simplified(self, response): + """(Override) Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo_name) for repo_name in response] diff --git a/swh/lister/pypi/models.py b/swh/lister/pypi/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/pypi/models.py @@ -0,0 +1,16 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from ..core.models import ModelBase + + +class PyPiModel(ModelBase): + """a PyPi repository representation + + """ + __tablename__ = 'pypi_repo' + + uid = Column(String, primary_key=True) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/pypi/tasks.py @@ -0,0 +1,20 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from ..core.tasks import ListerTaskBase +from .lister import PyPiLister + + +class PyPiListerTask(ListerTaskBase): + """Full PyPi lister (list all available origins from the api). + + """ + task_queue = 'swh_lister_pypi_refresh' + + def new_lister(self): + return PyPiLister() + + def run_task(self): + lister = self.new_lister() + lister.run()