diff --git a/README.md b/README.md index 3c3878b..274c311 100644 --- a/README.md +++ b/README.md @@ -1,162 +1,201 @@ SWH-lister ============ The Software Heritage Lister is both a library module to permit to centralize lister behaviors, and to provide lister implementations. Actual lister implementations are: - swh-lister-debian - swh-lister-github - swh-lister-gitlab - swh-lister-bitbucket Licensing ---------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ - python3 - python3-requests - python3-sqlalchemy More details in requirements*.txt Local deployment ----------- ## lister-github ### Preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/ 3. create configuration file ~/.config/swh/lister-github.com.yml 4. Bootstrap the db instance schema $ createdb lister-github $ python3 -m swh.lister.cli --db-url postgres:///lister-github \ --lister github \ --create-tables ### Configuration file sample Minimalistic configuration: $ cat ~/.config/swh/lister-github.com.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-github credentials: [] cache_responses: True cache_dir: /home/zack/.cache/swh/lister/github.com Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 >>> import logging >>> logging.basicConfig(level=logging.DEBUG) >>> from swh.lister.github.tasks import RangeGitHubLister; RangeGitHubLister().run(364, 365) INFO:root:listing repos starting at 364 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1 ## lister-gitlab ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/ 3. create configuration file ~/.config/swh/lister-gitlab.yml 4. Bootstrap the db instance schema $ createdb lister-gitlab $ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab \ --lister gitlab \ --create-tables ### Configuration file sample $ cat ~/.config/swh/lister-gitlab.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-gitlab credentials: [] cache_responses: True cache_dir: /home/zack/.cache/swh/lister/gitlab Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20}) >>> from swh.lister.gitlab.tasks import FullGitLabRelister; FullGitLabRelister().run_task( {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20}) >>> from swh.lister.gitlab.tasks import IncrementalGitLabLister; IncrementalGitLabLister().run_task( {'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20}) ## lister-debian ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/debian/ 3. create configuration file ~/.config/swh/lister-debian.yml 4. Bootstrap the db instance schema $ createdb lister-debian $ python3 -m swh.lister.cli --db-url postgres:///lister-debian \ --lister debian \ --create-tables \ --with-data Note: This bootstraps a minimum data set needed for the debian lister to run (for development) ### Configuration file sample $ cat ~/.config/swh/lister-debian.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-debian credentials: [] cache_responses: True cache_dir: /home/zack/.cache/swh/lister/debian Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import logging; logging.basicConfig(level=logging.DEBUG); from swh.lister.debian.tasks import DebianListerTask; DebianListerTask().run_task('Debian') DEBUG:root:Creating snapshot for distribution Distribution(Debian (deb) on http://deb.debian.org/debian/) on date 2018-07-27 09:22:50.461165+00:00 DEBUG:root:Processing area Area(stretch/main of Debian) DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): deb.debian.org DEBUG:urllib3.connectionpool:http://deb.debian.org:80 "GET /debian//dists/stretch/main/source/Sources.xz HTTP/1.1" 302 325 ... + + +## lister-debian + +### preparation steps + +1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) +2. mkdir ~/.config/swh/ ~/.cache/swh/lister/pypi/ +3. create configuration file ~/.config/swh/lister-pypi.yml +4. Bootstrap the db instance schema + + $ createdb lister-pypi + $ python3 -m swh.lister.cli --db-url postgres:///lister-pypi \ + --lister pypi \ + --create-tables \ + --with-data + + Note: This bootstraps a minimum data set needed for the pypi + lister to run (for development) + +### Configuration file sample + + $ cat ~/.config/swh/lister-pypi.yml + # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls + lister_db_url: postgres:///lister-pypi + credentials: [] + cache_responses: True + cache_dir: /home/zack/.cache/swh/lister/pypi + +Note: This expects storage (5002) and scheduler (5008) services to run locally + +### Run + + $ python3 + Python 3.6.6 (default, Jun 27 2018, 14:44:17) + [GCC 8.1.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> from swh.lister.pypi.tasks import PyPiListerTask; PyPiListerTask().run_task() + >>> diff --git a/swh/lister/cli.py b/swh/lister/cli.py index ee7ff09..4997fe6 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -1,88 +1,93 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click -SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian'] +SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi'] @click.command() @click.option( '--db-url', '-d', default='postgres:///lister-gitlab.com', help='SQLAlchemy DB URL; see ' '') # noqa @click.option('--lister', required=1, type=click.Choice(SUPPORTED_LISTERS), help='Lister to act upon') @click.option('--create-tables', is_flag=True, default=False, help='create tables') @click.option('--drop-tables', is_flag=True, default=False, help='Drop tables') @click.option('--with-data', is_flag=True, default=False, help='Insert minimum required data') def cli(db_url, lister, create_tables, drop_tables, with_data): """Initialize db model according to lister. """ override_conf = {'lister_db_url': db_url} insert_minimum_data = None if lister == 'github': from .github.models import IndexingModelBase as ModelBase from .github.lister import GitHubLister _lister = GitHubLister(api_baseurl='https://api.github.com', override_config=override_conf) elif lister == 'bitbucket': from .bitbucket.models import IndexingModelBase as ModelBase from .bitbucket.lister import BitBucketLister _lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0', override_config=override_conf) elif lister == 'gitlab': from .gitlab.models import ModelBase from .gitlab.lister import GitLabLister _lister = GitLabLister(api_baseurl='https://gitlab.com/api/v4/', override_config=override_conf) elif lister == 'debian': from .debian.lister import DebianLister ModelBase = DebianLister.MODEL _lister = DebianLister() def insert_minimum_data(lister): from swh.storage.schemata.distribution import Distribution, Area d = Distribution( name='Debian', type='deb', mirror_uri='http://deb.debian.org/debian/') lister.db_session.add(d) areas = [] for distribution_name in ['stretch']: for area_name in ['main', 'contrib', 'non-free']: areas.append(Area( name='%s/%s' % (distribution_name, area_name), distribution=d, )) lister.db_session.add_all(areas) lister.db_session.commit() + elif lister == 'pypi': + from .pypi.models import ModelBase + from .pypi.lister import PyPiLister + _lister = PyPiLister(override_config=override_conf) + else: raise ValueError('Only supported listers are %s' % SUPPORTED_LISTERS) if drop_tables: ModelBase.metadata.drop_all(_lister.db_engine) if create_tables: ModelBase.metadata.create_all(_lister.db_engine) if with_data and insert_minimum_data: insert_minimum_data(_lister) if __name__ == '__main__': cli() diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py index d6e85b6..7f77449 100644 --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -1,153 +1,218 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import random from datetime import datetime from email.utils import parsedate from pprint import pformat import requests import xmltodict try: from swh.lister._version import __version__ except ImportError: __version__ = 'devel' from .abstractattribute import AbstractAttribute from .lister_base import FetchError +class ListerXMLRPCTransport(abc.ABC): + """Use the xmlrpc library for making Lister endpoint requests. + + To be used in conjunction with SWHListerBase or a subclass of it. + """ + SERVER = AbstractAttribute('string containing the server to contact for ' + 'information') + + def __init__(self): + self.lister_version = __version__ + + def get_client(self, path): + """Initialize client to query for result + + """ + from xmlrpc import client + return client.ServerProxy(path) + + def list_packages(self, client): + """Listing method + + """ + pass + + def request_uri(self, _): + """Same uri called once + + """ + return self.SERVER + + def request_params(self, identifier): + """Cannot pass any parameters to query to the xmlrpc client so cannot + even pass our user-agent specifics. + + """ + return {} + + def transport_quota_check(self, response): + """No rate limit dealing explained. + + """ + return False, 0 + + def transport_request(self, identifier): + """Implements SWHListerBase.transport_request for HTTP using Requests. + + """ + path = self.request_uri(identifier) + # params = self.request_params(identifier) # we cannot use this... + + try: + _client = self.get_client(path) + return self.list_packages(_client) + except Exception as e: + raise FetchError(e) + + def transport_response_to_string(self, response): + """Implements SWHListerBase.transport_response_to_string for XMLRPC + given responses. + """ + s = pformat(self.SERVER) + s += '\n#\n' + pformat(response) + return s + + class SWHListerHttpTransport(abc.ABC): """Use the Requests library for making Lister endpoint requests. To be used in conjunction with SWHListerBase or a subclass of it. """ PATH_TEMPLATE = AbstractAttribute('string containing a python string' ' format pattern that produces the API' ' endpoint path for listing stored' ' repositories when given an index.' ' eg. "/repositories?after=%s".' 'To be implemented in the API-specific' ' class inheriting this.') EXPECTED_STATUS_CODES = (200, 429, 403, 404) def request_headers(self): """Returns dictionary of any request headers needed by the server. MAY BE OVERRIDDEN if request headers are needed. """ return { 'User-Agent': 'Software Heritage lister (%s)' % self.lister_version } def request_uri(self, identifier): """Get the full request URI given the transport_request identifier. MAY BE OVERRIDDEN if something more complex than the PATH_TEMPLATE is required. """ path = self.PATH_TEMPLATE % identifier return self.api_baseurl + path def request_params(self, identifier): """Get the full parameters passed to requests given the transport_request identifier. MAY BE OVERRIDDEN if something more complex than the request headers is needed. """ params = {} params['headers'] = self.request_headers() or {} creds = self.config['credentials'] auth = random.choice(creds) if creds else None if auth: params['auth'] = (auth['username'], auth['password']) return params def transport_quota_check(self, response): """Implements SWHListerBase.transport_quota_check with standard 429 code check for HTTP with Requests library. MAY BE OVERRIDDEN if the server notifies about rate limits in a non-standard way that doesn't use HTTP 429 and the Retry-After response header. ( https://tools.ietf.org/html/rfc6585#section-4 ) """ if response.status_code == 429: # HTTP too many requests retry_after = response.headers.get('Retry-After', self.back_off()) try: # might be seconds return True, float(retry_after) except Exception: # might be http-date at_date = datetime(*parsedate(retry_after)[:6]) from_now = (at_date - datetime.today()).total_seconds() + 5 return True, max(0, from_now) else: # response ok self.reset_backoff() return False, 0 def __init__(self, api_baseurl=None): if not api_baseurl: raise NameError('HTTP Lister Transport requires api_baseurl.') self.api_baseurl = api_baseurl # eg. 'https://api.github.com' self.session = requests.Session() self.lister_version = __version__ def _transport_action(self, identifier, method='get'): """Permit to ask information to the api prior to actually executing query. """ path = self.request_uri(identifier) params = self.request_params(identifier) try: if method == 'head': response = self.session.head(path, **params) else: response = self.session.get(path, **params) except requests.exceptions.ConnectionError as e: raise FetchError(e) else: if response.status_code not in self.EXPECTED_STATUS_CODES: raise FetchError(response) return response def transport_head(self, identifier): """Retrieve head information on api. """ return self._transport_action(identifier, method='head') def transport_request(self, identifier): """Implements SWHListerBase.transport_request for HTTP using Requests. Retrieve get information on api. """ return self._transport_action(identifier) def transport_response_to_string(self, response): """Implements SWHListerBase.transport_response_to_string for HTTP given Requests responses. """ s = pformat(response.request.path_url) s += '\n#\n' + pformat(response.request.headers) s += '\n#\n' + pformat(response.status_code) s += '\n#\n' + pformat(response.headers) s += '\n#\n' try: # json? s += pformat(response.json()) except Exception: # not json try: # xml? s += pformat(xmltodict.parse(response.text)) except Exception: # not xml s += pformat(response.text) return s diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py new file mode 100644 index 0000000..42b707c --- /dev/null +++ b/swh/lister/core/simple_lister.py @@ -0,0 +1,67 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging + +from .lister_base import SWHListerBase + + +class SimpleLister(SWHListerBase): + """Lister* intermediate class for any service that follows the simple, + 'list in oneshot information' pattern. + + - Client sends a request to list repositories in oneshot + + - Client receives structured (json/xml/etc) response with + information and stores those in db + + """ + def ingest_data(self, identifier, checks=False): + """Rework the base ingest_data. + Request server endpoint which gives all in one go. + + Simplify and filter response list of repositories. Inject + repo information into local db. Queue loader tasks for + linked repositories. + + Args: + identifier: Resource identifier (unused) + checks (bool): Additional checks required (unused) + + """ + # Request (partial?) list of repositories info + response = self.safely_issue_request(identifier) + if not response: + return response, [] + models_list = self.transport_response_simplified(response) + models_list = self.filter_before_inject(models_list) + from swh.core import utils + all_injected = [] + for models in utils.grouper(models_list, n=1000): + models = list(models) + logging.debug('models: %s' % len(models)) + # inject into local db + injected = self.inject_repo_data_into_db(models) + # queue workers + self.create_missing_origins_and_tasks(models, injected) + all_injected.append(injected) + # flush + self.db_session.commit() + self.db_session = self.mk_session() + + return response, all_injected + + def run(self): + """Query the server which answers in one query. Stores the + information, dropping actual redundant information we + already have. + + Returns: + nothing + + """ + dump_not_used_identifier = 0 + response, injected_repos = self.ingest_data(dump_not_used_identifier) + if not response and not injected_repos: + logging.info('No response from api server, stopping') diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py new file mode 100644 index 0000000..857b951 --- /dev/null +++ b/swh/lister/pypi/lister.py @@ -0,0 +1,70 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from .models import PyPiModel + +from swh.scheduler import utils +from swh.lister.core.simple_lister import SimpleLister +from swh.lister.core.lister_transports import ListerXMLRPCTransport + + +class PyPiLister(ListerXMLRPCTransport, SimpleLister): + # Template path expecting an integer that represents the page id + MODEL = PyPiModel + LISTER_NAME = 'pypi' + SERVER = 'https://pypi.org/pypi' + + def __init__(self, override_config=None): + ListerXMLRPCTransport.__init__(self) + SimpleLister.__init__(self, override_config=override_config) + + def task_dict(self, origin_type, origin_url, **kwargs): + """(Override) Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + + """ + _type = 'origin-update-%s' % origin_type + _policy = 'recurring' + project_metadata_url = kwargs.get('html_url') + return utils.create_task_dict( + _type, _policy, origin_url, + project_metadata_url=project_metadata_url) + + def list_packages(self, client): + """(Override) List the actual pypi origins from the api. + + """ + return client.list_packages() + + def _compute_urls(self, repo_name): + """Returns a tuple (project_url, project_metadata_url) + + """ + return ( + 'https://pypi.org/pypi/%s/' % repo_name, + 'https://pypi.org/pypi/%s/json' % repo_name + ) + + def get_model_from_repo(self, repo_name): + """(Override) Transform from repository representation to model + + """ + project_url, project_url_meta = self._compute_urls(repo_name) + return { + 'uid': repo_name, + 'name': repo_name, + 'full_name': repo_name, + 'html_url': project_url_meta, + 'origin_url': project_url, + 'origin_type': 'pypi', + 'description': None, + } + + def transport_response_simplified(self, response): + """(Override) Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo_name) for repo_name in response] diff --git a/swh/lister/pypi/models.py b/swh/lister/pypi/models.py new file mode 100644 index 0000000..b035f4c --- /dev/null +++ b/swh/lister/pypi/models.py @@ -0,0 +1,16 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from ..core.models import ModelBase + + +class PyPiModel(ModelBase): + """a PyPi repository representation + + """ + __tablename__ = 'pypi_repo' + + uid = Column(String, primary_key=True) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py new file mode 100644 index 0000000..d8b0e2c --- /dev/null +++ b/swh/lister/pypi/tasks.py @@ -0,0 +1,20 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from ..core.tasks import ListerTaskBase +from .lister import PyPiLister + + +class PyPiListerTask(ListerTaskBase): + """Full PyPi lister (list all available origins from the api). + + """ + task_queue = 'swh_lister_pypi_refresh' + + def new_lister(self): + return PyPiLister() + + def run_task(self): + lister = self.new_lister() + lister.run()