diff --git a/README.md b/README.md index d3d0240..0607ce4 100644 --- a/README.md +++ b/README.md @@ -1,202 +1,202 @@ SWH-lister ============ The Software Heritage Lister is both a library module to permit to centralize lister behaviors, and to provide lister implementations. Actual lister implementations are: - swh-lister-bitbucket - swh-lister-debian - swh-lister-github - swh-lister-gitlab - swh-lister-pypi Licensing ---------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ - python3 - python3-requests - python3-sqlalchemy More details in requirements*.txt Local deployment ----------- ## lister-github ### Preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/ 3. create configuration file ~/.config/swh/lister-github.com.yml 4. Bootstrap the db instance schema $ createdb lister-github $ python3 -m swh.lister.cli --db-url postgres:///lister-github \ --lister github \ --create-tables ### Configuration file sample Minimalistic configuration: $ cat ~/.config/swh/lister-github.com.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-github credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/github.com + cache_dir: /home/user/.cache/swh/lister/github.com Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 >>> import logging >>> logging.basicConfig(level=logging.DEBUG) >>> from swh.lister.github.tasks import RangeGitHubLister; RangeGitHubLister().run(364, 365) INFO:root:listing repos starting at 364 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1 ## lister-gitlab ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/ 3. create configuration file ~/.config/swh/lister-gitlab.yml 4. Bootstrap the db instance schema $ createdb lister-gitlab $ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab \ --lister gitlab \ --create-tables ### Configuration file sample $ cat ~/.config/swh/lister-gitlab.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-gitlab credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/gitlab + cache_dir: /home/user/.cache/swh/lister/gitlab Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20}) >>> from swh.lister.gitlab.tasks import FullGitLabRelister; FullGitLabRelister().run_task( {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20}) >>> from swh.lister.gitlab.tasks import IncrementalGitLabLister; IncrementalGitLabLister().run_task( {'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20}) ## lister-debian ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/debian/ 3. create configuration file ~/.config/swh/lister-debian.yml 4. Bootstrap the db instance schema $ createdb lister-debian $ python3 -m swh.lister.cli --db-url postgres:///lister-debian \ --lister debian \ --create-tables \ --with-data Note: This bootstraps a minimum data set needed for the debian lister to run (for development) ### Configuration file sample $ cat ~/.config/swh/lister-debian.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-debian credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/debian + cache_dir: /home/user/.cache/swh/lister/debian Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import logging; logging.basicConfig(level=logging.DEBUG); from swh.lister.debian.tasks import DebianListerTask; DebianListerTask().run_task('Debian') DEBUG:root:Creating snapshot for distribution Distribution(Debian (deb) on http://deb.debian.org/debian/) on date 2018-07-27 09:22:50.461165+00:00 DEBUG:root:Processing area Area(stretch/main of Debian) DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): deb.debian.org DEBUG:urllib3.connectionpool:http://deb.debian.org:80 "GET /debian//dists/stretch/main/source/Sources.xz HTTP/1.1" 302 325 ... -## lister-debian +## lister-pypi ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/pypi/ 3. create configuration file ~/.config/swh/lister-pypi.yml 4. Bootstrap the db instance schema $ createdb lister-pypi $ python3 -m swh.lister.cli --db-url postgres:///lister-pypi \ --lister pypi \ --create-tables \ --with-data Note: This bootstraps a minimum data set needed for the pypi lister to run (for development) ### Configuration file sample $ cat ~/.config/swh/lister-pypi.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-pypi credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/pypi + cache_dir: /home/user/.cache/swh/lister/pypi Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> from swh.lister.pypi.tasks import PyPiListerTask; PyPiListerTask().run_task() >>> diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py index a1f346c..ef59b6f 100644 --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -1,232 +1,232 @@ # Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import random from datetime import datetime from email.utils import parsedate from pprint import pformat from xmlrpc import client import requests import xmltodict try: from swh.lister._version import __version__ except ImportError: __version__ = 'devel' from .abstractattribute import AbstractAttribute from .lister_base import FetchError class ListerXMLRPCTransport(abc.ABC): """Use the xmlrpc library for making Lister endpoint requests. To be used in conjunction with SWHListerBase or a subclass of it. """ SERVER = AbstractAttribute('string containing the server to contact for ' 'information') def __init__(self): self.lister_version = __version__ def get_client(self, path): """Initialize client to query for result """ return client.ServerProxy(path) def request_uri(self, _): """Same uri called once """ return self.SERVER def request_params(self, identifier): """Cannot pass any parameters to query to the xmlrpc client so cannot even pass our user-agent specifics. """ return {} def transport_quota_check(self, response): """No rate limit dealing explained. """ return False, 0 def transport_request(self, identifier): """Implements SWHListerBase.transport_request """ path = self.request_uri(identifier) try: return self.get_client(path) except Exception as e: raise FetchError(e) def transport_response_to_string(self, response): """Implements SWHListerBase.transport_response_to_string for XMLRPC given responses. """ s = pformat(self.SERVER) s += '\n#\n' + pformat(response) # Note: will potentially be big return s class SWHListerHttpTransport(abc.ABC): """Use the Requests library for making Lister endpoint requests. To be used in conjunction with SWHListerBase or a subclass of it. """ PATH_TEMPLATE = AbstractAttribute('string containing a python string' ' format pattern that produces the API' ' endpoint path for listing stored' ' repositories when given an index.' ' eg. "/repositories?after=%s".' 'To be implemented in the API-specific' ' class inheriting this.') EXPECTED_STATUS_CODES = (200, 429, 403, 404) def request_headers(self): """Returns dictionary of any request headers needed by the server. MAY BE OVERRIDDEN if request headers are needed. """ return { 'User-Agent': 'Software Heritage lister (%s)' % self.lister_version } def request_uri(self, identifier): """Get the full request URI given the transport_request identifier. MAY BE OVERRIDDEN if something more complex than the PATH_TEMPLATE is required. """ path = self.PATH_TEMPLATE % identifier return self.api_baseurl + path def request_params(self, identifier): """Get the full parameters passed to requests given the transport_request identifier. MAY BE OVERRIDDEN if something more complex than the request headers is needed. """ params = {} params['headers'] = self.request_headers() or {} creds = self.config['credentials'] auth = random.choice(creds) if creds else None if auth: params['auth'] = (auth['username'], auth['password']) return params def transport_quota_check(self, response): """Implements SWHListerBase.transport_quota_check with standard 429 code check for HTTP with Requests library. MAY BE OVERRIDDEN if the server notifies about rate limits in a non-standard way that doesn't use HTTP 429 and the Retry-After response header. ( https://tools.ietf.org/html/rfc6585#section-4 ) """ if response.status_code == 429: # HTTP too many requests retry_after = response.headers.get('Retry-After', self.back_off()) try: # might be seconds return True, float(retry_after) except Exception: # might be http-date at_date = datetime(*parsedate(retry_after)[:6]) from_now = (at_date - datetime.today()).total_seconds() + 5 return True, max(0, from_now) else: # response ok self.reset_backoff() return False, 0 def __init__(self, api_baseurl=None): if not api_baseurl: raise NameError('HTTP Lister Transport requires api_baseurl.') self.api_baseurl = api_baseurl # eg. 'https://api.github.com' self.session = requests.Session() self.lister_version = __version__ def _transport_action(self, identifier, method='get'): """Permit to ask information to the api prior to actually executing query. """ path = self.request_uri(identifier) params = self.request_params(identifier) try: if method == 'head': response = self.session.head(path, **params) else: response = self.session.get(path, **params) except requests.exceptions.ConnectionError as e: raise FetchError(e) else: if response.status_code not in self.EXPECTED_STATUS_CODES: raise FetchError(response) return response def transport_head(self, identifier): """Retrieve head information on api. """ return self._transport_action(identifier, method='head') def transport_request(self, identifier): """Implements SWHListerBase.transport_request for HTTP using Requests. Retrieve get information on api. """ return self._transport_action(identifier) def transport_response_to_string(self, response): """Implements SWHListerBase.transport_response_to_string for HTTP given Requests responses. """ s = pformat(response.request.path_url) s += '\n#\n' + pformat(response.request.headers) s += '\n#\n' + pformat(response.status_code) s += '\n#\n' + pformat(response.headers) s += '\n#\n' try: # json? s += pformat(response.json()) except Exception: # not json try: # xml? s += pformat(xmltodict.parse(response.text)) except Exception: # not xml s += pformat(response.text) return s class ListerOnePageApiTransport(SWHListerHttpTransport): - """Use the request library for retrieving a basic html page and parse - the result. + """Leverage requests library to retrieve basic html page and parse + result. To be used in conjunction with SWHListerBase or a subclass of it. """ PAGE = AbstractAttribute("The server api's unique page to retrieve and " "parse for information") PATH_TEMPLATE = None # we do not use it def __init__(self, api_baseurl=None): self.session = requests.Session() self.lister_version = __version__ def request_uri(self, _): """Get the full request URI given the transport_request identifier. """ return self.PAGE diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index 4f7c55e..6d04d78 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,77 +1,76 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import xmltodict from .models import PyPiModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister from swh.lister.core.lister_transports import ListerOnePageApiTransport class PyPiLister(ListerOnePageApiTransport, SimpleLister): - # Template path expecting an integer that represents the page id MODEL = PyPiModel LISTER_NAME = 'pypi' PAGE = 'https://pypi.org/simple/' def __init__(self, override_config=None): ListerOnePageApiTransport .__init__(self) SimpleLister.__init__(self, override_config=override_config) def task_dict(self, origin_type, origin_url, **kwargs): """(Override) Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ _type = 'origin-update-%s' % origin_type _policy = 'recurring' project_name = kwargs.get('name') project_metadata_url = kwargs.get('html_url') return utils.create_task_dict( _type, _policy, project_name, origin_url, project_metadata_url=project_metadata_url) def list_packages(self, response): """(Override) List the actual pypi origins from the response. """ result = xmltodict.parse(response.content) _packages = [p['#text'] for p in result['html']['body']['a']] random.shuffle(_packages) return _packages def _compute_urls(self, repo_name): """Returns a tuple (project_url, project_metadata_url) """ return ( 'https://pypi.org/pypi/%s/' % repo_name, 'https://pypi.org/pypi/%s/json' % repo_name ) def get_model_from_repo(self, repo_name): """(Override) Transform from repository representation to model """ project_url, project_url_meta = self._compute_urls(repo_name) return { 'uid': repo_name, 'name': repo_name, 'full_name': repo_name, 'html_url': project_url_meta, 'origin_url': project_url, 'origin_type': 'pypi', 'description': None, } def transport_response_simplified(self, response): """(Override) Transform response to list for model manipulation """ return [self.get_model_from_repo(repo_name) for repo_name in response]