diff --git a/PKG-INFO b/PKG-INFO index c68754c..04eec84 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,30 +1,88 @@ Metadata-Version: 2.1 Name: swh.core -Version: 0.0.63 +Version: 0.0.64 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-core ======== core library for swh's modules: - config parser - hash computations - serialization - logging mechanism + - database connection + - http-based RPC client/server + + Development + ----------- + + We strongly recommend you to use a [virtualenv][1] if you want to run tests or + hack the code. + + To set up your development environment: + + ``` + (swh) user@host:~/swh-environment/swh-core$ pip install -e .[testing] + ``` + + This will install every Python package needed to run this package's tests. + + Unit tests can be executed using [pytest][2] or [tox][3]. + + ``` + (swh) user@host:~/swh-environment/swh-core$ pytest + ============================== test session starts ============================== + platform linux -- Python 3.7.3, pytest-3.10.1, py-1.8.0, pluggy-0.12.0 + hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/ddouard/src/swh-environment/swh-core/.hypothesis/examples') + rootdir: /home/ddouard/src/swh-environment/swh-core, inifile: pytest.ini + plugins: requests-mock-1.6.0, hypothesis-4.26.4, celery-4.3.0, postgresql-1.4.1 + collected 89 items + + swh/core/api/tests/test_api.py .. [ 2%] + swh/core/api/tests/test_async.py .... [ 6%] + swh/core/api/tests/test_serializers.py ..... [ 12%] + swh/core/db/tests/test_db.py .... [ 16%] + swh/core/tests/test_cli.py ...... [ 23%] + swh/core/tests/test_config.py .............. [ 39%] + swh/core/tests/test_statsd.py ........................................... [ 87%] + .... [ 92%] + swh/core/tests/test_utils.py ....... [100%] + ===================== 89 passed, 9 warnings in 6.94 seconds ===================== + ``` + + Note: this git repository uses [pre-commit][4] hooks to ensure better and more + consistent code. It should already be installed in your virtualenv (if not, + just type `pip install pre-commit`). Make sure to activate it in your local + copy of the git repository: + + ``` + (swh) user@host:~/swh-environment/swh-core$ pre-commit install + pre-commit installed at .git/hooks/pre-commit + ``` + + Please read the [developer setup manual][5] for more information on how to hack + on Software Heritage. + + [1]: https://virtualenv.pypa.io + [2]: https://docs.pytest.org + [3]: https://tox.readthedocs.io + [4]: https://pre-commit.com + [5]: https://docs.softwareheritage.org/devel/developer-setup.html Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown +Provides-Extra: testing Provides-Extra: http Provides-Extra: db -Provides-Extra: testing diff --git a/README.md b/README.md index 25aeda6..bf4ef79 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,66 @@ swh-core ======== core library for swh's modules: - config parser - hash computations - serialization - logging mechanism +- database connection +- http-based RPC client/server + +Development +----------- + +We strongly recommend you to use a [virtualenv][1] if you want to run tests or +hack the code. + +To set up your development environment: + +``` +(swh) user@host:~/swh-environment/swh-core$ pip install -e .[testing] +``` + +This will install every Python package needed to run this package's tests. + +Unit tests can be executed using [pytest][2] or [tox][3]. + +``` +(swh) user@host:~/swh-environment/swh-core$ pytest +============================== test session starts ============================== +platform linux -- Python 3.7.3, pytest-3.10.1, py-1.8.0, pluggy-0.12.0 +hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/ddouard/src/swh-environment/swh-core/.hypothesis/examples') +rootdir: /home/ddouard/src/swh-environment/swh-core, inifile: pytest.ini +plugins: requests-mock-1.6.0, hypothesis-4.26.4, celery-4.3.0, postgresql-1.4.1 +collected 89 items + +swh/core/api/tests/test_api.py .. [ 2%] +swh/core/api/tests/test_async.py .... [ 6%] +swh/core/api/tests/test_serializers.py ..... [ 12%] +swh/core/db/tests/test_db.py .... [ 16%] +swh/core/tests/test_cli.py ...... [ 23%] +swh/core/tests/test_config.py .............. [ 39%] +swh/core/tests/test_statsd.py ........................................... [ 87%] +.... [ 92%] +swh/core/tests/test_utils.py ....... [100%] +===================== 89 passed, 9 warnings in 6.94 seconds ===================== +``` + +Note: this git repository uses [pre-commit][4] hooks to ensure better and more +consistent code. It should already be installed in your virtualenv (if not, +just type `pip install pre-commit`). Make sure to activate it in your local +copy of the git repository: + +``` +(swh) user@host:~/swh-environment/swh-core$ pre-commit install +pre-commit installed at .git/hooks/pre-commit +``` + +Please read the [developer setup manual][5] for more information on how to hack +on Software Heritage. + +[1]: https://virtualenv.pypa.io +[2]: https://docs.pytest.org +[3]: https://tox.readthedocs.io +[4]: https://pre-commit.com +[5]: https://docs.softwareheritage.org/devel/developer-setup.html diff --git a/requirements.txt b/requirements.txt index bd236c3..7d6c629 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ +Deprecated PyYAML systemd-python - diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index dd9de8a..66235fe --- a/setup.py +++ b/setup.py @@ -1,78 +1,79 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() def parse_requirements(*names): requirements = [] for name in names: if name: reqf = 'requirements-%s.txt' % name else: reqf = 'requirements.txt' if not os.path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.core', description='Software Heritage core utilities', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DCORE/', packages=find_packages(), scripts=[], install_requires=parse_requirements(None, 'swh'), setup_requires=['vcversioner'], extras_require={ 'testing': parse_requirements('test', 'db', 'http'), 'db': parse_requirements('db'), 'http': parse_requirements('http'), }, vcversioner={}, include_package_data=True, entry_points=''' [console_scripts] swh=swh.core.cli:main swh-db-init=swh.core.cli.db:db_init [swh.cli.subcommands] + db=swh.core.cli.db:db db-init=swh.core.cli.db:db_init ''', classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', 'Source': 'https://forge.softwareheritage.org/source/swh-core', }, ) diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index c68754c..04eec84 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,30 +1,88 @@ Metadata-Version: 2.1 Name: swh.core -Version: 0.0.63 +Version: 0.0.64 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-core ======== core library for swh's modules: - config parser - hash computations - serialization - logging mechanism + - database connection + - http-based RPC client/server + + Development + ----------- + + We strongly recommend you to use a [virtualenv][1] if you want to run tests or + hack the code. + + To set up your development environment: + + ``` + (swh) user@host:~/swh-environment/swh-core$ pip install -e .[testing] + ``` + + This will install every Python package needed to run this package's tests. + + Unit tests can be executed using [pytest][2] or [tox][3]. + + ``` + (swh) user@host:~/swh-environment/swh-core$ pytest + ============================== test session starts ============================== + platform linux -- Python 3.7.3, pytest-3.10.1, py-1.8.0, pluggy-0.12.0 + hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/ddouard/src/swh-environment/swh-core/.hypothesis/examples') + rootdir: /home/ddouard/src/swh-environment/swh-core, inifile: pytest.ini + plugins: requests-mock-1.6.0, hypothesis-4.26.4, celery-4.3.0, postgresql-1.4.1 + collected 89 items + + swh/core/api/tests/test_api.py .. [ 2%] + swh/core/api/tests/test_async.py .... [ 6%] + swh/core/api/tests/test_serializers.py ..... [ 12%] + swh/core/db/tests/test_db.py .... [ 16%] + swh/core/tests/test_cli.py ...... [ 23%] + swh/core/tests/test_config.py .............. [ 39%] + swh/core/tests/test_statsd.py ........................................... [ 87%] + .... [ 92%] + swh/core/tests/test_utils.py ....... [100%] + ===================== 89 passed, 9 warnings in 6.94 seconds ===================== + ``` + + Note: this git repository uses [pre-commit][4] hooks to ensure better and more + consistent code. It should already be installed in your virtualenv (if not, + just type `pip install pre-commit`). Make sure to activate it in your local + copy of the git repository: + + ``` + (swh) user@host:~/swh-environment/swh-core$ pre-commit install + pre-commit installed at .git/hooks/pre-commit + ``` + + Please read the [developer setup manual][5] for more information on how to hack + on Software Heritage. + + [1]: https://virtualenv.pypa.io + [2]: https://docs.pytest.org + [3]: https://tox.readthedocs.io + [4]: https://pre-commit.com + [5]: https://docs.softwareheritage.org/devel/developer-setup.html Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown +Provides-Extra: testing Provides-Extra: http Provides-Extra: db -Provides-Extra: testing diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index e6d432b..8ec21be 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,46 +1,48 @@ MANIFEST.in Makefile README.md requirements-db.txt requirements-http.txt requirements-swh.txt requirements.txt setup.py version.txt swh/__init__.py swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt swh.core.egg-info/entry_points.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/__init__.py swh/core/api_async.py swh/core/config.py swh/core/logger.py swh/core/statsd.py swh/core/tarball.py swh/core/utils.py swh/core/api/__init__.py swh/core/api/asynchronous.py swh/core/api/negotiation.py swh/core/api/serializers.py swh/core/api/tests/__init__.py swh/core/api/tests/server_testing.py swh/core/api/tests/test_api.py +swh/core/api/tests/test_async.py swh/core/api/tests/test_serializers.py swh/core/cli/__init__.py swh/core/cli/db.py swh/core/db/__init__.py swh/core/db/common.py swh/core/db/db_utils.py swh/core/db/tests/__init__.py swh/core/db/tests/conftest.py swh/core/db/tests/db_testing.py +swh/core/db/tests/test_cli.py swh/core/db/tests/test_db.py swh/core/sql/log-schema.sql swh/core/tests/__init__.py swh/core/tests/test_cli.py swh/core/tests/test_config.py swh/core/tests/test_statsd.py swh/core/tests/test_utils.py \ No newline at end of file diff --git a/swh.core.egg-info/entry_points.txt b/swh.core.egg-info/entry_points.txt index 06d08b2..403254e 100644 --- a/swh.core.egg-info/entry_points.txt +++ b/swh.core.egg-info/entry_points.txt @@ -1,7 +1,8 @@ [console_scripts] swh=swh.core.cli:main swh-db-init=swh.core.cli.db:db_init [swh.cli.subcommands] + db=swh.core.cli.db:db db-init=swh.core.cli.db:db_init \ No newline at end of file diff --git a/swh.core.egg-info/requires.txt b/swh.core.egg-info/requires.txt index 131c64f..854035c 100644 --- a/swh.core.egg-info/requires.txt +++ b/swh.core.egg-info/requires.txt @@ -1,29 +1,31 @@ +Deprecated PyYAML systemd-python [db] psycopg2 [http] aiohttp arrow decorator Flask msgpack>0.5 python-dateutil requests [testing] Click pytest<4 pytest-postgresql requests-mock hypothesis>=3.11.0 +pre-commit psycopg2 aiohttp arrow decorator Flask msgpack>0.5 python-dateutil requests diff --git a/swh/core/api/__init__.py b/swh/core/api/__init__.py index 0a23f66..7df9100 100644 --- a/swh/core/api/__init__.py +++ b/swh/core/api/__init__.py @@ -1,322 +1,347 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import collections import functools import inspect import json import logging import pickle import requests import datetime +from deprecated import deprecated from flask import Flask, Request, Response, request, abort from .serializers import (decode_response, encode_data_client as encode_data, msgpack_dumps, msgpack_loads, SWHJSONDecoder) from .negotiation import (Formatter as FormatterBase, Negotiator as NegotiatorBase, negotiate as _negotiate) logger = logging.getLogger(__name__) # support for content negotiation class Negotiator(NegotiatorBase): def best_mimetype(self): return request.accept_mimetypes.best_match( self.accept_mimetypes, 'application/json') def _abort(self, status_code, err=None): return abort(status_code, err) def negotiate(formatter_cls, *args, **kwargs): return _negotiate(Negotiator, formatter_cls, *args, **kwargs) class Formatter(FormatterBase): def _make_response(self, body, content_type): return Response(body, content_type=content_type) class SWHJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() if isinstance(obj, datetime.timedelta): return str(obj) # Let the base class default method raise the TypeError return super().default(obj) class JSONFormatter(Formatter): format = 'json' mimetypes = ['application/json'] def render(self, obj): return json.dumps(obj, cls=SWHJSONEncoder) class MsgpackFormatter(Formatter): format = 'msgpack' mimetypes = ['application/x-msgpack'] def render(self, obj): return msgpack_dumps(obj) # base API classes class RemoteException(Exception): pass def remote_api_endpoint(path): def dec(f): f._endpoint_path = path return f return dec class APIError(Exception): """API Error""" def __str__(self): return ('An unexpected error occurred in the backend: {}' .format(self.args)) -class MetaSWHRemoteAPI(type): - """Metaclass for SWHRemoteAPI, which adds a method for each endpoint +class MetaRPCClient(type): + """Metaclass for RPCClient, which adds a method for each endpoint of the database it is designed to access. See for example :class:`swh.indexer.storage.api.client.RemoteStorage`""" def __new__(cls, name, bases, attributes): # For each method wrapped with @remote_api_endpoint in an API backend # (eg. :class:`swh.indexer.storage.IndexerStorage`), add a new # method in RemoteStorage, with the same documentation. # # Note that, despite the usage of decorator magic (eg. functools.wrap), # this never actually calls an IndexerStorage method. backend_class = attributes.get('backend_class', None) for base in bases: if backend_class is not None: break backend_class = getattr(base, 'backend_class', None) if backend_class: for (meth_name, meth) in backend_class.__dict__.items(): if hasattr(meth, '_endpoint_path'): cls.__add_endpoint(meth_name, meth, attributes) return super().__new__(cls, name, bases, attributes) @staticmethod def __add_endpoint(meth_name, meth, attributes): wrapped_meth = inspect.unwrap(meth) @functools.wraps(meth) # Copy signature and doc def meth_(*args, **kwargs): # Match arguments and parameters post_data = inspect.getcallargs( wrapped_meth, *args, **kwargs) # Remove arguments that should not be passed self = post_data.pop('self') post_data.pop('cur', None) post_data.pop('db', None) # Send the request. return self.post(meth._endpoint_path, post_data) attributes[meth_name] = meth_ -class SWHRemoteAPI(metaclass=MetaSWHRemoteAPI): - """Proxy to an internal SWH API +class RPCClient(metaclass=MetaRPCClient): + """Proxy to an internal SWH RPC """ backend_class = None """For each method of `backend_class` decorated with :func:`remote_api_endpoint`, a method with the same prototype and docstring will be added to this class. Calls to this new method will be translated into HTTP requests to a remote server. This backend class will never be instantiated, it only serves as a template.""" api_exception = APIError """The exception class to raise in case of communication error with the server.""" def __init__(self, url, api_exception=None, timeout=None, chunk_size=4096, **kwargs): if api_exception: self.api_exception = api_exception base_url = url if url.endswith('/') else url + '/' self.url = base_url self.session = requests.Session() + adapter = requests.adapters.HTTPAdapter( + max_retries=kwargs.get('max_retries', 3), + pool_connections=kwargs.get('pool_connections', 20), + pool_maxsize=kwargs.get('pool_maxsize', 100)) + self.session.mount(self.url, adapter) + self.timeout = timeout self.chunk_size = chunk_size def _url(self, endpoint): return '%s%s' % (self.url, endpoint) def raw_verb(self, verb, endpoint, **opts): if 'chunk_size' in opts: # if the chunk_size argument has been passed, consider the user # also wants stream=True, otherwise, what's the point. opts['stream'] = True if self.timeout and 'timeout' not in opts: opts['timeout'] = self.timeout try: return getattr(self.session, verb)( self._url(endpoint), **opts ) except requests.exceptions.ConnectionError as e: raise self.api_exception(e) def post(self, endpoint, data, **opts): if isinstance(data, (collections.Iterator, collections.Generator)): data = (encode_data(x) for x in data) else: data = encode_data(data) chunk_size = opts.pop('chunk_size', self.chunk_size) response = self.raw_verb( 'post', endpoint, data=data, headers={'content-type': 'application/x-msgpack', 'accept': 'application/x-msgpack'}, **opts) if opts.get('stream') or \ response.headers.get('transfer-encoding') == 'chunked': return response.iter_content(chunk_size) else: return self._decode_response(response) post_stream = post def get(self, endpoint, **opts): chunk_size = opts.pop('chunk_size', self.chunk_size) response = self.raw_verb( 'get', endpoint, headers={'accept': 'application/x-msgpack'}, **opts) if opts.get('stream') or \ response.headers.get('transfer-encoding') == 'chunked': return response.iter_content(chunk_size) else: return self._decode_response(response) def get_stream(self, endpoint, **opts): return self.get(endpoint, stream=True, **opts) def _decode_response(self, response): if response.status_code == 404: return None if response.status_code == 500: data = decode_response(response) if 'exception_pickled' in data: raise pickle.loads(data['exception_pickled']) else: raise RemoteException(data['exception']) # XXX: this breaks language-independence and should be # replaced by proper unserialization if response.status_code == 400: raise pickle.loads(decode_response(response)) elif response.status_code != 200: raise RemoteException( "Unexpected status code for API request: %s (%s)" % ( response.status_code, response.content, ) ) return decode_response(response) def __repr__(self): return '<{} url={}>'.format(self.__class__.__name__, self.url) class BytesRequest(Request): """Request with proper escaping of arbitrary byte sequences.""" encoding = 'utf-8' encoding_errors = 'surrogateescape' ENCODERS = { 'application/x-msgpack': msgpack_dumps, 'application/json': json.dumps, } def encode_data_server(data, content_type='application/x-msgpack'): encoded_data = ENCODERS[content_type](data) return Response( encoded_data, mimetype=content_type, ) def decode_request(request): content_type = request.mimetype data = request.get_data() if not data: return {} if content_type == 'application/x-msgpack': r = msgpack_loads(data) elif content_type == 'application/json': r = json.loads(data, cls=SWHJSONDecoder) else: raise ValueError('Wrong content type `%s` for API request' % content_type) return r def error_handler(exception, encoder): # XXX: this breaks language-independence and should be # replaced by proper serialization of errors logging.exception(exception) response = encoder(pickle.dumps(exception)) response.status_code = 400 return response -class SWHServerAPIApp(Flask): +class RPCServerApp(Flask): """For each endpoint of the given `backend_class`, tells app.route to call a function that decodes the request and sends it to the backend object provided by the factory. :param Any backend_class: The class of the backend, which will be analyzed to look for API endpoints. :param Callable[[], backend_class] backend_factory: A function with no argument that returns an instance of `backend_class`.""" request_class = BytesRequest def __init__(self, *args, backend_class=None, backend_factory=None, **kwargs): super().__init__(*args, **kwargs) if backend_class is not None: if backend_factory is None: raise TypeError('Missing argument backend_factory') for (meth_name, meth) in backend_class.__dict__.items(): if hasattr(meth, '_endpoint_path'): self.__add_endpoint(meth_name, meth, backend_factory) def __add_endpoint(self, meth_name, meth, backend_factory): from flask import request @self.route('/'+meth._endpoint_path, methods=['POST']) @functools.wraps(meth) # Copy signature and doc def _f(): # Call the actual code obj_meth = getattr(backend_factory(), meth_name) return encode_data_server(obj_meth(**decode_request(request))) + + +@deprecated(version='0.0.64', + reason='Use the RPCServerApp instead') +class SWHServerAPIApp(RPCServerApp): + pass + + +@deprecated(version='0.0.64', + reason='Use the MetaRPCClient instead') +class MetaSWHRemoteAPI(MetaRPCClient): + pass + + +@deprecated(version='0.0.64', + reason='Use the RPCClient instead') +class SWHRemoteAPI(RPCClient): + pass diff --git a/swh/core/api/asynchronous.py b/swh/core/api/asynchronous.py index 02085b1..93d5c70 100644 --- a/swh/core/api/asynchronous.py +++ b/swh/core/api/asynchronous.py @@ -1,54 +1,61 @@ import json import logging import pickle import sys import traceback import aiohttp.web +from deprecated import deprecated import multidict from .serializers import msgpack_dumps, msgpack_loads, SWHJSONDecoder def encode_data_server(data, **kwargs): return aiohttp.web.Response( body=msgpack_dumps(data), headers=multidict.MultiDict({'Content-Type': 'application/x-msgpack'}), **kwargs ) async def decode_request(request): content_type = request.headers.get('Content-Type') data = await request.read() if not data: return {} if content_type == 'application/x-msgpack': r = msgpack_loads(data) elif content_type == 'application/json': - r = json.loads(data, cls=SWHJSONDecoder) + r = json.loads(data.decode(), cls=SWHJSONDecoder) else: raise ValueError('Wrong content type `%s` for API request' % content_type) return r async def error_middleware(app, handler): async def middleware_handler(request): try: return (await handler(request)) except Exception as e: if isinstance(e, aiohttp.web.HTTPException): raise logging.exception(e) exception = traceback.format_exception(*sys.exc_info()) res = {'exception': exception, 'exception_pickled': pickle.dumps(e)} return encode_data_server(res, status=500) return middleware_handler -class SWHRemoteAPI(aiohttp.web.Application): +class RPCServerApp(aiohttp.web.Application): def __init__(self, *args, middlewares=(), **kwargs): middlewares = (error_middleware,) + middlewares super().__init__(*args, middlewares=middlewares, **kwargs) + + +@deprecated(version='0.0.64', + reason='Use the RPCServerApp instead') +class SWHRemoteAPI(RPCServerApp): + pass diff --git a/swh/core/api/tests/test_api.py b/swh/core/api/tests/test_api.py index 1b978d8..32180f8 100644 --- a/swh/core/api/tests/test_api.py +++ b/swh/core/api/tests/test_api.py @@ -1,81 +1,84 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import requests_mock from werkzeug.wrappers import BaseResponse from werkzeug.test import Client as WerkzeugTestClient from swh.core.api import ( error_handler, encode_data_server, - remote_api_endpoint, SWHRemoteAPI, SWHServerAPIApp) + remote_api_endpoint, RPCClient, RPCServerApp) class ApiTest(unittest.TestCase): def test_server(self): testcase = self nb_endpoint_calls = 0 class TestStorage: @remote_api_endpoint('test_endpoint_url') def test_endpoint(self, test_data, db=None, cur=None): nonlocal nb_endpoint_calls nb_endpoint_calls += 1 testcase.assertEqual(test_data, 'spam') return 'egg' - app = SWHServerAPIApp('testapp', - backend_class=TestStorage, - backend_factory=lambda: TestStorage()) + app = RPCServerApp('testapp', + backend_class=TestStorage, + backend_factory=lambda: TestStorage()) @app.errorhandler(Exception) def my_error_handler(exception): return error_handler(exception, encode_data_server) client = WerkzeugTestClient(app, BaseResponse) res = client.post('/test_endpoint_url', headers={'Content-Type': 'application/x-msgpack'}, data=b'\x81\xa9test_data\xa4spam') self.assertEqual(nb_endpoint_calls, 1) self.assertEqual(b''.join(res.response), b'\xa3egg') def test_client(self): class TestStorage: @remote_api_endpoint('test_endpoint_url') def test_endpoint(self, test_data, db=None, cur=None): pass nb_http_calls = 0 def callback(request, context): nonlocal nb_http_calls nb_http_calls += 1 self.assertEqual(request.headers['Content-Type'], 'application/x-msgpack') self.assertEqual(request.body, b'\x81\xa9test_data\xa4spam') context.headers['Content-Type'] = 'application/x-msgpack' context.content = b'\xa3egg' return b'\xa3egg' adapter = requests_mock.Adapter() adapter.register_uri('POST', 'mock://example.com/test_endpoint_url', content=callback) - class Testclient(SWHRemoteAPI): + class Testclient(RPCClient): backend_class = TestStorage def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.session.mount('mock', adapter) + # we need to mount the mock adapter on the base url to override + # RPCClient's mechanism that also mounts an HTTPAdapter + # (for configuration purpose) + self.session.mount('mock://example.com/', adapter) c = Testclient(url='mock://example.com/') res = c.test_endpoint('spam') self.assertEqual(nb_http_calls, 1) self.assertEqual(res, 'egg') diff --git a/swh/core/api/tests/test_async.py b/swh/core/api/tests/test_async.py new file mode 100644 index 0000000..883149e --- /dev/null +++ b/swh/core/api/tests/test_async.py @@ -0,0 +1,115 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +import json + +import msgpack + +import pytest + +from swh.core.api.asynchronous import RPCServerApp +from swh.core.api.asynchronous import encode_data_server, decode_request +from swh.core.api.serializers import msgpack_dumps, SWHJSONEncoder + + +pytest_plugins = ['aiohttp.pytest_plugin', 'pytester'] + + +async def root(request): + return encode_data_server('toor') + +STRUCT = {'txt': 'something stupid', + # 'date': datetime.date(2019, 6, 9), # not supported + 'datetime': datetime.datetime(2019, 6, 9, 10, 12), + 'timedelta': datetime.timedelta(days=-2, hours=3), + 'int': 42, + 'float': 3.14, + 'subdata': {'int': 42, + 'datetime': datetime.datetime(2019, 6, 10, 11, 12), + }, + 'list': [42, datetime.datetime(2019, 9, 10, 11, 12), 'ok'], + } + + +async def struct(request): + return encode_data_server(STRUCT) + + +async def echo(request): + data = await decode_request(request) + return encode_data_server(data) + + +@pytest.fixture +def app(): + app = RPCServerApp() + app.router.add_route('GET', '/', root) + app.router.add_route('GET', '/struct', struct) + app.router.add_route('POST', '/echo', echo) + return app + + +async def test_get_simple(app, aiohttp_client) -> None: + assert app is not None + + cli = await aiohttp_client(app) + resp = await cli.get('/') + assert resp.status == 200 + assert resp.headers['Content-Type'] == 'application/x-msgpack' + data = await resp.read() + value = msgpack.unpackb(data, raw=False) + assert value == 'toor' + + +async def test_get_struct(app, aiohttp_client) -> None: + """Test returned structured from a simple GET data is OK""" + cli = await aiohttp_client(app) + resp = await cli.get('/struct') + assert resp.status == 200 + assert resp.headers['Content-Type'] == 'application/x-msgpack' + assert (await decode_request(resp)) == STRUCT + + +async def test_post_struct_msgpack(app, aiohttp_client) -> None: + """Test that msgpack encoded posted struct data is returned as is""" + cli = await aiohttp_client(app) + # simple struct + resp = await cli.post( + '/echo', + headers={'Content-Type': 'application/x-msgpack'}, + data=msgpack_dumps({'toto': 42})) + assert resp.status == 200 + assert resp.headers['Content-Type'] == 'application/x-msgpack' + assert (await decode_request(resp)) == {'toto': 42} + # complex struct + resp = await cli.post( + '/echo', + headers={'Content-Type': 'application/x-msgpack'}, + data=msgpack_dumps(STRUCT)) + assert resp.status == 200 + assert resp.headers['Content-Type'] == 'application/x-msgpack' + assert (await decode_request(resp)) == STRUCT + + +async def test_post_struct_json(app, aiohttp_client) -> None: + """Test that json encoded posted struct data is returned as is""" + cli = await aiohttp_client(app) + + resp = await cli.post( + '/echo', + headers={'Content-Type': 'application/json'}, + data=json.dumps({'toto': 42}, cls=SWHJSONEncoder)) + assert resp.status == 200 + assert resp.headers['Content-Type'] == 'application/x-msgpack' + assert (await decode_request(resp)) == {'toto': 42} + + resp = await cli.post( + '/echo', + headers={'Content-Type': 'application/json'}, + data=json.dumps(STRUCT, cls=SWHJSONEncoder)) + assert resp.status == 200 + assert resp.headers['Content-Type'] == 'application/x-msgpack' + assert (await decode_request(resp)) == STRUCT diff --git a/swh/core/cli/db.py b/swh/core/cli/db.py index a60cb29..cee0234 100755 --- a/swh/core/cli/db.py +++ b/swh/core/cli/db.py @@ -1,87 +1,170 @@ #!/usr/bin/env python3 # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import glob import logging +from os import path +import subprocess import warnings + warnings.filterwarnings("ignore") # noqa prevent psycopg from telling us sh*t import click from swh.core.cli import CONTEXT_SETTINGS +from swh.core.config import read as config_read logger = logging.getLogger(__name__) +@click.group(name="db", context_settings=CONTEXT_SETTINGS) +@click.option("--config-file", "-C", default=None, + type=click.Path(exists=True, dir_okay=False), + help="Configuration file.") +@click.pass_context +def db(ctx, config_file): + """Software Heritage database generic tools. + """ + ctx.ensure_object(dict) + cfg = config_read(config_file) + ctx.obj["config"] = cfg + + +@db.command(name="init", context_settings=CONTEXT_SETTINGS) +@click.pass_context +def init(ctx): + """Initialize the database for every Software Heritage module found in the + configuration file. For every configuration section in the config file + that: + + 1. has the name of an existing swh package, + 2. has credentials for a local db access, + + it will run the initialization scripts from the swh package against the + given database. + + Example for the config file:: + + \b + storage: + cls: local + args: + db: postgresql:///?service=swh-storage + objstorage: + cls: remote + args: + url: http://swh-objstorage:5003/ + + the command: + + swh db -C /path/to/config.yml init + + will initialize the database for the `storage` section using initialization + scripts from the `swh.storage` package. + """ + + for modname, cfg in ctx.obj["config"].items(): + if cfg.get("cls") == "local" and cfg.get("args"): + try: + sqlfiles = get_sql_for_package(modname) + except click.BadParameter: + logger.info( + "Failed to load/find sql initialization files for %s", + modname) + + if sqlfiles: + conninfo = cfg["args"]["db"] + for sqlfile in sqlfiles: + subprocess.call_call( + [ + "psql", + "--quiet", + "--no-psqlrc", + "-v", + "ON_ERROR_STOP=1", + "-d", + conninfo, + "-f", + sqlfile, + ] + ) + + @click.command(context_settings=CONTEXT_SETTINGS) @click.argument('module', nargs=-1, required=True) @click.option('--db-name', '-d', help='Database name.', default='softwareheritage-dev', show_default=True) -def db_init(module, db_name=None): +@click.option('--create-db/--no-create-db', '-C', + help='Attempt to create the database.', + default=False) +def db_init(module, db_name, create_db): """Initialise a database for the Software Heritage . By - default, attempts to create the database first. + default, does not attempt to create the database. Example: swh db-init -d swh-test storage If you want to specify non-default postgresql connection parameters, please provide them using standard environment variables. See psql(1) man page (section ENVIRONMENTS) for details. Example: PGPORT=5434 swh db-init indexer """ # put import statements here so we can keep startup time of the main swh # command as short as possible - from os import path - import glob - from importlib import import_module - from swh.core.utils import numfile_sortkey as sortkey from swh.core.db.tests.db_testing import ( pg_createdb, pg_restore, DB_DUMP_TYPES, swh_db_version ) logger.debug('db_init %s dn_name=%s', module, db_name) dump_files = [] for modname in module: - if not modname.startswith('swh.'): - modname = 'swh.{}'.format(modname) - try: - m = import_module(modname) - except ImportError: - raise click.BadParameter( - 'Unable to load module {}'.format(modname)) - - sqldir = path.join(path.dirname(m.__file__), 'sql') - if not path.isdir(sqldir): - raise click.BadParameter( - 'Module {} does not provide a db schema ' - '(no sql/ dir)'.format(modname)) - dump_files.extend(sorted(glob.glob(path.join(sqldir, '*.sql')), - key=sortkey)) - - # Create the db (or fail silently if already existing) - pg_createdb(db_name, check=False) + dump_files.extend(get_sql_for_package(modname)) + + if create_db: + # Create the db (or fail silently if already existing) + pg_createdb(db_name, check=False) # Try to retrieve the db version if any db_version = swh_db_version(db_name) if not db_version: # Initialize the db dump_files = [(x, DB_DUMP_TYPES[path.splitext(x)[1]]) for x in dump_files] for dump, dtype in dump_files: click.secho('Loading {}'.format(dump), fg='yellow') pg_restore(db_name, dump, dtype) db_version = swh_db_version(db_name) # TODO: Ideally migrate the version from db_version to the latest # db version click.secho('DONE database is {} version {}'.format(db_name, db_version), fg='green', bold=True) + + +def get_sql_for_package(modname): + from importlib import import_module + from swh.core.utils import numfile_sortkey as sortkey + + if not modname.startswith("swh."): + modname = "swh.{}".format(modname) + try: + m = import_module(modname) + except ImportError: + raise click.BadParameter("Unable to load module {}".format(modname)) + + sqldir = path.join(path.dirname(m.__file__), "sql") + if not path.isdir(sqldir): + raise click.BadParameter( + "Module {} does not provide a db schema " + "(no sql/ dir)".format(modname)) + return list(sorted(glob.glob(path.join(sqldir, "*.sql")), key=sortkey)) diff --git a/swh/core/db/tests/db_testing.py b/swh/core/db/tests/db_testing.py index c2122e8..63cbcaf 100644 --- a/swh/core/db/tests/db_testing.py +++ b/swh/core/db/tests/db_testing.py @@ -1,315 +1,315 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import glob import subprocess import psycopg2 from swh.core.utils import numfile_sortkey as sortkey DB_DUMP_TYPES = {'.sql': 'psql', '.dump': 'pg_dump'} def swh_db_version(dbname_or_service): """Retrieve the swh version if any. In case of the db not initialized, this returns None. Otherwise, this returns the db's version. Args: dbname_or_service (str): The db's name or service Returns: Optional[Int]: Either the db's version or None """ query = 'select version from dbversion order by dbversion desc limit 1' cmd = [ 'psql', '--tuples-only', '--no-psqlrc', '--quiet', '-v', 'ON_ERROR_STOP=1', "--command=%s" % query, dbname_or_service ] try: r = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, universal_newlines=True) result = int(r.stdout.strip()) except Exception: # db not initialized result = None return result def pg_restore(dbname, dumpfile, dumptype='pg_dump'): """ Args: dbname: name of the DB to restore into - dumpfile: path fo the dump file + dumpfile: path of the dump file dumptype: one of 'pg_dump' (for binary dumps), 'psql' (for SQL dumps) """ assert dumptype in ['pg_dump', 'psql'] if dumptype == 'pg_dump': subprocess.check_call(['pg_restore', '--no-owner', '--no-privileges', '--dbname', dbname, dumpfile]) elif dumptype == 'psql': subprocess.check_call(['psql', '--quiet', '--no-psqlrc', '-v', 'ON_ERROR_STOP=1', '-f', dumpfile, dbname]) def pg_dump(dbname, dumpfile): subprocess.check_call(['pg_dump', '--no-owner', '--no-privileges', '-Fc', '-f', dumpfile, dbname]) def pg_dropdb(dbname): subprocess.check_call(['dropdb', dbname]) def pg_createdb(dbname, check=True): """Create a db. If check is True and the db already exists, this will raise an exception (original behavior). If check is False and the db already exists, this will fail silently. If the db does not exist, the db will be created. """ subprocess.run(['createdb', dbname], check=check) def db_create(dbname, dumps=None): """create the test DB and load the test data dumps into it dumps is an iterable of couples (dump_file, dump_type). context: setUpClass """ try: pg_createdb(dbname) except subprocess.CalledProcessError: # try recovering once, in case pg_dropdb(dbname) # the db already existed pg_createdb(dbname) for dump, dtype in dumps: pg_restore(dbname, dump, dtype) return dbname def db_destroy(dbname): """destroy the test DB context: tearDownClass """ pg_dropdb(dbname) def db_connect(dbname): """connect to the test DB and open a cursor context: setUp """ conn = psycopg2.connect('dbname=' + dbname) return { 'conn': conn, 'cursor': conn.cursor() } def db_close(conn): """rollback current transaction and disconnect from the test DB context: tearDown """ if not conn.closed: conn.rollback() conn.close() class DbTestConn: def __init__(self, dbname): self.dbname = dbname def __enter__(self): self.db_setup = db_connect(self.dbname) self.conn = self.db_setup['conn'] self.cursor = self.db_setup['cursor'] return self def __exit__(self, *_): db_close(self.conn) class DbTestContext: def __init__(self, name='softwareheritage-test', dumps=None): self.dbname = name self.dumps = dumps def __enter__(self): db_create(dbname=self.dbname, dumps=self.dumps) return self def __exit__(self, *_): db_destroy(self.dbname) class DbTestFixture: """Mix this in a test subject class to get DB testing support. Use the class method add_db() to add a new database to be tested. Using this will create a DbTestConn entry in the `test_db` dictionary for all the tests, indexed by the name of the database. Example: class TestDb(DbTestFixture, unittest.TestCase): @classmethod def setUpClass(cls): cls.add_db('db_name', DUMP) super().setUpClass() def setUp(self): db = self.test_db['db_name'] print('conn: {}, cursor: {}'.format(db.conn, db.cursor)) To ensure test isolation, each test method of the test case class will execute in its own connection, cursor, and transaction. Note that if you want to define setup/teardown methods, you need to explicitly call super() to ensure that the fixture setup/teardown methods are invoked. Here is an example where all setup/teardown methods are defined in a test case: class TestDb(DbTestFixture, unittest.TestCase): @classmethod def setUpClass(cls): # your add_db() calls here super().setUpClass() # your class setup code here def setUp(self): super().setUp() # your instance setup code here def tearDown(self): # your instance teardown code here super().tearDown() @classmethod def tearDownClass(cls): # your class teardown code here super().tearDownClass() """ _DB_DUMP_LIST = {} _DB_LIST = {} DB_TEST_FIXTURE_IMPORTED = True @classmethod def add_db(cls, name='softwareheritage-test', dumps=None): cls._DB_DUMP_LIST[name] = dumps @classmethod def setUpClass(cls): for name, dumps in cls._DB_DUMP_LIST.items(): cls._DB_LIST[name] = DbTestContext(name, dumps) cls._DB_LIST[name].__enter__() super().setUpClass() @classmethod def tearDownClass(cls): super().tearDownClass() for name, context in cls._DB_LIST.items(): context.__exit__() def setUp(self, *args, **kwargs): self.test_db = {} for name in self._DB_LIST.keys(): self.test_db[name] = DbTestConn(name) self.test_db[name].__enter__() super().setUp(*args, **kwargs) def tearDown(self): super().tearDown() for name in self._DB_LIST.keys(): self.test_db[name].__exit__() def reset_db_tables(self, name, excluded=None): db = self.test_db[name] conn = db.conn cursor = db.cursor cursor.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema = %s""", ('public',)) tables = set(table for (table,) in cursor.fetchall()) if excluded is not None: tables -= set(excluded) for table in tables: cursor.execute('truncate table %s cascade' % table) conn.commit() class SingleDbTestFixture(DbTestFixture): """Simplified fixture like DbTest but that can only handle a single DB. Gives access to shortcuts like self.cursor and self.conn. DO NOT use this with other fixtures that need to access databases, like StorageTestFixture. The class can override the following class attributes: TEST_DB_NAME: name of the DB used for testing TEST_DB_DUMP: DB dump to be restored before running test methods; can be set to None if no restore from dump is required. If the dump file name endswith" - '.sql' it will be loaded via psql, - '.dump' it will be loaded via pg_restore. Other file extensions will be ignored. Can be a string or a list of strings; each path will be expanded using glob pattern matching. The test case class will then have the following attributes, accessible via self: dbname: name of the test database conn: psycopg2 connection object cursor: open psycopg2 cursor to the DB """ TEST_DB_NAME = 'softwareheritage-test' TEST_DB_DUMP = None @classmethod def setUpClass(cls): cls.dbname = cls.TEST_DB_NAME # XXX to kill? dump_files = cls.TEST_DB_DUMP if isinstance(dump_files, str): dump_files = [dump_files] all_dump_files = [] for files in dump_files: all_dump_files.extend( sorted(glob.glob(files), key=sortkey)) all_dump_files = [(x, DB_DUMP_TYPES[os.path.splitext(x)[1]]) for x in all_dump_files] cls.add_db(name=cls.TEST_DB_NAME, dumps=all_dump_files) super().setUpClass() def setUp(self, *args, **kwargs): super().setUp(*args, **kwargs) db = self.test_db[self.TEST_DB_NAME] self.conn = db.conn self.cursor = db.cursor diff --git a/swh/core/db/tests/test_cli.py b/swh/core/db/tests/test_cli.py new file mode 100644 index 0000000..4a07fdc --- /dev/null +++ b/swh/core/db/tests/test_cli.py @@ -0,0 +1,49 @@ +# + +from click.testing import CliRunner + +from swh.core.cli import swh as swhmain +from swh.core.cli.db import db as swhdb + + +help_msg = '''Usage: swh [OPTIONS] COMMAND [ARGS]... + + Command line interface for Software Heritage. + +Options: + -l, --log-level [NOTSET|DEBUG|INFO|WARNING|ERROR|CRITICAL] + Log level (default to INFO) + -h, --help Show this message and exit. + +Commands: + db Software Heritage database generic tools. +''' + + +def test_swh_help(): + swhmain.add_command(swhdb) + runner = CliRunner() + result = runner.invoke(swhmain, ['-h']) + assert result.exit_code == 0 + assert result.output == help_msg + + +help_db_msg = '''Usage: swh db [OPTIONS] COMMAND [ARGS]... + + Software Heritage database generic tools. + +Options: + -C, --config-file FILE Configuration file. + -h, --help Show this message and exit. + +Commands: + init Initialize the database for every Software Heritage module found in... +''' + + +def test_swh_db_help(): + swhmain.add_command(swhdb) + runner = CliRunner() + result = runner.invoke(swhmain, ['db', '-h']) + assert result.exit_code == 0 + assert result.output == help_db_msg diff --git a/swh/core/tarball.py b/swh/core/tarball.py index 69d56f4..261bfe7 100644 --- a/swh/core/tarball.py +++ b/swh/core/tarball.py @@ -1,227 +1,228 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import stat import tarfile import zipfile from os.path import abspath, realpath, join, dirname from . import utils def _canonical_abspath(path): """Resolve all paths to an absolute and real one. Args: path: to resolve Returns: canonical absolute path to path """ return realpath(abspath(path)) def _badpath(path, basepath): """Determine if a path is outside basepath. Args: path: a relative or absolute path of a file or directory basepath: the basepath path must be in Returns: True if path is outside basepath, false otherwise. """ return not _canonical_abspath(join(basepath, path)).startswith(basepath) def _badlink(info, basepath): """Determine if the tarinfo member is outside basepath. Args: info: TarInfo member representing a symlink or hardlink of tar archive basepath: the basepath the info member must be in Returns: True if info is outside basepath, false otherwise. """ tippath = _canonical_abspath(join(basepath, dirname(info.name))) return _badpath(info.linkname, basepath=tippath) def is_tarball(filepath): """Given a filepath, determine if it represents an archive. Args: filepath: file to test for tarball property Returns: Bool, True if it's a tarball, False otherwise """ return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) def _uncompress_zip(tarpath, dirpath): """Uncompress zip archive safely. As per zipfile is concerned (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa Args: tarpath: path to the archive dirpath: directory to uncompress the archive to """ with zipfile.ZipFile(tarpath) as z: z.extractall(path=dirpath) +def _safemembers(tarpath, members, basepath): + """Given a list of archive members, yield the members (directory, + file, hard-link) that stays in bounds with basepath. Note + that symbolic link are authorized to point outside the + basepath though. + + Args: + tarpath: Name of the tarball + members: Archive members for such tarball + basepath: the basepath sandbox + + Yields: + Safe TarInfo member + + Raises: + ValueError when a member would be extracted outside basepath + + """ + errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) + + for finfo in members: + if finfo.isdir() and _badpath(finfo.name, basepath): + raise ValueError(errormsg % ('directory', finfo.name)) + elif finfo.isfile() and _badpath(finfo.name, basepath): + raise ValueError(errormsg % ('file', finfo.name)) + elif finfo.islnk() and _badlink(finfo, basepath): + raise ValueError(errormsg % ('hard-link', finfo.linkname)) + # Authorize symlinks to point outside basepath + # elif finfo.issym() and _badlink(finfo, basepath): + # raise ValueError(errormsg % ('symlink', finfo.linkname)) + else: + yield finfo + + def _uncompress_tar(tarpath, dirpath): """Uncompress tarpath if the tarpath is safe. Safe means, no file will be uncompressed outside of dirpath. Args: tarpath: path to the archive dirpath: directory to uncompress the archive to Raises: ValueError when a member would be extracted outside dirpath. """ - def safemembers(tarpath, members, basepath): - """Given a list of archive members, yield the members (directory, - file, hard-link) that stays in bounds with basepath. Note - that symbolic link are authorized to point outside the - basepath though. - - Args: - tarpath: Name of the tarball - members: Archive members for such tarball - basepath: the basepath sandbox - - Yields: - Safe TarInfo member - - Raises: - ValueError when a member would be extracted outside basepath - - """ - errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) - - for finfo in members: - if finfo.isdir() and _badpath(finfo.name, basepath): - raise ValueError(errormsg % ('directory', finfo.name)) - elif finfo.isfile() and _badpath(finfo.name, basepath): - raise ValueError(errormsg % ('file', finfo.name)) - elif finfo.islnk() and _badlink(finfo, basepath): - raise ValueError(errormsg % ('hard-link', finfo.linkname)) - # Authorize symlinks to point outside basepath - # elif finfo.issym() and _badlink(finfo, basepath): - # raise ValueError(errormsg % ('symlink', finfo.linkname)) - else: - yield finfo - with tarfile.open(tarpath) as t: members = t.getmembers() t.extractall(path=dirpath, - members=safemembers(tarpath, members, dirpath)) + members=_safemembers(tarpath, members, dirpath)) def uncompress(tarpath, dest): """Uncompress tarpath to dest folder if tarball is supported and safe. Safe means, no file will be uncompressed outside of dirpath. Note that this fixes permissions after successfully uncompressing the archive. Args: tarpath: path to tarball to uncompress dest: the destination folder where to uncompress the tarball Returns: The nature of the tarball, zip or tar. Raises: ValueError when: - an archive member would be extracted outside basepath - the archive is not supported """ if tarfile.is_tarfile(tarpath): _uncompress_tar(tarpath, dest) nature = 'tar' elif zipfile.is_zipfile(tarpath): _uncompress_zip(tarpath, dest) nature = 'zip' else: raise ValueError('File %s is not a supported archive.' % tarpath) # Fix permissions for dirpath, _, fnames in os.walk(dest): os.chmod(dirpath, 0o755) for fname in fnames: fpath = os.path.join(dirpath, fname) if not os.path.islink(fpath): fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR if not fpath_exec: os.chmod(fpath, 0o644) return nature def _ls(rootdir): """Generator of filepath, filename from rootdir. """ for dirpath, dirnames, fnames in os.walk(rootdir): for fname in (dirnames+fnames): fpath = os.path.join(dirpath, fname) fname = utils.commonname(rootdir, fpath) yield fpath, fname def _compress_zip(tarpath, files): """Compress dirpath's content as tarpath. """ with zipfile.ZipFile(tarpath, 'w') as z: for fpath, fname in files: z.write(fpath, arcname=fname) def _compress_tar(tarpath, files): """Compress dirpath's content as tarpath. """ with tarfile.open(tarpath, 'w:bz2') as t: for fpath, fname in files: t.add(fpath, arcname=fname, recursive=False) def compress(tarpath, nature, dirpath_or_files): """Create a tarball tarpath with nature nature. The content of the tarball is either dirpath's content (if representing a directory path) or dirpath's iterable contents. Compress the directory dirpath's content to a tarball. The tarball being dumped at tarpath. The nature of the tarball is determined by the nature argument. """ if isinstance(dirpath_or_files, str): files = _ls(dirpath_or_files) else: # iterable of 'filepath, filename' files = dirpath_or_files if nature == 'zip': _compress_zip(tarpath, files) else: _compress_tar(tarpath, files) return tarpath diff --git a/swh/core/tests/test_cli.py b/swh/core/tests/test_cli.py index 63f2538..560c543 100644 --- a/swh/core/tests/test_cli.py +++ b/swh/core/tests/test_cli.py @@ -1,109 +1,109 @@ # import logging import click from click.testing import CliRunner from swh.core.cli import swh as swhmain help_msg = '''Usage: swh [OPTIONS] COMMAND [ARGS]... Command line interface for Software Heritage. Options: -l, --log-level [NOTSET|DEBUG|INFO|WARNING|ERROR|CRITICAL] Log level (default to INFO) -h, --help Show this message and exit. ''' def test_swh_help(): runner = CliRunner() result = runner.invoke(swhmain, ['-h']) assert result.exit_code == 0 - assert result.output == help_msg + assert result.output.startswith(help_msg) result = runner.invoke(swhmain, ['--help']) assert result.exit_code == 0 - assert result.output == help_msg + assert result.output.startswith(help_msg) def test_command(): @swhmain.command(name='test') @click.pass_context def swhtest(ctx): click.echo('Hello SWH!') runner = CliRunner() result = runner.invoke(swhmain, ['test']) assert result.exit_code == 0 assert result.output.strip() == 'Hello SWH!' def test_loglevel_default(caplog): @swhmain.command(name='test') @click.pass_context def swhtest(ctx): assert logging.root.level == 20 click.echo('Hello SWH!') runner = CliRunner() result = runner.invoke(swhmain, ['test']) assert result.exit_code == 0 print(result.output) assert result.output.strip() == '''Hello SWH!''' def test_loglevel_error(caplog): @swhmain.command(name='test') @click.pass_context def swhtest(ctx): assert logging.root.level == 40 click.echo('Hello SWH!') runner = CliRunner() result = runner.invoke(swhmain, ['-l', 'ERROR', 'test']) assert result.exit_code == 0 assert result.output.strip() == '''Hello SWH!''' def test_loglevel_debug(caplog): @swhmain.command(name='test') @click.pass_context def swhtest(ctx): assert logging.root.level == 10 click.echo('Hello SWH!') runner = CliRunner() result = runner.invoke(swhmain, ['-l', 'DEBUG', 'test']) assert result.exit_code == 0 assert result.output.strip() == '''Hello SWH!''' def test_aliased_command(): @swhmain.command(name='canonical-test') @click.pass_context def swhtest(ctx): 'A test command.' click.echo('Hello SWH!') swhmain.add_alias(swhtest, 'othername') runner = CliRunner() # check we have only 'canonical-test' listed in the usage help msg result = runner.invoke(swhmain, ['-h']) assert result.exit_code == 0 assert 'canonical-test A test command.' in result.output assert 'othername' not in result.output # check we can execute the cmd with 'canonical-test' result = runner.invoke(swhmain, ['canonical-test']) assert result.exit_code == 0 assert result.output.strip() == '''Hello SWH!''' # check we can also execute the cmd with the alias 'othername' result = runner.invoke(swhmain, ['othername']) assert result.exit_code == 0 assert result.output.strip() == '''Hello SWH!''' diff --git a/swh/core/tests/test_statsd.py b/swh/core/tests/test_statsd.py index 70986d7..7b5dd62 100644 --- a/swh/core/tests/test_statsd.py +++ b/swh/core/tests/test_statsd.py @@ -1,563 +1,563 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Initially imported from https://github.com/DataDog/datadogpy/ # at revision 62b3a3e89988dc18d78c282fe3ff5d1813917436 # # Copyright (c) 2015, Datadog # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Datadog nor the names of its contributors may be # used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # from collections import deque from contextlib import contextmanager import os import socket import time import unittest import pytest from swh.core.statsd import Statsd, TimedContextManagerDecorator @contextmanager def preserve_envvars(*envvars): """Context manager preserving the value of environment variables""" preserved = {} to_delete = object() for var in envvars: preserved[var] = os.environ.get(var, to_delete) yield for var in envvars: old = preserved[var] if old is not to_delete: os.environ[var] = old else: del os.environ[var] class FakeSocket(object): """ A fake socket for testing. """ def __init__(self): self.payloads = deque() def send(self, payload): assert type(payload) == bytes self.payloads.append(payload) def recv(self): try: return self.payloads.popleft().decode('utf-8') except IndexError: return None def close(self): pass def __repr__(self): return str(self.payloads) class BrokenSocket(FakeSocket): def send(self, payload): raise socket.error("Socket error") class SlowSocket(FakeSocket): def send(self, payload): raise socket.timeout("Socket timeout") class TestStatsd(unittest.TestCase): def setUp(self): """ Set up a default Statsd instance and mock the socket. """ # self.statsd = Statsd() self.statsd.socket = FakeSocket() def recv(self): return self.statsd.socket.recv() def test_set(self): self.statsd.set('set', 123) assert self.recv() == 'set:123|s' def test_gauge(self): self.statsd.gauge('gauge', 123.4) assert self.recv() == 'gauge:123.4|g' def test_counter(self): self.statsd.increment('page.views') self.assertEqual('page.views:1|c', self.recv()) self.statsd.increment('page.views', 11) self.assertEqual('page.views:11|c', self.recv()) self.statsd.decrement('page.views') self.assertEqual('page.views:-1|c', self.recv()) self.statsd.decrement('page.views', 12) self.assertEqual('page.views:-12|c', self.recv()) def test_histogram(self): self.statsd.histogram('histo', 123.4) self.assertEqual('histo:123.4|h', self.recv()) def test_tagged_gauge(self): self.statsd.gauge('gt', 123.4, tags={'country': 'china', 'age': 45}) self.assertEqual('gt:123.4|g|#age:45,country:china', self.recv()) def test_tagged_counter(self): self.statsd.increment('ct', tags={'country': 'españa'}) self.assertEqual('ct:1|c|#country:españa', self.recv()) def test_tagged_histogram(self): self.statsd.histogram('h', 1, tags={'test_tag': 'tag_value'}) self.assertEqual('h:1|h|#test_tag:tag_value', self.recv()) def test_sample_rate(self): self.statsd.increment('c', sample_rate=0) assert not self.recv() for i in range(10000): self.statsd.increment('sampled_counter', sample_rate=0.3) self.assert_almost_equal(3000, len(self.statsd.socket.payloads), 150) self.assertEqual('sampled_counter:1|c|@0.3', self.recv()) def test_tags_and_samples(self): for i in range(100): self.statsd.gauge('gst', 23, tags={"sampled": True}, sample_rate=0.9) self.assert_almost_equal(90, len(self.statsd.socket.payloads), 10) self.assertEqual('gst:23|g|@0.9|#sampled:True', self.recv()) def test_timing(self): self.statsd.timing('t', 123) self.assertEqual('t:123|ms', self.recv()) def test_metric_namespace(self): """ Namespace prefixes all metric names. """ self.statsd.namespace = "foo" self.statsd.gauge('gauge', 123.4) self.assertEqual('foo.gauge:123.4|g', self.recv()) - # Test Client level contant tags + # Test Client level constant tags def test_gauge_constant_tags(self): self.statsd.constant_tags = { 'bar': 'baz', } self.statsd.gauge('gauge', 123.4) assert self.recv() == 'gauge:123.4|g|#bar:baz' def test_counter_constant_tag_with_metric_level_tags(self): self.statsd.constant_tags = { 'bar': 'baz', 'foo': True, } self.statsd.increment('page.views', tags={'extra': 'extra'}) self.assertEqual( 'page.views:1|c|#bar:baz,extra:extra,foo:True', self.recv(), ) def test_gauge_constant_tags_with_metric_level_tags_twice(self): metric_level_tag = {'foo': 'bar'} self.statsd.constant_tags = {'bar': 'baz'} self.statsd.gauge('gauge', 123.4, tags=metric_level_tag) assert self.recv() == 'gauge:123.4|g|#bar:baz,foo:bar' # sending metrics multiple times with same metric-level tags # should not duplicate the tags being sent self.statsd.gauge('gauge', 123.4, tags=metric_level_tag) assert self.recv() == 'gauge:123.4|g|#bar:baz,foo:bar' def assert_almost_equal(self, a, b, delta): self.assertTrue( 0 <= abs(a - b) <= delta, "%s - %s not within %s" % (a, b, delta) ) def test_socket_error(self): self.statsd.socket = BrokenSocket() self.statsd.gauge('no error', 1) assert True, 'success' def test_socket_timeout(self): self.statsd.socket = SlowSocket() self.statsd.gauge('no error', 1) assert True, 'success' def test_timed(self): """ Measure the distribution of a function's run time. """ @self.statsd.timed('timed.test') def func(a, b, c=1, d=1): """docstring""" time.sleep(0.5) return (a, b, c, d) self.assertEqual('func', func.__name__) self.assertEqual('docstring', func.__doc__) result = func(1, 2, d=3) # Assert it handles args and kwargs correctly. self.assertEqual(result, (1, 2, 1, 3)) packet = self.recv() name_value, type_ = packet.split('|') name, value = name_value.split(':') self.assertEqual('ms', type_) self.assertEqual('timed.test', name) self.assert_almost_equal(500, float(value), 100) def test_timed_exception(self): """ Exception bubble out of the decorator and is reported to statsd as a dedicated counter. """ @self.statsd.timed('timed.test') def func(a, b, c=1, d=1): """docstring""" time.sleep(0.5) return (a / b, c, d) self.assertEqual('func', func.__name__) self.assertEqual('docstring', func.__doc__) with self.assertRaises(ZeroDivisionError): func(1, 0) packet = self.recv() name_value, type_ = packet.split('|') name, value = name_value.split(':') self.assertEqual('c', type_) self.assertEqual('timed.test_error_count', name) self.assertEqual(int(value), 1) def test_timed_no_metric(self, ): """ Test using a decorator without providing a metric. """ @self.statsd.timed() def func(a, b, c=1, d=1): """docstring""" time.sleep(0.5) return (a, b, c, d) self.assertEqual('func', func.__name__) self.assertEqual('docstring', func.__doc__) result = func(1, 2, d=3) # Assert it handles args and kwargs correctly. self.assertEqual(result, (1, 2, 1, 3)) packet = self.recv() name_value, type_ = packet.split('|') name, value = name_value.split(':') self.assertEqual('ms', type_) self.assertEqual('swh.core.tests.test_statsd.func', name) self.assert_almost_equal(500, float(value), 100) def test_timed_coroutine(self): """ Measure the distribution of a coroutine function's run time. Warning: Python >= 3.5 only. """ import asyncio @self.statsd.timed('timed.test') @asyncio.coroutine def print_foo(): """docstring""" time.sleep(0.5) print("foo") - loop = asyncio.get_event_loop() + loop = asyncio.new_event_loop() loop.run_until_complete(print_foo()) loop.close() # Assert packet = self.recv() name_value, type_ = packet.split('|') name, value = name_value.split(':') self.assertEqual('ms', type_) self.assertEqual('timed.test', name) self.assert_almost_equal(500, float(value), 100) def test_timed_context(self): """ Measure the distribution of a context's run time. """ # In milliseconds with self.statsd.timed('timed_context.test') as timer: self.assertIsInstance(timer, TimedContextManagerDecorator) time.sleep(0.5) packet = self.recv() name_value, type_ = packet.split('|') name, value = name_value.split(':') self.assertEqual('ms', type_) self.assertEqual('timed_context.test', name) self.assert_almost_equal(500, float(value), 100) self.assert_almost_equal(500, timer.elapsed, 100) def test_timed_context_exception(self): """ Exception bubbles out of the `timed` context manager and is reported to statsd as a dedicated counter. """ class ContextException(Exception): pass def func(self): with self.statsd.timed('timed_context.test'): time.sleep(0.5) raise ContextException() # Ensure the exception was raised. self.assertRaises(ContextException, func, self) # Ensure the timing was recorded. packet = self.recv() name_value, type_ = packet.split('|') name, value = name_value.split(':') self.assertEqual('c', type_) self.assertEqual('timed_context.test_error_count', name) self.assertEqual(int(value), 1) def test_timed_context_no_metric_name_exception(self): """Test that an exception occurs if using a context manager without a metric name. """ def func(self): with self.statsd.timed(): time.sleep(0.5) # Ensure the exception was raised. self.assertRaises(TypeError, func, self) # Ensure the timing was recorded. packet = self.recv() self.assertEqual(packet, None) def test_timed_start_stop_calls(self): timer = self.statsd.timed('timed_context.test') timer.start() time.sleep(0.5) timer.stop() packet = self.recv() name_value, type_ = packet.split('|') name, value = name_value.split(':') self.assertEqual('ms', type_) self.assertEqual('timed_context.test', name) self.assert_almost_equal(500, float(value), 100) def test_batched(self): self.statsd.open_buffer() self.statsd.gauge('page.views', 123) self.statsd.timing('timer', 123) self.statsd.close_buffer() self.assertEqual('page.views:123|g\ntimer:123|ms', self.recv()) def test_context_manager(self): fake_socket = FakeSocket() with Statsd() as statsd: statsd.socket = fake_socket statsd.gauge('page.views', 123) statsd.timing('timer', 123) self.assertEqual('page.views:123|g\ntimer:123|ms', fake_socket.recv()) def test_batched_buffer_autoflush(self): fake_socket = FakeSocket() with Statsd() as statsd: statsd.socket = fake_socket for i in range(51): statsd.increment('mycounter') self.assertEqual( '\n'.join(['mycounter:1|c' for i in range(50)]), fake_socket.recv(), ) self.assertEqual('mycounter:1|c', fake_socket.recv()) def test_module_level_instance(self): from swh.core.statsd import statsd self.assertTrue(isinstance(statsd, Statsd)) def test_instantiating_does_not_connect(self): local_statsd = Statsd() self.assertEqual(None, local_statsd.socket) def test_accessing_socket_opens_socket(self): local_statsd = Statsd() try: self.assertIsNotNone(local_statsd.get_socket()) finally: local_statsd.socket.close() def test_accessing_socket_multiple_times_returns_same_socket(self): local_statsd = Statsd() fresh_socket = FakeSocket() local_statsd.socket = fresh_socket self.assertEqual(fresh_socket, local_statsd.get_socket()) self.assertNotEqual(FakeSocket(), local_statsd.get_socket()) def test_tags_from_environment(self): with preserve_envvars('STATSD_TAGS'): os.environ['STATSD_TAGS'] = 'country:china,age:45' statsd = Statsd() statsd.socket = FakeSocket() statsd.gauge('gt', 123.4) self.assertEqual('gt:123.4|g|#age:45,country:china', statsd.socket.recv()) def test_tags_from_environment_and_constant(self): with preserve_envvars('STATSD_TAGS'): os.environ['STATSD_TAGS'] = 'country:china,age:45' statsd = Statsd(constant_tags={'country': 'canada'}) statsd.socket = FakeSocket() statsd.gauge('gt', 123.4) self.assertEqual('gt:123.4|g|#age:45,country:canada', statsd.socket.recv()) def test_tags_from_environment_warning(self): with preserve_envvars('STATSD_TAGS'): os.environ['STATSD_TAGS'] = 'valid:tag,invalid_tag' with pytest.warns(UserWarning) as record: statsd = Statsd() assert len(record) == 1 assert 'invalid_tag' in record[0].message.args[0] assert 'valid:tag' not in record[0].message.args[0] assert statsd.constant_tags == {'valid': 'tag'} def test_gauge_doesnt_send_none(self): self.statsd.gauge('metric', None) assert self.recv() is None def test_increment_doesnt_send_none(self): self.statsd.increment('metric', None) assert self.recv() is None def test_decrement_doesnt_send_none(self): self.statsd.decrement('metric', None) assert self.recv() is None def test_timing_doesnt_send_none(self): self.statsd.timing('metric', None) assert self.recv() is None def test_histogram_doesnt_send_none(self): self.statsd.histogram('metric', None) assert self.recv() is None def test_param_host(self): with preserve_envvars('STATSD_HOST', 'STATSD_PORT'): os.environ['STATSD_HOST'] = 'test-value' os.environ['STATSD_PORT'] = '' local_statsd = Statsd(host='actual-test-value') self.assertEqual(local_statsd.host, 'actual-test-value') self.assertEqual(local_statsd.port, 8125) def test_param_port(self): with preserve_envvars('STATSD_HOST', 'STATSD_PORT'): os.environ['STATSD_HOST'] = '' os.environ['STATSD_PORT'] = '12345' local_statsd = Statsd(port=4321) self.assertEqual(local_statsd.host, 'localhost') self.assertEqual(local_statsd.port, 4321) def test_envvar_host(self): with preserve_envvars('STATSD_HOST', 'STATSD_PORT'): os.environ['STATSD_HOST'] = 'test-value' os.environ['STATSD_PORT'] = '' local_statsd = Statsd() self.assertEqual(local_statsd.host, 'test-value') self.assertEqual(local_statsd.port, 8125) def test_envvar_port(self): with preserve_envvars('STATSD_HOST', 'STATSD_PORT'): os.environ['STATSD_HOST'] = '' os.environ['STATSD_PORT'] = '12345' local_statsd = Statsd() self.assertEqual(local_statsd.host, 'localhost') self.assertEqual(local_statsd.port, 12345) def test_namespace_added(self): local_statsd = Statsd(namespace='test-namespace') local_statsd.socket = FakeSocket() local_statsd.gauge('gauge', 123.4) assert local_statsd.socket.recv() == 'test-namespace.gauge:123.4|g' def test_contextmanager_empty(self): with self.statsd: assert True, 'success' def test_contextmanager_buffering(self): with self.statsd as s: s.gauge('gauge', 123.4) s.gauge('gauge_other', 456.78) self.assertIsNone(s.socket.recv()) self.assertEqual(self.recv(), 'gauge:123.4|g\ngauge_other:456.78|g') def test_timed_elapsed(self): with self.statsd.timed('test_timer') as t: pass self.assertGreaterEqual(t.elapsed, 0) self.assertEqual(self.recv(), 'test_timer:%s|ms' % t.elapsed) diff --git a/version.txt b/version.txt index 41dad72..34057bb 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.63-0-g3f4d966 \ No newline at end of file +v0.0.64-0-g13c18d7 \ No newline at end of file