diff --git a/MANIFEST.in b/MANIFEST.in index e7c46fc..e19b7ae 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include Makefile include requirements.txt include requirements-swh.txt include version.txt +recursive-include swh/core/sql *.sql diff --git a/PKG-INFO b/PKG-INFO index 7f11cae..dd31e6c 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,59 +1,59 @@ Metadata-Version: 2.1 Name: swh.core -Version: 0.0.43 +Version: 0.0.44 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== core library for swh's modules: - config parser - hash computations - serialization - logging mechanism Defines also a celery application to run concurrency tasks Celery use ---------- ### configuration file worker.ini file which looks like: [main] task_broker = amqp://guest@localhost// task_modules = swh.loader.dir.tasks, swh.loader.tar.tasks, swh.loader.git.tasks task_queues = swh_loader_tar, swh_loader_git, swh_loader_dir task_soft_time_limit = 0 This file can be set in the following location: - ~/.swh - ~/.config/swh - /etc/softwareheritage ### run celery worker Sample command: celery worker --app=swh.core.worker \ --pool=prefork \ --autoscale=2,2 \ -Ofair \ --loglevel=info 2>&1 | tee -a swh-core-worker.log Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/requirements-test.txt b/requirements-test.txt index 145e520..4dfbdaa 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,2 +1,2 @@ -nose +pytest requests-mock diff --git a/setup.py b/setup.py index 5755b09..25fc474 100755 --- a/setup.py +++ b/setup.py @@ -1,66 +1,69 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = 'requirements-%s.txt' % name else: reqf = 'requirements.txt' requirements = [] if not os.path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.core', description='Software Heritage core utilities', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DCORE/', packages=find_packages(), scripts=[], install_requires=parse_requirements() + parse_requirements('swh'), setup_requires=['vcversioner'], extras_require={'testing': parse_requirements('test')}, vcversioner={}, include_package_data=True, + entry_points={ + 'console_scripts': ['swh-db-init=swh.core.cli:db_init'], + }, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', 'Source': 'https://forge.softwareheritage.org/source/swh-core', }, ) diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index 7f11cae..dd31e6c 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,59 +1,59 @@ Metadata-Version: 2.1 Name: swh.core -Version: 0.0.43 +Version: 0.0.44 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== core library for swh's modules: - config parser - hash computations - serialization - logging mechanism Defines also a celery application to run concurrency tasks Celery use ---------- ### configuration file worker.ini file which looks like: [main] task_broker = amqp://guest@localhost// task_modules = swh.loader.dir.tasks, swh.loader.tar.tasks, swh.loader.git.tasks task_queues = swh_loader_tar, swh_loader_git, swh_loader_dir task_soft_time_limit = 0 This file can be set in the following location: - ~/.swh - ~/.config/swh - /etc/softwareheritage ### run celery worker Sample command: celery worker --app=swh.core.worker \ --pool=prefork \ --autoscale=2,2 \ -Ofair \ --loglevel=info 2>&1 | tee -a swh-core-worker.log Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index e17ec53..6893a6b 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,47 +1,49 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README.md requirements-swh.txt requirements-test.txt requirements.txt setup.py tox.ini version.txt debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder -sql/log-schema.sql swh/__init__.py swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt +swh.core.egg-info/entry_points.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/__init__.py swh/core/api.py swh/core/api_async.py +swh/core/cli.py swh/core/config.py swh/core/logger.py swh/core/serializers.py swh/core/tarball.py swh/core/utils.py +swh/core/sql/log-schema.sql swh/core/tests/__init__.py swh/core/tests/db_testing.py swh/core/tests/server_testing.py swh/core/tests/test_api.py swh/core/tests/test_config.py swh/core/tests/test_logger.py swh/core/tests/test_serializers.py swh/core/tests/test_utils.py \ No newline at end of file diff --git a/swh.core.egg-info/entry_points.txt b/swh.core.egg-info/entry_points.txt new file mode 100644 index 0000000..6b673cb --- /dev/null +++ b/swh.core.egg-info/entry_points.txt @@ -0,0 +1,3 @@ +[console_scripts] +swh-db-init = swh.core.cli:db_init + diff --git a/swh.core.egg-info/requires.txt b/swh.core.egg-info/requires.txt index 0f40f15..bd2ef50 100644 --- a/swh.core.egg-info/requires.txt +++ b/swh.core.egg-info/requires.txt @@ -1,14 +1,14 @@ Flask PyYAML aiohttp arrow msgpack-python psycopg2 python-dateutil requests systemd-python vcversioner [testing] -nose +pytest requests-mock diff --git a/swh/core/cli.py b/swh/core/cli.py new file mode 100755 index 0000000..1da8bdb --- /dev/null +++ b/swh/core/cli.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import warnings +warnings.filterwarnings("ignore") # noqa prevent psycopg from telling us sh*t + +from os import path +import glob + +import click +from importlib import import_module + +from swh.core.utils import numfile_sortkey as sortkey +from swh.core.tests.db_testing import pg_createdb, pg_restore, DB_DUMP_TYPES + + +@click.command() +@click.argument('module', nargs=-1, required=True) +@click.option('--db-name', '-d', help='Database name.', + default='softwareheritage-dev', show_default=True) +@click.option('--no-create', '-C', + help='Do not attempt to create the database', default=False) +def db_init(module, db_name=None, no_create=None): + """Create and initialise a database for the Software Heritage . + + Example: + + swh-db-init storage -d swh-test + + If you want to specify non-default postgresql connection parameters, + please provide them using standard environment variables. + See psql(1) man page (section ENVIRONMENTS) for details. + + Example: + + PGPORT=5434 swh-db-init indexer -d swh-indexer + + """ + + dump_files = [] + + for modname in module: + if not modname.startswith('swh.'): + modname = 'swh.{}'.format(modname) + try: + m = import_module(modname) + except ImportError: + raise click.BadParameter( + 'Unable to load module {}'.format(modname)) + + sqldir = path.join(path.dirname(m.__file__), 'sql') + if not path.isdir(sqldir): + raise click.BadParameter( + 'Module {} does not provide a db schema ' + '(no sql/ dir)'.format(modname)) + dump_files.extend(sorted(glob.glob(path.join(sqldir, '*.sql')), + key=sortkey)) + if not no_create: + pg_createdb(db_name) + + dump_files = [(x, DB_DUMP_TYPES[path.splitext(x)[1]]) + for x in dump_files] + for dump, dtype in dump_files: + click.secho('Loading {}'.format(dump), fg='yellow') + pg_restore(db_name, dump, dtype) + + click.secho('DONE database is {}'.format(db_name), fg='green', bold=True) diff --git a/swh/core/logger.py b/swh/core/logger.py index 06e0847..ecced63 100644 --- a/swh/core/logger.py +++ b/swh/core/logger.py @@ -1,192 +1,192 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging import os import socket import psycopg2 from psycopg2.extras import Json from systemd.journal import JournalHandler as _JournalHandler, send try: from celery import current_task except ImportError: current_task = None EXTRA_LOGDATA_PREFIX = 'swh_' def db_level_of_py_level(lvl): """convert a log level of the logging module to a log level suitable for the logging Postgres DB """ return logging.getLevelName(lvl).lower() def get_extra_data(record, task_args=True): """Get the extra data to insert to the database from the logging record""" log_data = record.__dict__ extra_data = {k[len(EXTRA_LOGDATA_PREFIX):]: v for k, v in log_data.items() if k.startswith(EXTRA_LOGDATA_PREFIX)} args = log_data.get('args') if args: extra_data['logging_args'] = args # Retrieve Celery task info if current_task and current_task.request: extra_data['task'] = { 'id': current_task.request.id, 'name': current_task.name, } if task_args: extra_data['task'].update({ 'kwargs': current_task.request.kwargs, 'args': current_task.request.args, }) return extra_data def flatten(data, separator='_'): """Flatten the data dictionary into a flat structure""" def inner_flatten(data, prefix): if isinstance(data, dict): for key, value in data.items(): yield from inner_flatten(value, prefix + [key]) elif isinstance(data, (list, tuple)): for key, value in enumerate(data): yield from inner_flatten(value, prefix + [str(key)]) else: yield prefix, data for path, value in inner_flatten(data, []): yield separator.join(path), value def stringify(value): """Convert value to string""" if isinstance(value, datetime.datetime): return value.isoformat() return str(value) class PostgresHandler(logging.Handler): """log handler that store messages in a Postgres DB - See swh-core/sql/log-schema.sql for the DB schema. + See swh-core/swh/core/sql/log-schema.sql for the DB schema. All logging methods can be used as usual. Additionally, arbitrary metadata can be passed to logging methods, requesting that they will be stored in the DB as a single JSONB value. To do so, pass a dictionary to the 'extra' kwarg of any logging method; all keys in that dictionary that start with EXTRA_LOGDATA_PREFIX (currently: 'swh\_') will be extracted to form the JSONB dictionary. The prefix will be stripped and not included in the DB. Note: the logger name will be used to fill the 'module' DB column. Sample usage:: logging.basicConfig(level=logging.INFO) h = PostgresHandler('dbname=softwareheritage-log') logging.getLogger().addHandler(h) logger.info('not so important notice', extra={'swh_type': 'swh_logging_test', 'swh_meditation': 'guru'}) logger.warn('something weird just happened, did you see that?') """ def __init__(self, connstring): """ Create a Postgres log handler. Args: config: configuration dictionary, with a key "log_db" containing a libpq connection string to the log DB """ super().__init__() self.connstring = connstring self.fqdn = socket.getfqdn() # cache FQDN value def _connect(self): return psycopg2.connect(self.connstring) def emit(self, record): msg = self.format(record) extra_data = get_extra_data(record) if 'task' in extra_data: task_args = { 'args': extra_data['task']['args'], 'kwargs': extra_data['task']['kwargs'], } try: json_args = Json(task_args).getquoted() except TypeError: task_args = { 'args': [''], 'kwargs': {}, } else: json_args_length = len(json_args) if json_args_length >= 1000: task_args = { 'args': [''], 'kwargs': {}, } extra_data['task'].update(task_args) log_entry = (db_level_of_py_level(record.levelno), msg, Json(extra_data), record.name, self.fqdn, os.getpid()) db = self._connect() with db.cursor() as cur: cur.execute('INSERT INTO log ' '(level, message, data, src_module, src_host, src_pid)' 'VALUES (%s, %s, %s, %s, %s, %s)', log_entry) db.commit() db.close() class JournalHandler(_JournalHandler): def emit(self, record): """Write `record` as a journal event. MESSAGE is taken from the message provided by the user, and PRIORITY, LOGGER, THREAD_NAME, CODE_{FILE,LINE,FUNC} fields are appended automatically. In addition, record.MESSAGE_ID will be used if present. """ try: extra_data = flatten(get_extra_data(record, task_args=False)) extra_data = { (EXTRA_LOGDATA_PREFIX + key).upper(): stringify(value) for key, value in extra_data } msg = self.format(record) pri = self.mapPriority(record.levelno) send(msg, PRIORITY=format(pri), LOGGER=record.name, THREAD_NAME=record.threadName, CODE_FILE=record.pathname, CODE_LINE=record.lineno, CODE_FUNC=record.funcName, **extra_data) except Exception: self.handleError(record) diff --git a/swh/core/serializers.py b/swh/core/serializers.py index b3c4a19..49bd7e0 100644 --- a/swh/core/serializers.py +++ b/swh/core/serializers.py @@ -1,175 +1,186 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import arrow import base64 import datetime from json import JSONDecoder, JSONEncoder import types from uuid import UUID import dateutil.parser import msgpack def encode_data_client(data): try: return msgpack_dumps(data) except OverflowError as e: raise ValueError('Limits were reached. Please, check your input.\n' + str(e)) def decode_response(response): content_type = response.headers['content-type'] if content_type.startswith('application/x-msgpack'): r = msgpack_loads(response.content) elif content_type.startswith('application/json'): r = response.json(cls=SWHJSONDecoder) else: raise ValueError('Wrong content type `%s` for API response' % content_type) return r class SWHJSONEncoder(JSONEncoder): """JSON encoder for data structures generated by Software Heritage. This JSON encoder extends the default Python JSON encoder and adds awareness for the following specific types: - bytes (get encoded as a Base85 string); - datetime.datetime (get encoded as an ISO8601 string). Non-standard types get encoded as a a dictionary with two keys: - swhtype with value 'bytes' or 'datetime'; - d containing the encoded value. SWHJSONEncoder also encodes arbitrary iterables as a list (allowing serialization of generators). Caveats: Limitations in the JSONEncoder extension mechanism prevent us from "escaping" dictionaries that only contain the swhtype and d keys, and therefore arbitrary data structures can't be round-tripped through SWHJSONEncoder and SWHJSONDecoder. """ def default(self, o): if isinstance(o, bytes): return { 'swhtype': 'bytes', 'd': base64.b85encode(o).decode('ascii'), } elif isinstance(o, datetime.datetime): return { 'swhtype': 'datetime', 'd': o.isoformat(), } elif isinstance(o, UUID): return { 'swhtype': 'uuid', 'd': str(o), } elif isinstance(o, datetime.timedelta): return { 'swhtype': 'timedelta', - 'd': repr(o), + 'd': { + 'days': o.days, + 'seconds': o.seconds, + 'microseconds': o.microseconds, + }, } elif isinstance(o, arrow.Arrow): return { 'swhtype': 'arrow', 'd': o.isoformat(), } try: return super().default(o) except TypeError as e: try: iterable = iter(o) except TypeError: raise e from None else: return list(iterable) class SWHJSONDecoder(JSONDecoder): """JSON decoder for data structures encoded with SWHJSONEncoder. This JSON decoder extends the default Python JSON decoder, allowing the decoding of: - bytes (encoded as a Base85 string); - datetime.datetime (encoded as an ISO8601 string). Non-standard types must be encoded as a a dictionary with exactly two keys: - swhtype with value 'bytes' or 'datetime'; - d containing the encoded value. To limit the impact our encoding, if the swhtype key doesn't contain a known value, the dictionary is decoded as-is. """ def decode_data(self, o): if isinstance(o, dict): if set(o.keys()) == {'d', 'swhtype'}: datatype = o['swhtype'] if datatype == 'bytes': return base64.b85decode(o['d']) elif datatype == 'datetime': return dateutil.parser.parse(o['d']) elif datatype == 'uuid': return UUID(o['d']) elif datatype == 'timedelta': - return eval(o['d']) + return datetime.timedelta(**o['d']) elif datatype == 'arrow': return arrow.get(o['d']) return {key: self.decode_data(value) for key, value in o.items()} if isinstance(o, list): return [self.decode_data(value) for value in o] else: return o def raw_decode(self, s, idx=0): data, index = super().raw_decode(s, idx) return self.decode_data(data), index def msgpack_dumps(data): """Write data as a msgpack stream""" def encode_types(obj): if isinstance(obj, datetime.datetime): return {b'__datetime__': True, b's': obj.isoformat()} if isinstance(obj, types.GeneratorType): return list(obj) if isinstance(obj, UUID): return {b'__uuid__': True, b's': str(obj)} if isinstance(obj, datetime.timedelta): - return {b'__timedelta__': True, b's': repr(obj)} + return { + b'__timedelta__': True, + b's': { + 'days': obj.days, + 'seconds': obj.seconds, + 'microseconds': obj.microseconds, + }, + } if isinstance(obj, arrow.Arrow): return {b'__arrow__': True, b's': obj.isoformat()} return obj return msgpack.packb(data, use_bin_type=True, default=encode_types) def msgpack_loads(data): """Read data as a msgpack stream""" def decode_types(obj): if b'__datetime__' in obj and obj[b'__datetime__']: return dateutil.parser.parse(obj[b's']) if b'__uuid__' in obj and obj[b'__uuid__']: return UUID(obj[b's']) if b'__timedelta__' in obj and obj[b'__timedelta__']: - return eval(obj[b's']) + return datetime.timedelta(**obj[b's']) if b'__arrow__' in obj and obj[b'__arrow__']: return arrow.get(obj[b's']) return obj return msgpack.unpackb(data, encoding='utf-8', object_hook=decode_types) diff --git a/sql/log-schema.sql b/swh/core/sql/log-schema.sql similarity index 100% rename from sql/log-schema.sql rename to swh/core/sql/log-schema.sql diff --git a/swh/core/tests/__init__.py b/swh/core/tests/__init__.py index e69de29..0e407c3 100644 --- a/swh/core/tests/__init__.py +++ b/swh/core/tests/__init__.py @@ -0,0 +1,5 @@ +from os import path +import swh.core + + +SQL_DIR = path.join(path.dirname(swh.core.__file__), 'sql') diff --git a/swh/core/tests/db_testing.py b/swh/core/tests/db_testing.py index 035945d..7c7971d 100644 --- a/swh/core/tests/db_testing.py +++ b/swh/core/tests/db_testing.py @@ -1,266 +1,281 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os +import glob import psycopg2 import subprocess +from swh.core.utils import numfile_sortkey as sortkey + +DB_DUMP_TYPES = {'.sql': 'psql', '.dump': 'pg_dump'} + def pg_restore(dbname, dumpfile, dumptype='pg_dump'): """ Args: dbname: name of the DB to restore into dumpfile: path fo the dump file dumptype: one of 'pg_dump' (for binary dumps), 'psql' (for SQL dumps) """ assert dumptype in ['pg_dump', 'psql'] if dumptype == 'pg_dump': subprocess.check_call(['pg_restore', '--no-owner', '--no-privileges', '--dbname', dbname, dumpfile]) elif dumptype == 'psql': subprocess.check_call(['psql', '--quiet', '--no-psqlrc', '-v', 'ON_ERROR_STOP=1', '-f', dumpfile, dbname]) def pg_dump(dbname, dumpfile): subprocess.check_call(['pg_dump', '--no-owner', '--no-privileges', '-Fc', '-f', dumpfile, dbname]) def pg_dropdb(dbname): subprocess.check_call(['dropdb', dbname]) def pg_createdb(dbname): subprocess.check_call(['createdb', dbname]) -def db_create(dbname, dump=None, dumptype='pg_dump'): - """create the test DB and load the test data dump into it +def db_create(dbname, dumps=None): + """create the test DB and load the test data dumps into it + + dumps is an iterable of couples (dump_file, dump_type). context: setUpClass """ try: pg_createdb(dbname) except subprocess.CalledProcessError: # try recovering once, in case pg_dropdb(dbname) # the db already existed pg_createdb(dbname) - if dump: - pg_restore(dbname, dump, dumptype) + for dump, dtype in dumps: + pg_restore(dbname, dump, dtype) return dbname def db_destroy(dbname): """destroy the test DB context: tearDownClass """ pg_dropdb(dbname) def db_connect(dbname): """connect to the test DB and open a cursor context: setUp """ conn = psycopg2.connect('dbname=' + dbname) return { 'conn': conn, 'cursor': conn.cursor() } def db_close(conn): """rollback current transaction and disconnect from the test DB context: tearDown """ if not conn.closed: conn.rollback() conn.close() class DbTestConn: def __init__(self, dbname): self.dbname = dbname def __enter__(self): self.db_setup = db_connect(self.dbname) self.conn = self.db_setup['conn'] self.cursor = self.db_setup['cursor'] return self def __exit__(self, *_): db_close(self.conn) class DbTestContext: - def __init__(self, name='softwareheritage-test', dump=None, - dump_type='pg_dump'): + def __init__(self, name='softwareheritage-test', dumps=None): self.dbname = name - self.dump = dump - self.dump_type = dump_type + self.dumps = dumps def __enter__(self): db_create(dbname=self.dbname, - dump=self.dump, - dumptype=self.dump_type) + dumps=self.dumps) return self def __exit__(self, *_): db_destroy(self.dbname) class DbTestFixture: """Mix this in a test subject class to get DB testing support. Use the class method add_db() to add a new database to be tested. Using this will create a DbTestConn entry in the `test_db` dictionary for all the tests, indexed by the name of the database. Example: class TestDb(DbTestFixture, unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.add_db('db_name', DUMP) def setUp(self): db = self.test_db['db_name'] print('conn: {}, cursor: {}'.format(db.conn, db.cursor)) To ensure test isolation, each test method of the test case class will execute in its own connection, cursor, and transaction. Note that if you want to define setup/teardown methods, you need to explicitly call super() to ensure that the fixture setup/teardown methods are invoked. Here is an example where all setup/teardown methods are defined in a test case: class TestDb(DbTestFixture, unittest.TestCase): @classmethod def setUpClass(cls): # your add_db() calls here super().setUpClass() # your class setup code here def setUp(self): super().setUp() # your instance setup code here def tearDown(self): # your instance teardown code here super().tearDown() @classmethod def tearDownClass(cls): # your class teardown code here super().tearDownClass() """ _DB_DUMP_LIST = {} _DB_LIST = {} DB_TEST_FIXTURE_IMPORTED = True @classmethod - def add_db(cls, name='softwareheritage-test', dump=None, - dump_type='pg_dump'): - cls._DB_DUMP_LIST[name] = (dump, dump_type) + def add_db(cls, name='softwareheritage-test', dumps=None): + cls._DB_DUMP_LIST[name] = dumps @classmethod def setUpClass(cls): - for name, (dump, dump_type) in cls._DB_DUMP_LIST.items(): - cls._DB_LIST[name] = DbTestContext(name, dump, dump_type) + for name, dumps in cls._DB_DUMP_LIST.items(): + cls._DB_LIST[name] = DbTestContext(name, dumps) cls._DB_LIST[name].__enter__() super().setUpClass() @classmethod def tearDownClass(cls): super().tearDownClass() for name, context in cls._DB_LIST.items(): context.__exit__() - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.test_db = {} - def setUp(self): self.test_db = {} for name in self._DB_LIST.keys(): self.test_db[name] = DbTestConn(name) self.test_db[name].__enter__() super().setUp() def tearDown(self): super().tearDown() for name in self._DB_LIST.keys(): self.test_db[name].__exit__() def reset_db_tables(self, name, excluded=None): db = self.test_db[name] conn = db.conn cursor = db.cursor cursor.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema = %s""", ('public',)) tables = set(table for (table,) in cursor.fetchall()) if excluded is not None: tables -= set(excluded) for table in tables: cursor.execute('truncate table %s cascade' % table) conn.commit() class SingleDbTestFixture(DbTestFixture): """Simplified fixture like DbTest but that can only handle a single DB. Gives access to shortcuts like self.cursor and self.conn. DO NOT use this with other fixtures that need to access databases, like StorageTestFixture. The class can override the following class attributes: TEST_DB_NAME: name of the DB used for testing TEST_DB_DUMP: DB dump to be restored before running test methods; can - be set to None if no restore from dump is required - TEST_DB_DUMP_TYPE: one of 'pg_dump' (binary dump) or 'psql' (SQL dump) + be set to None if no restore from dump is required. + If the dump file name endswith" + - '.sql' it will be loaded via psql, + - '.dump' it will be loaded via pg_restore. + Other file extensions will be ignored. + Can be a string or a list of strings; each path will be expanded + using glob pattern matching. The test case class will then have the following attributes, accessible via self: dbname: name of the test database conn: psycopg2 connection object cursor: open psycopg2 cursor to the DB """ TEST_DB_NAME = 'softwareheritage-test' TEST_DB_DUMP = None - TEST_DB_DUMP_TYPE = 'pg_dump' @classmethod def setUpClass(cls): - cls.dbname = cls.TEST_DB_NAME + cls.dbname = cls.TEST_DB_NAME # XXX to kill? + + dump_files = cls.TEST_DB_DUMP + if isinstance(dump_files, str): + dump_files = [dump_files] + all_dump_files = [] + for files in dump_files: + all_dump_files.extend( + sorted(glob.glob(files), key=sortkey)) + + all_dump_files = [(x, DB_DUMP_TYPES[os.path.splitext(x)[1]]) + for x in all_dump_files] + cls.add_db(name=cls.TEST_DB_NAME, - dump=cls.TEST_DB_DUMP, - dump_type=cls.TEST_DB_DUMP_TYPE) + dumps=all_dump_files) super().setUpClass() def setUp(self): super().setUp() db = self.test_db[self.TEST_DB_NAME] self.conn = db.conn self.cursor = db.cursor diff --git a/swh/core/tests/test_config.py b/swh/core/tests/test_config.py index bdabb1a..b0461b4 100644 --- a/swh/core/tests/test_config.py +++ b/swh/core/tests/test_config.py @@ -1,210 +1,210 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import tempfile import unittest from swh.core import config class ConfReaderTest(unittest.TestCase): @classmethod def setUpClass(cls): # create a temporary folder cls.tmpdir = tempfile.mkdtemp(prefix='test-swh-core.') cls.conffile = os.path.join(cls.tmpdir, 'config.ini') conf_contents = """[main] a = 1 b = this is a string c = true h = false ls = list, of, strings li = 1, 2, 3, 4 """ with open(cls.conffile, 'w') as conf: conf.write(conf_contents) cls.non_existing_conffile = os.path.join(cls.tmpdir, 'config-nonexisting.ini') # Create an unreadable, proper configuration file cls.perms_broken_file = os.path.join(cls.tmpdir, 'unreadable.ini') with open(cls.perms_broken_file, 'w') as conf: conf.write(conf_contents) os.chmod(cls.perms_broken_file, 0o000) # Create a proper configuration file in an unreadable directory cls.perms_broken_dir = os.path.join(cls.tmpdir, 'unreadabledir') cls.file_in_broken_dir = os.path.join(cls.perms_broken_dir, 'unreadable.ini') os.makedirs(cls.perms_broken_dir) with open(cls.file_in_broken_dir, 'w') as conf: conf.write(conf_contents) os.chmod(cls.perms_broken_dir, 0o000) cls.empty_conffile = os.path.join(cls.tmpdir, 'empty.ini') open(cls.empty_conffile, 'w').close() cls.default_conf = { 'a': ('int', 2), 'b': ('string', 'default-string'), 'c': ('bool', True), 'd': ('int', 10), 'e': ('int', None), 'f': ('bool', None), 'g': ('string', None), 'h': ('bool', True), 'i': ('bool', True), 'ls': ('list[str]', ['a', 'b', 'c']), 'li': ('list[int]', [42, 43]), } cls.other_default_conf = { 'a': ('int', 3), } cls.full_default_conf = cls.default_conf.copy() cls.full_default_conf['a'] = cls.other_default_conf['a'] cls.parsed_default_conf = { key: value for key, (type, value) in cls.default_conf.items() } cls.parsed_conffile = { 'a': 1, 'b': 'this is a string', 'c': True, 'd': 10, 'e': None, 'f': None, 'g': None, 'h': False, 'i': True, 'ls': ['list', 'of', 'strings'], 'li': [1, 2, 3, 4], } @classmethod def tearDownClass(cls): # Make the broken perms items readable again to be able to remove them os.chmod(cls.perms_broken_dir, 0o755) os.chmod(cls.perms_broken_file, 0o644) shutil.rmtree(cls.tmpdir) def test_read(self): # when res = config.read(self.conffile, self.default_conf) # then - self.assertEquals(res, self.parsed_conffile) + self.assertEqual(res, self.parsed_conffile) def test_read_empty_file(self): # when res = config.read(None, self.default_conf) # then - self.assertEquals(res, self.parsed_default_conf) + self.assertEqual(res, self.parsed_default_conf) def test_support_non_existing_conffile(self): # when res = config.read(self.non_existing_conffile, self.default_conf) # then - self.assertEquals(res, self.parsed_default_conf) + self.assertEqual(res, self.parsed_default_conf) def test_support_empty_conffile(self): # when res = config.read(self.empty_conffile, self.default_conf) # then - self.assertEquals(res, self.parsed_default_conf) + self.assertEqual(res, self.parsed_default_conf) def test_raise_on_broken_directory_perms(self): with self.assertRaises(PermissionError): config.read(self.file_in_broken_dir, self.default_conf) def test_raise_on_broken_file_perms(self): with self.assertRaises(PermissionError): config.read(self.perms_broken_file, self.default_conf) def test_merge_default_configs(self): # when res = config.merge_default_configs(self.default_conf, self.other_default_conf) # then - self.assertEquals(res, self.full_default_conf) + self.assertEqual(res, self.full_default_conf) def test_priority_read_nonexist_conf(self): # when res = config.priority_read([self.non_existing_conffile, self.conffile], self.default_conf) # then - self.assertEquals(res, self.parsed_conffile) + self.assertEqual(res, self.parsed_conffile) def test_priority_read_conf_nonexist_empty(self): # when res = config.priority_read([ self.conffile, self.non_existing_conffile, self.empty_conffile, ], self.default_conf) # then - self.assertEquals(res, self.parsed_conffile) + self.assertEqual(res, self.parsed_conffile) def test_priority_read_empty_conf_nonexist(self): # when res = config.priority_read([ self.empty_conffile, self.conffile, self.non_existing_conffile, ], self.default_conf) # then - self.assertEquals(res, self.parsed_default_conf) + self.assertEqual(res, self.parsed_default_conf) def test_swh_config_paths(self): res = config.swh_config_paths('foo/bar.ini') self.assertEqual(res, [ '~/.config/swh/foo/bar.ini', '~/.swh/foo/bar.ini', '/etc/softwareheritage/foo/bar.ini', ]) def test_prepare_folder(self): # given conf = {'path1': os.path.join(self.tmpdir, 'path1'), 'path2': os.path.join(self.tmpdir, 'path2', 'depth1')} # the folders does not exists self.assertFalse(os.path.exists(conf['path1']), "path1 should not exist.") self.assertFalse(os.path.exists(conf['path2']), "path2 should not exist.") # when config.prepare_folders(conf, 'path1') # path1 exists but not path2 self.assertTrue(os.path.exists(conf['path1']), "path1 should now exist!") self.assertFalse(os.path.exists(conf['path2']), "path2 should not exist.") # path1 already exists, skips it but creates path2 config.prepare_folders(conf, 'path1', 'path2') self.assertTrue(os.path.exists(conf['path1']), "path1 should still exist!") self.assertTrue(os.path.exists(conf['path2']), "path2 should now exist.") diff --git a/swh/core/tests/test_logger.py b/swh/core/tests/test_logger.py index 1cbe308..ba4c9f6 100644 --- a/swh/core/tests/test_logger.py +++ b/swh/core/tests/test_logger.py @@ -1,47 +1,45 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import unittest -from nose.plugins.attrib import attr +import pytest from swh.core.logger import PostgresHandler from swh.core.tests.db_testing import SingleDbTestFixture -TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -SQL_DIR = os.path.join(TEST_DIR, '../../../sql') +from swh.core.tests import SQL_DIR -@attr('db') +@pytest.mark.db class PgLogHandler(SingleDbTestFixture, unittest.TestCase): TEST_DB_DUMP = os.path.join(SQL_DIR, 'log-schema.sql') - TEST_DB_DUMP_TYPE = 'psql' def setUp(self): super().setUp() self.modname = 'swh.core.tests.test_logger' self.logger = logging.Logger(self.modname, logging.DEBUG) self.logger.addHandler(PostgresHandler('dbname=' + self.TEST_DB_NAME)) def tearDown(self): logging.shutdown() super().tearDown() def test_log(self): self.logger.info('notice', extra={'swh_type': 'test entry', 'swh_data': 42}) - self.logger.warn('warning') + self.logger.warning('warning') with self.conn.cursor() as cur: cur.execute('SELECT level, message, data, src_module FROM log') db_log_entries = cur.fetchall() self.assertIn(('info', 'notice', {'type': 'test entry', 'data': 42}, self.modname), db_log_entries) self.assertIn(('warning', 'warning', {}, self.modname), db_log_entries) diff --git a/swh/core/tests/test_serializers.py b/swh/core/tests/test_serializers.py index 166744a..f9e80e9 100644 --- a/swh/core/tests/test_serializers.py +++ b/swh/core/tests/test_serializers.py @@ -1,80 +1,81 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import unittest from uuid import UUID import arrow from swh.core.serializers import ( SWHJSONDecoder, SWHJSONEncoder, msgpack_dumps, msgpack_loads ) class Serializers(unittest.TestCase): def setUp(self): self.tz = datetime.timezone(datetime.timedelta(minutes=118)) self.data = { 'bytes': b'123456789\x99\xaf\xff\x00\x12', 'datetime_naive': datetime.datetime(2015, 1, 1, 12, 4, 42, 231455), 'datetime_tz': datetime.datetime(2015, 3, 4, 18, 25, 13, 1234, tzinfo=self.tz), 'datetime_utc': datetime.datetime(2015, 3, 4, 18, 25, 13, 1234, tzinfo=datetime.timezone.utc), 'datetime_delta': datetime.timedelta(64), 'arrow_date': arrow.get('2018-04-25T16:17:53.533672+00:00'), 'swhtype': 'fake', 'swh_dict': {'swhtype': 42, 'd': 'test'}, 'random_dict': {'swhtype': 43}, 'uuid': UUID('cdd8f804-9db6-40c3-93ab-5955d3836234'), } self.encoded_data = { 'bytes': {'swhtype': 'bytes', 'd': 'F)}kWH8wXmIhn8j01^'}, 'datetime_naive': {'swhtype': 'datetime', 'd': '2015-01-01T12:04:42.231455'}, 'datetime_tz': {'swhtype': 'datetime', 'd': '2015-03-04T18:25:13.001234+01:58'}, 'datetime_utc': {'swhtype': 'datetime', 'd': '2015-03-04T18:25:13.001234+00:00'}, 'datetime_delta': {'swhtype': 'timedelta', - 'd': 'datetime.timedelta(64)'}, + 'd': {'days': 64, 'seconds': 0, + 'microseconds': 0}}, 'arrow_date': {'swhtype': 'arrow', 'd': '2018-04-25T16:17:53.533672+00:00'}, 'swhtype': 'fake', 'swh_dict': {'swhtype': 42, 'd': 'test'}, 'random_dict': {'swhtype': 43}, 'uuid': {'swhtype': 'uuid', 'd': 'cdd8f804-9db6-40c3-93ab-5955d3836234'}, } self.generator = (i for i in range(5)) self.gen_lst = list(range(5)) def test_round_trip_json(self): data = json.dumps(self.data, cls=SWHJSONEncoder) self.assertEqual(self.data, json.loads(data, cls=SWHJSONDecoder)) def test_encode_swh_json(self): data = json.dumps(self.data, cls=SWHJSONEncoder) self.assertEqual(self.encoded_data, json.loads(data)) def test_round_trip_msgpack(self): data = msgpack_dumps(self.data) self.assertEqual(self.data, msgpack_loads(data)) def test_generator_json(self): data = json.dumps(self.generator, cls=SWHJSONEncoder) self.assertEqual(self.gen_lst, json.loads(data, cls=SWHJSONDecoder)) def test_generator_msgpack(self): data = msgpack_dumps(self.generator) self.assertEqual(self.gen_lst, msgpack_loads(data)) diff --git a/swh/core/tests/test_utils.py b/swh/core/tests/test_utils.py index 48dab5d..a2a4397 100644 --- a/swh/core/tests/test_utils.py +++ b/swh/core/tests/test_utils.py @@ -1,116 +1,116 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.core import utils class UtilsLib(unittest.TestCase): def test_grouper(self): # given actual_data = utils.grouper((i for i in range(0, 9)), 2) out = [] for d in actual_data: out.append(list(d)) # force generator resolution for checks self.assertEqual(out, [[0, 1], [2, 3], [4, 5], [6, 7], [8]]) # given actual_data = utils.grouper((i for i in range(9, 0, -1)), 4) out = [] for d in actual_data: out.append(list(d)) # force generator resolution for checks self.assertEqual(out, [[9, 8, 7, 6], [5, 4, 3, 2], [1]]) def test_backslashescape_errors(self): raw_data_err = b'abcd\x80' with self.assertRaises(UnicodeDecodeError): raw_data_err.decode('utf-8', 'strict') - self.assertEquals( + self.assertEqual( raw_data_err.decode('utf-8', 'backslashescape'), 'abcd\\x80', ) raw_data_ok = b'abcd\xc3\xa9' - self.assertEquals( + self.assertEqual( raw_data_ok.decode('utf-8', 'backslashescape'), raw_data_ok.decode('utf-8', 'strict'), ) unicode_data = 'abcdef\u00a3' - self.assertEquals( + self.assertEqual( unicode_data.encode('ascii', 'backslashescape'), b'abcdef\\xa3', ) def test_encode_with_unescape(self): valid_data = '\\x01020304\\x00' valid_data_encoded = b'\x01020304\x00' - self.assertEquals( + self.assertEqual( valid_data_encoded, utils.encode_with_unescape(valid_data) ) def test_encode_with_unescape_invalid_escape(self): invalid_data = 'test\\abcd' with self.assertRaises(ValueError) as exc: utils.encode_with_unescape(invalid_data) self.assertIn('invalid escape', exc.exception.args[0]) self.assertIn('position 4', exc.exception.args[0]) def test_decode_with_escape(self): backslashes = b'foo\\bar\\\\baz' backslashes_escaped = 'foo\\\\bar\\\\\\\\baz' - self.assertEquals( + self.assertEqual( backslashes_escaped, utils.decode_with_escape(backslashes), ) valid_utf8 = b'foo\xc3\xa2' valid_utf8_escaped = 'foo\u00e2' - self.assertEquals( + self.assertEqual( valid_utf8_escaped, utils.decode_with_escape(valid_utf8), ) invalid_utf8 = b'foo\xa2' invalid_utf8_escaped = 'foo\\xa2' - self.assertEquals( + self.assertEqual( invalid_utf8_escaped, utils.decode_with_escape(invalid_utf8), ) valid_utf8_nul = b'foo\xc3\xa2\x00' valid_utf8_nul_escaped = 'foo\u00e2\\x00' - self.assertEquals( + self.assertEqual( valid_utf8_nul_escaped, utils.decode_with_escape(valid_utf8_nul), ) def test_commonname(self): # when actual_commonname = utils.commonname('/some/where/to/', '/some/where/to/go/to') # then - self.assertEquals('go/to', actual_commonname) + self.assertEqual('go/to', actual_commonname) # when actual_commonname2 = utils.commonname(b'/some/where/to/', b'/some/where/to/go/to') # then - self.assertEquals(b'go/to', actual_commonname2) + self.assertEqual(b'go/to', actual_commonname2) diff --git a/swh/core/utils.py b/swh/core/utils.py index 0e748a1..857f8f6 100644 --- a/swh/core/utils.py +++ b/swh/core/utils.py @@ -1,103 +1,117 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import itertools import codecs +import re from contextlib import contextmanager @contextmanager def cwd(path): """Contextually change the working directory to do thy bidding. Then gets back to the original location. """ prev_cwd = os.getcwd() os.chdir(path) try: yield finally: os.chdir(prev_cwd) def grouper(iterable, n): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n for _data in itertools.zip_longest(*args, fillvalue=None): yield (d for d in _data if d is not None) def backslashescape_errors(exception): if isinstance(exception, UnicodeDecodeError): bad_data = exception.object[exception.start:exception.end] escaped = ''.join(r'\x%02x' % x for x in bad_data) return escaped, exception.end return codecs.backslashreplace_errors(exception) codecs.register_error('backslashescape', backslashescape_errors) def encode_with_unescape(value): """Encode an unicode string containing \\x backslash escapes""" slices = [] start = 0 odd_backslashes = False i = 0 while i < len(value): if value[i] == '\\': odd_backslashes = not odd_backslashes else: if odd_backslashes: if value[i] != 'x': raise ValueError('invalid escape for %r at position %d' % (value, i-1)) slices.append( value[start:i-1].replace('\\\\', '\\').encode('utf-8') ) slices.append(bytes.fromhex(value[i+1:i+3])) odd_backslashes = False start = i = i + 3 continue i += 1 slices.append( value[start:i].replace('\\\\', '\\').encode('utf-8') ) return b''.join(slices) def decode_with_escape(value): """Decode a bytestring as utf-8, escaping the bytes of invalid utf-8 sequences as \\x. We also escape NUL bytes as they are invalid in JSON strings. """ # escape backslashes value = value.replace(b'\\', b'\\\\') value = value.replace(b'\x00', b'\\x00') return value.decode('utf-8', 'backslashescape') def commonname(path0, path1, as_str=False): """Compute the commonname between the path0 and path1. """ return path1.split(path0)[1] + + +def numfile_sortkey(fname): + """Simple function to sort filenames of the form: + + nnxxx.ext + + where nn is a number according to the numbers. + + Typically used to sort sql/nn-swh-xxx.sql files. + """ + num, rem = re.match(r'(\d*)(.*)', fname).groups() + return (num and int(num) or 99, rem) diff --git a/tox.ini b/tox.ini index 7a3f221..c1e2f8b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,23 +1,17 @@ [tox] -envlist=check-manifest,flake8,py3 +envlist=flake8,py3 [testenv:py3] deps = + -r requirements-test.txt + pytest-cov pifpaf - nose commands = - pifpaf run postgresql -- nosetests + pifpaf run postgresql -- pytest --cov=swh --cov-branch {posargs} [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 - -[testenv:check-manifest] -skip_install = true -deps = - check-manifest -commands = - {envpython} -m check_manifest {toxinidir} diff --git a/version.txt b/version.txt index 6e57d71..2595a75 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.43-0-g8363433 \ No newline at end of file +v0.0.44-0-ge118202 \ No newline at end of file