diff --git a/PKG-INFO b/PKG-INFO index 26fd246..76b5ec9 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.27 +Version: 0.0.28 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index 4280875..9da8398 100644 --- a/debian/control +++ b/debian/control @@ -1,21 +1,23 @@ Source: swh-core Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, + python3-flask, + python3-requests, python3-dateutil, python3-msgpack, python3-nose, python3-psycopg2, python3-setuptools, python3-yaml, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DCORE/ Package: python3-swh.core Architecture: all Depends: ${misc:Depends}, ${python3:Depends} Description: Software Heritage core utilities diff --git a/requirements.txt b/requirements.txt index 942ea75..e5c3fc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ msgpack-python psycopg2 python-dateutil vcversioner PyYAML +requests +Flask diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index 26fd246..76b5ec9 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.27 +Version: 0.0.28 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index b545ac6..ac6b3fd 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,35 +1,36 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README.md requirements.txt setup.py version.txt bin/swh-hashdir bin/swh-hashfile debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format sql/log-schema.sql swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/__init__.py +swh/core/api.py swh/core/config.py swh/core/hashutil.py swh/core/logger.py swh/core/serializers.py swh/core/utils.py swh/core/tests/db_testing.py swh/core/tests/test_config.py swh/core/tests/test_hashutil.py swh/core/tests/test_logger.py swh/core/tests/test_serializers.py swh/core/tests/test_utils.py \ No newline at end of file diff --git a/swh.core.egg-info/requires.txt b/swh.core.egg-info/requires.txt index 5ee2ebc..11dff34 100644 --- a/swh.core.egg-info/requires.txt +++ b/swh.core.egg-info/requires.txt @@ -1,5 +1,7 @@ +Flask PyYAML msgpack-python psycopg2 python-dateutil +requests vcversioner diff --git a/swh/core/api.py b/swh/core/api.py new file mode 100644 index 0000000..04d2981 --- /dev/null +++ b/swh/core/api.py @@ -0,0 +1,108 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import pickle +import requests + +from flask import Flask, Request, Response +from .serializers import (decode_response, + encode_data_client as encode_data, + msgpack_dumps, msgpack_loads, SWHJSONDecoder) + + +class SWHRemoteAPI: + """Proxy to an internal SWH API + + """ + + def __init__(self, api_exception, url): + super().__init__() + self.api_exception = api_exception + base_url = url if url.endswith('/') else url + '/' + self.url = base_url + self.session = requests.Session() + + def _url(self, endpoint): + return '%s%s' % (self.url, endpoint) + + def post(self, endpoint, data): + try: + response = self.session.post( + self._url(endpoint), + data=encode_data(data), + headers={'content-type': 'application/x-msgpack'}, + ) + except ConnectionError as e: + print(str(e)) + raise self.api_exception(e) + + # XXX: this breaks language-independence and should be + # replaced by proper unserialization + if response.status_code == 400: + raise pickle.loads(decode_response(response)) + + return decode_response(response) + + def get(self, endpoint, data=None): + try: + response = self.session.get( + self._url(endpoint), + params=data, + ) + except ConnectionError as e: + print(str(e)) + raise self.api_exception(e) + + if response.status_code == 404: + return None + + # XXX: this breaks language-independence and should be + # replaced by proper unserialization + if response.status_code == 400: + raise pickle.loads(decode_response(response)) + + else: + return decode_response(response) + + +class BytesRequest(Request): + """Request with proper escaping of arbitrary byte sequences.""" + encoding = 'utf-8' + encoding_errors = 'surrogateescape' + + +def encode_data_server(data): + return Response( + msgpack_dumps(data), + mimetype='application/x-msgpack', + ) + + +def decode_request(request): + content_type = request.mimetype + data = request.get_data() + + if content_type == 'application/x-msgpack': + r = msgpack_loads(data) + elif content_type == 'application/json': + r = json.loads(data, cls=SWHJSONDecoder) + else: + raise ValueError('Wrong content type `%s` for API request' + % content_type) + + return r + + +def error_handler(exception, encoder): + # XXX: this breaks language-independence and should be + # replaced by proper serialization of errors + response = encoder(pickle.dumps(exception)) + response.status_code = 400 + return response + + +class SWHServerAPIApp(Flask): + request_class = BytesRequest diff --git a/swh/core/serializers.py b/swh/core/serializers.py index 8466d53..437fc4f 100644 --- a/swh/core/serializers.py +++ b/swh/core/serializers.py @@ -1,126 +1,148 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import datetime from json import JSONDecoder, JSONEncoder import types from uuid import UUID import dateutil.parser import msgpack +def encode_data_client(data): + try: + return msgpack_dumps(data) + except OverflowError as e: + raise ValueError('Limits were reached. Please, check your input.\n' + + str(e)) + + +def decode_response(response): + content_type = response.headers['content-type'] + + if content_type.startswith('application/x-msgpack'): + r = msgpack_loads(response.content) + elif content_type.startswith('application/json'): + r = response.json(cls=SWHJSONDecoder) + else: + raise ValueError('Wrong content type `%s` for API response' + % content_type) + + return r + + class SWHJSONEncoder(JSONEncoder): """JSON encoder for data structures generated by Software Heritage. This JSON encoder extends the default Python JSON encoder and adds awareness for the following specific types: - bytes (get encoded as a Base85 string); - datetime.datetime (get encoded as an ISO8601 string). Non-standard types get encoded as a a dictionary with two keys: - swhtype with value 'bytes' or 'datetime'; - d containing the encoded value. SWHJSONEncoder also encodes arbitrary iterables as a list (allowing serialization of generators). Caveats: Limitations in the JSONEncoder extension mechanism prevent us from "escaping" dictionaries that only contain the swhtype and d keys, and therefore arbitrary data structures can't be round-tripped through SWHJSONEncoder and SWHJSONDecoder. """ def default(self, o): if isinstance(o, bytes): return { 'swhtype': 'bytes', 'd': base64.b85encode(o).decode('ascii'), } elif isinstance(o, datetime.datetime): return { 'swhtype': 'datetime', 'd': o.isoformat(), } elif isinstance(o, UUID): return { 'swhtype': 'uuid', 'd': str(o), } try: return super().default(o) except TypeError as e: try: iterable = iter(o) except TypeError: raise e from None else: return list(iterable) class SWHJSONDecoder(JSONDecoder): """JSON decoder for data structures encoded with SWHJSONEncoder. This JSON decoder extends the default Python JSON decoder, allowing the decoding of: - bytes (encoded as a Base85 string); - datetime.datetime (encoded as an ISO8601 string). Non-standard types must be encoded as a a dictionary with exactly two keys: - swhtype with value 'bytes' or 'datetime'; - d containing the encoded value. To limit the impact our encoding, if the swhtype key doesn't contain a known value, the dictionary is decoded as-is. """ def decode_data(self, o): if isinstance(o, dict): if set(o.keys()) == {'d', 'swhtype'}: datatype = o['swhtype'] if datatype == 'bytes': return base64.b85decode(o['d']) elif datatype == 'datetime': return dateutil.parser.parse(o['d']) elif datatype == 'uuid': return UUID(o['d']) return {key: self.decode_data(value) for key, value in o.items()} if isinstance(o, list): return [self.decode_data(value) for value in o] else: return o def raw_decode(self, s, idx=0): data, index = super().raw_decode(s, idx) return self.decode_data(data), index def msgpack_dumps(data): """Write data as a msgpack stream""" def encode_types(obj): if isinstance(obj, datetime.datetime): return {b'__datetime__': True, b's': obj.isoformat()} if isinstance(obj, types.GeneratorType): return list(obj) if isinstance(obj, UUID): return {b'__uuid__': True, b's': str(obj)} return obj return msgpack.packb(data, use_bin_type=True, default=encode_types) def msgpack_loads(data): """Read data as a msgpack stream""" def decode_types(obj): if b'__datetime__' in obj and obj[b'__datetime__']: return dateutil.parser.parse(obj[b's']) if b'__uuid__' in obj and obj[b'__uuid__']: return UUID(obj[b's']) return obj return msgpack.unpackb(data, encoding='utf-8', object_hook=decode_types) diff --git a/swh/core/tests/db_testing.py b/swh/core/tests/db_testing.py index eb7ca9f..7895d00 100644 --- a/swh/core/tests/db_testing.py +++ b/swh/core/tests/db_testing.py @@ -1,257 +1,255 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import psycopg2 import subprocess def pg_restore(dbname, dumpfile, dumptype='pg_dump'): """ Args: dbname: name of the DB to restore into dumpfile: path fo the dump file dumptype: one of 'pg_dump' (for binary dumps), 'psql' (for SQL dumps) """ assert dumptype in ['pg_dump', 'psql'] if dumptype == 'pg_dump': subprocess.check_call(['pg_restore', '--no-owner', '--no-privileges', '--dbname', dbname, dumpfile]) elif dumptype == 'psql': subprocess.check_call(['psql', '--quiet', '-f', dumpfile, dbname]) def pg_dump(dbname, dumpfile): subprocess.check_call(['pg_dump', '--no-owner', '--no-privileges', '-Fc', '-f', dumpfile, dbname]) def pg_dropdb(dbname): subprocess.check_call(['dropdb', dbname]) def pg_createdb(dbname): subprocess.check_call(['createdb', dbname]) def db_create(dbname, dump=None, dumptype='pg_dump'): """create the test DB and load the test data dump into it context: setUpClass """ try: pg_createdb(dbname) except subprocess.CalledProcessError: # try recovering once, in case pg_dropdb(dbname) # the db already existed pg_createdb(dbname) if dump: pg_restore(dbname, dump, dumptype) return dbname def db_destroy(dbname): """destroy the test DB context: tearDownClass """ pg_dropdb(dbname) def db_connect(dbname): """connect to the test DB and open a cursor context: setUp """ conn = psycopg2.connect('dbname=' + dbname) return { 'conn': conn, 'cursor': conn.cursor() } def db_close(conn): """rollback current transaction and disconnet from the test DB context: tearDown """ if not conn.closed: conn.rollback() conn.close() class DbTestFixture(): """Mix this in a test subject class to get DB testing support. The class can override the following class attributes: TEST_DB_NAME: name of the DB used for testing TEST_DB_DUMP: DB dump to be restored before running test methods; can be set to None if no restore from dump is required TEST_DB_DUMP_TYPE: one of 'pg_dump' (binary dump) or 'psql' (SQL dump) The test case class will then have the following attributes, accessible via self: dbname: name of the test database conn: psycopg2 connection object cursor: open psycopg2 cursor to the DB To ensure test isolation, each test method of the test case class will execute in its own connection, cursor, and transaction. To ensure setup/teardown methods are called, in case of multiple inheritance DbTestFixture should be the first class in the inheritance hierarchy. Note that if you want to define setup/teardown methods, you need to explicitly call super() to ensure that the fixture setup/teardown methods are invoked. Here is an example where all setup/teardown methods are defined in a test case: class TestDb(DbTestFixture, unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() # your class setup code here def setUp(self): super().setUp() # your instance setup code here def tearDown(self): # your instance teardown code here super().tearDown() @classmethod def tearDownClass(cls): # your class teardown code here super().tearDownClass() """ TEST_DB_NAME = 'softwareheritage-test' TEST_DB_DUMP = None TEST_DB_DUMP_TYPE = 'pg_dump' @classmethod def setUpClass(cls): cls.dbname = db_create(dbname=cls.TEST_DB_NAME, dump=cls.TEST_DB_DUMP, dumptype=cls.TEST_DB_DUMP_TYPE) super().setUpClass() def setUp(self): db_setup = db_connect(self.dbname) self.conn = db_setup['conn'] self.cursor = db_setup['cursor'] super().setUp() def tearDown(self): super().tearDown() db_close(self.conn) @classmethod def tearDownClass(cls): super().tearDownClass() db_destroy(cls.dbname) class DbsTestFixture(): """Mix this in a test subject class to get DB testing support with multiple databases. The class can override the following class attributes: TEST_DB_NAMES: names of the DB used for testing TEST_DB_DUMPS: DB dumps to be restored before running test methods; can be set to [] if no restore from dump is required TEST_DB_DUMP_TYPES: List of one of 'pg_dump' (binary dump) or 'psql' (SQL dump) The test case class will then have the following attributes, accessible via self: dbnames: name of the test database conns: psycopg2 connection object cursors: open psycopg2 cursor to the DB To ensure test isolation, each test method of the test case class will execute in its own connection, cursor, and transaction. To ensure setup/teardown methods are called, in case of multiple inheritance DbTestFixture should be the first class in the inheritance hierarchy. Note that if you want to define setup/teardown methods, you need to explicitly call super() to ensure that the fixture setup/teardown methods are invoked. Here is an example where all setup/teardown methods are defined in a test case: class TestDb(DbTestFixture, unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() # your class setup code here def setUp(self): super().setUp() # your instance setup code here def tearDown(self): # your instance teardown code here super().tearDown() @classmethod def tearDownClass(cls): # your class teardown code here super().tearDownClass() """ TEST_DB_NAMES = ['softwareheritage-test'] TEST_DB_DUMPS = [] TEST_DB_DUMP_TYPES = ['pg_dump'] @classmethod def setUpClass(cls): dbnames = [] for i, dbname in enumerate(cls.TEST_DB_NAMES): try: dbname = db_create(dbname, dump=cls.TEST_DB_DUMPS[i], dumptype=cls.TEST_DB_DUMP_TYPES[i]) finally: - print(dbname) dbnames.append(dbname) - print(dbnames) cls.dbnames = dbnames super().setUpClass() def setUp(self): conns = [] cursors = [] for i, dbname in enumerate(self.dbnames): db_setup = db_connect(dbname) conns.append(db_setup['conn']) cursors.append(db_setup['cursor']) self.conns = conns self.cursors = cursors super().setUp() def tearDown(self): super().tearDown() for conn in self.conns: db_close(conn) @classmethod def tearDownClass(cls): super().tearDownClass() for dbname in cls.dbnames: db_destroy(dbname) diff --git a/swh/core/utils.py b/swh/core/utils.py index 235d704..a4921d0 100644 --- a/swh/core/utils.py +++ b/swh/core/utils.py @@ -1,95 +1,96 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import itertools import codecs from contextlib import contextmanager @contextmanager def cwd(path): """Contextually change the working directory to do thy bidding. Then gets back to the original location. """ prev_cwd = os.getcwd() os.chdir(path) try: yield finally: os.chdir(prev_cwd) def grouper(iterable, n): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n for _data in itertools.zip_longest(*args, fillvalue=None): yield (d for d in _data if d is not None) def backslashescape_errors(exception): if isinstance(exception, UnicodeDecodeError): bad_data = exception.object[exception.start:exception.end] escaped = ''.join(r'\x%02x' % x for x in bad_data) return escaped, exception.end return codecs.backslashreplace_errors(exception) + codecs.register_error('backslashescape', backslashescape_errors) def encode_with_unescape(value): """Encode an unicode string containing \\x backslash escapes""" slices = [] start = 0 odd_backslashes = False i = 0 while i < len(value): if value[i] == '\\': odd_backslashes = not odd_backslashes else: if odd_backslashes: if value[i] != 'x': raise ValueError('invalid escape for %r at position %d' % (value, i-1)) slices.append( value[start:i-1].replace('\\\\', '\\').encode('utf-8') ) slices.append(bytes.fromhex(value[i+1:i+3])) odd_backslashes = False start = i = i + 3 continue i += 1 slices.append( value[start:i].replace('\\\\', '\\').encode('utf-8') ) return b''.join(slices) def decode_with_escape(value): """Decode a bytestring as utf-8, escaping the bytes of invalid utf-8 sequences as \\x. We also escape NUL bytes as they are invalid in JSON strings. """ # escape backslashes value = value.replace(b'\\', b'\\\\') value = value.replace(b'\x00', b'\\x00') return value.decode('utf-8', 'backslashescape') diff --git a/version.txt b/version.txt index 0937988..ce8d225 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.27-0-g24a51d5 \ No newline at end of file +v0.0.28-0-ge3f6ea9 \ No newline at end of file