diff --git a/requirements.txt b/requirements.txt index c339f52..38490a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,12 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner # remote storage API server -flask - +aiohttp click # optional dependencies # apache-libcloud # azure-storage diff --git a/swh/objstorage/api/client.py b/swh/objstorage/api/client.py index b359931..f6d3887 100644 --- a/swh/objstorage/api/client.py +++ b/swh/objstorage/api/client.py @@ -1,48 +1,64 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core.api import SWHRemoteAPI +from swh.model import hashutil -from ..objstorage import ObjStorage +from ..objstorage import ObjStorage, DEFAULT_CHUNK_SIZE from ..exc import ObjStorageAPIError class RemoteObjStorage(ObjStorage, SWHRemoteAPI): """Proxy to a remote object storage. This class allows to connect to an object storage server via http protocol. Attributes: url (string): The url of the server to connect. Must end with a '/' session: The session to send requests. """ def __init__(self, url): super().__init__(api_exception=ObjStorageAPIError, url=url) def check_config(self, *, check_write): return self.post('check_config', {'check_write': check_write}) def __contains__(self, obj_id): return self.post('content/contains', {'obj_id': obj_id}) def add(self, content, obj_id=None, check_presence=True): return self.post('content/add', {'content': content, 'obj_id': obj_id, 'check_presence': check_presence}) def get(self, obj_id): return self.post('content/get', {'obj_id': obj_id}) def get_batch(self, obj_ids): return self.post('content/get/batch', {'obj_ids': obj_ids}) def check(self, obj_id): - self.post('content/check', {'obj_id': obj_id}) + return self.post('content/check', {'obj_id': obj_id}) + + # Management methods def get_random(self, batch_size): return self.post('content/get/random', {'batch_size': batch_size}) + + # Streaming methods + + def add_stream(self, content_iter, obj_id, check_presence=True): + obj_id = hashutil.hash_to_hex(obj_id) + return self.post_stream('content/add_stream/{}'.format(obj_id), + params={'check_presence': check_presence}, + data=content_iter) + + def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE): + obj_id = hashutil.hash_to_hex(obj_id) + return super().get_stream('content/get_stream/{}'.format(obj_id), + chunk_size=chunk_size) diff --git a/swh/objstorage/api/server.py b/swh/objstorage/api/server.py index 9a41780..cfdd9fe 100644 --- a/swh/objstorage/api/server.py +++ b/swh/objstorage/api/server.py @@ -1,113 +1,138 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import asyncio +import aiohttp.web import click -import logging - -from flask import g, request from swh.core import config -from swh.core.api import (SWHServerAPIApp, decode_request, - error_handler, - encode_data_server as encode_data) +from swh.core.api_async import (SWHRemoteAPI, decode_request, + encode_data_server as encode_data) +from swh.model import hashutil from swh.objstorage import get_objstorage DEFAULT_CONFIG = { 'cls': ('str', 'pathslicing'), 'args': ('dict', { 'root': '/srv/softwareheritage/objects', 'slicing': '0:2/2:4/4:6', }) } -app = SWHServerAPIApp(__name__) - - -@app.errorhandler(Exception) -def my_error_handler(exception): - return error_handler(exception, encode_data) +@asyncio.coroutine +def index(request): + return aiohttp.web.Response(body="SWH Objstorage API server") -@app.before_request -def before_request(): - g.objstorage = get_objstorage(app.config['cls'], app.config['args']) +@asyncio.coroutine +def check_config(request): + req = yield from decode_request(request) + return encode_data(request.app['objstorage'].check_config(**req)) -@app.route('/') -def index(): - return "SWH Objstorage API server" +@asyncio.coroutine +def contains(request): + req = yield from decode_request(request) + return encode_data(request.app['objstorage'].__contains__(**req)) -@app.route('/check_config', methods=['POST']) -def check_config(): - return encode_data(g.objstorage.check_config(**decode_request(request))) +@asyncio.coroutine +def add_bytes(request): + req = yield from decode_request(request) + return encode_data(request.app['objstorage'].add(**req)) -@app.route('/content') -def content(): - return str(list(g.storage)) +@asyncio.coroutine +def get_bytes(request): + req = yield from decode_request(request) + return encode_data(request.app['objstorage'].get(**req)) -@app.route('/content/contains', methods=['POST']) -def contains(): - return encode_data(g.objstorage.__contains__(**decode_request(request))) +@asyncio.coroutine +def get_batch(request): + req = yield from decode_request(request) + return encode_data(request.app['objstorage'].get_batch(**req)) -@app.route('/content/add', methods=['POST']) -def add_bytes(): - return encode_data(g.objstorage.add(**decode_request(request))) +@asyncio.coroutine +def check(request): + req = yield from decode_request(request) + return encode_data(request.app['objstorage'].check(**req)) -@app.route('/content/get', methods=['POST']) -def get_bytes(): - return encode_data(g.objstorage.get(**decode_request(request))) +# Management methods -@app.route('/content/get/batch', methods=['POST']) -def get_batch(): - return encode_data(g.objstorage.get_batch(**decode_request(request))) +@asyncio.coroutine +def get_random_contents(request): + req = yield from decode_request(request) + return encode_data(request.app['objstorage'].get_random(**req)) -@app.route('/content/get/random', methods=['POST']) -def get_random_contents(): - return encode_data( - g.objstorage.get_random(**decode_request(request)) - ) +# Streaming methods +@asyncio.coroutine +def add_stream(request): + hex_id = request.match_info['hex_id'] + obj_id = hashutil.hash_to_bytes(hex_id) + check_pres = (request.query.get('check_presence', '').lower() == 'true') + objstorage = request.app['objstorage'] -@app.route('/content/check', methods=['POST']) -def check(): - return encode_data(g.objstorage.check(**decode_request(request))) + if check_pres and obj_id in objstorage: + return encode_data(obj_id) + with objstorage.chunk_writer(obj_id) as write: + # XXX (3.5): use 'async for chunk in request.content.iter_any()' + while not request.content.at_eof(): + chunk = yield from request.content.readany() + write(chunk) -def run_from_webserver(environ, start_response): - """Run the WSGI app from the webserver, loading the configuration. + return encode_data(obj_id) - """ - config_path = '/etc/softwareheritage/storage/objstorage.yml' - app.config.update(config.read(config_path, DEFAULT_CONFIG)) +@asyncio.coroutine +def get_stream(request): + hex_id = request.match_info['hex_id'] + obj_id = hashutil.hash_to_bytes(hex_id) + response = aiohttp.web.StreamResponse() + yield from response.prepare(request) + for chunk in request.app['objstorage'].get_stream(obj_id, 2 << 20): + response.write(chunk) + yield from response.drain() + return response - handler = logging.StreamHandler() - app.logger.addHandler(handler) - return app(environ, start_response) +def make_app(config, **kwargs): + app = SWHRemoteAPI(**kwargs) + app.router.add_route('GET', '/', index) + app.router.add_route('POST', '/check_config', check_config) + app.router.add_route('POST', '/content/contains', contains) + app.router.add_route('POST', '/content/add', add_bytes) + app.router.add_route('POST', '/content/get', get_bytes) + app.router.add_route('POST', '/content/get/batch', get_batch) + app.router.add_route('POST', '/content/get/random', get_random_contents) + app.router.add_route('POST', '/content/check', check) + app.router.add_route('POST', '/content/add_stream/{hex_id}', add_stream) + app.router.add_route('GET', '/content/get_stream/{hex_id}', get_stream) + app.update(config) + app['objstorage'] = get_objstorage(app['cls'], app['args']) + return app @click.command() @click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5003, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") def launch(config_path, host, port, debug): - app.config.update(config.read(config_path, DEFAULT_CONFIG)) - app.run(host, port=int(port), debug=bool(debug)) + app = make_app(config.read(config_path, DEFAULT_CONFIG), debug=bool(debug)) + aiohttp.web.run_app(app, host=host, port=int(port)) if __name__ == '__main__': launch() diff --git a/swh/objstorage/objstorage.py b/swh/objstorage/objstorage.py index 3619026..4663c24 100644 --- a/swh/objstorage/objstorage.py +++ b/swh/objstorage/objstorage.py @@ -1,181 +1,243 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc from swh.model import hashutil from .exc import ObjNotFoundError ID_HASH_ALGO = 'sha1' ID_HASH_LENGTH = 40 # Size in bytes of the hash hexadecimal representation. +DEFAULT_CHUNK_SIZE = 2 * 1024 * 1024 # Size in bytes of the streaming chunks def compute_hash(content): return hashutil.hash_data( content, algorithms=[ID_HASH_ALGO] ).get(ID_HASH_ALGO) class ObjStorage(metaclass=abc.ABCMeta): """ High-level API to manipulate the Software Heritage object storage. Conceptually, the object storage offers the following methods: - check_config() check if the object storage is properly configured - __contains__() check if an object is present, by object id - add() add a new object, returning an object id - restore() same as add() but erase an already existed content - get() retrieve the content of an object, by object id - check() check the integrity of an object, by object id And some management methods: - get_random() get random object id of existing contents (used for the content integrity checker). + Some of the methods have available streaming equivalents: + + - add_stream() same as add() but with a chunked iterator + - restore_stream() same as add_stream() but erase already existing content + - get_stream() same as get() but returns a chunked iterator + Each implementation of this interface can have a different behavior and its own way to store the contents. """ @abc.abstractmethod def check_config(self, *, check_write): """Check whether the object storage is properly configured. Args: check_write (bool): if True, check if writes to the object storage can succeed. Returns: True if the configuration check worked, an exception if it didn't. """ pass @abc.abstractmethod def __contains__(self, obj_id, *args, **kwargs): """Indicate if the given object is present in the storage. Args: obj_id (bytes): object identifier. Returns: True iff the object is present in the current object storage. """ pass @abc.abstractmethod def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): """Add a new object to the object storage. Args: content (bytes): object's raw content to add in storage. obj_id (bytes): checksum of [bytes] using [ID_HASH_ALGO] algorithm. When given, obj_id will be trusted to match the bytes. If missing, obj_id will be computed on the fly. check_presence (bool): indicate if the presence of the content should be verified before adding the file. Returns: the id (bytes) of the object into the storage. """ pass def restore(self, content, obj_id=None, *args, **kwargs): """Restore a content that have been corrupted. - This function is identical to add_bytes but does not check if + This function is identical to add but does not check if the object id is already in the file system. The default implementation provided by the current class is suitable for most cases. Args: content (bytes): object's raw content to add in storage obj_id (bytes): checksum of `bytes` as computed by ID_HASH_ALGO. When given, obj_id will be trusted to match bytes. If missing, obj_id will be computed on the fly. """ # check_presence to false will erase the potential previous content. return self.add(content, obj_id, check_presence=False) @abc.abstractmethod def get(self, obj_id, *args, **kwargs): """Retrieve the content of a given object. Args: obj_id (bytes): object id. Returns: the content of the requested object as bytes. Raises: ObjNotFoundError: if the requested object is missing. """ pass def get_batch(self, obj_ids, *args, **kwargs): """Retrieve objects' raw content in bulk from storage. Note: This function does have a default implementation in ObjStorage that is suitable for most cases. For object storages that needs to do the minimal number of requests possible (ex: remote object storages), that method can be overriden to perform a more efficient operation. Args: obj_ids ([bytes]: list of object ids. Returns: list of resulting contents, or None if the content could not be retrieved. Do not raise any exception as a fail for one content will not cancel the whole request. """ for obj_id in obj_ids: try: yield self.get(obj_id) except ObjNotFoundError: yield None @abc.abstractmethod def check(self, obj_id, *args, **kwargs): """Perform an integrity check for a given object. Verify that the file object is in place and that the gziped content matches the object id. Args: obj_id (bytes): object identifier. Raises: ObjNotFoundError: if the requested object is missing. Error: if the request object is corrupted. """ pass + # Management methods + def get_random(self, batch_size, *args, **kwargs): """Get random ids of existing contents. This method is used in order to get random ids to perform content integrity verifications on random contents. Args: batch_size (int): Number of ids that will be given Yields: An iterable of ids (bytes) of contents that are in the current object storage. """ pass + + # Streaming methods + + def add_stream(self, content_iter, obj_id, check_presence=True): + """Add a new object to the object storage using streaming. + + This function is identical to add() except it takes a generator that + yields the chunked content instead of the whole content at once. + + Args: + content (bytes): chunked generator that yields the object's raw + content to add in storage. + obj_id (bytes): object identifier + check_presence (bool): indicate if the presence of the + content should be verified before adding the file. + + Returns: + the id (bytes) of the object into the storage. + + """ + raise NotImplementedError + + def restore_stream(self, content_iter, obj_id=None): + """Restore a content that have been corrupted using streaming. + + This function is identical to restore() except it takes a generator + that yields the chunked content instead of the whole content at once. + The default implementation provided by the current class is + suitable for most cases. + + Args: + content (bytes): chunked generator that yields the object's raw + content to add in storage. + obj_id (bytes): object identifier + + """ + # check_presence to false will erase the potential previous content. + return self.add_stream(content_iter, obj_id, check_presence=False) + + def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE): + """Retrieve the content of a given object as a chunked iterator. + + Args: + obj_id (bytes): object id. + + Returns: + the content of the requested object as bytes. + + Raises: + ObjNotFoundError: if the requested object is missing. + + """ + raise NotImplementedError diff --git a/swh/objstorage/objstorage_pathslicing.py b/swh/objstorage/objstorage_pathslicing.py index 897a5f7..6980374 100644 --- a/swh/objstorage/objstorage_pathslicing.py +++ b/swh/objstorage/objstorage_pathslicing.py @@ -1,296 +1,327 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import functools import os import gzip import tempfile import random from contextlib import contextmanager from swh.model import hashutil -from .objstorage import ObjStorage, compute_hash, ID_HASH_ALGO, ID_HASH_LENGTH +from .objstorage import (ObjStorage, compute_hash, ID_HASH_ALGO, + ID_HASH_LENGTH, DEFAULT_CHUNK_SIZE) from .exc import ObjNotFoundError, Error GZIP_BUFSIZ = 1048576 DIR_MODE = 0o755 FILE_MODE = 0o644 @contextmanager def _write_obj_file(hex_obj_id, objstorage): """ Context manager for writing object files to the object storage. During writing, data are written to a temporary file, which is atomically renamed to the right file name after closing. This context manager also takes care of (gzip) compressing the data on the fly. Usage sample: with _write_obj_file(hex_obj_id, objstorage): f.write(obj_data) Yields: a file-like object open for writing bytes. """ # Get the final paths and create the directory if absent. dir = objstorage._obj_dir(hex_obj_id) if not os.path.isdir(dir): os.makedirs(dir, DIR_MODE, exist_ok=True) path = os.path.join(dir, hex_obj_id) # Create a temporary file. (tmp, tmp_path) = tempfile.mkstemp(suffix='.tmp', prefix='hex_obj_id.', dir=dir) # Open the file and yield it for writing. tmp_f = os.fdopen(tmp, 'wb') with gzip.GzipFile(filename=tmp_path, fileobj=tmp_f) as f: yield f # Then close the temporary file and move it to the right directory. tmp_f.close() os.chmod(tmp_path, FILE_MODE) os.rename(tmp_path, path) @contextmanager def _read_obj_file(hex_obj_id, objstorage): """ Context manager for reading object file in the object storage. Usage sample: with _read_obj_file(hex_obj_id, objstorage) as f: b = f.read() Yields: a file-like object open for reading bytes. """ path = objstorage._obj_path(hex_obj_id) with gzip.GzipFile(path, 'rb') as f: yield f class PathSlicingObjStorage(ObjStorage): """Implementation of the ObjStorage API based on the hash of the content. On disk, an object storage is a directory tree containing files named after their object IDs. An object ID is a checksum of its content, depending on the value of the ID_HASH_ALGO constant (see swh.model.hashutil for its meaning). To avoid directories that contain too many files, the object storage has a given slicing. Each slicing correspond to a directory that is named according to the hash of its content. So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689 will be stored in the given object storages : - 0:2/2:4/4:6 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689 - 0:1/0:5/ : 3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689 The files in the storage are stored in gzipped compressed format. Attributes: root (string): path to the root directory of the storage on the disk. bounds: list of tuples that indicates the beginning and the end of each subdirectory for a content. """ def __init__(self, root, slicing): """ Create an object to access a hash-slicing based object storage. Args: root (string): path to the root directory of the storage on the disk. slicing (string): string that indicates the slicing to perform on the hash of the content to know the path where it should be stored. """ self.root = root # Make a list of tuples where each tuple contains the beginning # and the end of each slicing. self.bounds = [ slice(*map(int, sbounds.split(':'))) for sbounds in slicing.split('/') if sbounds ] self.check_config(check_write=False) def check_config(self, *, check_write): """Check whether this object storage is properly configured""" root = self.root if not os.path.isdir(root): raise ValueError( 'PathSlicingObjStorage root "%s" is not a directory' % root ) max_endchar = max(map(lambda bound: bound.stop, self.bounds)) if ID_HASH_LENGTH < max_endchar: raise ValueError( 'Algorithm %s has too short hash for slicing to char %d' % (ID_HASH_ALGO, max_endchar) ) if check_write: if not os.access(self.root, os.W_OK): raise PermissionError( 'PathSlicingObjStorage root "%s" is not writable' % root ) return True def __contains__(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return os.path.exists(self._obj_path(hex_obj_id)) def __iter__(self): """Iterate over the object identifiers currently available in the storage. Warning: with the current implementation of the object storage, this method will walk the filesystem to list objects, meaning that listing all objects will be very slow for large storages. You almost certainly don't want to use this method in production. Return: Iterator over object IDs """ def obj_iterator(): # XXX hackish: it does not verify that the depth of found files # matches the slicing depth of the storage for root, _dirs, files in os.walk(self.root): for f in files: yield bytes.fromhex(f) return obj_iterator() def __len__(self): """Compute the number of objects available in the storage. Warning: this currently uses `__iter__`, its warning about bad performances applies Return: number of objects contained in the storage """ return sum(1 for i in self) def _obj_dir(self, hex_obj_id): """ Compute the storage directory of an object. See also: PathSlicingObjStorage::_obj_path Args: hex_obj_id: object id as hexlified string. Returns: Path to the directory that contains the required object. """ slices = [hex_obj_id[bound] for bound in self.bounds] return os.path.join(self.root, *slices) def _obj_path(self, hex_obj_id): """ Compute the full path to an object into the current storage. See also: PathSlicingObjStorage::_obj_dir Args: hex_obj_id: object id as hexlified string. Returns: Path to the actual object corresponding to the given id. """ return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id) def add(self, content, obj_id=None, check_presence=True): if obj_id is None: obj_id = compute_hash(content) if check_presence and obj_id in self: # If the object is already present, return immediately. return obj_id hex_obj_id = hashutil.hash_to_hex(obj_id) with _write_obj_file(hex_obj_id, self) as f: f.write(content) return obj_id def get(self, obj_id): if obj_id not in self: raise ObjNotFoundError(obj_id) # Open the file and return its content as bytes hex_obj_id = hashutil.hash_to_hex(obj_id) with _read_obj_file(hex_obj_id, self) as f: return f.read() def check(self, obj_id): if obj_id not in self: raise ObjNotFoundError(obj_id) hex_obj_id = hashutil.hash_to_hex(obj_id) try: with gzip.open(self._obj_path(hex_obj_id)) as f: length = None if ID_HASH_ALGO.endswith('_git'): # if the hashing algorithm is git-like, we need to know the # content size to hash on the fly. Do a first pass here to # compute the size length = 0 while True: chunk = f.read(GZIP_BUFSIZ) length += len(chunk) if not chunk: break f.rewind() checksums = hashutil.hash_file(f, length, algorithms=[ID_HASH_ALGO]) actual_obj_id = checksums[ID_HASH_ALGO] if obj_id != actual_obj_id: raise Error( 'Corrupt object %s should have id %s' % (hashutil.hash_to_hex(obj_id), hashutil.hash_to_hex(actual_obj_id)) ) except (OSError, IOError): # IOError is for compatibility with older python versions raise Error('Corrupt object %s is not a gzip file' % obj_id) + # Management methods + def get_random(self, batch_size): def get_random_content(self, batch_size): """ Get a batch of content inside a single directory. Returns: a tuple (batch size, batch). """ dirs = [] for level in range(len(self.bounds)): path = os.path.join(self.root, *dirs) dir_list = next(os.walk(path))[1] if 'tmp' in dir_list: dir_list.remove('tmp') dirs.append(random.choice(dir_list)) path = os.path.join(self.root, *dirs) content_list = next(os.walk(path))[2] length = min(batch_size, len(content_list)) return length, map(hashutil.hash_to_bytes, random.sample(content_list, length)) while batch_size: length, it = get_random_content(self, batch_size) batch_size = batch_size - length yield from it + + # Streaming methods + + @contextmanager + def chunk_writer(self, obj_id): + hex_obj_id = hashutil.hash_to_hex(obj_id) + with _write_obj_file(hex_obj_id, self) as f: + yield f.write + + def add_stream(self, content_iter, obj_id, check_presence=True): + if check_presence and obj_id in self: + return obj_id + + with self.chunk_writer(obj_id) as writer: + for chunk in content_iter: + writer(chunk) + + return obj_id + + def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE): + if obj_id not in self: + raise ObjNotFoundError(obj_id) + + hex_obj_id = hashutil.hash_to_hex(obj_id) + with _read_obj_file(hex_obj_id, self) as f: + reader = functools.partial(f.read, chunk_size) + yield from iter(reader, b'') diff --git a/swh/objstorage/tests/objstorage_testing.py b/swh/objstorage/tests/objstorage_testing.py index c15665f..3d9e067 100644 --- a/swh/objstorage/tests/objstorage_testing.py +++ b/swh/objstorage/tests/objstorage_testing.py @@ -1,100 +1,137 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import time + from nose.tools import istest from swh.model import hashutil from swh.objstorage import exc class ObjStorageTestFixture(): def setUp(self): super().setUp() def hash_content(self, content): obj_id = hashutil.hash_data(content)['sha1'] return content, obj_id def assertContentMatch(self, obj_id, expected_content): # noqa content = self.storage.get(obj_id) self.assertEqual(content, expected_content) @istest def check_config(self): self.assertTrue(self.storage.check_config(check_write=False)) self.assertTrue(self.storage.check_config(check_write=True)) @istest def contains(self): content_p, obj_id_p = self.hash_content(b'contains_present') content_m, obj_id_m = self.hash_content(b'contains_missing') self.storage.add(content_p, obj_id=obj_id_p) self.assertIn(obj_id_p, self.storage) self.assertNotIn(obj_id_m, self.storage) @istest def add_get_w_id(self): content, obj_id = self.hash_content(b'add_get_w_id') r = self.storage.add(content, obj_id=obj_id) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) @istest def add_get_wo_id(self): content, obj_id = self.hash_content(b'add_get_wo_id') r = self.storage.add(content) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) @istest def add_get_batch(self): content1, obj_id1 = self.hash_content(b'add_get_batch_1') content2, obj_id2 = self.hash_content(b'add_get_batch_2') self.storage.add(content1, obj_id1) self.storage.add(content2, obj_id2) cr1, cr2 = self.storage.get_batch([obj_id1, obj_id2]) self.assertEqual(cr1, content1) self.assertEqual(cr2, content2) @istest def get_batch_unexisting_content(self): content, obj_id = self.hash_content(b'get_batch_unexisting_content') result = list(self.storage.get_batch([obj_id])) self.assertTrue(len(result) == 1) self.assertIsNone(result[0]) @istest def restore_content(self): valid_content, valid_obj_id = self.hash_content(b'restore_content') invalid_content = b'unexpected content' id_adding = self.storage.add(invalid_content, valid_obj_id) id_restore = self.storage.restore(valid_content) # Adding a false content then restore it to the right one and # then perform a verification should result in a successful check. self.assertEqual(id_adding, valid_obj_id) self.assertEqual(id_restore, valid_obj_id) self.assertContentMatch(valid_obj_id, valid_content) @istest def get_missing(self): content, obj_id = self.hash_content(b'get_missing') with self.assertRaises(exc.Error): self.storage.get(obj_id) @istest def check_missing(self): content, obj_id = self.hash_content(b'check_missing') with self.assertRaises(exc.Error): self.storage.check(obj_id) @istest def check_present(self): content, obj_id = self.hash_content(b'check_missing') self.storage.add(content) try: self.storage.check(obj_id) except: self.fail('Integrity check failed') + + @istest + def add_stream(self): + content = [b'chunk1', b'chunk2'] + _, obj_id = self.hash_content(b''.join(content)) + try: + self.storage.add_stream(iter(content), obj_id=obj_id) + except NotImplementedError: + return + self.assertContentMatch(obj_id, b''.join(content)) + + @istest + def add_stream_sleep(self): + def gen_content(): + yield b'chunk1' + time.sleep(0.5) + yield b'chunk2' + _, obj_id = self.hash_content(b'placeholder_id') + try: + self.storage.add_stream(gen_content(), obj_id=obj_id) + except NotImplementedError: + return + self.assertContentMatch(obj_id, b'chunk1chunk2') + + @istest + def get_stream(self): + content_l = [b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9'] + content = b''.join(content_l) + _, obj_id = self.hash_content(content) + self.storage.add(content, obj_id=obj_id) + try: + r = list(self.storage.get_stream(obj_id, chunk_size=1)) + except NotImplementedError: + return + self.assertEqual(r, content_l) diff --git a/swh/objstorage/tests/server_testing.py b/swh/objstorage/tests/server_testing.py index 61277d3..95b19e8 100644 --- a/swh/objstorage/tests/server_testing.py +++ b/swh/objstorage/tests/server_testing.py @@ -1,80 +1,81 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import aiohttp.web import multiprocessing import socket import time from urllib.request import urlopen class ServerTestFixture(): """ Base class for http client/server testing. Mix this in a test class in order to have access to an http flask server running in background. Note that the subclass should define a dictionary in self.config that contains the flask server config. And a flask application in self.app that corresponds to the type of server the tested client needs. To ensure test isolation, each test will run in a different server and a different repertory. In order to correctly work, the subclass must call the parents class's setUp() and tearDown() methods. """ def setUp(self): super().setUp() self.start_server() def tearDown(self): self.stop_server() super().tearDown() def url(self): return 'http://127.0.0.1:%d/' % self.port def start_server(self): """ Spawn the API server using multiprocessing. """ self.process = None # WSGI app configuration for key, value in self.config.items(): - self.app.config[key] = value + self.app[key] = value # Get an available port number sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(('127.0.0.1', 0)) self.port = sock.getsockname()[1] sock.close() # Worker function for multiprocessing def worker(app, port): - return app.run(port=port, use_reloader=False) + return aiohttp.web.run_app(app, port=port) self.process = multiprocessing.Process( target=worker, args=(self.app, self.port) ) self.process.start() # Wait max 5 seconds for server to spawn i = 0 while i < 20: try: urlopen(self.url()) except Exception: i += 1 time.sleep(0.25) else: return def stop_server(self): """ Terminate the API server's process. """ if self.process: self.process.terminate() diff --git a/swh/objstorage/tests/test_objstorage_api.py b/swh/objstorage/tests/test_objstorage_api.py index 8aa8838..801284b 100644 --- a/swh/objstorage/tests/test_objstorage_api.py +++ b/swh/objstorage/tests/test_objstorage_api.py @@ -1,36 +1,36 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest from nose.plugins.attrib import attr from swh.objstorage import get_objstorage from swh.objstorage.tests.objstorage_testing import ObjStorageTestFixture from swh.objstorage.tests.server_testing import ServerTestFixture -from swh.objstorage.api.server import app +from swh.objstorage.api.server import make_app @attr('db') class TestRemoteObjStorage(ServerTestFixture, ObjStorageTestFixture, unittest.TestCase): """ Test the remote archive API. """ def setUp(self): self.config = { 'cls': 'pathslicing', 'args': { 'root': tempfile.mkdtemp(), 'slicing': '0:1/0:5', } } - self.app = app + self.app = make_app(self.config) super().setUp() self.storage = get_objstorage('remote', { 'url': self.url() }) diff --git a/swh/objstorage/tests/test_objstorage_instantiation.py b/swh/objstorage/tests/test_objstorage_instantiation.py index beda01f..a515855 100644 --- a/swh/objstorage/tests/test_objstorage_instantiation.py +++ b/swh/objstorage/tests/test_objstorage_instantiation.py @@ -1,49 +1,46 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest from nose.tools import istest -from swh.objstorage.tests.server_testing import ServerTestFixture from swh.objstorage import get_objstorage from swh.objstorage.objstorage_pathslicing import PathSlicingObjStorage from swh.objstorage.api.client import RemoteObjStorage -from swh.objstorage.api.server import app -class TestObjStorageInitialization(ServerTestFixture, unittest.TestCase): +class TestObjStorageInitialization(unittest.TestCase): """ Test that the methods for ObjStorage initializations with `get_objstorage` works properly. """ def setUp(self): self.path = tempfile.mkdtemp() # Server is launched at self.url() - self.app = app self.config = {'storage_base': tempfile.mkdtemp(), 'storage_slicing': '0:1/0:5'} super().setUp() @istest def pathslicing_objstorage(self): conf = { 'cls': 'pathslicing', 'args': {'root': self.path, 'slicing': '0:2/0:5'} } st = get_objstorage(**conf) self.assertTrue(isinstance(st, PathSlicingObjStorage)) @istest def remote_objstorage(self): conf = { 'cls': 'remote', 'args': { - 'url': self.url() + 'url': 'http://127.0.0.1:4242/' } } st = get_objstorage(**conf) self.assertTrue(isinstance(st, RemoteObjStorage))