diff --git a/PKG-INFO b/PKG-INFO index 76ff779..34aede1 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.objstorage -Version: 0.0.23 +Version: 0.0.24 Summary: Software Heritage Object Storage Home-page: https://forge.softwareheritage.org/diffusion/DOBJS Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/docs/index.rst b/docs/index.rst index 8b64117..77b43fa 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,15 +1,17 @@ +.. _swh-objstorage: + Software Heritage - Development Documentation ============================================= .. toctree:: :maxdepth: 2 :caption: Contents: Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` diff --git a/swh.objstorage.egg-info/PKG-INFO b/swh.objstorage.egg-info/PKG-INFO index 76ff779..34aede1 100644 --- a/swh.objstorage.egg-info/PKG-INFO +++ b/swh.objstorage.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.objstorage -Version: 0.0.23 +Version: 0.0.24 Summary: Software Heritage Object Storage Home-page: https://forge.softwareheritage.org/diffusion/DOBJS Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh/objstorage/api/client.py b/swh/objstorage/api/client.py index 61bfcdd..f8fe8d6 100644 --- a/swh/objstorage/api/client.py +++ b/swh/objstorage/api/client.py @@ -1,68 +1,72 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core.api import SWHRemoteAPI from swh.model import hashutil from ..objstorage import ObjStorage, DEFAULT_CHUNK_SIZE -from ..exc import ObjStorageAPIError +from ..exc import ObjNotFoundError, ObjStorageAPIError class RemoteObjStorage(ObjStorage, SWHRemoteAPI): """Proxy to a remote object storage. This class allows to connect to an object storage server via http protocol. Attributes: url (string): The url of the server to connect. Must end with a '/' session: The session to send requests. """ def __init__(self, url, **kwargs): super().__init__(api_exception=ObjStorageAPIError, url=url, **kwargs) def check_config(self, *, check_write): return self.post('check_config', {'check_write': check_write}) def __contains__(self, obj_id): return self.post('content/contains', {'obj_id': obj_id}) def add(self, content, obj_id=None, check_presence=True): return self.post('content/add', {'content': content, 'obj_id': obj_id, 'check_presence': check_presence}) def get(self, obj_id): - return self.post('content/get', {'obj_id': obj_id}) + ret = self.post('content/get', {'obj_id': obj_id}) + if ret is None: + raise ObjNotFoundError(obj_id) + else: + return ret def get_batch(self, obj_ids): return self.post('content/get/batch', {'obj_ids': obj_ids}) def check(self, obj_id): return self.post('content/check', {'obj_id': obj_id}) def delete(self, obj_id): super().delete(obj_id) # Check delete permission return self.post('content/delete', {'obj_id': obj_id}) # Management methods def get_random(self, batch_size): return self.post('content/get/random', {'batch_size': batch_size}) # Streaming methods def add_stream(self, content_iter, obj_id, check_presence=True): obj_id = hashutil.hash_to_hex(obj_id) return self.post_stream('content/add_stream/{}'.format(obj_id), params={'check_presence': check_presence}, data=content_iter) def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE): obj_id = hashutil.hash_to_hex(obj_id) return super().get_stream('content/get_stream/{}'.format(obj_id), chunk_size=chunk_size) diff --git a/swh/objstorage/api/server.py b/swh/objstorage/api/server.py index 9110687..41226bc 100644 --- a/swh/objstorage/api/server.py +++ b/swh/objstorage/api/server.py @@ -1,154 +1,165 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio import aiohttp.web import click from swh.core import config from swh.core.api_async import (SWHRemoteAPI, decode_request, encode_data_server as encode_data) from swh.model import hashutil from swh.objstorage import get_objstorage +from swh.objstorage.exc import ObjNotFoundError DEFAULT_CONFIG_PATH = 'objstorage/server' DEFAULT_CONFIG = { 'cls': ('str', 'pathslicing'), 'args': ('dict', { 'root': '/srv/softwareheritage/objects', 'slicing': '0:2/2:4/4:6', }), 'client_max_size': ('int', 1024 * 1024 * 1024), } @asyncio.coroutine def index(request): return aiohttp.web.Response(body="SWH Objstorage API server") @asyncio.coroutine def check_config(request): req = yield from decode_request(request) return encode_data(request.app['objstorage'].check_config(**req)) @asyncio.coroutine def contains(request): req = yield from decode_request(request) return encode_data(request.app['objstorage'].__contains__(**req)) @asyncio.coroutine def add_bytes(request): req = yield from decode_request(request) return encode_data(request.app['objstorage'].add(**req)) @asyncio.coroutine def get_bytes(request): req = yield from decode_request(request) - return encode_data(request.app['objstorage'].get(**req)) + try: + ret = request.app['objstorage'].get(**req) + except ObjNotFoundError: + ret = { + 'error': 'object_not_found', + 'request': req, + } + return encode_data(ret, status=404) + else: + return encode_data(ret) @asyncio.coroutine def get_batch(request): req = yield from decode_request(request) return encode_data(request.app['objstorage'].get_batch(**req)) @asyncio.coroutine def check(request): req = yield from decode_request(request) return encode_data(request.app['objstorage'].check(**req)) @asyncio.coroutine def delete(request): req = yield from decode_request(request) return encode_data(request.app['objstorage'].delete(**req)) # Management methods @asyncio.coroutine def get_random_contents(request): req = yield from decode_request(request) return encode_data(request.app['objstorage'].get_random(**req)) # Streaming methods @asyncio.coroutine def add_stream(request): hex_id = request.match_info['hex_id'] obj_id = hashutil.hash_to_bytes(hex_id) check_pres = (request.query.get('check_presence', '').lower() == 'true') objstorage = request.app['objstorage'] if check_pres and obj_id in objstorage: return encode_data(obj_id) with objstorage.chunk_writer(obj_id) as write: # XXX (3.5): use 'async for chunk in request.content.iter_any()' while not request.content.at_eof(): chunk = yield from request.content.readany() write(chunk) return encode_data(obj_id) @asyncio.coroutine def get_stream(request): hex_id = request.match_info['hex_id'] obj_id = hashutil.hash_to_bytes(hex_id) response = aiohttp.web.StreamResponse() yield from response.prepare(request) for chunk in request.app['objstorage'].get_stream(obj_id, 2 << 20): response.write(chunk) yield from response.drain() return response def make_app(config, **kwargs): if 'client_max_size' in config: kwargs['client_max_size'] = config['client_max_size'] app = SWHRemoteAPI(**kwargs) app.router.add_route('GET', '/', index) app.router.add_route('POST', '/check_config', check_config) app.router.add_route('POST', '/content/contains', contains) app.router.add_route('POST', '/content/add', add_bytes) app.router.add_route('POST', '/content/get', get_bytes) app.router.add_route('POST', '/content/get/batch', get_batch) app.router.add_route('POST', '/content/get/random', get_random_contents) app.router.add_route('POST', '/content/check', check) app.router.add_route('POST', '/content/delete', delete) app.router.add_route('POST', '/content/add_stream/{hex_id}', add_stream) app.router.add_route('GET', '/content/get_stream/{hex_id}', get_stream) app.update(config) app['objstorage'] = get_objstorage(app['cls'], app['args']) return app def make_app_from_configfile(config_path=DEFAULT_CONFIG_PATH, **kwargs): - return make_app(config.read(config_path, DEFAULT_CONFIG), **kwargs) + cfg = config.load_named_config(config_path, DEFAULT_CONFIG) + return make_app(cfg, **kwargs) @click.command() @click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5003, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") def launch(config_path, host, port, debug): app = make_app_from_configfile(config_path, debug=bool(debug)) aiohttp.web.run_app(app, host=host, port=int(port)) if __name__ == '__main__': launch() diff --git a/swh/objstorage/cloud/objstorage_azure.py b/swh/objstorage/cloud/objstorage_azure.py index 8e8c1da..c325638 100644 --- a/swh/objstorage/cloud/objstorage_azure.py +++ b/swh/objstorage/cloud/objstorage_azure.py @@ -1,128 +1,128 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import gzip from swh.objstorage.objstorage import ObjStorage, compute_hash from swh.objstorage.exc import ObjNotFoundError, Error from swh.model import hashutil from azure.storage.blob import BlockBlobService from azure.common import AzureMissingResourceHttpError class AzureCloudObjStorage(ObjStorage): """ObjStorage with azure abilities. """ def __init__(self, account_name, api_secret_key, container_name, **kwargs): super().__init__(**kwargs) self.block_blob_service = BlockBlobService( account_name=account_name, account_key=api_secret_key) self.container_name = container_name def _internal_id(self, obj_id): """Internal id is the hex version in objstorage. """ return hashutil.hash_to_hex(obj_id) def check_config(self, *, check_write): """Check the configuration for this object storage""" props = self.block_blob_service.get_container_properties( self.container_name ) # FIXME: check_write is ignored here return bool(props) def __contains__(self, obj_id): """Does the storage contains the obj_id. """ hex_obj_id = self._internal_id(obj_id) return self.block_blob_service.exists( container_name=self.container_name, blob_name=hex_obj_id) def __iter__(self): """Iterate over the objects present in the storage. """ for obj in self.block_blob_service.list_blobs(self.container_name): yield hashutil.hash_to_bytes(obj.name) def __len__(self): """Compute the number of objects in the current object storage. Returns: number of objects contained in the storage. """ return sum(1 for i in self) def add(self, content, obj_id=None, check_presence=True): """Add an obj in storage if it's not there already. """ if obj_id is None: # Checksum is missing, compute it on the fly. obj_id = compute_hash(content) if check_presence and obj_id in self: return obj_id hex_obj_id = self._internal_id(obj_id) # Send the gzipped content self.block_blob_service.create_blob_from_bytes( container_name=self.container_name, blob_name=hex_obj_id, blob=gzip.compress(content)) return obj_id def restore(self, content, obj_id=None): """Restore a content. """ return self.add(content, obj_id, check_presence=False) def get(self, obj_id): """Retrieve blob's content if found. """ hex_obj_id = self._internal_id(obj_id) try: blob = self.block_blob_service.get_blob_to_bytes( container_name=self.container_name, blob_name=hex_obj_id) except AzureMissingResourceHttpError: - raise ObjNotFoundError('Content %s not found!' % hex_obj_id) + raise ObjNotFoundError(obj_id) return gzip.decompress(blob.content) def check(self, obj_id): """Check the content integrity. """ obj_content = self.get(obj_id) content_obj_id = compute_hash(obj_content) if content_obj_id != obj_id: raise Error(obj_id) def delete(self, obj_id): """Delete an object.""" super().delete(obj_id) # Check delete permission hex_obj_id = self._internal_id(obj_id) try: self.block_blob_service.delete_blob( container_name=self.container_name, blob_name=hex_obj_id) except AzureMissingResourceHttpError: raise ObjNotFoundError('Content {} not found!'.format(hex_obj_id)) return True diff --git a/swh/objstorage/cloud/objstorage_cloud.py b/swh/objstorage/cloud/objstorage_cloud.py index a9ef724..8778f7e 100644 --- a/swh/objstorage/cloud/objstorage_cloud.py +++ b/swh/objstorage/cloud/objstorage_cloud.py @@ -1,169 +1,169 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc from swh.model import hashutil from swh.objstorage.objstorage import ObjStorage, compute_hash from swh.objstorage.exc import ObjNotFoundError, Error from libcloud.storage import providers from libcloud.storage.types import Provider, ObjectDoesNotExistError class CloudObjStorage(ObjStorage, metaclass=abc.ABCMeta): """Abstract ObjStorage that connect to a cloud using Libcloud Implementations of this class must redefine the _get_provider method to make it return a driver provider (i.e. object that supports `get_driver` method) which return a LibCloud driver (see https://libcloud.readthedocs.io/en/latest/storage/api.html). """ def __init__(self, api_key, api_secret_key, container_name, **kwargs): super().__init__(**kwargs) self.driver = self._get_driver(api_key, api_secret_key) self.container_name = container_name self.container = self.driver.get_container( container_name=container_name) def _get_driver(self, api_key, api_secret_key): """Initialize a driver to communicate with the cloud Args: api_key: key to connect to the API. api_secret_key: secret key for authentification. Returns: a Libcloud driver to a cloud storage. """ # Get the driver class from its description. cls = providers.get_driver(self._get_provider()) # Initialize the driver. return cls(api_key, api_secret_key) @abc.abstractmethod def _get_provider(self): """Get a libcloud driver provider This method must be overriden by subclasses to specify which of the native libcloud driver the current storage should connect to. Alternatively, provider for a custom driver may be returned, in which case the provider will have to support `get_driver` method. """ raise NotImplementedError('%s must implement `get_provider` method' % type(self)) def check_config(self, *, check_write): """Check the configuration for this object storage""" # FIXME: hopefully this blew up during instantiation return True def __contains__(self, obj_id): try: self._get_object(obj_id) except ObjNotFoundError: return False else: return True def __iter__(self): """ Iterate over the objects present in the storage Warning: Iteration over the contents of a cloud-based object storage may have bad efficiency: due to the very high amount of objects in it and the fact that it is remote, get all the contents of the current object storage may result in a lot of network requests. You almost certainly don't want to use this method in production. """ yield from map(lambda obj: obj.name, self.driver.iterate_container_objects(self.container)) def __len__(self): """Compute the number of objects in the current object storage. Warning: this currently uses `__iter__`, its warning about bad performance applies. Returns: number of objects contained in the storage. """ return sum(1 for i in self) def add(self, content, obj_id=None, check_presence=True): if obj_id is None: # Checksum is missing, compute it on the fly. obj_id = compute_hash(content) if check_presence and obj_id in self: return obj_id self._put_object(content, obj_id) return obj_id def restore(self, content, obj_id=None): return self.add(content, obj_id, check_presence=False) def get(self, obj_id): return bytes(self._get_object(obj_id).as_stream()) def check(self, obj_id): # Check that the file exists, as _get_object raises ObjNotFoundError self._get_object(obj_id) # Check the content integrity obj_content = self.get(obj_id) content_obj_id = compute_hash(obj_content) if content_obj_id != obj_id: raise Error(obj_id) def delete(self, obj_id): super().delete(obj_id) # Check delete permission obj = self._get_object(obj_id) return self.driver.delete_object(obj) def _get_object(self, obj_id): """Get a Libcloud wrapper for an object pointer. This wrapper does not retrieve the content of the object directly. """ hex_obj_id = hashutil.hash_to_hex(obj_id) try: return self.driver.get_object(self.container_name, hex_obj_id) except ObjectDoesNotExistError as e: - raise ObjNotFoundError(e.object_name) + raise ObjNotFoundError(obj_id) def _put_object(self, content, obj_id): """Create an object in the cloud storage. Created object will contain the content and be referenced by the given id. """ hex_obj_id = hashutil.hash_to_hex(obj_id) self.driver.upload_object_via_stream(iter(content), self.container, hex_obj_id) class AwsCloudObjStorage(CloudObjStorage): """ Amazon's S3 Cloud-based object storage """ def _get_provider(self): return Provider.S3 class OpenStackCloudObjStorage(CloudObjStorage): """ OpenStack Swift Cloud based object storage """ def _get_provider(self): return Provider.OPENSTACK_SWIFT diff --git a/swh/objstorage/tests/objstorage_testing.py b/swh/objstorage/tests/objstorage_testing.py index 23fd950..4d4820c 100644 --- a/swh/objstorage/tests/objstorage_testing.py +++ b/swh/objstorage/tests/objstorage_testing.py @@ -1,175 +1,177 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time from nose.tools import istest from swh.model import hashutil from swh.objstorage import exc class ObjStorageTestFixture(): def setUp(self): super().setUp() def hash_content(self, content): obj_id = hashutil.hash_data(content)['sha1'] return content, obj_id def assertContentMatch(self, obj_id, expected_content): # noqa content = self.storage.get(obj_id) self.assertEqual(content, expected_content) @istest def check_config(self): self.assertTrue(self.storage.check_config(check_write=False)) self.assertTrue(self.storage.check_config(check_write=True)) @istest def contains(self): content_p, obj_id_p = self.hash_content(b'contains_present') content_m, obj_id_m = self.hash_content(b'contains_missing') self.storage.add(content_p, obj_id=obj_id_p) self.assertIn(obj_id_p, self.storage) self.assertNotIn(obj_id_m, self.storage) @istest def add_get_w_id(self): content, obj_id = self.hash_content(b'add_get_w_id') r = self.storage.add(content, obj_id=obj_id) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) @istest def add_big(self): content, obj_id = self.hash_content(b'add_big' * 1024 * 1024) r = self.storage.add(content, obj_id=obj_id) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) @istest def add_get_wo_id(self): content, obj_id = self.hash_content(b'add_get_wo_id') r = self.storage.add(content) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) @istest def add_get_batch(self): content1, obj_id1 = self.hash_content(b'add_get_batch_1') content2, obj_id2 = self.hash_content(b'add_get_batch_2') self.storage.add(content1, obj_id1) self.storage.add(content2, obj_id2) cr1, cr2 = self.storage.get_batch([obj_id1, obj_id2]) self.assertEqual(cr1, content1) self.assertEqual(cr2, content2) @istest def get_batch_unexisting_content(self): content, obj_id = self.hash_content(b'get_batch_unexisting_content') result = list(self.storage.get_batch([obj_id])) self.assertTrue(len(result) == 1) self.assertIsNone(result[0]) @istest def restore_content(self): valid_content, valid_obj_id = self.hash_content(b'restore_content') invalid_content = b'unexpected content' id_adding = self.storage.add(invalid_content, valid_obj_id) id_restore = self.storage.restore(valid_content) # Adding a false content then restore it to the right one and # then perform a verification should result in a successful check. self.assertEqual(id_adding, valid_obj_id) self.assertEqual(id_restore, valid_obj_id) self.assertContentMatch(valid_obj_id, valid_content) @istest def get_missing(self): content, obj_id = self.hash_content(b'get_missing') - with self.assertRaises(exc.Error): + with self.assertRaises(exc.ObjNotFoundError) as e: self.storage.get(obj_id) + self.assertIn(obj_id, e.exception.args) + @istest def check_missing(self): content, obj_id = self.hash_content(b'check_missing') with self.assertRaises(exc.Error): self.storage.check(obj_id) @istest def check_present(self): content, obj_id = self.hash_content(b'check_missing') self.storage.add(content) try: self.storage.check(obj_id) except: self.fail('Integrity check failed') @istest def delete_missing(self): self.storage.allow_delete = True content, obj_id = self.hash_content(b'missing_content_to_delete') with self.assertRaises(exc.Error): self.storage.delete(obj_id) @istest def delete_present(self): self.storage.allow_delete = True content, obj_id = self.hash_content(b'content_to_delete') self.storage.add(content, obj_id=obj_id) self.assertTrue(self.storage.delete(obj_id)) with self.assertRaises(exc.Error): self.storage.get(obj_id) @istest def delete_not_allowed(self): self.storage.allow_delete = False content, obj_id = self.hash_content(b'content_to_delete') self.storage.add(content, obj_id=obj_id) with self.assertRaises(PermissionError): self.assertTrue(self.storage.delete(obj_id)) @istest def delete_not_allowed_by_default(self): content, obj_id = self.hash_content(b'content_to_delete') self.storage.add(content, obj_id=obj_id) with self.assertRaises(PermissionError): self.assertTrue(self.storage.delete(obj_id)) @istest def add_stream(self): content = [b'chunk1', b'chunk2'] _, obj_id = self.hash_content(b''.join(content)) try: self.storage.add_stream(iter(content), obj_id=obj_id) except NotImplementedError: return self.assertContentMatch(obj_id, b''.join(content)) @istest def add_stream_sleep(self): def gen_content(): yield b'chunk1' time.sleep(0.5) yield b'chunk2' _, obj_id = self.hash_content(b'placeholder_id') try: self.storage.add_stream(gen_content(), obj_id=obj_id) except NotImplementedError: return self.assertContentMatch(obj_id, b'chunk1chunk2') @istest def get_stream(self): content_l = [b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9'] content = b''.join(content_l) _, obj_id = self.hash_content(content) self.storage.add(content, obj_id=obj_id) try: r = list(self.storage.get_stream(obj_id, chunk_size=1)) except NotImplementedError: return self.assertEqual(r, content_l) diff --git a/version.txt b/version.txt index e56dbd5..cb02228 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.23-0-g77b54b4 \ No newline at end of file +v0.0.24-0-g6611f95 \ No newline at end of file