diff --git a/swh/objstorage/api/client.py b/swh/objstorage/api/client.py index 2daabee..cf89eda 100644 --- a/swh/objstorage/api/client.py +++ b/swh/objstorage/api/client.py @@ -1,103 +1,103 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pickle import requests from requests.exceptions import ConnectionError -from ...exc import StorageAPIError -from ...api.common import (decode_response, - encode_data_client as encode_data) +from ..exc import ObjStorageAPIError +from ...storage.api.common import (decode_response, + encode_data_client as encode_data) class RemoteObjStorage(): """ Proxy to a remote object storage. This class allows to connect to an object storage server via http protocol. Attributes: base_url (string): The url of the server to connect. Must end with a '/' session: The session to send requests. """ def __init__(self, base_url): self.base_url = base_url self.session = requests.Session() def url(self, endpoint): return '%s%s' % (self.base_url, endpoint) def post(self, endpoint, data): try: response = self.session.post( self.url(endpoint), data=encode_data(data), headers={'content-type': 'application/x-msgpack'}, ) except ConnectionError as e: print(str(e)) - raise StorageAPIError(e) + raise ObjStorageAPIError(e) # XXX: this breaks language-independence and should be # replaced by proper unserialization if response.status_code == 400: raise pickle.loads(decode_response(response)) return decode_response(response) def content_add(self, bytes, obj_id=None): """ Add a new object to the object storage. Args: bytes: content of the object to be added to the storage. obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When given, obj_id will be trusted to match bytes. If missing, obj_id will be computed on the fly. """ return self.post('content/add', {'bytes': bytes, 'obj_id': obj_id}) def content_get(self, obj_id): """ Retrieve the content of a given object. Args: obj_id: The id of the object. Returns: The content of the requested objects as bytes. Raises: ObjNotFoundError: if the requested object is missing """ return self.post('content/get', {'obj_id': obj_id}) def content_get_random(self, batch_size): """ Retrieve a random sample of existing content. Args: batch_size: Number of content requested. Returns: A list of random ids that represents existing contents. """ return self.post('content/get/random', {'batch_size': batch_size}) def content_check(self, obj_id): """ Integrity check for a given object verify that the file object is in place, and that the gzipped content matches the object id Args: obj_id: The id of the object. Raises: ObjNotFoundError: if the requested object is missing Error: if the requested object is corrupt """ self.post('content/check', {'obj_id': obj_id}) diff --git a/swh/objstorage/api/server.py b/swh/objstorage/api/server.py index 6cb8885..8d9787f 100644 --- a/swh/objstorage/api/server.py +++ b/swh/objstorage/api/server.py @@ -1,96 +1,96 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import logging from flask import Flask, g, request from swh.core import config -from swh.storage.objstorage import PathSlicingObjStorage +from swh.objstorage import PathSlicingObjStorage from swh.storage.api.common import (BytesRequest, decode_request, error_handler, encode_data_server as encode_data) DEFAULT_CONFIG = { 'storage_base': ('str', '/tmp/swh-storage/objects/'), 'storage_slicing': ('str', '0:2/2:4/4:6') } app = Flask(__name__) app.request_class = BytesRequest @app.errorhandler(Exception) def my_error_handler(exception): return error_handler(exception, encode_data) @app.before_request def before_request(): g.objstorage = PathSlicingObjStorage(app.config['storage_base'], app.config['storage_slicing']) @app.route('/') def index(): return "SWH Objstorage API server" @app.route('/content') def content(): return str(list(g.storage)) @app.route('/content/add', methods=['POST']) def add_bytes(): return encode_data(g.objstorage.add(**decode_request(request))) @app.route('/content/get', methods=['POST']) def get_bytes(): return encode_data(g.objstorage.get(**decode_request(request))) @app.route('/content/get/random', methods=['POST']) def get_random_contents(): return encode_data( g.objstorage.get_random(**decode_request(request)) ) @app.route('/content/check', methods=['POST']) def check(): return encode_data(g.objstorage.check(**decode_request(request))) def run_from_webserver(environ, start_response): """Run the WSGI app from the webserver, loading the configuration. """ config_path = '/etc/softwareheritage/storage/objstorage.ini' app.config.update(config.read(config_path, DEFAULT_CONFIG)) handler = logging.StreamHandler() app.logger.addHandler(handler) return app(environ, start_response) @click.command() @click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5000, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") def launch(config_path, host, port, debug): app.config.update(config.read(config_path, DEFAULT_CONFIG)) app.run(host, port=int(port), debug=bool(debug)) if __name__ == '__main__': launch() diff --git a/swh/objstorage/multiplexer/filter/id_filter.py b/swh/objstorage/multiplexer/filter/id_filter.py index b7faa30..71039b0 100644 --- a/swh/objstorage/multiplexer/filter/id_filter.py +++ b/swh/objstorage/multiplexer/filter/id_filter.py @@ -1,99 +1,99 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re from swh.core import hashutil from .filter import ObjStorageFilter from ...objstorage import ID_HASH_ALGO -from ....exc import ObjNotFoundError +from ...exc import ObjNotFoundError def compute_hash(bytes): """ Compute the hash of the given content. """ # Checksum is missing, compute it on the fly. h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) h.update(bytes) return h.digest() class IdObjStorageFilter(ObjStorageFilter): """ Filter that only allow operations if the object id match a requirement. Even for read operations, check before if the id match the requirements. This may prevent for unnecesary disk access. """ def is_valid(self, obj_id): """ Indicates if the given id is valid. """ raise NotImplementedError('Implementations of an IdObjStorageFilter ' 'must have a "is_valid" method') def __contains__(self, obj_id, *args, **kwargs): if self.is_valid(obj_id): return self.storage.__contains__(*args, obj_id=obj_id, **kwargs) return False def __len__(self): return sum(1 for i in [id for id in self.storage if self.is_valid(id)]) def __iter__(self): yield from filter(lambda id: self.is_valid(id), iter(self.storage)) def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): if obj_id is None: obj_id = compute_hash(content) if self.is_valid(obj_id): return self.storage.add(content, *args, obj_id=obj_id, **kwargs) def restore(self, content, obj_id=None, *args, **kwargs): if obj_id is None: obj_id = compute_hash(content) if self.is_valid(obj_id): return self.storage.restore(content, *args, obj_id=obj_id, **kwargs) def get(self, obj_id, *args, **kwargs): if self.is_valid(obj_id): return self.storage.get(*args, obj_id=obj_id, **kwargs) raise ObjNotFoundError(obj_id) def check(self, obj_id, *args, **kwargs): if self.is_valid(obj_id): return self.storage.check(*args, obj_id=obj_id, **kwargs) raise ObjNotFoundError(obj_id) def get_random(self, *args, **kwargs): yield from filter(lambda id: self.is_valid(id), self.storage.get_random(*args, **kwargs)) class RegexIdObjStorageFilter(IdObjStorageFilter): """ Filter that allow operations if the content's id as hex match a regex. """ def __init__(self, storage, regex): super().__init__(storage) self.regex = re.compile(regex) def is_valid(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return self.regex.match(hex_obj_id) is not None class PrefixIdObjStorageFilter(IdObjStorageFilter): """ Filter that allow operations if the hexlified id have a given prefix. """ def __init__(self, storage, prefix): super().__init__(storage) self.prefix = str(prefix) def is_valid(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return str(hex_obj_id).startswith(self.prefix) diff --git a/swh/objstorage/multiplexer/multiplexer_objstorage.py b/swh/objstorage/multiplexer/multiplexer_objstorage.py index 3b19be4..9376498 100644 --- a/swh/objstorage/multiplexer/multiplexer_objstorage.py +++ b/swh/objstorage/multiplexer/multiplexer_objstorage.py @@ -1,194 +1,194 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random from ..objstorage import ObjStorage -from ...exc import ObjNotFoundError +from ..exc import ObjNotFoundError class MultiplexerObjStorage(ObjStorage): """ Implementation of ObjStorage that distribute between multiple storages The multiplexer object storage allows an input to be demultiplexed among multiple storages that will or will not accept it by themselves (see .filter package). As the ids can be differents, no pre-computed ids should be submitted. Also, there are no guarantees that the returned ids can be used directly into the storages that the multiplexer manage. Use case examples could be: Example 1: storage_v1 = filter.read_only(PathSlicingObjStorage('/dir1', '0:2/2:4/4:6')) storage_v2 = PathSlicingObjStorage('/dir2', '0:1/0:5') storage = MultiplexerObjStorage([storage_v1, storage_v2]) When using 'storage', all the new contents will only be added to the v2 storage, while it will be retrievable from both. Example 2: storage_v1 = filter.id_regex( PathSlicingObjStorage('/dir1', '0:2/2:4/4:6'), r'[^012].*' ) storage_v2 = filter.if_regex( PathSlicingObjStorage('/dir2', '0:1/0:5'), r'[012]/*' ) storage = MultiplexerObjStorage([storage_v1, storage_v2]) When using this storage, the contents with a sha1 starting with 0, 1 or 2 will be redirected (read AND write) to the storage_v2, while the others will be redirected to the storage_v1. If a content starting with 0, 1 or 2 is present in the storage_v1, it would be ignored anyway. """ def __init__(self, storages): self.storages = storages def __contains__(self, obj_id): for storage in self.storages: if obj_id in storage: return True return False def __iter__(self): for storage in self.storages: yield from storage def __len__(self): """ Returns the number of files in the storage. Warning: Multiple files may represent the same content, so this method does not indicate how many different contents are in the storage. """ return sum(map(len, self.storages)) def add(self, content, obj_id=None, check_presence=True): """ Add a new object to the object storage. If the adding step works in all the storages that accept this content, this is a success. Otherwise, the full adding step is an error even if it succeed in some of the storages. Args: content: content of the object to be added to the storage. obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When given, obj_id will be trusted to match the bytes. If missing, obj_id will be computed on the fly. check_presence: indicate if the presence of the content should be verified before adding the file. Returns: an id of the object into the storage. As the write-storages are always readable as well, any id will be valid to retrieve a content. """ return [storage.add(content, obj_id, check_presence) for storage in self.storages].pop() def restore(self, content, obj_id=None): """ Restore a content that have been corrupted. This function is identical to add_bytes but does not check if the object id is already in the file system. (see "add" method) Args: content: content of the object to be added to the storage obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When given, obj_id will be trusted to match bytes. If missing, obj_id will be computed on the fly. Returns: an id of the object into the storage. As the write-storages are always readable as well, any id will be valid to retrieve a content. """ return [storage.restore(content, obj_id) for storage in self.storages].pop() def get(self, obj_id): """ Retrieve the content of a given object. Args: obj_id: object id. Returns: the content of the requested object as bytes. Raises: ObjNotFoundError: if the requested object is missing. """ for storage in self.storages: try: return storage.get(obj_id) except ObjNotFoundError: continue # If no storage contains this content, raise the error raise ObjNotFoundError(obj_id) def check(self, obj_id): """ Perform an integrity check for a given object. Verify that the file object is in place and that the gziped content matches the object id. Args: obj_id: object id. Raises: ObjNotFoundError: if the requested object is missing. Error: if the request object is corrupted. """ nb_present = 0 for storage in self.storages: try: storage.check(obj_id) except ObjNotFoundError: continue else: nb_present += 1 # If there is an Error because of a corrupted file, then let it pass. # Raise the ObjNotFoundError only if the content coulnd't be found in # all the storages. if nb_present == 0: raise ObjNotFoundError(obj_id) def get_random(self, batch_size): """ Get random ids of existing contents This method is used in order to get random ids to perform content integrity verifications on random contents. Attributes: batch_size (int): Number of ids that will be given Yields: An iterable of ids of contents that are in the current object storage. """ storages_set = [storage for storage in self.storages if len(storage) > 0] if len(storages_set) <= 0: return [] while storages_set: storage = random.choice(storages_set) try: return storage.get_random(batch_size) except NotImplementedError: storages_set.remove(storage) # There is no storage that allow the get_random operation raise NotImplementedError( "There is no storage implementation into the multiplexer that " "support the 'get_random' operation" ) diff --git a/swh/objstorage/objstorage_pathslicing.py b/swh/objstorage/objstorage_pathslicing.py index 7da58b4..1ea2ebe 100644 --- a/swh/objstorage/objstorage_pathslicing.py +++ b/swh/objstorage/objstorage_pathslicing.py @@ -1,347 +1,347 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import gzip import tempfile import random from contextlib import contextmanager from swh.core import hashutil from .objstorage import ObjStorage, ID_HASH_ALGO, ID_HASH_LENGTH -from ..exc import ObjNotFoundError, Error +from .exc import ObjNotFoundError, Error GZIP_BUFSIZ = 1048576 DIR_MODE = 0o755 FILE_MODE = 0o644 @contextmanager def _write_obj_file(hex_obj_id, objstorage): """ Context manager for writing object files to the object storage. During writing, data are written to a temporary file, which is atomically renamed to the right file name after closing. This context manager also takes care of (gzip) compressing the data on the fly. Usage sample: with _write_obj_file(hex_obj_id, objstorage): f.write(obj_data) Yields: a file-like object open for writing bytes. """ # Get the final paths and create the directory if absent. dir = objstorage._obj_dir(hex_obj_id) if not os.path.isdir(dir): os.makedirs(dir, DIR_MODE, exist_ok=True) path = os.path.join(dir, hex_obj_id) # Create a temporary file. (tmp, tmp_path) = tempfile.mkstemp(suffix='.tmp', prefix='hex_obj_id.', dir=dir) # Open the file and yield it for writing. tmp_f = os.fdopen(tmp, 'wb') with gzip.GzipFile(filename=tmp_path, fileobj=tmp_f) as f: yield f # Then close the temporary file and move it to the right directory. tmp_f.close() os.chmod(tmp_path, FILE_MODE) os.rename(tmp_path, path) @contextmanager def _read_obj_file(hex_obj_id, objstorage): """ Context manager for reading object file in the object storage. Usage sample: with _read_obj_file(hex_obj_id, objstorage) as f: b = f.read() Yields: a file-like object open for reading bytes. """ path = objstorage._obj_path(hex_obj_id) with gzip.GzipFile(path, 'rb') as f: yield f class PathSlicingObjStorage(ObjStorage): """ Implementation of the ObjStorage API based on the hash of the content. On disk, an object storage is a directory tree containing files named after their object IDs. An object ID is a checksum of its content, depending on the value of the ID_HASH_ALGO constant (see hashutil for its meaning). To avoid directories that contain too many files, the object storage has a given slicing. Each slicing correspond to a directory that is named according to the hash of its content. So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689 will be stored in the given object storages : - 0:2/2:4/4:6 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689 - 0:1/0:5/ : 3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689 The files in the storage are stored in gzipped compressed format. Attributes: root (string): path to the root directory of the storage on the disk. bounds: list of tuples that indicates the beginning and the end of each subdirectory for a content. """ def __init__(self, root, slicing): """ Create an object to access a hash-slicing based object storage. Args: root (string): path to the root directory of the storage on the disk. slicing (string): string that indicates the slicing to perform on the hash of the content to know the path where it should be stored. """ if not os.path.isdir(root): raise ValueError( 'PathSlicingObjStorage root "%s" is not a directory' % root ) self.root = root # Make a list of tuples where each tuple contains the beginning # and the end of each slicing. self.bounds = [ slice(*map(int, sbounds.split(':'))) for sbounds in slicing.split('/') if sbounds ] max_endchar = max(map(lambda bound: bound.stop, self.bounds)) if ID_HASH_LENGTH < max_endchar: raise ValueError( 'Algorithm %s has too short hash for slicing to char %d' % (ID_HASH_ALGO, max_endchar) ) def __contains__(self, obj_id): """ Check whether the given object is present in the storage or not. Returns: True iff the object is present in the storage. """ hex_obj_id = hashutil.hash_to_hex(obj_id) return os.path.exists(self._obj_path(hex_obj_id)) def __iter__(self): """iterate over the object identifiers currently available in the storage Warning: with the current implementation of the object storage, this method will walk the filesystem to list objects, meaning that listing all objects will be very slow for large storages. You almost certainly don't want to use this method in production. Return: iterator over object IDs """ def obj_iterator(): # XXX hackish: it does not verify that the depth of found files # matches the slicing depth of the storage for root, _dirs, files in os.walk(self.root): for f in files: yield bytes.fromhex(f) return obj_iterator() def __len__(self): """compute the number of objects available in the storage Warning: this currently uses `__iter__`, its warning about bad performances applies Return: number of objects contained in the storage """ return sum(1 for i in self) def _obj_dir(self, hex_obj_id): """ Compute the storage directory of an object. See also: PathSlicingObjStorage::_obj_path Args: hex_obj_id: object id as hexlified string. Returns: Path to the directory that contains the required object. """ slices = [hex_obj_id[bound] for bound in self.bounds] return os.path.join(self.root, *slices) def _obj_path(self, hex_obj_id): """ Compute the full path to an object into the current storage. See also: PathSlicingObjStorage::_obj_dir Args: hex_obj_id: object id as hexlified string. Returns: Path to the actual object corresponding to the given id. """ return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id) def add(self, bytes, obj_id=None, check_presence=True): """ Add a new object to the object storage. Args: bytes: content of the object to be added to the storage. obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When given, obj_id will be trusted to match the bytes. If missing, obj_id will be computed on the fly. check_presence: indicate if the presence of the content should be verified before adding the file. Returns: the id of the object into the storage. """ if obj_id is None: # Checksum is missing, compute it on the fly. h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) h.update(bytes) obj_id = h.digest() if check_presence and obj_id in self: # If the object is already present, return immediatly. return obj_id hex_obj_id = hashutil.hash_to_hex(obj_id) with _write_obj_file(hex_obj_id, self) as f: f.write(bytes) return obj_id def restore(self, bytes, obj_id=None): """ Restore a content that have been corrupted. This function is identical to add_bytes but does not check if the object id is already in the file system. Args: bytes: content of the object to be added to the storage obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When given, obj_id will be trusted to match bytes. If missing, obj_id will be computed on the fly. """ return self.add(bytes, obj_id, check_presence=False) def get(self, obj_id): """ Retrieve the content of a given object. Args: obj_id: object id. Returns: the content of the requested object as bytes. Raises: ObjNotFoundError: if the requested object is missing. """ if obj_id not in self: raise ObjNotFoundError(obj_id) # Open the file and return its content as bytes hex_obj_id = hashutil.hash_to_hex(obj_id) with _read_obj_file(hex_obj_id, self) as f: return f.read() def check(self, obj_id): """ Perform an integrity check for a given object. Verify that the file object is in place and that the gziped content matches the object id. Args: obj_id: object id. Raises: ObjNotFoundError: if the requested object is missing. Error: if the request object is corrupted. """ if obj_id not in self: raise ObjNotFoundError(obj_id) hex_obj_id = hashutil.hash_to_hex(obj_id) try: with gzip.open(self._obj_path(hex_obj_id)) as f: length = None if ID_HASH_ALGO.endswith('_git'): # if the hashing algorithm is git-like, we need to know the # content size to hash on the fly. Do a first pass here to # compute the size length = 0 while True: chunk = f.read(GZIP_BUFSIZ) length += len(chunk) if not chunk: break f.rewind() checksums = hashutil._hash_file_obj(f, length, algorithms=[ID_HASH_ALGO]) actual_obj_id = checksums[ID_HASH_ALGO] if obj_id != actual_obj_id: raise Error( 'Corrupt object %s should have id %s' % (hashutil.hash_to_hex(obj_id), hashutil.hash_to_hex(actual_obj_id)) ) except (OSError, IOError): # IOError is for compatibility with older python versions raise Error('Corrupt object %s is not a gzip file' % obj_id) def get_random(self, batch_size): """ Get random ids of existing contents This method is used in order to get random ids to perform content integrity verifications on random contents. Attributes: batch_size (int): Number of ids that will be given Yields: An iterable of ids of contents that are in the current object storage. """ def get_random_content(self, batch_size): """ Get a batch of content inside a single directory. Returns: a tuple (batch size, batch). """ dirs = [] for level in range(len(self.bounds)): path = os.path.join(self.root, *dirs) dir_list = next(os.walk(path))[1] if 'tmp' in dir_list: dir_list.remove('tmp') dirs.append(random.choice(dir_list)) path = os.path.join(self.root, *dirs) content_list = next(os.walk(path))[2] length = min(batch_size, len(content_list)) return length, map(hashutil.hex_to_hash, random.sample(content_list, length)) while batch_size: length, it = get_random_content(self, batch_size) batch_size = batch_size - length yield from it diff --git a/swh/objstorage/tests/objstorage_testing.py b/swh/objstorage/tests/objstorage_testing.py index 79c3587..3009cd0 100644 --- a/swh/objstorage/tests/objstorage_testing.py +++ b/swh/objstorage/tests/objstorage_testing.py @@ -1,70 +1,70 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from nose.tools import istest from swh.core import hashutil -from swh.storage import exc +from swh.objstorage import exc class ObjStorageTestFixture(): def setUp(self): super().setUp() def hash_content(self, content): obj_id = hashutil.hashdata(content)['sha1'] return content, obj_id def assertContentMatch(self, obj_id, expected_content): content = self.storage.get(obj_id) self.assertEqual(content, expected_content) @istest def add_get_w_id(self): content, obj_id = self.hash_content(b'add_get_w_id') r = self.storage.add(content, obj_id=obj_id) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) @istest def add_get_wo_id(self): content, obj_id = self.hash_content(b'add_get_wo_id') r = self.storage.add(content) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) @istest def restore_content(self): valid_content, valid_obj_id = self.hash_content(b'restore_content') invalid_content = b'unexpected content' id_adding = self.storage.add(invalid_content, valid_obj_id) id_restore = self.storage.restore(valid_content) # Adding a false content then restore it to the right one and # then perform a verification should result in a successful check. self.assertEqual(id_adding, valid_obj_id) self.assertEqual(id_restore, valid_obj_id) self.assertContentMatch(valid_obj_id, valid_content) @istest def get_missing(self): content, obj_id = self.hash_content(b'get_missing') with self.assertRaises(exc.Error): self.storage.get(obj_id) @istest def check_missing(self): content, obj_id = self.hash_content(b'check_missing') with self.assertRaises(exc.Error): self.storage.check(obj_id) @istest def check_present(self): content, obj_id = self.hash_content(b'check_missing') self.storage.add(content) try: self.storage.check(obj_id) except: self.fail('Integrity check failed') diff --git a/swh/objstorage/tests/test_multiplexer_filter.py b/swh/objstorage/tests/test_multiplexer_filter.py index eb1533c..79bfe02 100644 --- a/swh/objstorage/tests/test_multiplexer_filter.py +++ b/swh/objstorage/tests/test_multiplexer_filter.py @@ -1,373 +1,373 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import unittest from string import ascii_lowercase from nose.tools import istest from nose.plugins.attrib import attr from swh.core import hashutil -from swh.storage.exc import ObjNotFoundError, Error -from swh.storage.objstorage import ObjStorage -from swh.storage.objstorage.multiplexer.filter import (add_filter, read_only, - id_prefix, id_regex) +from swh.objstorage.exc import ObjNotFoundError, Error +from swh.objstorage import ObjStorage +from swh.objstorage.multiplexer.filter import (add_filter, read_only, + id_prefix, id_regex) def get_random_content(): return bytes(''.join(random.sample(ascii_lowercase, 10)), 'utf8') class MockObjStorage(ObjStorage): """ Mock an object storage for testing the filters. """ def __init__(self): self.objects = {} def __contains__(self, obj_id): return obj_id in self.objects def __len__(self): return len(self.objects) def __iter__(self): return iter(self.objects) def id(self, content): # Id is the content itself for easily choose the id of # a content for filtering. return hashutil.hashdata(content)['sha1'] def add(self, content, obj_id=None, check_presence=True): if obj_id is None: obj_id = self.id(content) if check_presence and obj_id in self.objects: return obj_id self.objects[obj_id] = content return obj_id def restore(self, content, obj_id=None): return self.add(content, obj_id, check_presence=False) def get(self, obj_id): if obj_id not in self: raise ObjNotFoundError(obj_id) return self.objects[obj_id] def check(self, obj_id): if obj_id not in self: raise ObjNotFoundError(obj_id) if obj_id != self.id(self.objects[obj_id]): raise Error(obj_id) def get_random(self, batch_size): batch_size = min(len(self), batch_size) return random.sample(list(self.objects), batch_size) @attr('!db') class MixinTestReadFilter(unittest.TestCase): # Read only filter should not allow writing def setUp(self): super().setUp() storage = MockObjStorage() self.valid_content = b'pre-existing content' self.invalid_content = b'invalid_content' self.true_invalid_content = b'Anything that is not correct' self.absent_content = b'non-existent content' # Create a valid content. self.valid_id = storage.add(self.valid_content) # Create an invalid id and add a content with it. self.invalid_id = storage.id(self.true_invalid_content) storage.add(self.invalid_content, obj_id=self.invalid_id) # Compute an id for a non-existing content. self.absent_id = storage.id(self.absent_content) self.storage = add_filter(storage, read_only()) @istest def can_contains(self): self.assertTrue(self.valid_id in self.storage) self.assertTrue(self.invalid_id in self.storage) self.assertFalse(self.absent_id in self.storage) @istest def can_iter(self): self.assertIn(self.valid_id, iter(self.storage)) self.assertIn(self.invalid_id, iter(self.storage)) @istest def can_len(self): self.assertEqual(2, len(self.storage)) @istest def can_get(self): self.assertEqual(self.valid_content, self.storage.get(self.valid_id)) self.assertEqual(self.invalid_content, self.storage.get(self.invalid_id)) @istest def can_check(self): with self.assertRaises(ObjNotFoundError): self.storage.check(self.absent_id) with self.assertRaises(Error): self.storage.check(self.invalid_id) self.storage.check(self.valid_id) @istest def can_get_random(self): self.assertEqual(1, len(self.storage.get_random(1))) self.assertEqual(len(self.storage), len(self.storage.get_random(1000))) @istest def cannot_add(self): new_id = self.storage.add(b'New content') result = self.storage.add(self.valid_content, self.valid_id) self.assertNotIn(new_id, self.storage) self.assertIsNone(result) @istest def cannot_restore(self): new_id = self.storage.restore(b'New content') result = self.storage.restore(self.valid_content, self.valid_id) self.assertNotIn(new_id, self.storage) self.assertIsNone(result) class MixinTestIdFilter(): """ Mixin class that tests the filters based on filter.IdFilter Methods "make_valid", "make_invalid" and "filter_storage" must be implemented by subclasses. """ def setUp(self): super().setUp() # Use a hack here : as the mock uses the content as id, it is easy to # create contents that are filtered or not. self.prefix = '71' storage = MockObjStorage() # Make the storage filtered self.base_storage = storage self.storage = self.filter_storage(storage) # Present content with valid id self.present_valid_content = self.ensure_valid(b'yroqdtotji') self.present_valid_id = storage.id(self.present_valid_content) # Present content with invalid id self.present_invalid_content = self.ensure_invalid(b'glxddlmmzb') self.present_invalid_id = storage.id(self.present_invalid_content) # Missing content with valid id self.missing_valid_content = self.ensure_valid(b'rmzkdclkez') self.missing_valid_id = storage.id(self.missing_valid_content) # Missing content with invalid id self.missing_invalid_content = self.ensure_invalid(b'hlejfuginh') self.missing_invalid_id = storage.id(self.missing_invalid_content) # Present corrupted content with valid id self.present_corrupted_valid_content = self.ensure_valid(b'cdsjwnpaij') self.true_present_corrupted_valid_content = self.ensure_valid( b'mgsdpawcrr') self.present_corrupted_valid_id = storage.id( self.true_present_corrupted_valid_content) # Present corrupted content with invalid id self.present_corrupted_invalid_content = self.ensure_invalid( b'pspjljnrco') self.true_present_corrupted_invalid_content = self.ensure_invalid( b'rjocbnnbso') self.present_corrupted_invalid_id = storage.id( self.true_present_corrupted_invalid_content) # Missing (potentially) corrupted content with valid id self.missing_corrupted_valid_content = self.ensure_valid( b'zxkokfgtou') self.true_missing_corrupted_valid_content = self.ensure_valid( b'royoncooqa') self.missing_corrupted_valid_id = storage.id( self.true_missing_corrupted_valid_content) # Missing (potentially) corrupted content with invalid id self.missing_corrupted_invalid_content = self.ensure_invalid( b'hxaxnrmnyk') self.true_missing_corrupted_invalid_content = self.ensure_invalid( b'qhbolyuifr') self.missing_corrupted_invalid_id = storage.id( self.true_missing_corrupted_invalid_content) # Add the content that are supposed to be present storage.add(self.present_valid_content) storage.add(self.present_invalid_content) storage.add(self.present_corrupted_valid_content, obj_id=self.present_corrupted_valid_id) storage.add(self.present_corrupted_invalid_content, obj_id=self.present_corrupted_invalid_id) def filter_storage(self, storage): raise NotImplementedError( 'Id_filter test class must have a filter_storage method') def ensure_valid(self, content=None): if content is None: content = get_random_content() while not self.storage.is_valid(self.base_storage.id(content)): content = get_random_content() return content def ensure_invalid(self, content=None): if content is None: content = get_random_content() while self.storage.is_valid(self.base_storage.id(content)): content = get_random_content() return content @istest def contains(self): # Both contents are present, but the invalid one should be ignored. self.assertTrue(self.present_valid_id in self.storage) self.assertFalse(self.present_invalid_id in self.storage) self.assertFalse(self.missing_valid_id in self.storage) self.assertFalse(self.missing_invalid_id in self.storage) self.assertTrue(self.present_corrupted_valid_id in self.storage) self.assertFalse(self.present_corrupted_invalid_id in self.storage) self.assertFalse(self.missing_corrupted_valid_id in self.storage) self.assertFalse(self.missing_corrupted_invalid_id in self.storage) @istest def iter(self): self.assertIn(self.present_valid_id, iter(self.storage)) self.assertNotIn(self.present_invalid_id, iter(self.storage)) self.assertNotIn(self.missing_valid_id, iter(self.storage)) self.assertNotIn(self.missing_invalid_id, iter(self.storage)) self.assertIn(self.present_corrupted_valid_id, iter(self.storage)) self.assertNotIn(self.present_corrupted_invalid_id, iter(self.storage)) self.assertNotIn(self.missing_corrupted_valid_id, iter(self.storage)) self.assertNotIn(self.missing_corrupted_invalid_id, iter(self.storage)) @istest def len(self): # Four contents are present, but only two should be valid. self.assertEqual(2, len(self.storage)) @istest def get(self): self.assertEqual(self.present_valid_content, self.storage.get(self.present_valid_id)) with self.assertRaises(ObjNotFoundError): self.storage.get(self.present_invalid_id) with self.assertRaises(ObjNotFoundError): self.storage.get(self.missing_valid_id) with self.assertRaises(ObjNotFoundError): self.storage.get(self.missing_invalid_id) self.assertEqual(self.present_corrupted_valid_content, self.storage.get(self.present_corrupted_valid_id)) with self.assertRaises(ObjNotFoundError): self.storage.get(self.present_corrupted_invalid_id) with self.assertRaises(ObjNotFoundError): self.storage.get(self.missing_corrupted_valid_id) with self.assertRaises(ObjNotFoundError): self.storage.get(self.missing_corrupted_invalid_id) @istest def check(self): self.storage.check(self.present_valid_id) with self.assertRaises(ObjNotFoundError): self.storage.check(self.present_invalid_id) with self.assertRaises(ObjNotFoundError): self.storage.check(self.missing_valid_id) with self.assertRaises(ObjNotFoundError): self.storage.check(self.missing_invalid_id) with self.assertRaises(Error): self.storage.check(self.present_corrupted_valid_id) with self.assertRaises(ObjNotFoundError): self.storage.check(self.present_corrupted_invalid_id) with self.assertRaises(ObjNotFoundError): self.storage.check(self.missing_corrupted_valid_id) with self.assertRaises(ObjNotFoundError): self.storage.check(self.missing_corrupted_invalid_id) @istest def get_random(self): self.assertEqual(0, len(list(self.storage.get_random(0)))) random_content = list(self.storage.get_random(1000)) self.assertIn(self.present_valid_id, random_content) self.assertNotIn(self.present_invalid_id, random_content) self.assertNotIn(self.missing_valid_id, random_content) self.assertNotIn(self.missing_invalid_id, random_content) self.assertIn(self.present_corrupted_valid_id, random_content) self.assertNotIn(self.present_corrupted_invalid_id, random_content) self.assertNotIn(self.missing_corrupted_valid_id, random_content) self.assertNotIn(self.missing_corrupted_invalid_id, random_content) @istest def add(self): # Add valid and invalid contents to the storage and check their # presence with the unfiltered storage. valid_content = self.ensure_valid(b'ulepsrjbgt') valid_id = self.base_storage.id(valid_content) invalid_content = self.ensure_invalid(b'znvghkjked') invalid_id = self.base_storage.id(invalid_content) self.storage.add(valid_content) self.storage.add(invalid_content) self.assertTrue(valid_id in self.base_storage) self.assertFalse(invalid_id in self.base_storage) @istest def restore(self): # Add corrupted content to the storage and the try to restore it valid_content = self.ensure_valid(b'ulepsrjbgt') valid_id = self.base_storage.id(valid_content) corrupted_content = self.ensure_valid(b'ltjkjsloyb') corrupted_id = self.base_storage.id(corrupted_content) self.storage.add(corrupted_content, obj_id=valid_id) with self.assertRaises(ObjNotFoundError): self.storage.check(corrupted_id) with self.assertRaises(Error): self.storage.check(valid_id) self.storage.restore(valid_content) self.storage.check(valid_id) @attr('!db') class TestPrefixFilter(MixinTestIdFilter, unittest.TestCase): def setUp(self): self.prefix = b'71' super().setUp() def ensure_valid(self, content): obj_id = hashutil.hashdata(content)['sha1'] hex_obj_id = hashutil.hash_to_hex(obj_id) self.assertTrue(hex_obj_id.startswith(self.prefix)) return content def ensure_invalid(self, content): obj_id = hashutil.hashdata(content)['sha1'] hex_obj_id = hashutil.hash_to_hex(obj_id) self.assertFalse(hex_obj_id.startswith(self.prefix)) return content def filter_storage(self, storage): return add_filter(storage, id_prefix(self.prefix)) @attr('!db') class TestRegexFilter(MixinTestIdFilter, unittest.TestCase): def setUp(self): self.regex = r'[a-f][0-9].*' super().setUp() def filter_storage(self, storage): return add_filter(storage, id_regex(self.regex)) diff --git a/swh/objstorage/tests/test_objstorage.py b/swh/objstorage/tests/test_objstorage.py new file mode 100644 index 0000000..2579871 --- /dev/null +++ b/swh/objstorage/tests/test_objstorage.py @@ -0,0 +1,17 @@ +# Copyright (C) 2015-2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +import tempfile + +from swh.objstorage import PathSlicingObjStorage + +from swh.objstorage.tests.objstorage_testing import ObjStorageTestFixture + + +class TestObjStorage(ObjStorageTestFixture, unittest.TestCase): + + def setUp(self): + self.storage = PathSlicingObjStorage(tempfile.mkdtemp(), '0:2/0:5') diff --git a/swh/objstorage/tests/test_objstorage_api.py b/swh/objstorage/tests/test_objstorage_api.py index 4e43f18..4fef81b 100644 --- a/swh/objstorage/tests/test_objstorage_api.py +++ b/swh/objstorage/tests/test_objstorage_api.py @@ -1,88 +1,88 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest from nose.tools import istest from nose.plugins.attrib import attr from swh.core import hashutil -from swh.storage.exc import ObjNotFoundError, Error -from swh.storage.tests.server_testing import ServerTestFixture -from swh.storage.objstorage.api.client import RemoteObjStorage -from swh.storage.objstorage.api.server import app +from swh.objstorage.exc import ObjNotFoundError, Error +from swh.objstorage.tests.server_testing import ServerTestFixture +from swh.objstorage.api.client import RemoteObjStorage +from swh.objstorage.api.server import app @attr('db') class TestRemoteObjStorage(ServerTestFixture, unittest.TestCase): """ Test the remote archive API. """ def setUp(self): self.config = {'storage_base': tempfile.mkdtemp(), 'storage_slicing': '0:1/0:5'} self.app = app super().setUp() self.objstorage = RemoteObjStorage(self.url()) def tearDown(self): super().tearDown() @istest def content_add(self): content = bytes('Test content', 'utf8') id = self.objstorage.content_add(content) self.assertEquals(self.objstorage.content_get(id), content) @istest def content_get_present(self): content = bytes('content_get_present', 'utf8') content_hash = hashutil.hashdata(content) id = self.objstorage.content_add(content) self.assertEquals(content_hash['sha1'], id) @istest def content_get_missing(self): content = bytes('content_get_missing', 'utf8') content_hash = hashutil.hashdata(content) with self.assertRaises(ObjNotFoundError): self.objstorage.content_get(content_hash['sha1']) @istest def content_get_random(self): ids = [] for i in range(100): content = bytes('content_get_present', 'utf8') id = self.objstorage.content_add(content) ids.append(id) for id in self.objstorage.content_get_random(50): self.assertIn(id, ids) @istest def content_check_invalid(self): content = bytes('content_check_invalid', 'utf8') invalid_id = hashutil.hashdata(b'invalid content')['sha1'] # Add the content with an invalid id. self.objstorage.content_add(content, invalid_id) # Then check it and expect an error. with self.assertRaises(Error): self.objstorage.content_check(invalid_id) @istest def content_check_valid(self): content = bytes('content_check_valid', 'utf8') id = self.objstorage.content_add(content) try: self.objstorage.content_check(id) except: self.fail('Integrity check failed') @istest def content_check_missing(self): content = bytes('content_check_valid', 'utf8') content_hash = hashutil.hashdata(content) with self.assertRaises(ObjNotFoundError): self.objstorage.content_check(content_hash['sha1']) diff --git a/swh/objstorage/tests/test_objstorage_multiplexer.py b/swh/objstorage/tests/test_objstorage_multiplexer.py index bd4bd07..603acf4 100644 --- a/swh/objstorage/tests/test_objstorage_multiplexer.py +++ b/swh/objstorage/tests/test_objstorage_multiplexer.py @@ -1,78 +1,78 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest from nose.tools import istest -from swh.storage.objstorage import PathSlicingObjStorage -from swh.storage.objstorage.multiplexer import MultiplexerObjStorage -from swh.storage.objstorage.multiplexer.filter import add_filter, read_only +from swh.objstorage import PathSlicingObjStorage +from swh.objstorage.multiplexer import MultiplexerObjStorage +from swh.objstorage.multiplexer.filter import add_filter, read_only from objstorage_testing import ObjStorageTestFixture class TestMultiplexerObjStorage(ObjStorageTestFixture, unittest.TestCase): def setUp(self): super().setUp() self.storage_v1 = PathSlicingObjStorage(tempfile.mkdtemp(), '0:2/2:4') self.storage_v2 = PathSlicingObjStorage(tempfile.mkdtemp(), '0:1/0:5') self.r_storage = add_filter(self.storage_v1, read_only()) self.w_storage = self.storage_v2 self.storage = MultiplexerObjStorage([self.r_storage, self.w_storage]) @istest def contains(self): content_p, obj_id_p = self.hash_content(b'contains_present') content_m, obj_id_m = self.hash_content(b'contains_missing') self.storage.add(content_p, obj_id=obj_id_p) self.assertIn(obj_id_p, self.storage) self.assertNotIn(obj_id_m, self.storage) @istest def iter(self): content, obj_id = self.hash_content(b'iter') self.assertEqual(list(iter(self.storage)), []) self.storage.add(content, obj_id=obj_id) self.assertEqual(list(iter(self.storage)), [obj_id]) @istest def len(self): content, obj_id = self.hash_content(b'len') self.assertEqual(len(self.storage), 0) self.storage.add(content, obj_id=obj_id) self.assertEqual(len(self.storage), 1) @istest def len_multiple(self): content, obj_id = self.hash_content(b'len_multiple') # Add a content to the read-only storage self.storage_v1.add(content) self.assertEqual(len(self.storage), 1) # By adding the same content to the global storage, it should be # Replicated. # len() behavior is to indicates the number of files, not unique # contents. self.storage.add(content) self.assertEqual(len(self.storage), 2) @istest def get_random_contents(self): content, obj_id = self.hash_content(b'get_random_content') self.storage.add(content) random_contents = list(self.storage.get_random(1)) self.assertEqual(1, len(random_contents)) self.assertIn(obj_id, random_contents) @istest def access_readonly(self): # Add a content to the readonly storage content, obj_id = self.hash_content(b'content in read-only') self.storage_v1.add(content) # Try to retrieve it on the main storage self.assertIn(obj_id, self.storage) diff --git a/swh/objstorage/tests/test_objstorage_pathslicing.py b/swh/objstorage/tests/test_objstorage_pathslicing.py index aedd63e..59b7de5 100644 --- a/swh/objstorage/tests/test_objstorage_pathslicing.py +++ b/swh/objstorage/tests/test_objstorage_pathslicing.py @@ -1,76 +1,76 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest from nose.tools import istest from swh.core import hashutil -from swh.storage import exc -from swh.storage.objstorage import PathSlicingObjStorage +from swh.objstorage import exc +from swh.objstorage import PathSlicingObjStorage from objstorage_testing import ObjStorageTestFixture class TestpathSlicingObjStorage(ObjStorageTestFixture, unittest.TestCase): def setUp(self): super().setUp() self.slicing = '0:2/2:4/4:6' self.tmpdir = tempfile.mkdtemp() self.storage = PathSlicingObjStorage(self.tmpdir, self.slicing) def content_path(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return self.storage._obj_path(hex_obj_id) @istest def contains(self): content_p, obj_id_p = self.hash_content(b'contains_present') content_m, obj_id_m = self.hash_content(b'contains_missing') self.storage.add(content_p, obj_id=obj_id_p) self.assertIn(obj_id_p, self.storage) self.assertNotIn(obj_id_m, self.storage) @istest def iter(self): content, obj_id = self.hash_content(b'iter') self.assertEqual(list(iter(self.storage)), []) self.storage.add(content, obj_id=obj_id) self.assertEqual(list(iter(self.storage)), [obj_id]) @istest def len(self): content, obj_id = self.hash_content(b'check_not_gzip') self.assertEqual(len(self.storage), 0) self.storage.add(content, obj_id=obj_id) self.assertEqual(len(self.storage), 1) @istest def check_not_gzip(self): content, obj_id = self.hash_content(b'check_not_gzip') self.storage.add(content, obj_id=obj_id) with open(self.content_path(obj_id), 'ab') as f: # Add garbage. f.write(b'garbage') with self.assertRaises(exc.Error): self.storage.check(obj_id) @istest def check_id_mismatch(self): content, obj_id = self.hash_content(b'check_id_mismatch') self.storage.add(content, obj_id=obj_id) with open(self.content_path(obj_id), 'wb') as f: f.write(b'unexpected content') with self.assertRaises(exc.Error): self.storage.check(obj_id) @istest def get_random_contents(self): content, obj_id = self.hash_content(b'get_random_content') self.storage.add(content, obj_id=obj_id) random_contents = list(self.storage.get_random(1)) self.assertEqual(1, len(random_contents)) self.assertIn(obj_id, random_contents)