diff --git a/swh/storage/archiver/copier.py b/swh/storage/archiver/copier.py --- a/swh/storage/archiver/copier.py +++ b/swh/storage/archiver/copier.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information from swh.core import hashutil -from ..objstorage.api.client import RemoteObjStorage +from swh.objstorage.api.client import RemoteObjStorage class ArchiverCopier(): diff --git a/swh/storage/checker/__init__.py b/swh/storage/checker/__init__.py deleted file mode 100644 diff --git a/swh/storage/checker/checker.py b/swh/storage/checker/checker.py deleted file mode 100644 --- a/swh/storage/checker/checker.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import click -import logging - -from swh.core import config, hashutil -from .. import get_storage -from ..objstorage import PathSlicingObjStorage -from ..exc import ObjNotFoundError, Error - -DEFAULT_CONFIG = { - 'storage_path': ('str', '/srv/softwareheritage/objects'), - 'storage_depth': ('int', 3), - 'backup_url': ('str', 'http://uffizi:5002/'), - - 'batch_size': ('int', 1000), -} - - -class ContentChecker(): - """ Content integrity checker that will check local objstorage content. - - The checker will check the data of an object storage in order to verify - that no file have been corrupted. - - Attributes: - config: dictionary that contains this - checker configuration - objstorage (ObjStorage): Local object storage that will be checked. - master_storage (RemoteStorage): A distant storage that will be used to - restore corrupted content. - """ - - def __init__(self, config, root, slicing, backup_urls): - """ Create a checker that ensure the objstorage have no corrupted file. - - Args: - config (dict): Dictionary that contains the following keys : - batch_size: Number of content that should be tested each - time the content checker runs. - root (string): Path to the objstorage directory - depth (int): Depth of the object storage. - backup_urls: List of url that can be contacted in order to - get a content. - """ - self.config = config - self.objstorage = PathSlicingObjStorage(root, slicing) - self.backup_storages = [get_storage('remote_storage', [backup_url]) - for backup_url in backup_urls] - - def run(self): - """ Start the check routine - """ - corrupted_contents = [] - batch_size = self.config['batch_size'] - - for content_id in self.get_content_to_check(batch_size): - if not self.check_content(content_id): - corrupted_contents.append(content_id) - logging.error('The content', content_id, 'have been corrupted') - - self.repair_contents(corrupted_contents) - - def run_as_daemon(self): - """ Start the check routine and perform it forever. - - Use this method to run the checker when it's done as a daemon that - will iterate over the content forever in background. - """ - while True: - try: - self.run() - except Exception as e: - logging.error('An error occured while verifing the content: %s' - % e) - - def get_content_to_check(self, batch_size): - """ Get the content that should be verified. - - Returns: - An iterable of the content's id that need to be checked. - """ - contents = self.objstorage.get_random_contents(batch_size) - yield from contents - - def check_content(self, content_id): - """ Check the validity of the given content. - - Returns: - True if the content was valid, false if it was corrupted. - """ - try: - self.objstorage.check(content_id) - except (ObjNotFoundError, Error) as e: - logging.warning(e) - return False - else: - return True - - def repair_contents(self, content_ids): - """ Try to restore the given contents. - - Ask the backup storages for the contents that are corrupted on - the local object storage. - If the first storage does not contain the missing contents, send - a request to the second one with only the content that couldn't be - retrieved, and so on until there is no remaining content or servers. - - If a content couldn't be retrieved on all the servers, then log it as - an error. - """ - contents_to_get = set(content_ids) - # Iterates over the backup storages. - for backup_storage in self.backup_storages: - # Try to get all the contents that still need to be retrieved. - contents = backup_storage.content_get(list(contents_to_get)) - for content in contents: - if content: - hash = content['sha1'] - data = content['data'] - # When a content is retrieved, remove it from the set - # of needed contents. - contents_to_get.discard(hash) - self.objstorage.restore(data) - - # Contents still in contents_to_get couldn't be retrieved. - if contents_to_get: - logging.error( - "Some corrupted contents could not be retrieved : %s" - % [hashutil.hash_to_hex(id) for id in contents_to_get] - ) - - -@click.command() -@click.argument('config-path', required=1) -@click.option('--storage-path', default=DEFAULT_CONFIG['storage_path'][1], - help='Path to the storage to verify') -@click.option('--depth', default=DEFAULT_CONFIG['storage_depth'][1], - type=click.INT, help='Depth of the object storage') -@click.option('--backup-url', default=DEFAULT_CONFIG['backup_url'][1], - help='Url of a remote storage to retrieve corrupted content') -@click.option('--daemon/--nodaemon', default=True, - help='Indicates if the checker should run forever ' - 'or on a single batch of content') -def launch(config_path, storage_path, depth, backup_url, is_daemon): - # The configuration have following priority : - # command line > file config > default config - cl_config = { - 'storage_path': storage_path, - 'storage_depth': depth, - 'backup_url': backup_url - } - conf = config.read(config_path, DEFAULT_CONFIG) - conf.update(cl_config) - # Create the checker and run - checker = ContentChecker( - {'batch_size': conf['batch_size']}, - conf['storage_path'], - conf['storage_depth'], - map(lambda x: x.strip(), conf['backup_url'].split(',')) - ) - if is_daemon: - checker.run_as_daemon() - else: - checker.run() - -if __name__ == '__main__': - launch() diff --git a/swh/storage/exc.py b/swh/storage/exc.py --- a/swh/storage/exc.py +++ b/swh/storage/exc.py @@ -4,18 +4,6 @@ # See top-level LICENSE file for more information -class Error(Exception): - - def __str__(self): - return 'storage error on object: %s' % self.args - - -class ObjNotFoundError(Error): - - def __str__(self): - return 'object not found: %s' % self.args - - class StorageDBError(Exception): """Specific storage db error (connection, erroneous queries, etc...) diff --git a/swh/storage/objstorage/__init__.py b/swh/storage/objstorage/__init__.py deleted file mode 100644 --- a/swh/storage/objstorage/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .objstorage import ObjStorage -from .objstorage_pathslicing import PathSlicingObjStorage - -__all__ = [ObjStorage, PathSlicingObjStorage] diff --git a/swh/storage/objstorage/api/__init__.py b/swh/storage/objstorage/api/__init__.py deleted file mode 100644 diff --git a/swh/storage/objstorage/api/client.py b/swh/storage/objstorage/api/client.py deleted file mode 100644 --- a/swh/storage/objstorage/api/client.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -import pickle - -import requests - -from requests.exceptions import ConnectionError -from ...exc import StorageAPIError -from ...api.common import (decode_response, - encode_data_client as encode_data) - - -class RemoteObjStorage(): - """ Proxy to a remote object storage. - - This class allows to connect to an object storage server via - http protocol. - - Attributes: - base_url (string): The url of the server to connect. Must end - with a '/' - session: The session to send requests. - """ - def __init__(self, base_url): - self.base_url = base_url - self.session = requests.Session() - - def url(self, endpoint): - return '%s%s' % (self.base_url, endpoint) - - def post(self, endpoint, data): - try: - response = self.session.post( - self.url(endpoint), - data=encode_data(data), - headers={'content-type': 'application/x-msgpack'}, - ) - except ConnectionError as e: - print(str(e)) - raise StorageAPIError(e) - - # XXX: this breaks language-independence and should be - # replaced by proper unserialization - if response.status_code == 400: - raise pickle.loads(decode_response(response)) - - return decode_response(response) - - def content_add(self, bytes, obj_id=None): - """ Add a new object to the object storage. - - Args: - bytes: content of the object to be added to the storage. - obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When - given, obj_id will be trusted to match bytes. If missing, - obj_id will be computed on the fly. - - """ - return self.post('content/add', {'bytes': bytes, 'obj_id': obj_id}) - - def content_get(self, obj_id): - """ Retrieve the content of a given object. - - Args: - obj_id: The id of the object. - - Returns: - The content of the requested objects as bytes. - - Raises: - ObjNotFoundError: if the requested object is missing - """ - return self.post('content/get', {'obj_id': obj_id}) - - def content_get_random(self, batch_size): - """ Retrieve a random sample of existing content. - - Args: - batch_size: Number of content requested. - - Returns: - A list of random ids that represents existing contents. - """ - return self.post('content/get/random', {'batch_size': batch_size}) - - def content_check(self, obj_id): - """ Integrity check for a given object - - verify that the file object is in place, and that the gzipped content - matches the object id - - Args: - obj_id: The id of the object. - - Raises: - ObjNotFoundError: if the requested object is missing - Error: if the requested object is corrupt - """ - self.post('content/check', {'obj_id': obj_id}) diff --git a/swh/storage/objstorage/api/server.py b/swh/storage/objstorage/api/server.py deleted file mode 100644 --- a/swh/storage/objstorage/api/server.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import click -import logging - -from flask import Flask, g, request - -from swh.core import config -from swh.storage.objstorage import PathSlicingObjStorage -from swh.storage.api.common import (BytesRequest, decode_request, - error_handler, - encode_data_server as encode_data) - -DEFAULT_CONFIG = { - 'storage_base': ('str', '/tmp/swh-storage/objects/'), - 'storage_slicing': ('str', '0:2/2:4/4:6') -} - -app = Flask(__name__) -app.request_class = BytesRequest - - -@app.errorhandler(Exception) -def my_error_handler(exception): - return error_handler(exception, encode_data) - - -@app.before_request -def before_request(): - g.objstorage = PathSlicingObjStorage(app.config['storage_base'], - app.config['storage_slicing']) - - -@app.route('/') -def index(): - return "SWH Objstorage API server" - - -@app.route('/content') -def content(): - return str(list(g.storage)) - - -@app.route('/content/add', methods=['POST']) -def add_bytes(): - return encode_data(g.objstorage.add(**decode_request(request))) - - -@app.route('/content/get', methods=['POST']) -def get_bytes(): - return encode_data(g.objstorage.get(**decode_request(request))) - - -@app.route('/content/get/random', methods=['POST']) -def get_random_contents(): - return encode_data( - g.objstorage.get_random(**decode_request(request)) - ) - - -@app.route('/content/check', methods=['POST']) -def check(): - return encode_data(g.objstorage.check(**decode_request(request))) - - -def run_from_webserver(environ, start_response): - """Run the WSGI app from the webserver, loading the configuration. - - """ - config_path = '/etc/softwareheritage/storage/objstorage.ini' - - app.config.update(config.read(config_path, DEFAULT_CONFIG)) - - handler = logging.StreamHandler() - app.logger.addHandler(handler) - - return app(environ, start_response) - - -@click.command() -@click.argument('config-path', required=1) -@click.option('--host', default='0.0.0.0', help="Host to run the server") -@click.option('--port', default=5000, type=click.INT, - help="Binding port of the server") -@click.option('--debug/--nodebug', default=True, - help="Indicates if the server should run in debug mode") -def launch(config_path, host, port, debug): - app.config.update(config.read(config_path, DEFAULT_CONFIG)) - app.run(host, port=int(port), debug=bool(debug)) - - -if __name__ == '__main__': - launch() diff --git a/swh/storage/objstorage/multiplexer/__init__.py b/swh/storage/objstorage/multiplexer/__init__.py deleted file mode 100644 --- a/swh/storage/objstorage/multiplexer/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .multiplexer_objstorage import MultiplexerObjStorage - - -__all__ = [MultiplexerObjStorage] diff --git a/swh/storage/objstorage/multiplexer/filter/__init__.py b/swh/storage/objstorage/multiplexer/filter/__init__.py deleted file mode 100644 --- a/swh/storage/objstorage/multiplexer/filter/__init__.py +++ /dev/null @@ -1,98 +0,0 @@ -import functools - -from .read_write_filter import ReadObjStorageFilter -from .id_filter import RegexIdObjStorageFilter, PrefixIdObjStorageFilter - - -_FILTERS_CLASSES = { - 'readonly': ReadObjStorageFilter, - 'regex': RegexIdObjStorageFilter, - 'prefix': PrefixIdObjStorageFilter -} - - -_FILTERS_PRIORITY = { - 'readonly': 0, - 'prefix': 1, - 'regex': 2 -} - - -def read_only(): - return {'type': 'readonly'} - - -def id_prefix(prefix): - return {'type': 'prefix', 'prefix': prefix} - - -def id_regex(regex): - return {'type': 'regex', 'regex': regex} - - -def _filter_priority(self, filter_type): - """ Get the priority of this filter. - - Priority is a value that indicates if the operation of the - filter is time-consuming (smaller values means quick execution), - or very likely to be almost always the same value (False being small, - and True high). - - In case the filters are chained, they will be ordered in a way that - small priorities (quick execution or instantly break the chain) are - executed first. - - Default value is 1. Value 0 is recommended for storages that change - behavior only by disabling some operations (making the method return - None). - """ - return _FILTERS_PRIORITY.get(filter_type, 1) - - -def add_filter(storage, filter_conf): - """ Add a filter to the given storage. - - Args: - storage (ObjStorage): storage which will be filtered. - filter_conf (dict): configuration of an ObjStorageFilter, given as - a dictionnary that contains the keys: - - type: which represent the type of filter, one of the keys - of FILTERS - - Every arguments that this type of filter require. - - Returns: - A filtered storage that perform only the valid operations. - """ - type = filter_conf['type'] - args = {k: v for k, v in filter_conf.items() if k is not 'type'} - filter = _FILTERS_CLASSES[type](storage=storage, **args) - return filter - - -def add_filters(storage, *filter_confs): - """ Add multiple filters to the given storage. - - (See filter.add_filter) - - Args: - storage (ObjStorage): storage which will be filtered. - filter_confs (list): any number of filter conf, as a dict with: - - type: which represent the type of filter, one of the keys of - FILTERS. - - Every arguments that this type of filter require. - - Returns: - A filtered storage that fulfill the requirement of all the given - filters. - """ - # Reverse sorting in order to put the filter with biggest priority first. - filter_confs.sort(key=lambda conf: _filter_priority(conf['type']), - reverse=True) - - # Add the bigest filter to the storage, and reduce it to accumulate filters - # on top of it, until the smallest (fastest, see filter.filter_priority) is - # added. - return functools.reduce( - lambda stor, conf: add_filter(stor, conf), - [storage] + filter_confs - ) diff --git a/swh/storage/objstorage/multiplexer/filter/filter.py b/swh/storage/objstorage/multiplexer/filter/filter.py deleted file mode 100644 --- a/swh/storage/objstorage/multiplexer/filter/filter.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from ...objstorage import ObjStorage - - -class ObjStorageFilter(ObjStorage): - """ Base implementation of a filter that allow inputs on ObjStorage or not - - This class copy the API of ...objstorage in order to filter the inputs - of this class. - If the operation is allowed, return the result of this operation - applied to the destination implementation. Otherwise, just return - without any operation. - - This class is an abstract base class for a classic read/write storage. - Filters can inherit from it and only redefine some methods in order - to change behavior. - """ - - def __init__(self, storage): - self.storage = storage - - def __contains__(self, *args, **kwargs): - return self.storage.__contains__(*args, **kwargs) - - def __iter__(self): - return self.storage.__iter__() - - def __len__(self): - return self.storage.__len__() - - def add(self, *args, **kwargs): - return self.storage.add(*args, **kwargs) - - def restore(self, *args, **kwargs): - return self.storage.restore(*args, **kwargs) - - def get(self, *args, **kwargs): - return self.storage.get(*args, **kwargs) - - def check(self, *args, **kwargs): - return self.storage.check(*args, **kwargs) - - def get_random(self, *args, **kwargs): - return self.storage.get_random(*args, **kwargs) diff --git a/swh/storage/objstorage/multiplexer/filter/id_filter.py b/swh/storage/objstorage/multiplexer/filter/id_filter.py deleted file mode 100644 --- a/swh/storage/objstorage/multiplexer/filter/id_filter.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import re - -from swh.core import hashutil - -from .filter import ObjStorageFilter -from ...objstorage import ID_HASH_ALGO -from ....exc import ObjNotFoundError - - -def compute_hash(bytes): - """ Compute the hash of the given content. - """ - # Checksum is missing, compute it on the fly. - h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) - h.update(bytes) - return h.digest() - - -class IdObjStorageFilter(ObjStorageFilter): - """ Filter that only allow operations if the object id match a requirement. - - Even for read operations, check before if the id match the requirements. - This may prevent for unnecesary disk access. - """ - - def is_valid(self, obj_id): - """ Indicates if the given id is valid. - """ - raise NotImplementedError('Implementations of an IdObjStorageFilter ' - 'must have a "is_valid" method') - - def __contains__(self, obj_id, *args, **kwargs): - if self.is_valid(obj_id): - return self.storage.__contains__(*args, obj_id=obj_id, **kwargs) - return False - - def __len__(self): - return sum(1 for i in [id for id in self.storage if self.is_valid(id)]) - - def __iter__(self): - yield from filter(lambda id: self.is_valid(id), iter(self.storage)) - - def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): - if obj_id is None: - obj_id = compute_hash(content) - if self.is_valid(obj_id): - return self.storage.add(content, *args, obj_id=obj_id, **kwargs) - - def restore(self, content, obj_id=None, *args, **kwargs): - if obj_id is None: - obj_id = compute_hash(content) - if self.is_valid(obj_id): - return self.storage.restore(content, *args, - obj_id=obj_id, **kwargs) - - def get(self, obj_id, *args, **kwargs): - if self.is_valid(obj_id): - return self.storage.get(*args, obj_id=obj_id, **kwargs) - raise ObjNotFoundError(obj_id) - - def check(self, obj_id, *args, **kwargs): - if self.is_valid(obj_id): - return self.storage.check(*args, obj_id=obj_id, **kwargs) - raise ObjNotFoundError(obj_id) - - def get_random(self, *args, **kwargs): - yield from filter(lambda id: self.is_valid(id), - self.storage.get_random(*args, **kwargs)) - - -class RegexIdObjStorageFilter(IdObjStorageFilter): - """ Filter that allow operations if the content's id as hex match a regex. - """ - - def __init__(self, storage, regex): - super().__init__(storage) - self.regex = re.compile(regex) - - def is_valid(self, obj_id): - hex_obj_id = hashutil.hash_to_hex(obj_id) - return self.regex.match(hex_obj_id) is not None - - -class PrefixIdObjStorageFilter(IdObjStorageFilter): - """ Filter that allow operations if the hexlified id have a given prefix. - """ - - def __init__(self, storage, prefix): - super().__init__(storage) - self.prefix = str(prefix) - - def is_valid(self, obj_id): - hex_obj_id = hashutil.hash_to_hex(obj_id) - return str(hex_obj_id).startswith(self.prefix) diff --git a/swh/storage/objstorage/multiplexer/filter/read_write_filter.py b/swh/storage/objstorage/multiplexer/filter/read_write_filter.py deleted file mode 100644 --- a/swh/storage/objstorage/multiplexer/filter/read_write_filter.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from .filter import ObjStorageFilter - - -class ReadObjStorageFilter(ObjStorageFilter): - """ Filter that disable write operation of the storage. - """ - - def add(self, *args, **kwargs): - return - - def restore(self, *args, **kwargs): - return diff --git a/swh/storage/objstorage/multiplexer/multiplexer_objstorage.py b/swh/storage/objstorage/multiplexer/multiplexer_objstorage.py deleted file mode 100644 --- a/swh/storage/objstorage/multiplexer/multiplexer_objstorage.py +++ /dev/null @@ -1,194 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import random - -from ..objstorage import ObjStorage -from ...exc import ObjNotFoundError - - -class MultiplexerObjStorage(ObjStorage): - """ Implementation of ObjStorage that distribute between multiple storages - - The multiplexer object storage allows an input to be demultiplexed - among multiple storages that will or will not accept it by themselves - (see .filter package). - - As the ids can be differents, no pre-computed ids should be submitted. - Also, there are no guarantees that the returned ids can be used directly - into the storages that the multiplexer manage. - - - Use case examples could be: - Example 1: - storage_v1 = filter.read_only(PathSlicingObjStorage('/dir1', - '0:2/2:4/4:6')) - storage_v2 = PathSlicingObjStorage('/dir2', '0:1/0:5') - storage = MultiplexerObjStorage([storage_v1, storage_v2]) - - When using 'storage', all the new contents will only be added to the v2 - storage, while it will be retrievable from both. - - Example 2: - storage_v1 = filter.id_regex( - PathSlicingObjStorage('/dir1', '0:2/2:4/4:6'), - r'[^012].*' - ) - storage_v2 = filter.if_regex( - PathSlicingObjStorage('/dir2', '0:1/0:5'), - r'[012]/*' - ) - storage = MultiplexerObjStorage([storage_v1, storage_v2]) - - When using this storage, the contents with a sha1 starting with 0, 1 or - 2 will be redirected (read AND write) to the storage_v2, while the - others will be redirected to the storage_v1. - If a content starting with 0, 1 or 2 is present in the storage_v1, it - would be ignored anyway. - """ - - def __init__(self, storages): - self.storages = storages - - def __contains__(self, obj_id): - for storage in self.storages: - if obj_id in storage: - return True - return False - - def __iter__(self): - for storage in self.storages: - yield from storage - - def __len__(self): - """ Returns the number of files in the storage. - - Warning: Multiple files may represent the same content, so this method - does not indicate how many different contents are in the storage. - """ - return sum(map(len, self.storages)) - - def add(self, content, obj_id=None, check_presence=True): - """ Add a new object to the object storage. - - If the adding step works in all the storages that accept this content, - this is a success. Otherwise, the full adding step is an error even if - it succeed in some of the storages. - - Args: - content: content of the object to be added to the storage. - obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When - given, obj_id will be trusted to match the bytes. If missing, - obj_id will be computed on the fly. - check_presence: indicate if the presence of the content should be - verified before adding the file. - - Returns: - an id of the object into the storage. As the write-storages are - always readable as well, any id will be valid to retrieve a - content. - """ - return [storage.add(content, obj_id, check_presence) - for storage in self.storages].pop() - - def restore(self, content, obj_id=None): - """ Restore a content that have been corrupted. - - This function is identical to add_bytes but does not check if - the object id is already in the file system. - - (see "add" method) - - Args: - content: content of the object to be added to the storage - obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When - given, obj_id will be trusted to match bytes. If missing, - obj_id will be computed on the fly. - - Returns: - an id of the object into the storage. As the write-storages are - always readable as well, any id will be valid to retrieve a - content. - """ - return [storage.restore(content, obj_id) - for storage in self.storages].pop() - - def get(self, obj_id): - """ Retrieve the content of a given object. - - Args: - obj_id: object id. - - Returns: - the content of the requested object as bytes. - - Raises: - ObjNotFoundError: if the requested object is missing. - """ - for storage in self.storages: - try: - return storage.get(obj_id) - except ObjNotFoundError: - continue - # If no storage contains this content, raise the error - raise ObjNotFoundError(obj_id) - - def check(self, obj_id): - """ Perform an integrity check for a given object. - - Verify that the file object is in place and that the gziped content - matches the object id. - - Args: - obj_id: object id. - - Raises: - ObjNotFoundError: if the requested object is missing. - Error: if the request object is corrupted. - """ - nb_present = 0 - for storage in self.storages: - try: - storage.check(obj_id) - except ObjNotFoundError: - continue - else: - nb_present += 1 - # If there is an Error because of a corrupted file, then let it pass. - - # Raise the ObjNotFoundError only if the content coulnd't be found in - # all the storages. - if nb_present == 0: - raise ObjNotFoundError(obj_id) - - def get_random(self, batch_size): - """ Get random ids of existing contents - - This method is used in order to get random ids to perform - content integrity verifications on random contents. - - Attributes: - batch_size (int): Number of ids that will be given - - Yields: - An iterable of ids of contents that are in the current object - storage. - """ - storages_set = [storage for storage in self.storages - if len(storage) > 0] - if len(storages_set) <= 0: - return [] - - while storages_set: - storage = random.choice(storages_set) - try: - return storage.get_random(batch_size) - except NotImplementedError: - storages_set.remove(storage) - # There is no storage that allow the get_random operation - raise NotImplementedError( - "There is no storage implementation into the multiplexer that " - "support the 'get_random' operation" - ) diff --git a/swh/storage/objstorage/objstorage.py b/swh/storage/objstorage/objstorage.py deleted file mode 100644 --- a/swh/storage/objstorage/objstorage.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -ID_HASH_ALGO = 'sha1' -ID_HASH_LENGTH = 40 # Size in bytes of the hash hexadecimal representation. - - -class ObjStorage(): - """ High-level API to manipulate the Software Heritage object storage. - - Conceptually, the object storage offers 5 methods: - - - __contains__() check if an object is present, by object id - - add() add a new object, returning an object id - - restore() same as add() but erase an already existed content - - get() retrieve the content of an object, by object id - - check() check the integrity of an object, by object id - - And some management methods: - - - get_random() get random object id of existing contents (used for the - content integrity checker). - - Each implementation of this interface can have a different behavior and - its own way to store the contents. - """ - - def __contains__(self, *args, **kwargs): - raise NotImplementedError( - "Implementations of ObjStorage must have a '__contains__' method" - ) - - def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): - """ Add a new object to the object storage. - - Args: - content: content of the object to be added to the storage. - obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When - given, obj_id will be trusted to match the bytes. If missing, - obj_id will be computed on the fly. - check_presence: indicate if the presence of the content should be - verified before adding the file. - - Returns: - the id of the object into the storage. - """ - raise NotImplementedError( - "Implementations of ObjStorage must have a 'add' method" - ) - - def restore(self, content, obj_id=None, *args, **kwargs): - """ Restore a content that have been corrupted. - - This function is identical to add_bytes but does not check if - the object id is already in the file system. - - Args: - content: content of the object to be added to the storage - obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When - given, obj_id will be trusted to match bytes. If missing, - obj_id will be computed on the fly. - """ - raise NotImplemented( - "Implementations of ObjStorage must have a 'restore' method" - ) - - def get(self, obj_id, *args, **kwargs): - """ Retrieve the content of a given object. - - Args: - obj_id: object id. - - Returns: - the content of the requested object as bytes. - - Raises: - ObjNotFoundError: if the requested object is missing. - """ - raise NotImplementedError( - "Implementations of ObjStorage must have a 'get' method" - ) - - def check(self, obj_id, *args, **kwargs): - """ Perform an integrity check for a given object. - - Verify that the file object is in place and that the gziped content - matches the object id. - - Args: - obj_id: object id. - - Raises: - ObjNotFoundError: if the requested object is missing. - Error: if the request object is corrupted. - """ - raise NotImplementedError( - "Implementations of ObjStorage must have a 'check' method" - ) - - def get_random(self, batch_size, *args, **kwargs): - """ Get random ids of existing contents - - This method is used in order to get random ids to perform - content integrity verifications on random contents. - - Attributes: - batch_size (int): Number of ids that will be given - - Yields: - An iterable of ids of contents that are in the current object - storage. - """ - raise NotImplementedError( - "The current implementation of ObjStorage does not support " - "'get_random' operation" - ) diff --git a/swh/storage/objstorage/objstorage_pathslicing.py b/swh/storage/objstorage/objstorage_pathslicing.py deleted file mode 100644 --- a/swh/storage/objstorage/objstorage_pathslicing.py +++ /dev/null @@ -1,347 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import os -import gzip -import tempfile -import random - -from contextlib import contextmanager - -from swh.core import hashutil - -from .objstorage import ObjStorage, ID_HASH_ALGO, ID_HASH_LENGTH -from ..exc import ObjNotFoundError, Error - - -GZIP_BUFSIZ = 1048576 - -DIR_MODE = 0o755 -FILE_MODE = 0o644 - - -@contextmanager -def _write_obj_file(hex_obj_id, objstorage): - """ Context manager for writing object files to the object storage. - - During writing, data are written to a temporary file, which is atomically - renamed to the right file name after closing. This context manager also - takes care of (gzip) compressing the data on the fly. - - Usage sample: - with _write_obj_file(hex_obj_id, objstorage): - f.write(obj_data) - - Yields: - a file-like object open for writing bytes. - """ - # Get the final paths and create the directory if absent. - dir = objstorage._obj_dir(hex_obj_id) - if not os.path.isdir(dir): - os.makedirs(dir, DIR_MODE, exist_ok=True) - path = os.path.join(dir, hex_obj_id) - - # Create a temporary file. - (tmp, tmp_path) = tempfile.mkstemp(suffix='.tmp', prefix='hex_obj_id.', - dir=dir) - - # Open the file and yield it for writing. - tmp_f = os.fdopen(tmp, 'wb') - with gzip.GzipFile(filename=tmp_path, fileobj=tmp_f) as f: - yield f - - # Then close the temporary file and move it to the right directory. - tmp_f.close() - os.chmod(tmp_path, FILE_MODE) - os.rename(tmp_path, path) - - -@contextmanager -def _read_obj_file(hex_obj_id, objstorage): - """ Context manager for reading object file in the object storage. - - Usage sample: - with _read_obj_file(hex_obj_id, objstorage) as f: - b = f.read() - - Yields: - a file-like object open for reading bytes. - """ - path = objstorage._obj_path(hex_obj_id) - with gzip.GzipFile(path, 'rb') as f: - yield f - - -class PathSlicingObjStorage(ObjStorage): - """ Implementation of the ObjStorage API based on the hash of the content. - - On disk, an object storage is a directory tree containing files named after - their object IDs. An object ID is a checksum of its content, depending on - the value of the ID_HASH_ALGO constant (see hashutil for its meaning). - - To avoid directories that contain too many files, the object storage has a - given slicing. Each slicing correspond to a directory that is named - according to the hash of its content. - - So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689 - will be stored in the given object storages : - - - 0:2/2:4/4:6 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689 - - 0:1/0:5/ : 3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689 - - The files in the storage are stored in gzipped compressed format. - - Attributes: - root (string): path to the root directory of the storage on the disk. - bounds: list of tuples that indicates the beginning and the end of - each subdirectory for a content. - """ - - def __init__(self, root, slicing): - """ Create an object to access a hash-slicing based object storage. - - Args: - root (string): path to the root directory of the storage on - the disk. - slicing (string): string that indicates the slicing to perform - on the hash of the content to know the path where it should - be stored. - """ - if not os.path.isdir(root): - raise ValueError( - 'PathSlicingObjStorage root "%s" is not a directory' % root - ) - - self.root = root - # Make a list of tuples where each tuple contains the beginning - # and the end of each slicing. - self.bounds = [ - slice(*map(int, sbounds.split(':'))) - for sbounds in slicing.split('/') - if sbounds - ] - - max_endchar = max(map(lambda bound: bound.stop, self.bounds)) - if ID_HASH_LENGTH < max_endchar: - raise ValueError( - 'Algorithm %s has too short hash for slicing to char %d' - % (ID_HASH_ALGO, max_endchar) - ) - - def __contains__(self, obj_id): - """ Check whether the given object is present in the storage or not. - - Returns: - True iff the object is present in the storage. - """ - hex_obj_id = hashutil.hash_to_hex(obj_id) - return os.path.exists(self._obj_path(hex_obj_id)) - - def __iter__(self): - """iterate over the object identifiers currently available in the storage - - Warning: with the current implementation of the object storage, this - method will walk the filesystem to list objects, meaning that listing - all objects will be very slow for large storages. You almost certainly - don't want to use this method in production. - - Return: - iterator over object IDs - """ - def obj_iterator(): - # XXX hackish: it does not verify that the depth of found files - # matches the slicing depth of the storage - for root, _dirs, files in os.walk(self.root): - for f in files: - yield bytes.fromhex(f) - - return obj_iterator() - - def __len__(self): - """compute the number of objects available in the storage - - Warning: this currently uses `__iter__`, its warning about bad - performances applies - - Return: - number of objects contained in the storage - - """ - return sum(1 for i in self) - - def _obj_dir(self, hex_obj_id): - """ Compute the storage directory of an object. - - See also: PathSlicingObjStorage::_obj_path - - Args: - hex_obj_id: object id as hexlified string. - - Returns: - Path to the directory that contains the required object. - """ - slices = [hex_obj_id[bound] for bound in self.bounds] - return os.path.join(self.root, *slices) - - def _obj_path(self, hex_obj_id): - """ Compute the full path to an object into the current storage. - - See also: PathSlicingObjStorage::_obj_dir - - Args: - hex_obj_id: object id as hexlified string. - - Returns: - Path to the actual object corresponding to the given id. - """ - return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id) - - def add(self, bytes, obj_id=None, check_presence=True): - """ Add a new object to the object storage. - - Args: - bytes: content of the object to be added to the storage. - obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When - given, obj_id will be trusted to match the bytes. If missing, - obj_id will be computed on the fly. - check_presence: indicate if the presence of the content should be - verified before adding the file. - - Returns: - the id of the object into the storage. - """ - if obj_id is None: - # Checksum is missing, compute it on the fly. - h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) - h.update(bytes) - obj_id = h.digest() - - if check_presence and obj_id in self: - # If the object is already present, return immediatly. - return obj_id - - hex_obj_id = hashutil.hash_to_hex(obj_id) - with _write_obj_file(hex_obj_id, self) as f: - f.write(bytes) - - return obj_id - - def restore(self, bytes, obj_id=None): - """ Restore a content that have been corrupted. - - This function is identical to add_bytes but does not check if - the object id is already in the file system. - - Args: - bytes: content of the object to be added to the storage - obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When - given, obj_id will be trusted to match bytes. If missing, - obj_id will be computed on the fly. - """ - return self.add(bytes, obj_id, check_presence=False) - - def get(self, obj_id): - """ Retrieve the content of a given object. - - Args: - obj_id: object id. - - Returns: - the content of the requested object as bytes. - - Raises: - ObjNotFoundError: if the requested object is missing. - """ - if obj_id not in self: - raise ObjNotFoundError(obj_id) - - # Open the file and return its content as bytes - hex_obj_id = hashutil.hash_to_hex(obj_id) - with _read_obj_file(hex_obj_id, self) as f: - return f.read() - - def check(self, obj_id): - """ Perform an integrity check for a given object. - - Verify that the file object is in place and that the gziped content - matches the object id. - - Args: - obj_id: object id. - - Raises: - ObjNotFoundError: if the requested object is missing. - Error: if the request object is corrupted. - """ - if obj_id not in self: - raise ObjNotFoundError(obj_id) - - hex_obj_id = hashutil.hash_to_hex(obj_id) - - try: - with gzip.open(self._obj_path(hex_obj_id)) as f: - length = None - if ID_HASH_ALGO.endswith('_git'): - # if the hashing algorithm is git-like, we need to know the - # content size to hash on the fly. Do a first pass here to - # compute the size - length = 0 - while True: - chunk = f.read(GZIP_BUFSIZ) - length += len(chunk) - if not chunk: - break - f.rewind() - - checksums = hashutil._hash_file_obj(f, length, - algorithms=[ID_HASH_ALGO]) - actual_obj_id = checksums[ID_HASH_ALGO] - if obj_id != actual_obj_id: - raise Error( - 'Corrupt object %s should have id %s' - % (hashutil.hash_to_hex(obj_id), - hashutil.hash_to_hex(actual_obj_id)) - ) - except (OSError, IOError): - # IOError is for compatibility with older python versions - raise Error('Corrupt object %s is not a gzip file' % obj_id) - - def get_random(self, batch_size): - """ Get random ids of existing contents - - This method is used in order to get random ids to perform - content integrity verifications on random contents. - - Attributes: - batch_size (int): Number of ids that will be given - - Yields: - An iterable of ids of contents that are in the current object - storage. - """ - def get_random_content(self, batch_size): - """ Get a batch of content inside a single directory. - - Returns: - a tuple (batch size, batch). - """ - dirs = [] - for level in range(len(self.bounds)): - path = os.path.join(self.root, *dirs) - dir_list = next(os.walk(path))[1] - if 'tmp' in dir_list: - dir_list.remove('tmp') - dirs.append(random.choice(dir_list)) - - path = os.path.join(self.root, *dirs) - content_list = next(os.walk(path))[2] - length = min(batch_size, len(content_list)) - return length, map(hashutil.hex_to_hash, - random.sample(content_list, length)) - - while batch_size: - length, it = get_random_content(self, batch_size) - batch_size = batch_size - length - yield from it diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -13,10 +13,11 @@ from . import converters from .db import Db -from .objstorage import PathSlicingObjStorage -from .exc import ObjNotFoundError, StorageDBError +from .exc import StorageDBError from swh.core.hashutil import ALGORITHMS +from swh.objstorage import PathSlicingObjStorage +from swh.objstorage.exc import ObjNotFoundError # Max block size of contents to return BULK_BLOCK_CONTENT_LEN_MAX = 10000 diff --git a/swh/storage/tests/objstorage_testing.py b/swh/storage/tests/objstorage_testing.py deleted file mode 100644 --- a/swh/storage/tests/objstorage_testing.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from nose.tools import istest - -from swh.core import hashutil -from swh.storage import exc - - -class ObjStorageTestFixture(): - - def setUp(self): - super().setUp() - - def hash_content(self, content): - obj_id = hashutil.hashdata(content)['sha1'] - return content, obj_id - - def assertContentMatch(self, obj_id, expected_content): - content = self.storage.get(obj_id) - self.assertEqual(content, expected_content) - - @istest - def add_get_w_id(self): - content, obj_id = self.hash_content(b'add_get_w_id') - r = self.storage.add(content, obj_id=obj_id) - self.assertEqual(obj_id, r) - self.assertContentMatch(obj_id, content) - - @istest - def add_get_wo_id(self): - content, obj_id = self.hash_content(b'add_get_wo_id') - r = self.storage.add(content) - self.assertEqual(obj_id, r) - self.assertContentMatch(obj_id, content) - - @istest - def restore_content(self): - valid_content, valid_obj_id = self.hash_content(b'restore_content') - invalid_content = b'unexpected content' - id_adding = self.storage.add(invalid_content, valid_obj_id) - id_restore = self.storage.restore(valid_content) - # Adding a false content then restore it to the right one and - # then perform a verification should result in a successful check. - self.assertEqual(id_adding, valid_obj_id) - self.assertEqual(id_restore, valid_obj_id) - self.assertContentMatch(valid_obj_id, valid_content) - - @istest - def get_missing(self): - content, obj_id = self.hash_content(b'get_missing') - with self.assertRaises(exc.Error): - self.storage.get(obj_id) - - @istest - def check_missing(self): - content, obj_id = self.hash_content(b'check_missing') - with self.assertRaises(exc.Error): - self.storage.check(obj_id) - - @istest - def check_present(self): - content, obj_id = self.hash_content(b'check_missing') - self.storage.add(content) - try: - self.storage.check(obj_id) - except: - self.fail('Integrity check failed') diff --git a/swh/storage/tests/test_archiver.py b/swh/storage/tests/test_archiver.py --- a/swh/storage/tests/test_archiver.py +++ b/swh/storage/tests/test_archiver.py @@ -16,10 +16,10 @@ from server_testing import ServerTestFixture from swh.storage import Storage -from swh.storage.exc import ObjNotFoundError from swh.storage.archiver import ArchiverDirector, ArchiverWorker -from swh.storage.objstorage.api.client import RemoteObjStorage -from swh.storage.objstorage.api.server import app +from swh.objstorage.exc import ObjNotFoundError +from swh.objstorage.api.client import RemoteObjStorage +from swh.objstorage.api.server import app TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DATA_DIR = os.path.join(TEST_DIR, '../../../../swh-storage-testdata') diff --git a/swh/storage/tests/test_checker.py b/swh/storage/tests/test_checker.py deleted file mode 100644 --- a/swh/storage/tests/test_checker.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import gzip -import tempfile -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.core import hashutil -from swh.storage.checker.checker import ContentChecker - - -class MockBackupStorage(): - - def __init__(self): - self.values = {} - - def content_add(self, id, value): - self.values[id] = value - - def content_get(self, ids): - for id in ids: - try: - data = self.values[id] - except KeyError: - yield None - continue - - yield {'sha1': id, 'data': data} - - -@attr('fs') -class TestChecker(unittest.TestCase): - """ Test the content integrity checker - """ - - def setUp(self): - super().setUp() - # Connect to an objstorage - config = {'batch_size': 10} - path = tempfile.mkdtemp() - slicing = '0:2/2:4/4:6' - self.checker = ContentChecker(config, path, slicing, 'http://None') - self.checker.backup_storages = [MockBackupStorage(), - MockBackupStorage()] - - def corrupt_content(self, id): - """ Make the given content invalid. - """ - hex_id = hashutil.hash_to_hex(id) - file_path = self.checker.objstorage._obj_path(hex_id) - with gzip.open(file_path, 'wb') as f: - f.write(b'Unexpected content') - - @istest - def check_valid_content(self): - # Check that a valid content is valid. - content = b'check_valid_content' - id = self.checker.objstorage.add(content) - self.assertTrue(self.checker.check_content(id)) - - @istest - def check_invalid_content(self): - # Check that an invalid content is noticed. - content = b'check_invalid_content' - id = self.checker.objstorage.add(content) - self.corrupt_content(id) - self.assertFalse(self.checker.check_content(id)) - - @istest - def repair_content_present_first(self): - # Try to repair a content that is in the backup storage. - content = b'repair_content_present_first' - id = self.checker.objstorage.add(content) - # Add a content to the mock - self.checker.backup_storages[0].content_add(id, content) - # Corrupt and repair it. - self.corrupt_content(id) - self.assertFalse(self.checker.check_content(id)) - self.checker.repair_contents([id]) - self.assertTrue(self.checker.check_content(id)) - - @istest - def repair_content_present_second(self): - # Try to repair a content that is not in the first backup storage. - content = b'repair_content_present_second' - id = self.checker.objstorage.add(content) - # Add a content to the mock - self.checker.backup_storages[1].content_add(id, content) - # Corrupt and repair it. - self.corrupt_content(id) - self.assertFalse(self.checker.check_content(id)) - self.checker.repair_contents([id]) - self.assertTrue(self.checker.check_content(id)) - - @istest - def repair_content_present_distributed(self): - # Try to repair two contents that are in separate backup storages. - content1 = b'repair_content_present_distributed_2' - content2 = b'repair_content_present_distributed_1' - id1 = self.checker.objstorage.add(content1) - id2 = self.checker.objstorage.add(content2) - # Add content to the mock. - self.checker.backup_storages[0].content_add(id1, content1) - self.checker.backup_storages[0].content_add(id2, content2) - # Corrupt and repair it - self.corrupt_content(id1) - self.corrupt_content(id2) - self.assertFalse(self.checker.check_content(id1)) - self.assertFalse(self.checker.check_content(id2)) - self.checker.repair_contents([id1, id2]) - self.assertTrue(self.checker.check_content(id1)) - self.assertTrue(self.checker.check_content(id2)) - - @istest - def repair_content_missing(self): - # Try to repair a content that is NOT in the backup storage. - content = b'repair_content_present' - id = self.checker.objstorage.add(content) - # Corrupt and repair it. - self.corrupt_content(id) - self.assertFalse(self.checker.check_content(id)) - self.checker.repair_contents([id]) - self.assertFalse(self.checker.check_content(id)) diff --git a/swh/storage/tests/test_multiplexer_filter.py b/swh/storage/tests/test_multiplexer_filter.py deleted file mode 100644 --- a/swh/storage/tests/test_multiplexer_filter.py +++ /dev/null @@ -1,373 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import random -import unittest - -from string import ascii_lowercase - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.core import hashutil -from swh.storage.exc import ObjNotFoundError, Error -from swh.storage.objstorage import ObjStorage -from swh.storage.objstorage.multiplexer.filter import (add_filter, read_only, - id_prefix, id_regex) - - -def get_random_content(): - return bytes(''.join(random.sample(ascii_lowercase, 10)), 'utf8') - - -class MockObjStorage(ObjStorage): - """ Mock an object storage for testing the filters. - """ - def __init__(self): - self.objects = {} - - def __contains__(self, obj_id): - return obj_id in self.objects - - def __len__(self): - return len(self.objects) - - def __iter__(self): - return iter(self.objects) - - def id(self, content): - # Id is the content itself for easily choose the id of - # a content for filtering. - return hashutil.hashdata(content)['sha1'] - - def add(self, content, obj_id=None, check_presence=True): - if obj_id is None: - obj_id = self.id(content) - - if check_presence and obj_id in self.objects: - return obj_id - - self.objects[obj_id] = content - return obj_id - - def restore(self, content, obj_id=None): - return self.add(content, obj_id, check_presence=False) - - def get(self, obj_id): - if obj_id not in self: - raise ObjNotFoundError(obj_id) - return self.objects[obj_id] - - def check(self, obj_id): - if obj_id not in self: - raise ObjNotFoundError(obj_id) - if obj_id != self.id(self.objects[obj_id]): - raise Error(obj_id) - - def get_random(self, batch_size): - batch_size = min(len(self), batch_size) - return random.sample(list(self.objects), batch_size) - - -@attr('!db') -class MixinTestReadFilter(unittest.TestCase): - # Read only filter should not allow writing - - def setUp(self): - super().setUp() - storage = MockObjStorage() - - self.valid_content = b'pre-existing content' - self.invalid_content = b'invalid_content' - self.true_invalid_content = b'Anything that is not correct' - self.absent_content = b'non-existent content' - # Create a valid content. - self.valid_id = storage.add(self.valid_content) - # Create an invalid id and add a content with it. - self.invalid_id = storage.id(self.true_invalid_content) - storage.add(self.invalid_content, obj_id=self.invalid_id) - # Compute an id for a non-existing content. - self.absent_id = storage.id(self.absent_content) - - self.storage = add_filter(storage, read_only()) - - @istest - def can_contains(self): - self.assertTrue(self.valid_id in self.storage) - self.assertTrue(self.invalid_id in self.storage) - self.assertFalse(self.absent_id in self.storage) - - @istest - def can_iter(self): - self.assertIn(self.valid_id, iter(self.storage)) - self.assertIn(self.invalid_id, iter(self.storage)) - - @istest - def can_len(self): - self.assertEqual(2, len(self.storage)) - - @istest - def can_get(self): - self.assertEqual(self.valid_content, self.storage.get(self.valid_id)) - self.assertEqual(self.invalid_content, - self.storage.get(self.invalid_id)) - - @istest - def can_check(self): - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.absent_id) - with self.assertRaises(Error): - self.storage.check(self.invalid_id) - self.storage.check(self.valid_id) - - @istest - def can_get_random(self): - self.assertEqual(1, len(self.storage.get_random(1))) - self.assertEqual(len(self.storage), len(self.storage.get_random(1000))) - - @istest - def cannot_add(self): - new_id = self.storage.add(b'New content') - result = self.storage.add(self.valid_content, self.valid_id) - self.assertNotIn(new_id, self.storage) - self.assertIsNone(result) - - @istest - def cannot_restore(self): - new_id = self.storage.restore(b'New content') - result = self.storage.restore(self.valid_content, self.valid_id) - self.assertNotIn(new_id, self.storage) - self.assertIsNone(result) - - -class MixinTestIdFilter(): - """ Mixin class that tests the filters based on filter.IdFilter - - Methods "make_valid", "make_invalid" and "filter_storage" must be - implemented by subclasses. - """ - - def setUp(self): - super().setUp() - # Use a hack here : as the mock uses the content as id, it is easy to - # create contents that are filtered or not. - self.prefix = '71' - storage = MockObjStorage() - - # Make the storage filtered - self.base_storage = storage - self.storage = self.filter_storage(storage) - - # Present content with valid id - self.present_valid_content = self.ensure_valid(b'yroqdtotji') - self.present_valid_id = storage.id(self.present_valid_content) - - # Present content with invalid id - self.present_invalid_content = self.ensure_invalid(b'glxddlmmzb') - self.present_invalid_id = storage.id(self.present_invalid_content) - - # Missing content with valid id - self.missing_valid_content = self.ensure_valid(b'rmzkdclkez') - self.missing_valid_id = storage.id(self.missing_valid_content) - - # Missing content with invalid id - self.missing_invalid_content = self.ensure_invalid(b'hlejfuginh') - self.missing_invalid_id = storage.id(self.missing_invalid_content) - - # Present corrupted content with valid id - self.present_corrupted_valid_content = self.ensure_valid(b'cdsjwnpaij') - self.true_present_corrupted_valid_content = self.ensure_valid( - b'mgsdpawcrr') - self.present_corrupted_valid_id = storage.id( - self.true_present_corrupted_valid_content) - - # Present corrupted content with invalid id - self.present_corrupted_invalid_content = self.ensure_invalid( - b'pspjljnrco') - self.true_present_corrupted_invalid_content = self.ensure_invalid( - b'rjocbnnbso') - self.present_corrupted_invalid_id = storage.id( - self.true_present_corrupted_invalid_content) - - # Missing (potentially) corrupted content with valid id - self.missing_corrupted_valid_content = self.ensure_valid( - b'zxkokfgtou') - self.true_missing_corrupted_valid_content = self.ensure_valid( - b'royoncooqa') - self.missing_corrupted_valid_id = storage.id( - self.true_missing_corrupted_valid_content) - - # Missing (potentially) corrupted content with invalid id - self.missing_corrupted_invalid_content = self.ensure_invalid( - b'hxaxnrmnyk') - self.true_missing_corrupted_invalid_content = self.ensure_invalid( - b'qhbolyuifr') - self.missing_corrupted_invalid_id = storage.id( - self.true_missing_corrupted_invalid_content) - - # Add the content that are supposed to be present - storage.add(self.present_valid_content) - storage.add(self.present_invalid_content) - storage.add(self.present_corrupted_valid_content, - obj_id=self.present_corrupted_valid_id) - storage.add(self.present_corrupted_invalid_content, - obj_id=self.present_corrupted_invalid_id) - - def filter_storage(self, storage): - raise NotImplementedError( - 'Id_filter test class must have a filter_storage method') - - def ensure_valid(self, content=None): - if content is None: - content = get_random_content() - while not self.storage.is_valid(self.base_storage.id(content)): - content = get_random_content() - return content - - def ensure_invalid(self, content=None): - if content is None: - content = get_random_content() - while self.storage.is_valid(self.base_storage.id(content)): - content = get_random_content() - return content - - @istest - def contains(self): - # Both contents are present, but the invalid one should be ignored. - self.assertTrue(self.present_valid_id in self.storage) - self.assertFalse(self.present_invalid_id in self.storage) - self.assertFalse(self.missing_valid_id in self.storage) - self.assertFalse(self.missing_invalid_id in self.storage) - self.assertTrue(self.present_corrupted_valid_id in self.storage) - self.assertFalse(self.present_corrupted_invalid_id in self.storage) - self.assertFalse(self.missing_corrupted_valid_id in self.storage) - self.assertFalse(self.missing_corrupted_invalid_id in self.storage) - - @istest - def iter(self): - self.assertIn(self.present_valid_id, iter(self.storage)) - self.assertNotIn(self.present_invalid_id, iter(self.storage)) - self.assertNotIn(self.missing_valid_id, iter(self.storage)) - self.assertNotIn(self.missing_invalid_id, iter(self.storage)) - self.assertIn(self.present_corrupted_valid_id, iter(self.storage)) - self.assertNotIn(self.present_corrupted_invalid_id, iter(self.storage)) - self.assertNotIn(self.missing_corrupted_valid_id, iter(self.storage)) - self.assertNotIn(self.missing_corrupted_invalid_id, iter(self.storage)) - - @istest - def len(self): - # Four contents are present, but only two should be valid. - self.assertEqual(2, len(self.storage)) - - @istest - def get(self): - self.assertEqual(self.present_valid_content, - self.storage.get(self.present_valid_id)) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.present_invalid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.missing_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.missing_invalid_id) - self.assertEqual(self.present_corrupted_valid_content, - self.storage.get(self.present_corrupted_valid_id)) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.present_corrupted_invalid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.missing_corrupted_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.missing_corrupted_invalid_id) - - @istest - def check(self): - self.storage.check(self.present_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.present_invalid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.missing_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.missing_invalid_id) - with self.assertRaises(Error): - self.storage.check(self.present_corrupted_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.present_corrupted_invalid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.missing_corrupted_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.missing_corrupted_invalid_id) - - @istest - def get_random(self): - self.assertEqual(0, len(list(self.storage.get_random(0)))) - - random_content = list(self.storage.get_random(1000)) - self.assertIn(self.present_valid_id, random_content) - self.assertNotIn(self.present_invalid_id, random_content) - self.assertNotIn(self.missing_valid_id, random_content) - self.assertNotIn(self.missing_invalid_id, random_content) - self.assertIn(self.present_corrupted_valid_id, random_content) - self.assertNotIn(self.present_corrupted_invalid_id, random_content) - self.assertNotIn(self.missing_corrupted_valid_id, random_content) - self.assertNotIn(self.missing_corrupted_invalid_id, random_content) - - @istest - def add(self): - # Add valid and invalid contents to the storage and check their - # presence with the unfiltered storage. - valid_content = self.ensure_valid(b'ulepsrjbgt') - valid_id = self.base_storage.id(valid_content) - invalid_content = self.ensure_invalid(b'znvghkjked') - invalid_id = self.base_storage.id(invalid_content) - self.storage.add(valid_content) - self.storage.add(invalid_content) - self.assertTrue(valid_id in self.base_storage) - self.assertFalse(invalid_id in self.base_storage) - - @istest - def restore(self): - # Add corrupted content to the storage and the try to restore it - valid_content = self.ensure_valid(b'ulepsrjbgt') - valid_id = self.base_storage.id(valid_content) - corrupted_content = self.ensure_valid(b'ltjkjsloyb') - corrupted_id = self.base_storage.id(corrupted_content) - self.storage.add(corrupted_content, obj_id=valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(corrupted_id) - with self.assertRaises(Error): - self.storage.check(valid_id) - self.storage.restore(valid_content) - self.storage.check(valid_id) - - -@attr('!db') -class TestPrefixFilter(MixinTestIdFilter, unittest.TestCase): - def setUp(self): - self.prefix = b'71' - super().setUp() - - def ensure_valid(self, content): - obj_id = hashutil.hashdata(content)['sha1'] - hex_obj_id = hashutil.hash_to_hex(obj_id) - self.assertTrue(hex_obj_id.startswith(self.prefix)) - return content - - def ensure_invalid(self, content): - obj_id = hashutil.hashdata(content)['sha1'] - hex_obj_id = hashutil.hash_to_hex(obj_id) - self.assertFalse(hex_obj_id.startswith(self.prefix)) - return content - - def filter_storage(self, storage): - return add_filter(storage, id_prefix(self.prefix)) - - -@attr('!db') -class TestRegexFilter(MixinTestIdFilter, unittest.TestCase): - def setUp(self): - self.regex = r'[a-f][0-9].*' - super().setUp() - - def filter_storage(self, storage): - return add_filter(storage, id_regex(self.regex)) diff --git a/swh/storage/tests/test_objstorage_api.py b/swh/storage/tests/test_objstorage_api.py deleted file mode 100644 --- a/swh/storage/tests/test_objstorage_api.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import tempfile -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.core import hashutil -from swh.storage.exc import ObjNotFoundError, Error -from swh.storage.tests.server_testing import ServerTestFixture -from swh.storage.objstorage.api.client import RemoteObjStorage -from swh.storage.objstorage.api.server import app - - -@attr('db') -class TestRemoteObjStorage(ServerTestFixture, unittest.TestCase): - """ Test the remote archive API. - """ - - def setUp(self): - self.config = {'storage_base': tempfile.mkdtemp(), - 'storage_slicing': '0:1/0:5'} - self.app = app - super().setUp() - self.objstorage = RemoteObjStorage(self.url()) - - def tearDown(self): - super().tearDown() - - @istest - def content_add(self): - content = bytes('Test content', 'utf8') - id = self.objstorage.content_add(content) - self.assertEquals(self.objstorage.content_get(id), content) - - @istest - def content_get_present(self): - content = bytes('content_get_present', 'utf8') - content_hash = hashutil.hashdata(content) - id = self.objstorage.content_add(content) - self.assertEquals(content_hash['sha1'], id) - - @istest - def content_get_missing(self): - content = bytes('content_get_missing', 'utf8') - content_hash = hashutil.hashdata(content) - with self.assertRaises(ObjNotFoundError): - self.objstorage.content_get(content_hash['sha1']) - - @istest - def content_get_random(self): - ids = [] - for i in range(100): - content = bytes('content_get_present', 'utf8') - id = self.objstorage.content_add(content) - ids.append(id) - for id in self.objstorage.content_get_random(50): - self.assertIn(id, ids) - - @istest - def content_check_invalid(self): - content = bytes('content_check_invalid', 'utf8') - invalid_id = hashutil.hashdata(b'invalid content')['sha1'] - # Add the content with an invalid id. - self.objstorage.content_add(content, invalid_id) - # Then check it and expect an error. - with self.assertRaises(Error): - self.objstorage.content_check(invalid_id) - - @istest - def content_check_valid(self): - content = bytes('content_check_valid', 'utf8') - id = self.objstorage.content_add(content) - try: - self.objstorage.content_check(id) - except: - self.fail('Integrity check failed') - - @istest - def content_check_missing(self): - content = bytes('content_check_valid', 'utf8') - content_hash = hashutil.hashdata(content) - with self.assertRaises(ObjNotFoundError): - self.objstorage.content_check(content_hash['sha1']) diff --git a/swh/storage/tests/test_objstorage_multiplexer.py b/swh/storage/tests/test_objstorage_multiplexer.py deleted file mode 100644 --- a/swh/storage/tests/test_objstorage_multiplexer.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import tempfile -import unittest - -from nose.tools import istest - -from swh.storage.objstorage import PathSlicingObjStorage -from swh.storage.objstorage.multiplexer import MultiplexerObjStorage -from swh.storage.objstorage.multiplexer.filter import add_filter, read_only - -from objstorage_testing import ObjStorageTestFixture - - -class TestMultiplexerObjStorage(ObjStorageTestFixture, unittest.TestCase): - - def setUp(self): - super().setUp() - self.storage_v1 = PathSlicingObjStorage(tempfile.mkdtemp(), '0:2/2:4') - self.storage_v2 = PathSlicingObjStorage(tempfile.mkdtemp(), '0:1/0:5') - - self.r_storage = add_filter(self.storage_v1, read_only()) - self.w_storage = self.storage_v2 - self.storage = MultiplexerObjStorage([self.r_storage, self.w_storage]) - - @istest - def contains(self): - content_p, obj_id_p = self.hash_content(b'contains_present') - content_m, obj_id_m = self.hash_content(b'contains_missing') - self.storage.add(content_p, obj_id=obj_id_p) - self.assertIn(obj_id_p, self.storage) - self.assertNotIn(obj_id_m, self.storage) - - @istest - def iter(self): - content, obj_id = self.hash_content(b'iter') - self.assertEqual(list(iter(self.storage)), []) - self.storage.add(content, obj_id=obj_id) - self.assertEqual(list(iter(self.storage)), [obj_id]) - - @istest - def len(self): - content, obj_id = self.hash_content(b'len') - self.assertEqual(len(self.storage), 0) - self.storage.add(content, obj_id=obj_id) - self.assertEqual(len(self.storage), 1) - - @istest - def len_multiple(self): - content, obj_id = self.hash_content(b'len_multiple') - # Add a content to the read-only storage - self.storage_v1.add(content) - self.assertEqual(len(self.storage), 1) - # By adding the same content to the global storage, it should be - # Replicated. - # len() behavior is to indicates the number of files, not unique - # contents. - self.storage.add(content) - self.assertEqual(len(self.storage), 2) - - @istest - def get_random_contents(self): - content, obj_id = self.hash_content(b'get_random_content') - self.storage.add(content) - random_contents = list(self.storage.get_random(1)) - self.assertEqual(1, len(random_contents)) - self.assertIn(obj_id, random_contents) - - @istest - def access_readonly(self): - # Add a content to the readonly storage - content, obj_id = self.hash_content(b'content in read-only') - self.storage_v1.add(content) - # Try to retrieve it on the main storage - self.assertIn(obj_id, self.storage) diff --git a/swh/storage/tests/test_objstorage_pathslicing.py b/swh/storage/tests/test_objstorage_pathslicing.py deleted file mode 100644 --- a/swh/storage/tests/test_objstorage_pathslicing.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import tempfile -import unittest - -from nose.tools import istest - -from swh.core import hashutil -from swh.storage import exc -from swh.storage.objstorage import PathSlicingObjStorage - -from objstorage_testing import ObjStorageTestFixture - - -class TestpathSlicingObjStorage(ObjStorageTestFixture, unittest.TestCase): - - def setUp(self): - super().setUp() - self.slicing = '0:2/2:4/4:6' - self.tmpdir = tempfile.mkdtemp() - self.storage = PathSlicingObjStorage(self.tmpdir, self.slicing) - - def content_path(self, obj_id): - hex_obj_id = hashutil.hash_to_hex(obj_id) - return self.storage._obj_path(hex_obj_id) - - @istest - def contains(self): - content_p, obj_id_p = self.hash_content(b'contains_present') - content_m, obj_id_m = self.hash_content(b'contains_missing') - self.storage.add(content_p, obj_id=obj_id_p) - self.assertIn(obj_id_p, self.storage) - self.assertNotIn(obj_id_m, self.storage) - - @istest - def iter(self): - content, obj_id = self.hash_content(b'iter') - self.assertEqual(list(iter(self.storage)), []) - self.storage.add(content, obj_id=obj_id) - self.assertEqual(list(iter(self.storage)), [obj_id]) - - @istest - def len(self): - content, obj_id = self.hash_content(b'check_not_gzip') - self.assertEqual(len(self.storage), 0) - self.storage.add(content, obj_id=obj_id) - self.assertEqual(len(self.storage), 1) - - @istest - def check_not_gzip(self): - content, obj_id = self.hash_content(b'check_not_gzip') - self.storage.add(content, obj_id=obj_id) - with open(self.content_path(obj_id), 'ab') as f: # Add garbage. - f.write(b'garbage') - with self.assertRaises(exc.Error): - self.storage.check(obj_id) - - @istest - def check_id_mismatch(self): - content, obj_id = self.hash_content(b'check_id_mismatch') - self.storage.add(content, obj_id=obj_id) - with open(self.content_path(obj_id), 'wb') as f: - f.write(b'unexpected content') - with self.assertRaises(exc.Error): - self.storage.check(obj_id) - - @istest - def get_random_contents(self): - content, obj_id = self.hash_content(b'get_random_content') - self.storage.add(content, obj_id=obj_id) - random_contents = list(self.storage.get_random(1)) - self.assertEqual(1, len(random_contents)) - self.assertIn(obj_id, random_contents)