diff --git a/swh/objstorage/api/client.py b/swh/objstorage/api/client.py index 1b87b9f..59f19ce 100644 --- a/swh/objstorage/api/client.py +++ b/swh/objstorage/api/client.py @@ -1,95 +1,95 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core.api import RPCClient from swh.model import hashutil -from ..objstorage import DEFAULT_CHUNK_SIZE, DEFAULT_LIMIT -from ..exc import Error, ObjNotFoundError, ObjStorageAPIError +from swh.objstorage.objstorage import DEFAULT_CHUNK_SIZE, DEFAULT_LIMIT +from swh.objstorage.exc import Error, ObjNotFoundError, ObjStorageAPIError class RemoteObjStorage: """Proxy to a remote object storage. This class allows to connect to an object storage server via http protocol. Attributes: url (string): The url of the server to connect. Must end with a '/' session: The session to send requests. """ def __init__(self, **kwargs): self._proxy = RPCClient( api_exception=ObjStorageAPIError, reraise_exceptions=[ObjNotFoundError, Error], **kwargs, ) def check_config(self, *, check_write): return self._proxy.post("check_config", {"check_write": check_write}) def __contains__(self, obj_id): return self._proxy.post("content/contains", {"obj_id": obj_id}) def add(self, content, obj_id=None, check_presence=True): return self._proxy.post( "content/add", {"content": content, "obj_id": obj_id, "check_presence": check_presence}, ) def add_batch(self, contents, check_presence=True): return self._proxy.post( "content/add/batch", {"contents": contents, "check_presence": check_presence,}, ) def restore(self, content, obj_id=None, *args, **kwargs): return self.add(content, obj_id, check_presence=False) def get(self, obj_id): return self._proxy.post("content/get", {"obj_id": obj_id}) def get_batch(self, obj_ids): return self._proxy.post("content/get/batch", {"obj_ids": obj_ids}) def check(self, obj_id): return self._proxy.post("content/check", {"obj_id": obj_id}) def delete(self, obj_id): # deletion permission are checked server-side return self._proxy.post("content/delete", {"obj_id": obj_id}) # Management methods def get_random(self, batch_size): return self._proxy.post("content/get/random", {"batch_size": batch_size}) # Streaming methods def add_stream(self, content_iter, obj_id, check_presence=True): obj_id = hashutil.hash_to_hex(obj_id) return self._proxy.post_stream( "content/add_stream/{}".format(obj_id), params={"check_presence": check_presence}, data=content_iter, ) def get_stream(self, obj_id, chunk_size=DEFAULT_CHUNK_SIZE): obj_id = hashutil.hash_to_hex(obj_id) return self._proxy.get_stream( "content/get_stream/{}".format(obj_id), chunk_size=chunk_size ) def __iter__(self): yield from self._proxy.get_stream("content") def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT): params = {"limit": limit} if last_obj_id: params["last_obj_id"] = hashutil.hash_to_hex(last_obj_id) yield from self._proxy.get_stream("content", params=params) diff --git a/swh/objstorage/multiplexer/__init__.py b/swh/objstorage/multiplexer/__init__.py index 82cc913..70f9ad1 100644 --- a/swh/objstorage/multiplexer/__init__.py +++ b/swh/objstorage/multiplexer/__init__.py @@ -1,5 +1,5 @@ -from .multiplexer_objstorage import MultiplexerObjStorage -from .striping_objstorage import StripingObjStorage +from swh.objstorage.multiplexer.multiplexer_objstorage import MultiplexerObjStorage +from swh.objstorage.multiplexer.striping_objstorage import StripingObjStorage __all__ = ["MultiplexerObjStorage", "StripingObjStorage"] diff --git a/swh/objstorage/multiplexer/filter/filter.py b/swh/objstorage/multiplexer/filter/filter.py index 61523ce..7363962 100644 --- a/swh/objstorage/multiplexer/filter/filter.py +++ b/swh/objstorage/multiplexer/filter/filter.py @@ -1,77 +1,77 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from ...objstorage import ObjStorage +from swh.objstorage.objstorage import ObjStorage class ObjStorageFilter(ObjStorage): """Base implementation of a filter that allow inputs on ObjStorage or not. This class copy the API of ...objstorage in order to filter the inputs of this class. If the operation is allowed, return the result of this operation applied to the destination implementation. Otherwise, just return without any operation. This class is an abstract base class for a classic read/write storage. Filters can inherit from it and only redefine some methods in order to change behavior. """ def __init__(self, storage): self.storage = storage def check_config(self, *, check_write): """Check the object storage for proper configuration. Args: check_write: check whether writes to the objstorage will succeed Returns: True if the storage is properly configured """ return self.storage.check_config(check_write=check_write) def __contains__(self, *args, **kwargs): return self.storage.__contains__(*args, **kwargs) def __iter__(self): """ Iterates over the content of each storages Warning: The `__iter__` methods frequently have bad performance. You almost certainly don't want to use this method in production as the wrapped storage may cause performance issues. """ return self.storage.__iter__() def __len__(self): """ Compute the number of objects in the current object storage. Warning: performance issue in `__iter__` also applies here. Returns: number of objects contained in the storage. """ return self.storage.__len__() def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): return self.storage.add(content, obj_id, check_presence, *args, **kwargs) def restore(self, content, obj_id=None, *args, **kwargs): return self.storage.restore(content, obj_id, *args, **kwargs) def get(self, obj_id, *args, **kwargs): return self.storage.get(obj_id, *args, **kwargs) def check(self, obj_id, *args, **kwargs): return self.storage.check(obj_id, *args, **kwargs) def delete(self, obj_id, *args, **kwargs): return self.storage.delete(obj_id, *args, **kwargs) def get_random(self, batch_size, *args, **kwargs): return self.storage.get_random(batch_size, *args, **kwargs) diff --git a/swh/objstorage/multiplexer/filter/id_filter.py b/swh/objstorage/multiplexer/filter/id_filter.py index e01daf2..0894bd0 100644 --- a/swh/objstorage/multiplexer/filter/id_filter.py +++ b/swh/objstorage/multiplexer/filter/id_filter.py @@ -1,93 +1,93 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import abc from swh.model import hashutil -from .filter import ObjStorageFilter -from ...objstorage import compute_hash -from ...exc import ObjNotFoundError +from swh.objstorage.multiplexer.filter.filter import ObjStorageFilter +from swh.objstorage.objstorage import compute_hash +from swh.objstorage.exc import ObjNotFoundError class IdObjStorageFilter(ObjStorageFilter, metaclass=abc.ABCMeta): """ Filter that only allow operations if the object id match a requirement. Even for read operations, check before if the id match the requirements. This may prevent for unnecessary disk access. """ @abc.abstractmethod def is_valid(self, obj_id): """ Indicates if the given id is valid. """ raise NotImplementedError( "Implementations of an IdObjStorageFilter " 'must have a "is_valid" method' ) def __contains__(self, obj_id, *args, **kwargs): if self.is_valid(obj_id): return self.storage.__contains__(*args, obj_id=obj_id, **kwargs) return False def __len__(self): return sum(1 for i in [id for id in self.storage if self.is_valid(id)]) def __iter__(self): yield from filter(lambda id: self.is_valid(id), iter(self.storage)) def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): if obj_id is None: obj_id = compute_hash(content) if self.is_valid(obj_id): return self.storage.add(content, *args, obj_id=obj_id, **kwargs) def restore(self, content, obj_id=None, *args, **kwargs): if obj_id is None: obj_id = compute_hash(content) if self.is_valid(obj_id): return self.storage.restore(content, *args, obj_id=obj_id, **kwargs) def get(self, obj_id, *args, **kwargs): if self.is_valid(obj_id): return self.storage.get(*args, obj_id=obj_id, **kwargs) raise ObjNotFoundError(obj_id) def check(self, obj_id, *args, **kwargs): if self.is_valid(obj_id): return self.storage.check(*args, obj_id=obj_id, **kwargs) raise ObjNotFoundError(obj_id) def get_random(self, *args, **kwargs): yield from filter( lambda id: self.is_valid(id), self.storage.get_random(*args, **kwargs) ) class RegexIdObjStorageFilter(IdObjStorageFilter): """ Filter that allow operations if the content's id as hex match a regex. """ def __init__(self, storage, regex): super().__init__(storage) self.regex = re.compile(regex) def is_valid(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return self.regex.match(hex_obj_id) is not None class PrefixIdObjStorageFilter(IdObjStorageFilter): """ Filter that allow operations if the hexlified id have a given prefix. """ def __init__(self, storage, prefix): super().__init__(storage) self.prefix = str(prefix) def is_valid(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return str(hex_obj_id).startswith(self.prefix) diff --git a/swh/objstorage/multiplexer/filter/read_write_filter.py b/swh/objstorage/multiplexer/filter/read_write_filter.py index 5e24289..753ad67 100644 --- a/swh/objstorage/multiplexer/filter/read_write_filter.py +++ b/swh/objstorage/multiplexer/filter/read_write_filter.py @@ -1,25 +1,25 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from .filter import ObjStorageFilter +from swh.objstorage.multiplexer.filter.filter import ObjStorageFilter class ReadObjStorageFilter(ObjStorageFilter): """ Filter that disable write operation of the storage. Writes will always succeed without doing any actual write operations. """ def check_config(self, *, check_write): return self.storage.check_config(check_write=False) def add(self, *args, **kwargs): return def restore(self, *args, **kwargs): return def delete(self, *args, **kwargs): return True diff --git a/swh/objstorage/multiplexer/multiplexer_objstorage.py b/swh/objstorage/multiplexer/multiplexer_objstorage.py index ebb790f..c61a0cb 100644 --- a/swh/objstorage/multiplexer/multiplexer_objstorage.py +++ b/swh/objstorage/multiplexer/multiplexer_objstorage.py @@ -1,319 +1,319 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import queue import random import threading -from ..objstorage import ObjStorage -from ..exc import ObjNotFoundError +from swh.objstorage.objstorage import ObjStorage +from swh.objstorage.exc import ObjNotFoundError class ObjStorageThread(threading.Thread): def __init__(self, storage): super().__init__(daemon=True) self.storage = storage self.commands = queue.Queue() def run(self): while True: try: mailbox, command, args, kwargs = self.commands.get(True, 0.05) except queue.Empty: continue try: ret = getattr(self.storage, command)(*args, **kwargs) except Exception as exc: self.queue_result(mailbox, "exception", exc) else: self.queue_result(mailbox, "result", ret) def queue_command(self, command, *args, mailbox=None, **kwargs): """Enqueue a new command to be processed by the thread. Args: command (str): one of the method names for the underlying storage. mailbox (queue.Queue): explicit mailbox if the calling thread wants to override it. args, kwargs: arguments for the command. Returns: queue.Queue The mailbox you can read the response from """ if not mailbox: mailbox = queue.Queue() self.commands.put((mailbox, command, args, kwargs)) return mailbox def queue_result(self, mailbox, result_type, result): """Enqueue a new result in the mailbox This also provides a reference to the storage, which can be useful when an exceptional condition arises. Args: mailbox (queue.Queue): the mailbox to which we need to enqueue the result result_type (str): one of 'result', 'exception' result: the result to pass back to the calling thread """ mailbox.put( {"type": result_type, "result": result,} ) @staticmethod def get_result_from_mailbox(mailbox, *args, **kwargs): """Unpack the result from the mailbox. Arguments: mailbox (queue.Queue): A mailbox to unpack a result from args, kwargs: arguments to :func:`mailbox.get` Returns: the next result unpacked from the queue Raises: either the exception we got back from the underlying storage, or :exc:`queue.Empty` if :func:`mailbox.get` raises that. """ result = mailbox.get(*args, **kwargs) if result["type"] == "exception": raise result["result"] from None else: return result["result"] @staticmethod def collect_results(mailbox, num_results): """Collect num_results from the mailbox""" collected = 0 ret = [] while collected < num_results: try: ret.append( ObjStorageThread.get_result_from_mailbox(mailbox, True, 0.05) ) except queue.Empty: continue collected += 1 return ret def __getattr__(self, attr): def call(*args, **kwargs): mailbox = self.queue_command(attr, *args, **kwargs) return self.get_result_from_mailbox(mailbox) return call def __contains__(self, *args, **kwargs): mailbox = self.queue_command("__contains__", *args, **kwargs) return self.get_result_from_mailbox(mailbox) class MultiplexerObjStorage(ObjStorage): """Implementation of ObjStorage that distributes between multiple storages. The multiplexer object storage allows an input to be demultiplexed among multiple storages that will or will not accept it by themselves (see .filter package). As the ids can be different, no pre-computed ids should be submitted. Also, there are no guarantees that the returned ids can be used directly into the storages that the multiplexer manage. Use case examples follow. Example 1:: storage_v1 = filter.read_only(PathSlicingObjStorage('/dir1', '0:2/2:4/4:6')) storage_v2 = PathSlicingObjStorage('/dir2', '0:1/0:5') storage = MultiplexerObjStorage([storage_v1, storage_v2]) When using 'storage', all the new contents will only be added to the v2 storage, while it will be retrievable from both. Example 2:: storage_v1 = filter.id_regex( PathSlicingObjStorage('/dir1', '0:2/2:4/4:6'), r'[^012].*' ) storage_v2 = filter.if_regex( PathSlicingObjStorage('/dir2', '0:1/0:5'), r'[012]/*' ) storage = MultiplexerObjStorage([storage_v1, storage_v2]) When using this storage, the contents with a sha1 starting with 0, 1 or 2 will be redirected (read AND write) to the storage_v2, while the others will be redirected to the storage_v1. If a content starting with 0, 1 or 2 is present in the storage_v1, it would be ignored anyway. """ def __init__(self, storages, **kwargs): super().__init__(**kwargs) self.storages = storages self.storage_threads = [ObjStorageThread(storage) for storage in storages] for thread in self.storage_threads: thread.start() def wrap_call(self, threads, call, *args, **kwargs): threads = list(threads) mailbox = queue.Queue() for thread in threads: thread.queue_command(call, *args, mailbox=mailbox, **kwargs) return ObjStorageThread.collect_results(mailbox, len(threads)) def get_read_threads(self, obj_id=None): yield from self.storage_threads def get_write_threads(self, obj_id=None): yield from self.storage_threads def check_config(self, *, check_write): """Check whether the object storage is properly configured. Args: check_write (bool): if True, check if writes to the object storage can succeed. Returns: True if the configuration check worked, an exception if it didn't. """ return all( self.wrap_call( self.storage_threads, "check_config", check_write=check_write ) ) def __contains__(self, obj_id): """Indicate if the given object is present in the storage. Args: obj_id (bytes): object identifier. Returns: True if and only if the object is present in the current object storage. """ for storage in self.get_read_threads(obj_id): if obj_id in storage: return True return False def __iter__(self): def obj_iterator(): for storage in self.storages: yield from storage return obj_iterator() def add(self, content, obj_id=None, check_presence=True): """ Add a new object to the object storage. If the adding step works in all the storages that accept this content, this is a success. Otherwise, the full adding step is an error even if it succeed in some of the storages. Args: content: content of the object to be added to the storage. obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When given, obj_id will be trusted to match the bytes. If missing, obj_id will be computed on the fly. check_presence: indicate if the presence of the content should be verified before adding the file. Returns: an id of the object into the storage. As the write-storages are always readable as well, any id will be valid to retrieve a content. """ results = self.wrap_call( self.get_write_threads(obj_id), "add", content, obj_id=obj_id, check_presence=check_presence, ) for result in results: if not result: continue return result def add_batch(self, contents, check_presence=True): """Add a batch of new objects to the object storage. """ write_threads = list(self.get_write_threads()) results = self.wrap_call( write_threads, "add_batch", contents, check_presence=check_presence, ) summed = {"object:add": 0, "object:add:bytes": 0} for result in results: summed["object:add"] += result["object:add"] summed["object:add:bytes"] += result["object:add:bytes"] return { "object:add": summed["object:add"] // len(results), "object:add:bytes": summed["object:add:bytes"] // len(results), } def restore(self, content, obj_id=None): return self.wrap_call( self.get_write_threads(obj_id), "restore", content, obj_id=obj_id, ).pop() def get(self, obj_id): for storage in self.get_read_threads(obj_id): try: return storage.get(obj_id) except ObjNotFoundError: continue # If no storage contains this content, raise the error raise ObjNotFoundError(obj_id) def check(self, obj_id): nb_present = 0 for storage in self.get_read_threads(obj_id): try: storage.check(obj_id) except ObjNotFoundError: continue else: nb_present += 1 # If there is an Error because of a corrupted file, then let it pass. # Raise the ObjNotFoundError only if the content couldn't be found in # all the storages. if nb_present == 0: raise ObjNotFoundError(obj_id) def delete(self, obj_id): super().delete(obj_id) # Check delete permission return all(self.wrap_call(self.get_write_threads(obj_id), "delete", obj_id)) def get_random(self, batch_size): storages_set = [storage for storage in self.storages if len(storage) > 0] if len(storages_set) <= 0: return [] while storages_set: storage = random.choice(storages_set) try: return storage.get_random(batch_size) except NotImplementedError: storages_set.remove(storage) # There is no storage that allow the get_random operation raise NotImplementedError( "There is no storage implementation into the multiplexer that " "support the 'get_random' operation" ) diff --git a/swh/objstorage/multiplexer/striping_objstorage.py b/swh/objstorage/multiplexer/striping_objstorage.py index ad06d27..aaa6729 100644 --- a/swh/objstorage/multiplexer/striping_objstorage.py +++ b/swh/objstorage/multiplexer/striping_objstorage.py @@ -1,71 +1,74 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import queue -from .multiplexer_objstorage import ObjStorageThread, MultiplexerObjStorage +from swh.objstorage.multiplexer.multiplexer_objstorage import ( + ObjStorageThread, + MultiplexerObjStorage, +) class StripingObjStorage(MultiplexerObjStorage): """Stripes objects across multiple objstorages This objstorage implementation will write objects to objstorages in a predictable way: it takes the modulo of the last 8 bytes of the object identifier with the number of object storages passed, which will yield an (almost) even distribution. Objects are read from all storages in turn until it succeeds. """ MOD_BYTES = 8 def __init__(self, storages, **kwargs): super().__init__(storages, **kwargs) self.num_storages = len(storages) def get_storage_index(self, obj_id): if obj_id is None: raise ValueError("StripingObjStorage always needs obj_id to be set") index = int.from_bytes(obj_id[: -self.MOD_BYTES], "big") return index % self.num_storages def get_write_threads(self, obj_id): idx = self.get_storage_index(obj_id) yield self.storage_threads[idx] def get_read_threads(self, obj_id=None): if obj_id: idx = self.get_storage_index(obj_id) else: idx = 0 for i in range(self.num_storages): yield self.storage_threads[(idx + i) % self.num_storages] def add_batch(self, contents, check_presence=True): """Add a batch of new objects to the object storage. """ content_by_storage_index = defaultdict(dict) for obj_id, content in contents.items(): storage_index = self.get_storage_index(obj_id) content_by_storage_index[storage_index][obj_id] = content mailbox = queue.Queue() for storage_index, contents in content_by_storage_index.items(): self.storage_threads[storage_index].queue_command( "add_batch", contents, check_presence=check_presence, mailbox=mailbox, ) results = ObjStorageThread.collect_results( mailbox, len(content_by_storage_index) ) summed = {"object:add": 0, "object:add:bytes": 0} for result in results: summed["object:add"] += result["object:add"] summed["object:add:bytes"] += result["object:add:bytes"] return summed