diff --git a/swh/objstorage/multiplexer/filter/id_filter.py b/swh/objstorage/multiplexer/filter/id_filter.py index ca149b9..d549213 100644 --- a/swh/objstorage/multiplexer/filter/id_filter.py +++ b/swh/objstorage/multiplexer/filter/id_filter.py @@ -1,117 +1,119 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re +import abc from swh.core import hashutil from .filter import ObjStorageFilter from ...objstorage import ID_HASH_ALGO from ...exc import ObjNotFoundError def compute_hash(bytes): """ Compute the hash of the given content. """ # Checksum is missing, compute it on the fly. h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) h.update(bytes) return h.digest() -class IdObjStorageFilter(ObjStorageFilter): +class IdObjStorageFilter(ObjStorageFilter, metaclass=abc.ABCMeta): """ Filter that only allow operations if the object id match a requirement. Even for read operations, check before if the id match the requirements. This may prevent for unnecesary disk access. """ + @abc.abstractmethod def is_valid(self, obj_id): """ Indicates if the given id is valid. """ raise NotImplementedError('Implementations of an IdObjStorageFilter ' 'must have a "is_valid" method') def __contains__(self, obj_id, *args, **kwargs): """ Indicates if the given object is present in the storage See base class [ObjStorage]. """ if self.is_valid(obj_id): return self.storage.__contains__(*args, obj_id=obj_id, **kwargs) return False def __len__(self): """ See base class [ObjStorageFilter]. """ return sum(1 for i in [id for id in self.storage if self.is_valid(id)]) def __iter__(self): """ See base class [ObjStorageFilter]. """ yield from filter(lambda id: self.is_valid(id), iter(self.storage)) def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): """ See base class [ObjStorageFilter]. """ if obj_id is None: obj_id = compute_hash(content) if self.is_valid(obj_id): return self.storage.add(content, *args, obj_id=obj_id, **kwargs) def restore(self, content, obj_id=None, *args, **kwargs): """ See base class [ObjStorageFilter]. """ if obj_id is None: obj_id = compute_hash(content) if self.is_valid(obj_id): return self.storage.restore(content, *args, obj_id=obj_id, **kwargs) def get(self, obj_id, *args, **kwargs): """ See base class [ObjStorageFilter]. """ if self.is_valid(obj_id): return self.storage.get(*args, obj_id=obj_id, **kwargs) raise ObjNotFoundError(obj_id) def check(self, obj_id, *args, **kwargs): """ See base class [ObjStorageFilter]. """ if self.is_valid(obj_id): return self.storage.check(*args, obj_id=obj_id, **kwargs) raise ObjNotFoundError(obj_id) def get_random(self, *args, **kwargs): """ See base class [ObjStorageFilter]. """ yield from filter(lambda id: self.is_valid(id), self.storage.get_random(*args, **kwargs)) class RegexIdObjStorageFilter(IdObjStorageFilter): """ Filter that allow operations if the content's id as hex match a regex. """ def __init__(self, storage, regex): super().__init__(storage) self.regex = re.compile(regex) def is_valid(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return self.regex.match(hex_obj_id) is not None class PrefixIdObjStorageFilter(IdObjStorageFilter): """ Filter that allow operations if the hexlified id have a given prefix. """ def __init__(self, storage, prefix): super().__init__(storage) self.prefix = str(prefix) def is_valid(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return str(hex_obj_id).startswith(self.prefix) diff --git a/swh/objstorage/objstorage.py b/swh/objstorage/objstorage.py index e9883b9..3264dc8 100644 --- a/swh/objstorage/objstorage.py +++ b/swh/objstorage/objstorage.py @@ -1,151 +1,156 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import abc + from .exc import ObjNotFoundError ID_HASH_ALGO = 'sha1' ID_HASH_LENGTH = 40 # Size in bytes of the hash hexadecimal representation. -class ObjStorage(): +class ObjStorage(metaclass=abc.ABCMeta): """ High-level API to manipulate the Software Heritage object storage. Conceptually, the object storage offers 5 methods: - __contains__() check if an object is present, by object id - add() add a new object, returning an object id - restore() same as add() but erase an already existed content - get() retrieve the content of an object, by object id - check() check the integrity of an object, by object id And some management methods: - get_random() get random object id of existing contents (used for the content integrity checker). Each implementation of this interface can have a different behavior and its own way to store the contents. """ - def __contains__(self, *args, **kwargs): + @abc.abstractmethod def __contains__(self, obj_id, *args, **kwargs): """ Indicates if the given object is present in the storage Returns: True iff the object is present in the current object storage. """ raise NotImplementedError( "Implementations of ObjStorage must have a '__contains__' method" ) + @abc.abstractmethod def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): """ Add a new object to the object storage. Args: content: content of the object to be added to the storage. obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When given, obj_id will be trusted to match the bytes. If missing, obj_id will be computed on the fly. check_presence: indicate if the presence of the content should be verified before adding the file. Returns: the id of the object into the storage. """ raise NotImplementedError( "Implementations of ObjStorage must have a 'add' method" ) def restore(self, content, obj_id=None, *args, **kwargs): """ Restore a content that have been corrupted. This function is identical to add_bytes but does not check if the object id is already in the file system. The default implementation provided by the current class is suitable for most cases. Args: content: content of the object to be added to the storage obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When given, obj_id will be trusted to match bytes. If missing, obj_id will be computed on the fly. """ # check_presence to false will erase the potential previous content. return self.add(content, obj_id, check_presence=False) + @abc.abstractmethod def get(self, obj_id, *args, **kwargs): """ Retrieve the content of a given object. Args: obj_id: object id. Returns: the content of the requested object as bytes. Raises: ObjNotFoundError: if the requested object is missing. """ raise NotImplementedError( "Implementations of ObjStorage must have a 'get' method" ) def get_batch(self, obj_ids, *args, **kwargs): """ Retrieve content in bulk. Note: This function does have a default implementation in ObjStorage that is suitable for most cases. For object storages that needs to do the minimal number of requests possible (ex: remote object storages), that method can be overriden to perform a more efficient operation. Args: obj_ids: list of object ids. Returns: list of resulting contents, or None if the content could not be retrieved. Do not raise any exception as a fail for one content will not cancel the whole request. """ for obj_id in obj_ids: try: yield self.get(obj_id) except ObjNotFoundError: yield None + @abc.abstractmethod def check(self, obj_id, *args, **kwargs): """ Perform an integrity check for a given object. Verify that the file object is in place and that the gziped content matches the object id. Args: obj_id: object id. Raises: ObjNotFoundError: if the requested object is missing. Error: if the request object is corrupted. """ raise NotImplementedError( "Implementations of ObjStorage must have a 'check' method" ) def get_random(self, batch_size, *args, **kwargs): """ Get random ids of existing contents This method is used in order to get random ids to perform content integrity verifications on random contents. Args: batch_size (int): Number of ids that will be given Yields: An iterable of ids of contents that are in the current object storage. """ raise NotImplementedError( "The current implementation of ObjStorage does not support " "'get_random' operation" )