diff --git a/swh/objstorage/api/client.py b/swh/objstorage/api/client.py --- a/swh/objstorage/api/client.py +++ b/swh/objstorage/api/client.py @@ -1,9 +1,8 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - import pickle import requests @@ -52,95 +51,44 @@ return decode_response(response) def __contains__(self, obj_id): + """ Indicates if the given object is present in the storage + + See base class [ObjStorage]. + """ return self.post('content/contains', {'obj_id': obj_id}) def add(self, content, obj_id=None, check_presence=True): """ Add a new object to the object storage. - Args: - content: content of the object to be added to the storage. - obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When - given, obj_id will be trusted to match the bytes. If missing, - obj_id will be computed on the fly. - check_presence: indicate if the presence of the content should be - verified before adding the file. - - Returns: - the id of the object into the storage. + See base class [ObjStorage]. """ - return self.post('content/add', {'bytes': content, 'obj_id': obj_id, + return self.post('content/add', {'content': content, 'obj_id': obj_id, 'check_presence': check_presence}) - def restore(self, content, obj_id=None): - """ Restore a content that have been corrupted. - - This function is identical to add_bytes but does not check if - the object id is already in the file system. - - Args: - content: content of the object to be added to the storage - obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When - given, obj_id will be trusted to match bytes. If missing, - obj_id will be computed on the fly. - """ - return self.add(content, obj_id, check_presence=False) - def get(self, obj_id): """ Retrieve the content of a given object. - Args: - obj_id: object id. - - Returns: - the content of the requested object as bytes. - - Raises: - ObjNotFoundError: if the requested object is missing. + See base class [ObjStorage]. """ return self.post('content/get', {'obj_id': obj_id}) def get_batch(self, obj_ids): """ Retrieve content in bulk. - Note: This function does have a default implementation in ObjStorage - that is suitable for most cases. - - Args: - obj_ids: list of object ids. - - Returns: - list of resulting contents, or None if the content could not - be retrieved. Do not raise any exception as a fail for one content - will not cancel the whole request. + See base class [ObjStorage]. """ return self.post('content/get/batch', {'obj_ids': obj_ids}) def check(self, obj_id): """ Perform an integrity check for a given object. - Verify that the file object is in place and that the gziped content - matches the object id. - - Args: - obj_id: object id. - - Raises: - ObjNotFoundError: if the requested object is missing. - Error: if the request object is corrupted. + See base class [ObjStorage]. """ self.post('content/check', {'obj_id': obj_id}) def get_random(self, batch_size): """ Get random ids of existing contents - This method is used in order to get random ids to perform - content integrity verifications on random contents. - - Attributes: - batch_size (int): Number of ids that will be given - - Yields: - An iterable of ids of contents that are in the current object - storage. + See base class [ObjStorage]. """ return self.post('content/get/random', {'batch_size': batch_size}) diff --git a/swh/objstorage/multiplexer/filter/filter.py b/swh/objstorage/multiplexer/filter/filter.py --- a/swh/objstorage/multiplexer/filter/filter.py +++ b/swh/objstorage/multiplexer/filter/filter.py @@ -24,25 +24,63 @@ self.storage = storage def __contains__(self, *args, **kwargs): + """ Indicates if the given object is present in the storage + + See base class [ObjStorage]. + """ return self.storage.__contains__(*args, **kwargs) def __iter__(self): + """ Iterates over the content of each storages + + Warning: The `__iter__` methods frequently have bad performance. You + almost certainly don't want to use this method in production as the + wrapped storage may cause performance issues. + """ return self.storage.__iter__() def __len__(self): + """ Compute the number of objects in the current object storage. + + Warning: performance issue in `__iter__` also applies here. + + Returns: + number of objects contained in the storage. + """ return self.storage.__len__() - def add(self, *args, **kwargs): - return self.storage.add(*args, **kwargs) + def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): + """ Add a new object to the object storage. + + See base class [ObjStorage]. + """ + return self.storage.add(content, obj_id, check_presence, + *args, **kwargs) + + def restore(self, content, obj_id=None, *args, **kwargs): + """ Restore a content that have been corrupted. + + See base class [ObjStorage] & self.add() method. + """ + return self.storage.restore(content, obj_id, *args, **kwargs) + + def get(self, obj_id, *args, **kwargs): + """ Retrieve the content of a given object. + + See base class [ObjStorage]. + """ + return self.storage.get(obj_id, *args, **kwargs) - def restore(self, *args, **kwargs): - return self.storage.restore(*args, **kwargs) + def check(self, obj_id, *args, **kwargs): + """ Perform an integrity check for a given object. - def get(self, *args, **kwargs): - return self.storage.get(*args, **kwargs) + See base class [ObjStorage]. + """ + return self.storage.check(obj_id, *args, **kwargs) - def check(self, *args, **kwargs): - return self.storage.check(*args, **kwargs) + def get_random(self, batch_size, *args, **kwargs): + """ Get random ids of existing contents - def get_random(self, *args, **kwargs): - return self.storage.get_random(*args, **kwargs) + See base class [ObjStorage]. + """ + return self.storage.get_random(batch_size, *args, **kwargs) diff --git a/swh/objstorage/multiplexer/filter/id_filter.py b/swh/objstorage/multiplexer/filter/id_filter.py --- a/swh/objstorage/multiplexer/filter/id_filter.py +++ b/swh/objstorage/multiplexer/filter/id_filter.py @@ -4,30 +4,23 @@ # See top-level LICENSE file for more information import re +import abc from swh.core import hashutil from .filter import ObjStorageFilter -from ...objstorage import ID_HASH_ALGO +from ...objstorage import compute_hash from ...exc import ObjNotFoundError -def compute_hash(bytes): - """ Compute the hash of the given content. - """ - # Checksum is missing, compute it on the fly. - h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) - h.update(bytes) - return h.digest() - - -class IdObjStorageFilter(ObjStorageFilter): +class IdObjStorageFilter(ObjStorageFilter, metaclass=abc.ABCMeta): """ Filter that only allow operations if the object id match a requirement. Even for read operations, check before if the id match the requirements. This may prevent for unnecesary disk access. """ + @abc.abstractmethod def is_valid(self, obj_id): """ Indicates if the given id is valid. """ @@ -35,23 +28,35 @@ 'must have a "is_valid" method') def __contains__(self, obj_id, *args, **kwargs): + """ Indicates if the given object is present in the storage + + See base class [ObjStorage]. + """ if self.is_valid(obj_id): return self.storage.__contains__(*args, obj_id=obj_id, **kwargs) return False def __len__(self): + """ See base class [ObjStorageFilter]. + """ return sum(1 for i in [id for id in self.storage if self.is_valid(id)]) def __iter__(self): + """ See base class [ObjStorageFilter]. + """ yield from filter(lambda id: self.is_valid(id), iter(self.storage)) def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): + """ See base class [ObjStorageFilter]. + """ if obj_id is None: obj_id = compute_hash(content) if self.is_valid(obj_id): return self.storage.add(content, *args, obj_id=obj_id, **kwargs) def restore(self, content, obj_id=None, *args, **kwargs): + """ See base class [ObjStorageFilter]. + """ if obj_id is None: obj_id = compute_hash(content) if self.is_valid(obj_id): @@ -59,16 +64,22 @@ obj_id=obj_id, **kwargs) def get(self, obj_id, *args, **kwargs): + """ See base class [ObjStorageFilter]. + """ if self.is_valid(obj_id): return self.storage.get(*args, obj_id=obj_id, **kwargs) raise ObjNotFoundError(obj_id) def check(self, obj_id, *args, **kwargs): + """ See base class [ObjStorageFilter]. + """ if self.is_valid(obj_id): return self.storage.check(*args, obj_id=obj_id, **kwargs) raise ObjNotFoundError(obj_id) def get_random(self, *args, **kwargs): + """ See base class [ObjStorageFilter]. + """ yield from filter(lambda id: self.is_valid(id), self.storage.get_random(*args, **kwargs)) diff --git a/swh/objstorage/multiplexer/filter/read_write_filter.py b/swh/objstorage/multiplexer/filter/read_write_filter.py --- a/swh/objstorage/multiplexer/filter/read_write_filter.py +++ b/swh/objstorage/multiplexer/filter/read_write_filter.py @@ -9,7 +9,6 @@ class ReadObjStorageFilter(ObjStorageFilter): """ Filter that disable write operation of the storage. """ - def add(self, *args, **kwargs): return diff --git a/swh/objstorage/multiplexer/multiplexer_objstorage.py b/swh/objstorage/multiplexer/multiplexer_objstorage.py --- a/swh/objstorage/multiplexer/multiplexer_objstorage.py +++ b/swh/objstorage/multiplexer/multiplexer_objstorage.py @@ -53,20 +53,37 @@ self.storages = storages def __contains__(self, obj_id): + """ Indicates if the given object is present in the storage + + See base class [ObjStorage]. + """ for storage in self.storages: if obj_id in storage: return True return False def __iter__(self): + """ Iterates over the content of each storages + + Due to the demultiplexer nature, same content can be in multiple + storages and may be yielded multiple times. + + Warning: The `__iter__` methods frequently have bad performance. You + almost certainly don't want to use this method in production. + """ for storage in self.storages: yield from storage def __len__(self): - """ Returns the number of files in the storage. + """ Compute the number of objects in the current object storage. + + Identical objects present in multiple storages will be counted as + multiple objects. + Warning: this currently uses `__iter__`, its warning about bad + performance applies. - Warning: Multiple files may represent the same content, so this method - does not indicate how many different contents are in the storage. + Returns: + number of objects contained in the storage. """ return sum(map(len, self.storages)) @@ -96,21 +113,7 @@ def restore(self, content, obj_id=None): """ Restore a content that have been corrupted. - This function is identical to add_bytes but does not check if - the object id is already in the file system. - - (see "add" method) - - Args: - content: content of the object to be added to the storage - obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When - given, obj_id will be trusted to match bytes. If missing, - obj_id will be computed on the fly. - - Returns: - an id of the object into the storage. As the write-storages are - always readable as well, any id will be valid to retrieve a - content. + See base class [ObjStorage] & self.add() method. """ return [storage.restore(content, obj_id) for storage in self.storages].pop() @@ -118,14 +121,7 @@ def get(self, obj_id): """ Retrieve the content of a given object. - Args: - obj_id: object id. - - Returns: - the content of the requested object as bytes. - - Raises: - ObjNotFoundError: if the requested object is missing. + See base class [ObjStorage]. """ for storage in self.storages: try: @@ -138,15 +134,7 @@ def check(self, obj_id): """ Perform an integrity check for a given object. - Verify that the file object is in place and that the gziped content - matches the object id. - - Args: - obj_id: object id. - - Raises: - ObjNotFoundError: if the requested object is missing. - Error: if the request object is corrupted. + See base class [ObjStorage]. """ nb_present = 0 for storage in self.storages: @@ -166,15 +154,7 @@ def get_random(self, batch_size): """ Get random ids of existing contents - This method is used in order to get random ids to perform - content integrity verifications on random contents. - - Attributes: - batch_size (int): Number of ids that will be given - - Yields: - An iterable of ids of contents that are in the current object - storage. + See base class [ObjStorage]. """ storages_set = [storage for storage in self.storages if len(storage) > 0] diff --git a/swh/objstorage/objstorage.py b/swh/objstorage/objstorage.py --- a/swh/objstorage/objstorage.py +++ b/swh/objstorage/objstorage.py @@ -3,6 +3,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import abc + +from swh.core import hashutil + from .exc import ObjNotFoundError @@ -10,7 +14,14 @@ ID_HASH_LENGTH = 40 # Size in bytes of the hash hexadecimal representation. -class ObjStorage(): +def compute_hash(content): + h = hashutil._new_hash(ID_HASH_ALGO, len(content)) + h.update(content) + obj_id = h.digest() + return obj_id + + +class ObjStorage(metaclass=abc.ABCMeta): """ High-level API to manipulate the Software Heritage object storage. Conceptually, the object storage offers 5 methods: @@ -30,11 +41,18 @@ its own way to store the contents. """ - def __contains__(self, *args, **kwargs): + @abc.abstractmethod + def __contains__(self, obj_id, *args, **kwargs): + """ Indicates if the given object is present in the storage + + Returns: + True iff the object is present in the current object storage. + """ raise NotImplementedError( "Implementations of ObjStorage must have a '__contains__' method" ) + @abc.abstractmethod def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): """ Add a new object to the object storage. @@ -58,6 +76,8 @@ This function is identical to add_bytes but does not check if the object id is already in the file system. + The default implementation provided by the current class is + suitable for most cases. Args: content: content of the object to be added to the storage @@ -65,10 +85,10 @@ given, obj_id will be trusted to match bytes. If missing, obj_id will be computed on the fly. """ - raise NotImplemented( - "Implementations of ObjStorage must have a 'restore' method" - ) + # check_presence to false will erase the potential previous content. + return self.add(content, obj_id, check_presence=False) + @abc.abstractmethod def get(self, obj_id, *args, **kwargs): """ Retrieve the content of a given object. @@ -90,6 +110,9 @@ Note: This function does have a default implementation in ObjStorage that is suitable for most cases. + For object storages that needs to do the minimal number of requests + possible (ex: remote object storages), that method can be overriden + to perform a more efficient operation. Args: obj_ids: list of object ids. @@ -105,6 +128,7 @@ except ObjNotFoundError: yield None + @abc.abstractmethod def check(self, obj_id, *args, **kwargs): """ Perform an integrity check for a given object. @@ -128,7 +152,7 @@ This method is used in order to get random ids to perform content integrity verifications on random contents. - Attributes: + Args: batch_size (int): Number of ids that will be given Yields: diff --git a/swh/objstorage/objstorage_pathslicing.py b/swh/objstorage/objstorage_pathslicing.py --- a/swh/objstorage/objstorage_pathslicing.py +++ b/swh/objstorage/objstorage_pathslicing.py @@ -12,7 +12,7 @@ from swh.core import hashutil -from .objstorage import ObjStorage, ID_HASH_ALGO, ID_HASH_LENGTH +from .objstorage import ObjStorage, compute_hash, ID_HASH_ALGO, ID_HASH_LENGTH from .exc import ObjNotFoundError, Error @@ -131,10 +131,9 @@ ) def __contains__(self, obj_id): - """ Check whether the given object is present in the storage or not. + """ Indicates if the given object is present in the storage - Returns: - True iff the object is present in the storage. + See base class [ObjStorage]. """ hex_obj_id = hashutil.hash_to_hex(obj_id) return os.path.exists(self._obj_path(hex_obj_id)) @@ -167,7 +166,6 @@ Return: number of objects contained in the storage - """ return sum(1 for i in self) @@ -198,25 +196,13 @@ """ return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id) - def add(self, bytes, obj_id=None, check_presence=True): - """ Add a new object to the object storage. + def add(self, content, obj_id=None, check_presence=True): + """ Add a new object to the current object storage. - Args: - bytes: content of the object to be added to the storage. - obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When - given, obj_id will be trusted to match the bytes. If missing, - obj_id will be computed on the fly. - check_presence: indicate if the presence of the content should be - verified before adding the file. - - Returns: - the id of the object into the storage. + See base class [ObjStorage]. """ if obj_id is None: - # Checksum is missing, compute it on the fly. - h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) - h.update(bytes) - obj_id = h.digest() + obj_id = compute_hash(content) if check_presence and obj_id in self: # If the object is already present, return immediatly. @@ -224,35 +210,14 @@ hex_obj_id = hashutil.hash_to_hex(obj_id) with _write_obj_file(hex_obj_id, self) as f: - f.write(bytes) + f.write(content) return obj_id - def restore(self, bytes, obj_id=None): - """ Restore a content that have been corrupted. - - This function is identical to add_bytes but does not check if - the object id is already in the file system. - - Args: - bytes: content of the object to be added to the storage - obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When - given, obj_id will be trusted to match bytes. If missing, - obj_id will be computed on the fly. - """ - return self.add(bytes, obj_id, check_presence=False) - def get(self, obj_id): """ Retrieve the content of a given object. - Args: - obj_id: object id. - - Returns: - the content of the requested object as bytes. - - Raises: - ObjNotFoundError: if the requested object is missing. + See base class [ObjStorage]. """ if obj_id not in self: raise ObjNotFoundError(obj_id) @@ -265,15 +230,7 @@ def check(self, obj_id): """ Perform an integrity check for a given object. - Verify that the file object is in place and that the gziped content - matches the object id. - - Args: - obj_id: object id. - - Raises: - ObjNotFoundError: if the requested object is missing. - Error: if the request object is corrupted. + See base class [ObjStorage]. """ if obj_id not in self: raise ObjNotFoundError(obj_id) @@ -311,15 +268,7 @@ def get_random(self, batch_size): """ Get random ids of existing contents - This method is used in order to get random ids to perform - content integrity verifications on random contents. - - Attributes: - batch_size (int): Number of ids that will be given - - Yields: - An iterable of ids of contents that are in the current object - storage. + See base class [ObjStorage]. """ def get_random_content(self, batch_size): """ Get a batch of content inside a single directory. diff --git a/swh/objstorage/tests/test_objstorage.py b/swh/objstorage/tests/test_objstorage.py deleted file mode 100644 --- a/swh/objstorage/tests/test_objstorage.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest -import tempfile - -from swh.objstorage import PathSlicingObjStorage - -from swh.objstorage.tests.objstorage_testing import ObjStorageTestFixture - - -class TestObjStorage(ObjStorageTestFixture, unittest.TestCase): - - def setUp(self): - self.storage = PathSlicingObjStorage(tempfile.mkdtemp(), '0:2/0:5')