diff --git a/swh/objstorage/objstorage.py b/swh/objstorage/objstorage.py index 9e42917..1e2dcea 100644 --- a/swh/objstorage/objstorage.py +++ b/swh/objstorage/objstorage.py @@ -1,119 +1,141 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from .exc import ObjNotFoundError + ID_HASH_ALGO = 'sha1' ID_HASH_LENGTH = 40 # Size in bytes of the hash hexadecimal representation. class ObjStorage(): """ High-level API to manipulate the Software Heritage object storage. Conceptually, the object storage offers 5 methods: - __contains__() check if an object is present, by object id - add() add a new object, returning an object id - restore() same as add() but erase an already existed content - get() retrieve the content of an object, by object id - check() check the integrity of an object, by object id And some management methods: - get_random() get random object id of existing contents (used for the content integrity checker). Each implementation of this interface can have a different behavior and its own way to store the contents. """ def __contains__(self, *args, **kwargs): raise NotImplementedError( "Implementations of ObjStorage must have a '__contains__' method" ) def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): """ Add a new object to the object storage. Args: content: content of the object to be added to the storage. obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When given, obj_id will be trusted to match the bytes. If missing, obj_id will be computed on the fly. check_presence: indicate if the presence of the content should be verified before adding the file. Returns: the id of the object into the storage. """ raise NotImplementedError( "Implementations of ObjStorage must have a 'add' method" ) def restore(self, content, obj_id=None, *args, **kwargs): """ Restore a content that have been corrupted. This function is identical to add_bytes but does not check if the object id is already in the file system. Args: content: content of the object to be added to the storage obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When given, obj_id will be trusted to match bytes. If missing, obj_id will be computed on the fly. """ raise NotImplemented( "Implementations of ObjStorage must have a 'restore' method" ) def get(self, obj_id, *args, **kwargs): """ Retrieve the content of a given object. Args: obj_id: object id. Returns: the content of the requested object as bytes. Raises: ObjNotFoundError: if the requested object is missing. """ raise NotImplementedError( "Implementations of ObjStorage must have a 'get' method" ) + def get_batch(self, obj_ids, *args, **kwargs): + """ Retrieve content in bulk. + + Note: This function does have a default implementation in ObjStorage + that is suitable for most cases. + + Args: + obj_ids: list of object ids. + + Returns: + list of resulting contents, or None if the content could not + be retrieved. Do not raise any exception as a fail for one content + will not cancel the whole request. + """ + for obj_id in obj_ids: + try: + yield self.get(obj_id) + except ObjNotFoundError: + yield None + def check(self, obj_id, *args, **kwargs): """ Perform an integrity check for a given object. Verify that the file object is in place and that the gziped content matches the object id. Args: obj_id: object id. Raises: ObjNotFoundError: if the requested object is missing. Error: if the request object is corrupted. """ raise NotImplementedError( "Implementations of ObjStorage must have a 'check' method" ) def get_random(self, batch_size, *args, **kwargs): """ Get random ids of existing contents This method is used in order to get random ids to perform content integrity verifications on random contents. Attributes: batch_size (int): Number of ids that will be given Yields: An iterable of ids of contents that are in the current object storage. """ raise NotImplementedError( "The current implementation of ObjStorage does not support " "'get_random' operation" ) diff --git a/swh/objstorage/tests/objstorage_testing.py b/swh/objstorage/tests/objstorage_testing.py index 3009cd0..a26a027 100644 --- a/swh/objstorage/tests/objstorage_testing.py +++ b/swh/objstorage/tests/objstorage_testing.py @@ -1,70 +1,87 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from nose.tools import istest from swh.core import hashutil from swh.objstorage import exc class ObjStorageTestFixture(): def setUp(self): super().setUp() def hash_content(self, content): obj_id = hashutil.hashdata(content)['sha1'] return content, obj_id def assertContentMatch(self, obj_id, expected_content): content = self.storage.get(obj_id) self.assertEqual(content, expected_content) @istest def add_get_w_id(self): content, obj_id = self.hash_content(b'add_get_w_id') r = self.storage.add(content, obj_id=obj_id) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) @istest def add_get_wo_id(self): content, obj_id = self.hash_content(b'add_get_wo_id') r = self.storage.add(content) self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) + @istest + def add_get_batch(self): + content1, obj_id1 = self.hash_content(b'add_get_batch_1') + content2, obj_id2 = self.hash_content(b'add_get_batch_2') + self.storage.add(content1, obj_id1) + self.storage.add(content2, obj_id2) + cr1, cr2 = self.storage.get_batch([obj_id1, obj_id2]) + self.assertEqual(cr1, content1) + self.assertEqual(cr2, content2) + + @istest + def get_batch_unexisting_content(self): + content, obj_id = self.hash_content(b'get_batch_unexisting_content') + result = list(self.storage.get_batch([obj_id])) + self.assertTrue(len(result) == 1) + self.assertIsNone(result[0]) + @istest def restore_content(self): valid_content, valid_obj_id = self.hash_content(b'restore_content') invalid_content = b'unexpected content' id_adding = self.storage.add(invalid_content, valid_obj_id) id_restore = self.storage.restore(valid_content) # Adding a false content then restore it to the right one and # then perform a verification should result in a successful check. self.assertEqual(id_adding, valid_obj_id) self.assertEqual(id_restore, valid_obj_id) self.assertContentMatch(valid_obj_id, valid_content) @istest def get_missing(self): content, obj_id = self.hash_content(b'get_missing') with self.assertRaises(exc.Error): self.storage.get(obj_id) @istest def check_missing(self): content, obj_id = self.hash_content(b'check_missing') with self.assertRaises(exc.Error): self.storage.check(obj_id) @istest def check_present(self): content, obj_id = self.hash_content(b'check_missing') self.storage.add(content) try: self.storage.check(obj_id) except: self.fail('Integrity check failed')