diff --git a/swh/storage/checker/checker.py b/swh/storage/checker/checker.py --- a/swh/storage/checker/checker.py +++ b/swh/storage/checker/checker.py @@ -34,7 +34,7 @@ restore corrupted content. """ - def __init__(self, config, root, depth, backup_urls): + def __init__(self, config, root, slicing, backup_urls): """ Create a checker that ensure the objstorage have no corrupted file. Args: @@ -47,7 +47,7 @@ get a content. """ self.config = config - self.objstorage = PathSlicingObjStorage(root, depth, slicing=2) + self.objstorage = PathSlicingObjStorage(root, slicing) self.backup_storages = [get_storage('remote_storage', [backup_url]) for backup_url in backup_urls] diff --git a/swh/storage/objstorage/api/server.py b/swh/storage/objstorage/api/server.py --- a/swh/storage/objstorage/api/server.py +++ b/swh/storage/objstorage/api/server.py @@ -16,7 +16,7 @@ DEFAULT_CONFIG = { 'storage_base': ('str', '/tmp/swh-storage/objects/'), - 'storage_depth': ('int', 3) + 'storage_slicing': ('str', '0:2/2:4/4:6') } app = Flask(__name__) @@ -31,8 +31,7 @@ @app.before_request def before_request(): g.objstorage = PathSlicingObjStorage(app.config['storage_base'], - app.config['storage_depth'], - slicing=2) + app.config['storage_slicing']) @app.route('/') diff --git a/swh/storage/objstorage/objstorage.py b/swh/storage/objstorage/objstorage.py --- a/swh/storage/objstorage/objstorage.py +++ b/swh/storage/objstorage/objstorage.py @@ -4,6 +4,10 @@ # See top-level LICENSE file for more information +ID_HASH_ALGO = 'sha1' +ID_HASH_LENGTH = 20 # Size in bytes of the hash hexadecimal representation. + + class ObjStorage(): """ High-level API to manipulate the Software Heritage object storage. @@ -47,7 +51,7 @@ "Implementations of ObjStorage must have a 'add' method" ) - def restore(self, content, obj_id, *args, **kwargs): + def restore(self, content, obj_id=None, *args, **kwargs): """ Restore a content that have been corrupted. This function is identical to add_bytes but does not check if diff --git a/swh/storage/objstorage/objstorage_pathslicing.py b/swh/storage/objstorage/objstorage_pathslicing.py --- a/swh/storage/objstorage/objstorage_pathslicing.py +++ b/swh/storage/objstorage/objstorage_pathslicing.py @@ -12,12 +12,10 @@ from swh.core import hashutil -from .objstorage import ObjStorage +from .objstorage import ObjStorage, ID_HASH_ALGO, ID_HASH_LENGTH from ..exc import ObjNotFoundError, Error -ID_HASH_ALGO = 'sha1' - GZIP_BUFSIZ = 1048576 DIR_MODE = 0o755 @@ -84,33 +82,32 @@ the value of the ID_HASH_ALGO constant (see hashutil for its meaning). To avoid directories that contain too many files, the object storage has a - given depth. Each depth level consumes a given amount of characters of - the object id. + given slicing. Each slicing correspond to a directory that is named + according to the hash of its content. So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689 will be stored in the given object storages : - - depth=3, slicing=2 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689 - - depth=1, slicing=5 : 34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689 + - 0:2/2:4/4:6 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689 + - 0:1/0:5/ : 3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689 The files in the storage are stored in gzipped compressed format. Attributes: root (string): path to the root directory of the storage on the disk. - depth (int): number of subdirectories created to store a file. - slicing (int): number of hash character consumed for each - subdirectories. + bounds: list of tuples that indicates the beginning and the end of + each subdirectory for a content. """ - def __init__(self, root, depth, slicing): + def __init__(self, root, slicing): """ Create an object to access a hash-slicing based object storage. Args: root (string): path to the root directory of the storage on the disk. - depth (int): number of subdirectories created to store a file. - slicing (int): number of hash character consumed for each - subdirectories. + slicing (string): string that indicates the slicing to perform + on the hash of the content to know the path where it should + be stored. """ if not os.path.isdir(root): raise ValueError( @@ -118,8 +115,20 @@ ) self.root = root - self.depth = depth - self.slicing = slicing + # Make a list of tuples where each tuple contains the beginning + # and the end of each slicing. + self.bounds = [ + slice(*map(int, sbounds.split(':'))) + for sbounds in slicing.split('/') + if sbounds + ] + + max_endchar = max(map(lambda bound: bound.stop, self.bounds)) + if ID_HASH_LENGTH < max_endchar: + raise ValueError( + 'Algorithm %s has too short hash for slicing to char %d' + % (ID_HASH_ALGO, max_endchar) + ) def __contains__(self, obj_id): """ Check whether the given object is present in the storage or not. @@ -173,20 +182,8 @@ Returns: Path to the directory that contains the required object. """ - if len(hex_obj_id) < self.depth * self.slicing: - raise ValueError( - 'Object id "%s" is to short for %d-slicing at depth %d' - % (hex_obj_id, self.slicing, self.depth) - ) - - # Compute [depth] substrings of [hex_obj_id], each of length [slicing], - # starting from the beginning. - id_steps = [hex_obj_id[i * self.slicing: - i * self.slicing + self.slicing] - for i in range(self.depth)] - steps = [self.root] + id_steps - - return os.path.join(*steps) + slices = [hex_obj_id[bound] for bound in self.bounds] + return os.path.join(self.root, *slices) def _obj_path(self, hex_obj_id): """ Compute the full path to an object into the current storage. @@ -331,7 +328,7 @@ a tuple (batch size, batch). """ dirs = [] - for level in range(self.depth): + for level in range(len(self.bounds)): path = os.path.join(self.root, *dirs) dir_list = next(os.walk(path))[1] if 'tmp' in dir_list: diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -68,7 +68,9 @@ except psycopg2.OperationalError as e: raise StorageDBError(e) - self.objstorage = PathSlicingObjStorage(obj_root, depth=3, slicing=2) + # TODO this needs to be configured + self.objstorage = PathSlicingObjStorage(obj_root, + slicing='0:2/2:4/4:6') def content_add(self, content): """Add content blobs to the storage diff --git a/swh/storage/tests/test_archiver.py b/swh/storage/tests/test_archiver.py --- a/swh/storage/tests/test_archiver.py +++ b/swh/storage/tests/test_archiver.py @@ -37,7 +37,7 @@ # Launch the backup server self.backup_objroot = tempfile.mkdtemp(prefix='remote') self.config = {'storage_base': self.backup_objroot, - 'storage_depth': 3} + 'storage_slicing': '0:2/2:4/4:6'} self.app = app super().setUp() diff --git a/swh/storage/tests/test_checker.py b/swh/storage/tests/test_checker.py --- a/swh/storage/tests/test_checker.py +++ b/swh/storage/tests/test_checker.py @@ -43,8 +43,8 @@ # Connect to an objstorage config = {'batch_size': 10} path = tempfile.mkdtemp() - depth = 3 - self.checker = ContentChecker(config, path, depth, 'http://None') + slicing = '0:2/2:4/4:6' + self.checker = ContentChecker(config, path, slicing, 'http://None') self.checker.backup_storages = [MockBackupStorage(), MockBackupStorage()] diff --git a/swh/storage/tests/test_objstorage_api.py b/swh/storage/tests/test_objstorage_api.py --- a/swh/storage/tests/test_objstorage_api.py +++ b/swh/storage/tests/test_objstorage_api.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os import tempfile import unittest @@ -24,7 +23,7 @@ def setUp(self): self.config = {'storage_base': tempfile.mkdtemp(), - 'storage_depth': 3} + 'storage_slicing': '0:1/0:5'} self.app = app super().setUp() self.objstorage = RemoteObjStorage(self.url()) @@ -65,20 +64,12 @@ @istest def content_check_invalid(self): content = bytes('content_check_invalid', 'utf8') - id = self.objstorage.content_add(content) - hex_obj_id = hashutil.hash_to_hex(id) - dir_path = os.path.join( - self.config['storage_base'], - *[hex_obj_id[i*2:i*2+2] - for i in range(int(self.config['storage_depth']))] - ) - path = os.path.join(dir_path, hex_obj_id) - content = list(content) - with open(path, 'bw') as f: - content[0] = (content[0] + 1) % 128 - f.write(bytes(content)) + invalid_id = hashutil.hashdata(b'invalid content')['sha1'] + # Add the content with an invalid id. + self.objstorage.content_add(content, invalid_id) + # Then check it and expect an error. with self.assertRaises(Error): - self.objstorage.content_check(id) + self.objstorage.content_check(invalid_id) @istest def content_check_valid(self): diff --git a/swh/storage/tests/test_objstorage_pathslicing.py b/swh/storage/tests/test_objstorage_pathslicing.py --- a/swh/storage/tests/test_objstorage_pathslicing.py +++ b/swh/storage/tests/test_objstorage_pathslicing.py @@ -19,11 +19,9 @@ def setUp(self): super().setUp() - self.depth = 3 - self.slicing = 2 + self.slicing = '0:2/2:4/4:6' self.tmpdir = tempfile.mkdtemp() - self.storage = PathSlicingObjStorage(self.tmpdir, self.depth, - self.slicing) + self.storage = PathSlicingObjStorage(self.tmpdir, self.slicing) def content_path(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id)