Page MenuHomeSoftware Heritage

D49.id178.diff
No OneTemporary

D49.id178.diff

diff --git a/swh/storage/checker/checker.py b/swh/storage/checker/checker.py
--- a/swh/storage/checker/checker.py
+++ b/swh/storage/checker/checker.py
@@ -34,7 +34,7 @@
restore corrupted content.
"""
- def __init__(self, config, root, depth, backup_urls):
+ def __init__(self, config, root, slicing, backup_urls):
""" Create a checker that ensure the objstorage have no corrupted file.
Args:
@@ -47,7 +47,7 @@
get a content.
"""
self.config = config
- self.objstorage = PathSlicingObjStorage(root, depth, slicing=2)
+ self.objstorage = PathSlicingObjStorage(root, slicing)
self.backup_storages = [get_storage('remote_storage', [backup_url])
for backup_url in backup_urls]
diff --git a/swh/storage/objstorage/api/server.py b/swh/storage/objstorage/api/server.py
--- a/swh/storage/objstorage/api/server.py
+++ b/swh/storage/objstorage/api/server.py
@@ -16,7 +16,7 @@
DEFAULT_CONFIG = {
'storage_base': ('str', '/tmp/swh-storage/objects/'),
- 'storage_depth': ('int', 3)
+ 'storage_slicing': ('str', '0:2/2:4/4:6')
}
app = Flask(__name__)
@@ -31,8 +31,7 @@
@app.before_request
def before_request():
g.objstorage = PathSlicingObjStorage(app.config['storage_base'],
- app.config['storage_depth'],
- slicing=2)
+ app.config['storage_slicing'])
@app.route('/')
diff --git a/swh/storage/objstorage/objstorage.py b/swh/storage/objstorage/objstorage.py
--- a/swh/storage/objstorage/objstorage.py
+++ b/swh/storage/objstorage/objstorage.py
@@ -4,6 +4,9 @@
# See top-level LICENSE file for more information
+ID_HASH_ALGO = 'sha1'
+
+
class ObjStorage():
""" High-level API to manipulate the Software Heritage object storage.
@@ -47,7 +50,7 @@
"Implementations of ObjStorage must have a 'add' method"
)
- def restore(self, content, obj_id, *args, **kwargs):
+ def restore(self, content, obj_id=None, *args, **kwargs):
""" Restore a content that have been corrupted.
This function is identical to add_bytes but does not check if
diff --git a/swh/storage/objstorage/objstorage_pathslicing.py b/swh/storage/objstorage/objstorage_pathslicing.py
--- a/swh/storage/objstorage/objstorage_pathslicing.py
+++ b/swh/storage/objstorage/objstorage_pathslicing.py
@@ -12,12 +12,10 @@
from swh.core import hashutil
-from .objstorage import ObjStorage
+from .objstorage import ObjStorage, ID_HASH_ALGO
from ..exc import ObjNotFoundError, Error
-ID_HASH_ALGO = 'sha1'
-
GZIP_BUFSIZ = 1048576
DIR_MODE = 0o755
@@ -84,33 +82,32 @@
the value of the ID_HASH_ALGO constant (see hashutil for its meaning).
To avoid directories that contain too many files, the object storage has a
- given depth. Each depth level consumes a given amount of characters of
- the object id.
+ given slicing. Each slicing correspond to a directory that is named
+ according to the hash of its content.
So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689
will be stored in the given object storages :
- - depth=3, slicing=2 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689
- - depth=1, slicing=5 : 34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689
+ - 0:2/2:4/4:6 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689
+ - 0:1/0:5/ : 3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689
The files in the storage are stored in gzipped compressed format.
Attributes:
root (string): path to the root directory of the storage on the disk.
- depth (int): number of subdirectories created to store a file.
- slicing (int): number of hash character consumed for each
- subdirectories.
+ bounds: list of tuples that indicates the beginning and the end of
+ each subdirectory for a content.
"""
- def __init__(self, root, depth, slicing):
+ def __init__(self, root, slicing):
""" Create an object to access a hash-slicing based object storage.
Args:
root (string): path to the root directory of the storage on
the disk.
- depth (int): number of subdirectories created to store a file.
- slicing (int): number of hash character consumed for each
- subdirectories.
+ slicing (string): string that indicates the slicing to perform
+ on the hash of the content to know the path where it should
+ be stored.
"""
if not os.path.isdir(root):
raise ValueError(
@@ -118,8 +115,13 @@
)
self.root = root
- self.depth = depth
- self.slicing = slicing
+ # Make a list of tuples where each tuple contains the beginning
+ # and the end of each slicing.
+ self.bounds = [
+ slice(*map(int, sbounds.split(':')))
+ for sbounds in slicing.split('/')
+ if sbounds
+ ]
def __contains__(self, obj_id):
""" Check whether the given object is present in the storage or not.
@@ -173,20 +175,15 @@
Returns:
Path to the directory that contains the required object.
"""
- if len(hex_obj_id) < self.depth * self.slicing:
+ max_endchar = max(map(lambda bound: bound.stop, self.bounds))
+ if len(hex_obj_id) < max_endchar:
raise ValueError(
- 'Object id "%s" is to short for %d-slicing at depth %d'
- % (hex_obj_id, self.slicing, self.depth)
+ 'Object id "%s" is to short for slicing to char %d'
+ % (hex_obj_id, max_endchar)
)
- # Compute [depth] substrings of [hex_obj_id], each of length [slicing],
- # starting from the beginning.
- id_steps = [hex_obj_id[i * self.slicing:
- i * self.slicing + self.slicing]
- for i in range(self.depth)]
- steps = [self.root] + id_steps
-
- return os.path.join(*steps)
+ slices = [hex_obj_id[bound] for bound in self.bounds]
+ return os.path.join(self.root, *slices)
def _obj_path(self, hex_obj_id):
""" Compute the full path to an object into the current storage.
@@ -331,7 +328,7 @@
a tuple (batch size, batch).
"""
dirs = []
- for level in range(self.depth):
+ for level in range(len(self.bounds)):
path = os.path.join(self.root, *dirs)
dir_list = next(os.walk(path))[1]
if 'tmp' in dir_list:
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -68,7 +68,9 @@
except psycopg2.OperationalError as e:
raise StorageDBError(e)
- self.objstorage = PathSlicingObjStorage(obj_root, depth=3, slicing=2)
+ # TODO this needs to be configured
+ self.objstorage = PathSlicingObjStorage(obj_root,
+ slicing='0:2/2:4/4:6')
def content_add(self, content):
"""Add content blobs to the storage
diff --git a/swh/storage/tests/test_archiver.py b/swh/storage/tests/test_archiver.py
--- a/swh/storage/tests/test_archiver.py
+++ b/swh/storage/tests/test_archiver.py
@@ -37,7 +37,7 @@
# Launch the backup server
self.backup_objroot = tempfile.mkdtemp(prefix='remote')
self.config = {'storage_base': self.backup_objroot,
- 'storage_depth': 3}
+ 'storage_slicing': '0:2/2:4/4:6'}
self.app = app
super().setUp()
diff --git a/swh/storage/tests/test_checker.py b/swh/storage/tests/test_checker.py
--- a/swh/storage/tests/test_checker.py
+++ b/swh/storage/tests/test_checker.py
@@ -43,8 +43,8 @@
# Connect to an objstorage
config = {'batch_size': 10}
path = tempfile.mkdtemp()
- depth = 3
- self.checker = ContentChecker(config, path, depth, 'http://None')
+ slicing = '0:2/2:4/4:6'
+ self.checker = ContentChecker(config, path, slicing, 'http://None')
self.checker.backup_storages = [MockBackupStorage(),
MockBackupStorage()]
diff --git a/swh/storage/tests/test_objstorage_api.py b/swh/storage/tests/test_objstorage_api.py
--- a/swh/storage/tests/test_objstorage_api.py
+++ b/swh/storage/tests/test_objstorage_api.py
@@ -3,7 +3,6 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import os
import tempfile
import unittest
@@ -24,7 +23,7 @@
def setUp(self):
self.config = {'storage_base': tempfile.mkdtemp(),
- 'storage_depth': 3}
+ 'storage_slicing': '0:1/0:5'}
self.app = app
super().setUp()
self.objstorage = RemoteObjStorage(self.url())
@@ -65,20 +64,12 @@
@istest
def content_check_invalid(self):
content = bytes('content_check_invalid', 'utf8')
- id = self.objstorage.content_add(content)
- hex_obj_id = hashutil.hash_to_hex(id)
- dir_path = os.path.join(
- self.config['storage_base'],
- *[hex_obj_id[i*2:i*2+2]
- for i in range(int(self.config['storage_depth']))]
- )
- path = os.path.join(dir_path, hex_obj_id)
- content = list(content)
- with open(path, 'bw') as f:
- content[0] = (content[0] + 1) % 128
- f.write(bytes(content))
+ invalid_id = hashutil.hashdata(b'invalid content')['sha1']
+ # Add the content with an invalid id.
+ self.objstorage.content_add(content, invalid_id)
+ # Then check it and expect an error.
with self.assertRaises(Error):
- self.objstorage.content_check(id)
+ self.objstorage.content_check(invalid_id)
@istest
def content_check_valid(self):
diff --git a/swh/storage/tests/test_objstorage_pathslicing.py b/swh/storage/tests/test_objstorage_pathslicing.py
--- a/swh/storage/tests/test_objstorage_pathslicing.py
+++ b/swh/storage/tests/test_objstorage_pathslicing.py
@@ -19,11 +19,9 @@
def setUp(self):
super().setUp()
- self.depth = 3
- self.slicing = 2
+ self.slicing = '0:2/2:4/4:6'
self.tmpdir = tempfile.mkdtemp()
- self.storage = PathSlicingObjStorage(self.tmpdir, self.depth,
- self.slicing)
+ self.storage = PathSlicingObjStorage(self.tmpdir, self.slicing)
def content_path(self, obj_id):
hex_obj_id = hashutil.hash_to_hex(obj_id)

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 10:21 PM (1 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233746

Event Timeline