Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9696985
D49.id178.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
10 KB
Subscribers
None
D49.id178.diff
View Options
diff --git a/swh/storage/checker/checker.py b/swh/storage/checker/checker.py
--- a/swh/storage/checker/checker.py
+++ b/swh/storage/checker/checker.py
@@ -34,7 +34,7 @@
restore corrupted content.
"""
- def __init__(self, config, root, depth, backup_urls):
+ def __init__(self, config, root, slicing, backup_urls):
""" Create a checker that ensure the objstorage have no corrupted file.
Args:
@@ -47,7 +47,7 @@
get a content.
"""
self.config = config
- self.objstorage = PathSlicingObjStorage(root, depth, slicing=2)
+ self.objstorage = PathSlicingObjStorage(root, slicing)
self.backup_storages = [get_storage('remote_storage', [backup_url])
for backup_url in backup_urls]
diff --git a/swh/storage/objstorage/api/server.py b/swh/storage/objstorage/api/server.py
--- a/swh/storage/objstorage/api/server.py
+++ b/swh/storage/objstorage/api/server.py
@@ -16,7 +16,7 @@
DEFAULT_CONFIG = {
'storage_base': ('str', '/tmp/swh-storage/objects/'),
- 'storage_depth': ('int', 3)
+ 'storage_slicing': ('str', '0:2/2:4/4:6')
}
app = Flask(__name__)
@@ -31,8 +31,7 @@
@app.before_request
def before_request():
g.objstorage = PathSlicingObjStorage(app.config['storage_base'],
- app.config['storage_depth'],
- slicing=2)
+ app.config['storage_slicing'])
@app.route('/')
diff --git a/swh/storage/objstorage/objstorage.py b/swh/storage/objstorage/objstorage.py
--- a/swh/storage/objstorage/objstorage.py
+++ b/swh/storage/objstorage/objstorage.py
@@ -4,6 +4,9 @@
# See top-level LICENSE file for more information
+ID_HASH_ALGO = 'sha1'
+
+
class ObjStorage():
""" High-level API to manipulate the Software Heritage object storage.
@@ -47,7 +50,7 @@
"Implementations of ObjStorage must have a 'add' method"
)
- def restore(self, content, obj_id, *args, **kwargs):
+ def restore(self, content, obj_id=None, *args, **kwargs):
""" Restore a content that have been corrupted.
This function is identical to add_bytes but does not check if
diff --git a/swh/storage/objstorage/objstorage_pathslicing.py b/swh/storage/objstorage/objstorage_pathslicing.py
--- a/swh/storage/objstorage/objstorage_pathslicing.py
+++ b/swh/storage/objstorage/objstorage_pathslicing.py
@@ -12,12 +12,10 @@
from swh.core import hashutil
-from .objstorage import ObjStorage
+from .objstorage import ObjStorage, ID_HASH_ALGO
from ..exc import ObjNotFoundError, Error
-ID_HASH_ALGO = 'sha1'
-
GZIP_BUFSIZ = 1048576
DIR_MODE = 0o755
@@ -84,33 +82,32 @@
the value of the ID_HASH_ALGO constant (see hashutil for its meaning).
To avoid directories that contain too many files, the object storage has a
- given depth. Each depth level consumes a given amount of characters of
- the object id.
+ given slicing. Each slicing correspond to a directory that is named
+ according to the hash of its content.
So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689
will be stored in the given object storages :
- - depth=3, slicing=2 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689
- - depth=1, slicing=5 : 34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689
+ - 0:2/2:4/4:6 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689
+ - 0:1/0:5/ : 3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689
The files in the storage are stored in gzipped compressed format.
Attributes:
root (string): path to the root directory of the storage on the disk.
- depth (int): number of subdirectories created to store a file.
- slicing (int): number of hash character consumed for each
- subdirectories.
+ bounds: list of tuples that indicates the beginning and the end of
+ each subdirectory for a content.
"""
- def __init__(self, root, depth, slicing):
+ def __init__(self, root, slicing):
""" Create an object to access a hash-slicing based object storage.
Args:
root (string): path to the root directory of the storage on
the disk.
- depth (int): number of subdirectories created to store a file.
- slicing (int): number of hash character consumed for each
- subdirectories.
+ slicing (string): string that indicates the slicing to perform
+ on the hash of the content to know the path where it should
+ be stored.
"""
if not os.path.isdir(root):
raise ValueError(
@@ -118,8 +115,13 @@
)
self.root = root
- self.depth = depth
- self.slicing = slicing
+ # Make a list of tuples where each tuple contains the beginning
+ # and the end of each slicing.
+ self.bounds = [
+ slice(*map(int, sbounds.split(':')))
+ for sbounds in slicing.split('/')
+ if sbounds
+ ]
def __contains__(self, obj_id):
""" Check whether the given object is present in the storage or not.
@@ -173,20 +175,15 @@
Returns:
Path to the directory that contains the required object.
"""
- if len(hex_obj_id) < self.depth * self.slicing:
+ max_endchar = max(map(lambda bound: bound.stop, self.bounds))
+ if len(hex_obj_id) < max_endchar:
raise ValueError(
- 'Object id "%s" is to short for %d-slicing at depth %d'
- % (hex_obj_id, self.slicing, self.depth)
+ 'Object id "%s" is to short for slicing to char %d'
+ % (hex_obj_id, max_endchar)
)
- # Compute [depth] substrings of [hex_obj_id], each of length [slicing],
- # starting from the beginning.
- id_steps = [hex_obj_id[i * self.slicing:
- i * self.slicing + self.slicing]
- for i in range(self.depth)]
- steps = [self.root] + id_steps
-
- return os.path.join(*steps)
+ slices = [hex_obj_id[bound] for bound in self.bounds]
+ return os.path.join(self.root, *slices)
def _obj_path(self, hex_obj_id):
""" Compute the full path to an object into the current storage.
@@ -331,7 +328,7 @@
a tuple (batch size, batch).
"""
dirs = []
- for level in range(self.depth):
+ for level in range(len(self.bounds)):
path = os.path.join(self.root, *dirs)
dir_list = next(os.walk(path))[1]
if 'tmp' in dir_list:
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -68,7 +68,9 @@
except psycopg2.OperationalError as e:
raise StorageDBError(e)
- self.objstorage = PathSlicingObjStorage(obj_root, depth=3, slicing=2)
+ # TODO this needs to be configured
+ self.objstorage = PathSlicingObjStorage(obj_root,
+ slicing='0:2/2:4/4:6')
def content_add(self, content):
"""Add content blobs to the storage
diff --git a/swh/storage/tests/test_archiver.py b/swh/storage/tests/test_archiver.py
--- a/swh/storage/tests/test_archiver.py
+++ b/swh/storage/tests/test_archiver.py
@@ -37,7 +37,7 @@
# Launch the backup server
self.backup_objroot = tempfile.mkdtemp(prefix='remote')
self.config = {'storage_base': self.backup_objroot,
- 'storage_depth': 3}
+ 'storage_slicing': '0:2/2:4/4:6'}
self.app = app
super().setUp()
diff --git a/swh/storage/tests/test_checker.py b/swh/storage/tests/test_checker.py
--- a/swh/storage/tests/test_checker.py
+++ b/swh/storage/tests/test_checker.py
@@ -43,8 +43,8 @@
# Connect to an objstorage
config = {'batch_size': 10}
path = tempfile.mkdtemp()
- depth = 3
- self.checker = ContentChecker(config, path, depth, 'http://None')
+ slicing = '0:2/2:4/4:6'
+ self.checker = ContentChecker(config, path, slicing, 'http://None')
self.checker.backup_storages = [MockBackupStorage(),
MockBackupStorage()]
diff --git a/swh/storage/tests/test_objstorage_api.py b/swh/storage/tests/test_objstorage_api.py
--- a/swh/storage/tests/test_objstorage_api.py
+++ b/swh/storage/tests/test_objstorage_api.py
@@ -3,7 +3,6 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import os
import tempfile
import unittest
@@ -24,7 +23,7 @@
def setUp(self):
self.config = {'storage_base': tempfile.mkdtemp(),
- 'storage_depth': 3}
+ 'storage_slicing': '0:1/0:5'}
self.app = app
super().setUp()
self.objstorage = RemoteObjStorage(self.url())
@@ -65,20 +64,12 @@
@istest
def content_check_invalid(self):
content = bytes('content_check_invalid', 'utf8')
- id = self.objstorage.content_add(content)
- hex_obj_id = hashutil.hash_to_hex(id)
- dir_path = os.path.join(
- self.config['storage_base'],
- *[hex_obj_id[i*2:i*2+2]
- for i in range(int(self.config['storage_depth']))]
- )
- path = os.path.join(dir_path, hex_obj_id)
- content = list(content)
- with open(path, 'bw') as f:
- content[0] = (content[0] + 1) % 128
- f.write(bytes(content))
+ invalid_id = hashutil.hashdata(b'invalid content')['sha1']
+ # Add the content with an invalid id.
+ self.objstorage.content_add(content, invalid_id)
+ # Then check it and expect an error.
with self.assertRaises(Error):
- self.objstorage.content_check(id)
+ self.objstorage.content_check(invalid_id)
@istest
def content_check_valid(self):
diff --git a/swh/storage/tests/test_objstorage_pathslicing.py b/swh/storage/tests/test_objstorage_pathslicing.py
--- a/swh/storage/tests/test_objstorage_pathslicing.py
+++ b/swh/storage/tests/test_objstorage_pathslicing.py
@@ -19,11 +19,9 @@
def setUp(self):
super().setUp()
- self.depth = 3
- self.slicing = 2
+ self.slicing = '0:2/2:4/4:6'
self.tmpdir = tempfile.mkdtemp()
- self.storage = PathSlicingObjStorage(self.tmpdir, self.depth,
- self.slicing)
+ self.storage = PathSlicingObjStorage(self.tmpdir, self.slicing)
def content_path(self, obj_id):
hex_obj_id = hashutil.hash_to_hex(obj_id)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 17, 10:21 PM (1 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233746
Attached To
D49: Arbitrary slicing on PathSlicingObjStorage
Event Timeline
Log In to Comment