diff --git a/swh/objstorage/backends/pathslicing.py b/swh/objstorage/backends/pathslicing.py --- a/swh/objstorage/backends/pathslicing.py +++ b/swh/objstorage/backends/pathslicing.py @@ -9,6 +9,7 @@ import os import random import tempfile +from typing import List from swh.model import hashutil from swh.objstorage.exc import Error, ObjNotFoundError @@ -29,59 +30,101 @@ FILE_MODE = 0o644 -@contextmanager -def _write_obj_file(hex_obj_id, objstorage): - """ Context manager for writing object files to the object storage. +class PathSlicer: + """Helper class to compute a path based on a hash. - During writing, data are written to a temporary file, which is atomically - renamed to the right file name after closing. + Used to compute a directory path based on the object hash according to a + given slicing. Each slicing correspond to a directory that is named + according to the hash of its content. + + For instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689 + will have the following computed path: - Usage sample: - with _write_obj_file(hex_obj_id, objstorage): - f.write(obj_data) + - 0:2/2:4/4:6 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689 + - 0:1/0:5/ : 3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689 - Yields: - a file-like object open for writing bytes. + Args: + root (str): path to the root directory of the storage on the disk. + slicing (str): the slicing configuration. """ - # Get the final paths and create the directory if absent. - dir = objstorage._obj_dir(hex_obj_id) - if not os.path.isdir(dir): - os.makedirs(dir, DIR_MODE, exist_ok=True) - path = os.path.join(dir, hex_obj_id) - # Create a temporary file. - (tmp, tmp_path) = tempfile.mkstemp(suffix=".tmp", prefix="hex_obj_id.", dir=dir) + def __init__(self, root: str, slicing: str): + self.root = root + # Make a list of tuples where each tuple contains the beginning + # and the end of each slicing. + try: + self.bounds = [ + slice(*(int(x) if x else None for x in sbounds.split(":"))) + for sbounds in slicing.split("/") + if sbounds + ] + except TypeError: + raise ValueError( + "Invalid slicing declaration; " + "it should be a of the form ':[/:]..." + ) + + def check_config(self): + """Check the slicing configuration is valid. + + Raises: + ValueError: if the slicing configuration is invalid. + """ + if len(self): + max_char = max( + max(bound.start or 0, bound.stop or 0) for bound in self.bounds + ) + if ID_HASH_LENGTH < max_char: + raise ValueError( + "Algorithm %s has too short hash for slicing to char %d" + % (ID_HASH_ALGO, max_char) + ) + + def get_directory(self, hex_obj_id: str) -> str: + """ Compute the storage directory of an object. - # Open the file and yield it for writing. - tmp_f = os.fdopen(tmp, "wb") - yield tmp_f + See also: PathSlicer::get_path - # Make sure the contents of the temporary file are written to disk - tmp_f.flush() - if objstorage.use_fdatasync: - os.fdatasync(tmp) - else: - os.fsync(tmp) + Args: + hex_obj_id: object id as hexlified string. - # Then close the temporary file and move it to the right path. - tmp_f.close() - os.chmod(tmp_path, FILE_MODE) - os.rename(tmp_path, path) + Returns: + Absolute path (including root) to the directory that contains + the given object id. + """ + return os.path.join(self.root, *self.get_slices(hex_obj_id)) + def get_path(self, hex_obj_id: str) -> str: + """ Compute the full path to an object into the current storage. -def _read_obj_file(hex_obj_id, objstorage): - """ Context manager for reading object file in the object storage. + See also: PathSlicer::get_directory - Usage sample: - with _read_obj_file(hex_obj_id, objstorage) as f: - b = f.read() + Args: + hex_obj_id(str): object id as hexlified string. - Yields: - a file-like object open for reading bytes. - """ - path = objstorage._obj_path(hex_obj_id) + Returns: + Absolute path (including root) to the object corresponding + to the given object id. + """ + return os.path.join(self.get_directory(hex_obj_id), hex_obj_id) + + def get_slices(self, hex_obj_id: str) -> List[str]: + """Compute the path elements for the given hash. + + Args: + hex_obj_id(str): object id as hexlified string. + + Returns: + Relative path to the actual object corresponding to the given id as + a list. + """ + + assert len(hex_obj_id) == ID_HASH_LENGTH + return [hex_obj_id[bound] for bound in self.bounds] - return open(path, "rb") + def __len__(self) -> int: + """Number of slices of the slicer""" + return len(self.bounds) class PathSlicingObjStorage(ObjStorage): @@ -104,32 +147,19 @@ The files in the storage are stored in gzipped compressed format. - Attributes: - root (string): path to the root directory of the storage on the disk. - bounds: list of tuples that indicates the beginning and the end of - each subdirectory for a content. + Args: + root (str): path to the root directory of the storage on + the disk. + slicing (str): string that indicates the slicing to perform + on the hash of the content to know the path where it should + be stored (see the documentation of the PathSlicer class). """ def __init__(self, root, slicing, compression="gzip", **kwargs): - """ Create an object to access a hash-slicing based object storage. - - Args: - root (string): path to the root directory of the storage on - the disk. - slicing (string): string that indicates the slicing to perform - on the hash of the content to know the path where it should - be stored. - """ super().__init__(**kwargs) self.root = root - # Make a list of tuples where each tuple contains the beginning - # and the end of each slicing. - self.bounds = [ - slice(*map(int, sbounds.split(":"))) - for sbounds in slicing.split("/") - if sbounds - ] + self.slicer = PathSlicer(root, slicing) self.use_fdatasync = hasattr(os, "fdatasync") self.compression = compression @@ -139,24 +169,17 @@ def check_config(self, *, check_write): """Check whether this object storage is properly configured""" - root = self.root + self.slicer.check_config() - if not os.path.isdir(root): + if not os.path.isdir(self.root): raise ValueError( - 'PathSlicingObjStorage root "%s" is not a directory' % root - ) - - max_endchar = max(map(lambda bound: bound.stop, self.bounds)) - if ID_HASH_LENGTH < max_endchar: - raise ValueError( - "Algorithm %s has too short hash for slicing to char %d" - % (ID_HASH_ALGO, max_endchar) + 'PathSlicingObjStorage root "%s" is not a directory' % self.root ) if check_write: if not os.access(self.root, os.W_OK): raise PermissionError( - 'PathSlicingObjStorage root "%s" is not writable' % root + 'PathSlicingObjStorage root "%s" is not writable' % self.root ) if self.compression not in compressors: @@ -169,7 +192,7 @@ def __contains__(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) - return os.path.isfile(self._obj_path(hex_obj_id)) + return os.path.isfile(self.slicer.get_path(hex_obj_id)) def __iter__(self): """Iterate over the object identifiers currently available in the @@ -207,33 +230,6 @@ """ return sum(1 for i in self) - def _obj_dir(self, hex_obj_id): - """ Compute the storage directory of an object. - - See also: PathSlicingObjStorage::_obj_path - - Args: - hex_obj_id: object id as hexlified string. - - Returns: - Path to the directory that contains the required object. - """ - slices = [hex_obj_id[bound] for bound in self.bounds] - return os.path.join(self.root, *slices) - - def _obj_path(self, hex_obj_id): - """ Compute the full path to an object into the current storage. - - See also: PathSlicingObjStorage::_obj_dir - - Args: - hex_obj_id: object id as hexlified string. - - Returns: - Path to the actual object corresponding to the given id. - """ - return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id) - def add(self, content, obj_id=None, check_presence=True): if obj_id is None: obj_id = compute_hash(content) @@ -245,7 +241,7 @@ if not isinstance(content, Iterator): content = [content] compressor = compressors[self.compression]() - with _write_obj_file(hex_obj_id, self) as f: + with self._write_obj_file(hex_obj_id) as f: for chunk in content: f.write(compressor.compress(chunk)) f.write(compressor.flush()) @@ -259,7 +255,7 @@ # Open the file and return its content as bytes hex_obj_id = hashutil.hash_to_hex(obj_id) d = decompressors[self.compression]() - with _read_obj_file(hex_obj_id, self) as f: + with open(self.slicer.get_path(hex_obj_id), "rb") as f: out = d.decompress(f.read()) if d.unused_data: raise Error("Corrupt object %s: trailing data found" % hex_obj_id,) @@ -293,7 +289,7 @@ hex_obj_id = hashutil.hash_to_hex(obj_id) try: - os.remove(self._obj_path(hex_obj_id)) + os.remove(self.slicer.get_path(hex_obj_id)) except FileNotFoundError: raise ObjNotFoundError(obj_id) return True @@ -308,7 +304,7 @@ a tuple (batch size, batch). """ dirs = [] - for level in range(len(self.bounds)): + for level in range(len(self.slicer)): path = os.path.join(self.root, *dirs) dir_list = next(os.walk(path))[1] if "tmp" in dir_list: @@ -334,7 +330,7 @@ def chunk_writer(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) compressor = compressors[self.compression]() - with _write_obj_file(hex_obj_id, self) as f: + with self._write_obj_file(hex_obj_id) as f: yield lambda c: f.write(compressor.compress(c)) f.write(compressor.flush()) @@ -354,7 +350,7 @@ hex_obj_id = hashutil.hash_to_hex(obj_id) decompressor = decompressors[self.compression]() - with _read_obj_file(hex_obj_id, self) as f: + with open(self.slicer.get_path(hex_obj_id), "rb") as f: while True: raw = f.read(chunk_size) if not raw: @@ -373,7 +369,7 @@ def iter_from(self, obj_id, n_leaf=False): hex_obj_id = hashutil.hash_to_hex(obj_id) - slices = [hex_obj_id[bound] for bound in self.bounds] + slices = self.slicer.get_slices(hex_obj_id) rlen = len(self.root.split("/")) i = 0 @@ -392,3 +388,42 @@ yield bytes.fromhex(f) if n_leaf: yield i + + @contextmanager + def _write_obj_file(self, hex_obj_id): + """ Context manager for writing object files to the object storage. + + During writing, data are written to a temporary file, which is atomically + renamed to the right file name after closing. + + Usage sample: + with objstorage._write_obj_file(hex_obj_id): + f.write(obj_data) + + Yields: + a file-like object open for writing bytes. + """ + # Get the final paths and create the directory if absent. + dir = self.slicer.get_directory(hex_obj_id) + if not os.path.isdir(dir): + os.makedirs(dir, DIR_MODE, exist_ok=True) + path = os.path.join(dir, hex_obj_id) + + # Create a temporary file. + (tmp, tmp_path) = tempfile.mkstemp(suffix=".tmp", prefix="hex_obj_id.", dir=dir) + + # Open the file and yield it for writing. + tmp_f = os.fdopen(tmp, "wb") + yield tmp_f + + # Make sure the contents of the temporary file are written to disk + tmp_f.flush() + if self.use_fdatasync: + os.fdatasync(tmp) + else: + os.fsync(tmp) + + # Then close the temporary file and move it to the right path. + tmp_f.close() + os.chmod(tmp_path, FILE_MODE) + os.rename(tmp_path, path) diff --git a/swh/objstorage/tests/test_objstorage_pathslicing.py b/swh/objstorage/tests/test_objstorage_pathslicing.py --- a/swh/objstorage/tests/test_objstorage_pathslicing.py +++ b/swh/objstorage/tests/test_objstorage_pathslicing.py @@ -38,7 +38,7 @@ def content_path(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) - return self.storage._obj_path(hex_obj_id) + return self.storage.slicer.get_path(hex_obj_id) def test_iter(self): content, obj_id = self.hash_content(b"iter") diff --git a/swh/objstorage/tests/test_pathslicer.py b/swh/objstorage/tests/test_pathslicer.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/tests/test_pathslicer.py @@ -0,0 +1,86 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import pytest + +from swh.objstorage.backends.pathslicing import PathSlicer + + +def test_pathslicer(): + slicer = PathSlicer("/", "0:2/2:4/4:6") + assert len(slicer) == 3 + assert slicer.check_config() is None + assert ( + slicer.get_path("34973274ccef6ab4dfaaf86599792fa9c3fe4689") + == "/34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689" + ) + assert ( + slicer.get_directory("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == "/34/97/32" + ) + assert slicer.get_slices("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == [ + "34", + "97", + "32", + ] + + slicer = PathSlicer("/", "/0:1/0:5/") # trailing '/' are ignored + assert slicer.check_config() is None + assert len(slicer) == 2 + assert ( + slicer.get_path("34973274ccef6ab4dfaaf86599792fa9c3fe4689") + == "/3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689" + ) + assert ( + slicer.get_directory("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == "/3/34973" + ) + assert slicer.get_slices("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == [ + "3", + "34973", + ] + + # funny one, with steps + slicer = PathSlicer("/", "0:6:2/1:7:2") + assert slicer.check_config() is None + assert slicer.get_slices("123456789".ljust(40, "0")) == ["135", "246"] + + # reverse works too! + slicer = PathSlicer("/", "-1::-1") + assert slicer.check_config() is None + print(slicer.bounds) + assert slicer.get_slices("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == [ + "34973274ccef6ab4dfaaf86599792fa9c3fe4689"[::-1] + ] + + +def test_pathslicer_noop(): + "test the 'empty' pathslicer" + slicer = PathSlicer("/", "") + assert len(slicer) == 0 + assert slicer.check_config() is None + assert ( + slicer.get_path("34973274ccef6ab4dfaaf86599792fa9c3fe4689") + == "/34973274ccef6ab4dfaaf86599792fa9c3fe4689" + ) + + +def test_pathslicer_bad_hash(): + slicer = PathSlicer("/", "0:2/2:4/4:6") + for hexhash in ("0" * 39, "0" * 41, ""): + with pytest.raises(AssertionError): + slicer.get_path(hexhash) + + +def test_pathslicer_check_config(): + with pytest.raises(ValueError): + PathSlicer("/", "toto") + + with pytest.raises(ValueError): + PathSlicer("/", "/1:2/a:b/") + + assert PathSlicer("/", "0:40").check_config() is None + with pytest.raises(ValueError): + PathSlicer("/", "0:41").check_config() + assert PathSlicer("/", "40:").check_config() is None + with pytest.raises(ValueError): + PathSlicer("/", "41:").check_config()