diff --git a/swh/objstorage/backends/pathslicing.py b/swh/objstorage/backends/pathslicing.py --- a/swh/objstorage/backends/pathslicing.py +++ b/swh/objstorage/backends/pathslicing.py @@ -9,6 +9,7 @@ import os import random import tempfile +from typing import List from swh.model import hashutil from swh.objstorage.exc import Error, ObjNotFoundError @@ -29,59 +30,73 @@ FILE_MODE = 0o644 -@contextmanager -def _write_obj_file(hex_obj_id, objstorage): - """ Context manager for writing object files to the object storage. - - During writing, data are written to a temporary file, which is atomically - renamed to the right file name after closing. +class PathSlicer: + def __init__(self, root: str, slicing: str): + self.root = root + # Make a list of tuples where each tuple contains the beginning + # and the end of each slicing. + try: + self.bounds = [ + slice(*map(lambda x: int(x) if x else None, sbounds.split(":"))) + for sbounds in slicing.split("/") + if sbounds + ] + except TypeError: + raise ValueError( + "Invalid slicing declaration; " + "it should be a of the form ':[/:]..." + ) - Usage sample: - with _write_obj_file(hex_obj_id, objstorage): - f.write(obj_data) + def check_config(self): + if len(self): + max_char = max( + max( + bound.start if bound.start is not None else 0 + for bound in self.bounds + ), + max( + bound.stop if bound.stop is not None else 0 for bound in self.bounds + ), + ) + if ID_HASH_LENGTH < max_char: + raise ValueError( + "Algorithm %s has too short hash for slicing to char %d" + % (ID_HASH_ALGO, max_char) + ) - Yields: - a file-like object open for writing bytes. - """ - # Get the final paths and create the directory if absent. - dir = objstorage._obj_dir(hex_obj_id) - if not os.path.isdir(dir): - os.makedirs(dir, DIR_MODE, exist_ok=True) - path = os.path.join(dir, hex_obj_id) + def get_directory(self, hex_obj_id: str) -> str: + """ Compute the storage directory of an object. - # Create a temporary file. - (tmp, tmp_path) = tempfile.mkstemp(suffix=".tmp", prefix="hex_obj_id.", dir=dir) + See also: PathSlicer::get_path - # Open the file and yield it for writing. - tmp_f = os.fdopen(tmp, "wb") - yield tmp_f + Args: + hex_obj_id: object id as hexlified string. - # Make sure the contents of the temporary file are written to disk - tmp_f.flush() - if objstorage.use_fdatasync: - os.fdatasync(tmp) - else: - os.fsync(tmp) + Returns: + Path to the directory that contains the required object. + """ + return os.path.join(self.root, *self.get_slices(hex_obj_id)) - # Then close the temporary file and move it to the right path. - tmp_f.close() - os.chmod(tmp_path, FILE_MODE) - os.rename(tmp_path, path) + def get_path(self, hex_obj_id: str) -> str: + """ Compute the full path to an object into the current storage. + See also: PathSlicer::get_directory -def _read_obj_file(hex_obj_id, objstorage): - """ Context manager for reading object file in the object storage. + Args: + hex_obj_id: object id as hexlified string. - Usage sample: - with _read_obj_file(hex_obj_id, objstorage) as f: - b = f.read() + Returns: + Path to the actual object corresponding to the given id. + """ + return os.path.join(self.get_directory(hex_obj_id), hex_obj_id) - Yields: - a file-like object open for reading bytes. - """ - path = objstorage._obj_path(hex_obj_id) + def get_slices(self, hex_obj_id: str) -> List[str]: + assert len(hex_obj_id) == ID_HASH_LENGTH + return [hex_obj_id[bound] for bound in self.bounds] - return open(path, "rb") + def __len__(self) -> int: + """Number of slices of the slicer""" + return len(self.bounds) class PathSlicingObjStorage(ObjStorage): @@ -106,8 +121,7 @@ Attributes: root (string): path to the root directory of the storage on the disk. - bounds: list of tuples that indicates the beginning and the end of - each subdirectory for a content. + slicer: the PathSlicer instance dealing with the path slicing logic. """ @@ -123,13 +137,7 @@ """ super().__init__(**kwargs) self.root = root - # Make a list of tuples where each tuple contains the beginning - # and the end of each slicing. - self.bounds = [ - slice(*map(int, sbounds.split(":"))) - for sbounds in slicing.split("/") - if sbounds - ] + self.slicer = PathSlicer(root, slicing) self.use_fdatasync = hasattr(os, "fdatasync") self.compression = compression @@ -139,24 +147,17 @@ def check_config(self, *, check_write): """Check whether this object storage is properly configured""" - root = self.root - - if not os.path.isdir(root): - raise ValueError( - 'PathSlicingObjStorage root "%s" is not a directory' % root - ) + self.slicer.check_config() - max_endchar = max(map(lambda bound: bound.stop, self.bounds)) - if ID_HASH_LENGTH < max_endchar: + if not os.path.isdir(self.root): raise ValueError( - "Algorithm %s has too short hash for slicing to char %d" - % (ID_HASH_ALGO, max_endchar) + 'PathSlicingObjStorage root "%s" is not a directory' % self.root ) if check_write: if not os.access(self.root, os.W_OK): raise PermissionError( - 'PathSlicingObjStorage root "%s" is not writable' % root + 'PathSlicingObjStorage root "%s" is not writable' % self.root ) if self.compression not in compressors: @@ -169,7 +170,7 @@ def __contains__(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) - return os.path.isfile(self._obj_path(hex_obj_id)) + return os.path.isfile(self.slicer.get_path(hex_obj_id)) def __iter__(self): """Iterate over the object identifiers currently available in the @@ -207,33 +208,6 @@ """ return sum(1 for i in self) - def _obj_dir(self, hex_obj_id): - """ Compute the storage directory of an object. - - See also: PathSlicingObjStorage::_obj_path - - Args: - hex_obj_id: object id as hexlified string. - - Returns: - Path to the directory that contains the required object. - """ - slices = [hex_obj_id[bound] for bound in self.bounds] - return os.path.join(self.root, *slices) - - def _obj_path(self, hex_obj_id): - """ Compute the full path to an object into the current storage. - - See also: PathSlicingObjStorage::_obj_dir - - Args: - hex_obj_id: object id as hexlified string. - - Returns: - Path to the actual object corresponding to the given id. - """ - return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id) - def add(self, content, obj_id=None, check_presence=True): if obj_id is None: obj_id = compute_hash(content) @@ -245,7 +219,7 @@ if not isinstance(content, Iterator): content = [content] compressor = compressors[self.compression]() - with _write_obj_file(hex_obj_id, self) as f: + with self._write_obj_file(hex_obj_id) as f: for chunk in content: f.write(compressor.compress(chunk)) f.write(compressor.flush()) @@ -259,7 +233,7 @@ # Open the file and return its content as bytes hex_obj_id = hashutil.hash_to_hex(obj_id) d = decompressors[self.compression]() - with _read_obj_file(hex_obj_id, self) as f: + with open(self.slicer.get_path(hex_obj_id), "rb") as f: out = d.decompress(f.read()) if d.unused_data: raise Error("Corrupt object %s: trailing data found" % hex_obj_id,) @@ -293,7 +267,7 @@ hex_obj_id = hashutil.hash_to_hex(obj_id) try: - os.remove(self._obj_path(hex_obj_id)) + os.remove(self.slicer.get_path(hex_obj_id)) except FileNotFoundError: raise ObjNotFoundError(obj_id) return True @@ -308,7 +282,7 @@ a tuple (batch size, batch). """ dirs = [] - for level in range(len(self.bounds)): + for level in range(len(self.slicer)): path = os.path.join(self.root, *dirs) dir_list = next(os.walk(path))[1] if "tmp" in dir_list: @@ -334,7 +308,7 @@ def chunk_writer(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) compressor = compressors[self.compression]() - with _write_obj_file(hex_obj_id, self) as f: + with self._write_obj_file(hex_obj_id) as f: yield lambda c: f.write(compressor.compress(c)) f.write(compressor.flush()) @@ -354,7 +328,7 @@ hex_obj_id = hashutil.hash_to_hex(obj_id) decompressor = decompressors[self.compression]() - with _read_obj_file(hex_obj_id, self) as f: + with open(self.slicer.get_path(hex_obj_id), "rb") as f: while True: raw = f.read(chunk_size) if not raw: @@ -373,7 +347,7 @@ def iter_from(self, obj_id, n_leaf=False): hex_obj_id = hashutil.hash_to_hex(obj_id) - slices = [hex_obj_id[bound] for bound in self.bounds] + slices = self.slicer.get_slices(hex_obj_id) rlen = len(self.root.split("/")) i = 0 @@ -392,3 +366,42 @@ yield bytes.fromhex(f) if n_leaf: yield i + + @contextmanager + def _write_obj_file(self, hex_obj_id): + """ Context manager for writing object files to the object storage. + + During writing, data are written to a temporary file, which is atomically + renamed to the right file name after closing. + + Usage sample: + with objstorage._write_obj_file(hex_obj_id): + f.write(obj_data) + + Yields: + a file-like object open for writing bytes. + """ + # Get the final paths and create the directory if absent. + dir = self.slicer.get_directory(hex_obj_id) + if not os.path.isdir(dir): + os.makedirs(dir, DIR_MODE, exist_ok=True) + path = os.path.join(dir, hex_obj_id) + + # Create a temporary file. + (tmp, tmp_path) = tempfile.mkstemp(suffix=".tmp", prefix="hex_obj_id.", dir=dir) + + # Open the file and yield it for writing. + tmp_f = os.fdopen(tmp, "wb") + yield tmp_f + + # Make sure the contents of the temporary file are written to disk + tmp_f.flush() + if self.use_fdatasync: + os.fdatasync(tmp) + else: + os.fsync(tmp) + + # Then close the temporary file and move it to the right path. + tmp_f.close() + os.chmod(tmp_path, FILE_MODE) + os.rename(tmp_path, path) diff --git a/swh/objstorage/tests/test_objstorage_pathslicing.py b/swh/objstorage/tests/test_objstorage_pathslicing.py --- a/swh/objstorage/tests/test_objstorage_pathslicing.py +++ b/swh/objstorage/tests/test_objstorage_pathslicing.py @@ -38,7 +38,7 @@ def content_path(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) - return self.storage._obj_path(hex_obj_id) + return self.storage.slicer.get_path(hex_obj_id) def test_iter(self): content, obj_id = self.hash_content(b"iter") diff --git a/swh/objstorage/tests/test_pathslicer.py b/swh/objstorage/tests/test_pathslicer.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/tests/test_pathslicer.py @@ -0,0 +1,86 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import pytest + +from swh.objstorage.backends.pathslicing import PathSlicer + + +def test_pathslicer(): + slicer = PathSlicer("/", "0:2/2:4/4:6") + assert len(slicer) == 3 + assert slicer.check_config() is None + assert ( + slicer.get_path("34973274ccef6ab4dfaaf86599792fa9c3fe4689") + == "/34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689" + ) + assert ( + slicer.get_directory("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == "/34/97/32" + ) + assert slicer.get_slices("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == [ + "34", + "97", + "32", + ] + + slicer = PathSlicer("/", "/0:1/0:5/") # trailing '/' are ignored + assert slicer.check_config() is None + assert len(slicer) == 2 + assert ( + slicer.get_path("34973274ccef6ab4dfaaf86599792fa9c3fe4689") + == "/3/34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689" + ) + assert ( + slicer.get_directory("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == "/3/34973" + ) + assert slicer.get_slices("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == [ + "3", + "34973", + ] + + # funny one, with steps + slicer = PathSlicer("/", "0:6:2/1:7:2") + assert slicer.check_config() is None + assert slicer.get_slices("123456789".ljust(40, "0")) == ["135", "246"] + + # reverse works too! + slicer = PathSlicer("/", "-1::-1") + assert slicer.check_config() is None + print(slicer.bounds) + assert slicer.get_slices("34973274ccef6ab4dfaaf86599792fa9c3fe4689") == [ + "34973274ccef6ab4dfaaf86599792fa9c3fe4689"[::-1] + ] + + +def test_pathslicer_noop(): + "test the 'empty' pathslicer" + slicer = PathSlicer("/", "") + assert len(slicer) == 0 + assert slicer.check_config() is None + assert ( + slicer.get_path("34973274ccef6ab4dfaaf86599792fa9c3fe4689") + == "/34973274ccef6ab4dfaaf86599792fa9c3fe4689" + ) + + +def test_pathslicer_bad_hash(): + slicer = PathSlicer("/", "0:2/2:4/4:6") + for hexhash in ("0" * 39, "0" * 41, ""): + with pytest.raises(AssertionError): + slicer.get_path(hexhash) + + +def test_pathslicer_check_config(): + with pytest.raises(ValueError): + PathSlicer("/", "toto") + + with pytest.raises(ValueError): + PathSlicer("/", "/1:2/a:b/") + + assert PathSlicer("/", "0:40").check_config() is None + with pytest.raises(ValueError): + PathSlicer("/", "0:41").check_config() + assert PathSlicer("/", "40:").check_config() is None + with pytest.raises(ValueError): + PathSlicer("/", "41:").check_config()