Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/objstorage/objstorage_pathslicing.py
# Copyright (C) 2015-2016 The Software Heritage developers | # Copyright (C) 2015-2016 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
import gzip | import gzip | ||||
import tempfile | import tempfile | ||||
import random | import random | ||||
from contextlib import contextmanager | from contextlib import contextmanager | ||||
from swh.core import hashutil | from swh.core import hashutil | ||||
from .objstorage import ObjStorage | from .objstorage import ObjStorage, ID_HASH_ALGO | ||||
from ..exc import ObjNotFoundError, Error | from ..exc import ObjNotFoundError, Error | ||||
ID_HASH_ALGO = 'sha1' | |||||
GZIP_BUFSIZ = 1048576 | GZIP_BUFSIZ = 1048576 | ||||
DIR_MODE = 0o755 | DIR_MODE = 0o755 | ||||
FILE_MODE = 0o644 | FILE_MODE = 0o644 | ||||
@contextmanager | @contextmanager | ||||
def _write_obj_file(hex_obj_id, objstorage): | def _write_obj_file(hex_obj_id, objstorage): | ||||
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines | class PathSlicingObjStorage(ObjStorage): | ||||
To avoid directories that contain too many files, the object storage has a | To avoid directories that contain too many files, the object storage has a | ||||
given depth. Each depth level consumes a given amount of characters of | given depth. Each depth level consumes a given amount of characters of | ||||
the object id. | the object id. | ||||
So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689 | So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689 | ||||
will be stored in the given object storages : | will be stored in the given object storages : | ||||
- depth=3, slicing=2 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689 | - depth=3, slicing=2 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689 | ||||
- depth=1, slicing=5 : 34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689 | - depth=1, slicing=5 : 34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689 | ||||
olasd: typo : `0:4` is only four characters long | |||||
The files in the storage are stored in gzipped compressed format. | The files in the storage are stored in gzipped compressed format. | ||||
Attributes: | Attributes: | ||||
root (string): path to the root directory of the storage on the disk. | root (string): path to the root directory of the storage on the disk. | ||||
depth (int): number of subdirectories created to store a file. | depth (int): number of subdirectories created to store a file. | ||||
slicing (int): number of hash character consumed for each | slicing (int): number of hash character consumed for each | ||||
subdirectories. | subdirectories. | ||||
Show All 13 Lines | def __init__(self, root, depth, slicing): | ||||
raise ValueError( | raise ValueError( | ||||
'PathSlicingObjStorage root "%s" is not a directory' % root | 'PathSlicingObjStorage root "%s" is not a directory' % root | ||||
) | ) | ||||
self.root = root | self.root = root | ||||
self.depth = depth | self.depth = depth | ||||
self.slicing = slicing | self.slicing = slicing | ||||
def __contains__(self, obj_id): | def __contains__(self, obj_id): | ||||
Done Inline ActionsYou could instantiate the slice objects here (use slice(map(int)) instead of tuple(map(int))), and reuse them directly when constructing the path. olasd: You could instantiate the slice objects here (use `slice(map(int))` instead of `tuple(map… | |||||
""" Check whether the given object is present in the storage or not. | """ Check whether the given object is present in the storage or not. | ||||
Returns: | Returns: | ||||
True iff the object is present in the storage. | True iff the object is present in the storage. | ||||
""" | """ | ||||
hex_obj_id = hashutil.hash_to_hex(obj_id) | hex_obj_id = hashutil.hash_to_hex(obj_id) | ||||
return os.path.exists(self._obj_path(hex_obj_id)) | return os.path.exists(self._obj_path(hex_obj_id)) | ||||
Show All 35 Lines | def _obj_dir(self, hex_obj_id): | ||||
See also: PathSlicingObjStorage::_obj_path | See also: PathSlicingObjStorage::_obj_path | ||||
Args: | Args: | ||||
hex_obj_id: object id as hexlified string. | hex_obj_id: object id as hexlified string. | ||||
Returns: | Returns: | ||||
Path to the directory that contains the required object. | Path to the directory that contains the required object. | ||||
""" | """ | ||||
if len(hex_obj_id) < self.depth * self.slicing: | if len(hex_obj_id) < self.depth * self.slicing: | ||||
raise ValueError( | raise ValueError( | ||||
'Object id "%s" is to short for %d-slicing at depth %d' | 'Object id "%s" is to short for %d-slicing at depth %d' | ||||
% (hex_obj_id, self.slicing, self.depth) | % (hex_obj_id, self.slicing, self.depth) | ||||
) | ) | ||||
# Compute [depth] substrings of [hex_obj_id], each of length [slicing], | # Compute [depth] substrings of [hex_obj_id], each of length [slicing], | ||||
# starting from the beginning. | # starting from the beginning. | ||||
id_steps = [hex_obj_id[i * self.slicing: | id_steps = [hex_obj_id[i * self.slicing: | ||||
i * self.slicing + self.slicing] | i * self.slicing + self.slicing] | ||||
for i in range(self.depth)] | for i in range(self.depth)] | ||||
steps = [self.root] + id_steps | steps = [self.root] + id_steps | ||||
Done Inline ActionsWe should probably move that check at the instanciation of the storage rather than do it on each access: the length of an object id is constant. olasd: We should probably move that check at the instanciation of the storage rather than do it on… | |||||
Done Inline ActionsDo we have a way, at instantiation, to know the size of a hash given the ID_HASH_ALGO algorithm without hard-coding it? qcampos: Do we have a way, at instantiation, to know the size of a hash given the ID_HASH_ALGO algorithm… | |||||
Done Inline ActionsNot really, no; we can add an ID_HASH_LENGTH variable next to ID_HASH_ALGO. olasd: Not really, no; we can add an ID_HASH_LENGTH variable next to ID_HASH_ALGO. | |||||
return os.path.join(*steps) | return os.path.join(*steps) | ||||
Done Inline Actionshex_obj_id[bounds] for bounds in self.bounds instead of unpacking start, end and repacking them. olasd: `hex_obj_id[bounds] for bounds in self.bounds` instead of unpacking `start, end` and repacking… | |||||
def _obj_path(self, hex_obj_id): | def _obj_path(self, hex_obj_id): | ||||
""" Compute the full path to an object into the current storage. | """ Compute the full path to an object into the current storage. | ||||
See also: PathSlicingObjStorage::_obj_dir | See also: PathSlicingObjStorage::_obj_dir | ||||
Args: | Args: | ||||
hex_obj_id: object id as hexlified string. | hex_obj_id: object id as hexlified string. | ||||
▲ Show 20 Lines • Show All 153 Lines • Show Last 20 Lines |
typo : 0:4 is only four characters long