No OneTemporary
Actions

Size

12 KB

Subscribers

None

View Options

	diff --git a/__init__.py b/__init__.py
	index afe05f2..02d3b0f 100644
	--- a/__init__.py
	+++ b/__init__.py
	@@ -1,3 +1,4 @@
	from .objstorage import ObjStorage
	+from .objstorage_pathslicing import PathSlicingObjStorage

	-__all__ = [ObjStorage]
	+__all__ = [ObjStorage, PathSlicingObjStorage]
	diff --git a/objstorage_pathslicing.py b/objstorage_pathslicing.py
	new file mode 100644
	index 0000000..875fd75
	--- /dev/null
	+++ b/objstorage_pathslicing.py
	@@ -0,0 +1,350 @@
	+# Copyright (C) 2015-2016 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+import os
	+import gzip
	+import tempfile
	+import random
	+
	+from contextlib import contextmanager
	+
	+from swh.core import hashutil
	+
	+from .objstorage import ObjStorage
	+from ..exc import ObjNotFoundError, Error
	+
	+
	+ID_HASH_ALGO = 'sha1'
	+
	+GZIP_BUFSIZ = 1048576
	+
	+DIR_MODE = 0o755
	+FILE_MODE = 0o644
	+
	+
	+@contextmanager
	+def _write_obj_file(hex_obj_id, objstorage):
	+ """ Context manager for writing object files to the object storage.
	+
	+ During writing, data are written to a temporary file, which is atomically
	+ renamed to the right file name after closing. This context manager also
	+ takes care of (gzip) compressing the data on the fly.
	+
	+ Usage sample:
	+ with _write_obj_file(hex_obj_id, objstorage):
	+ f.write(obj_data)
	+
	+ Yields:
	+ a file-like object open for writing bytes.
	+ """
	+ # Get the final paths and create the directory if absent.
	+ dir = objstorage._obj_dir(hex_obj_id)
	+ if not os.path.isdir(dir):
	+ os.makedirs(dir, DIR_MODE, exist_ok=True)
	+ path = os.path.join(dir, hex_obj_id)
	+
	+ # Create a temporary file.
	+ (tmp, tmp_path) = tempfile.mkstemp(suffix='.tmp', prefix='hex_obj_id.',
	+ dir=dir)
	+
	+ # Open the file and yield it for writing.
	+ tmp_f = os.fdopen(tmp, 'wb')
	+ with gzip.GzipFile(filename=tmp_path, fileobj=tmp_f) as f:
	+ yield f
	+
	+ # Then close the temporary file and move it to the right directory.
	+ tmp_f.close()
	+ os.chmod(tmp_path, FILE_MODE)
	+ os.rename(tmp_path, path)
	+
	+
	+@contextmanager
	+def _read_obj_file(hex_obj_id, objstorage):
	+ """ Context manager for reading object file in the object storage.
	+
	+ Usage sample:
	+ with _read_obj_file(hex_obj_id, objstorage) as f:
	+ b = f.read()
	+
	+ Yields:
	+ a file-like object open for reading bytes.
	+ """
	+ path = objstorage._obj_path(hex_obj_id)
	+ with gzip.GzipFile(path, 'rb') as f:
	+ yield f
	+
	+
	+class PathSlicingObjStorage(ObjStorage):
	+ """ Implementation of the ObjStorage API based on the hash of the content.
	+
	+ On disk, an object storage is a directory tree containing files named after
	+ their object IDs. An object ID is a checksum of its content, depending on
	+ the value of the ID_HASH_ALGO constant (see hashutil for its meaning).
	+
	+ To avoid directories that contain too many files, the object storage has a
	+ given depth. Each depth level consumes a given amount of characters of
	+ the object id.
	+
	+ So for instance a file with SHA1 34973274ccef6ab4dfaaf86599792fa9c3fe4689
	+ will be stored in the given object storages :
	+
	+ - depth=3, slicing=2 : 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689
	+ - depth=1, slicing=5 : 34973/34973274ccef6ab4dfaaf86599792fa9c3fe4689
	+
	+ The files in the storage are stored in gzipped compressed format.
	+
	+ Attributes:
	+ root (string): path to the root directory of the storage on the disk.
	+ depth (int): number of subdirectories created to store a file.
	+ slicing (int): number of hash character consumed for each
	+ subdirectories.
	+ """
	+
	+ def __init__(self, root, depth, slicing):
	+ """ Create an object to access a hash-slicing based object storage.
	+
	+ Args:
	+ root (string): path to the root directory of the storage on
	+ the disk.
	+ depth (int): number of subdirectories created to store a file.
	+ slicing (int): number of hash character consumed for each
	+ subdirectories.
	+ """
	+ if not os.path.isdir(root):
	+ raise ValueError(
	+ 'PathSlicingObjStorage root "%s" is not a directory' % root
	+ )
	+
	+ self.root = root
	+ self.depth = depth
	+ self.slicing = slicing
	+
	+ def __contains__(self, obj_id):
	+ """ Check whether the given object is present in the storage or not.
	+
	+ Returns:
	+ True iff the object is present in the storage.
	+ """
	+ hex_obj_id = hashutil.hash_to_hex(obj_id)
	+ return os.path.exists(self._obj_path(hex_obj_id))
	+
	+ def __iter__(self):
	+ """iterate over the object identifiers currently available in the storage
	+
	+ Warning: with the current implementation of the object storage, this
	+ method will walk the filesystem to list objects, meaning that listing
	+ all objects will be very slow for large storages. You almost certainly
	+ don't want to use this method in production.
	+
	+ Return:
	+ iterator over object IDs
	+ """
	+ def obj_iterator():
	+ # XXX hackish: it does not verify that the depth of found files
	+ # matches the slicing depth of the storage
	+ for root, _dirs, files in os.walk(self.root):
	+ for f in files:
	+ yield bytes.fromhex(f)
	+
	+ return obj_iterator()
	+
	+ def __len__(self):
	+ """compute the number of objects available in the storage
	+
	+ Warning: this currently uses `__iter__`, its warning about bad
	+ performances applies
	+
	+ Return:
	+ number of objects contained in the storage
	+
	+ """
	+ return sum(1 for i in self)
	+
	+ def _obj_dir(self, hex_obj_id):
	+ """ Compute the storage directory of an object.
	+
	+ See also: PathSlicingObjStorage::_obj_path
	+
	+ Args:
	+ hex_obj_id: object id as hexlified string.
	+
	+ Returns:
	+ Path to the directory that contains the required object.
	+ """
	+ if len(hex_obj_id) < self.depth * self.slicing:
	+ raise ValueError(
	+ 'Object id "%s" is to short for %d-slicing at depth %d'
	+ % (hex_obj_id, self.slicing, self.depth)
	+ )
	+
	+ # Compute [depth] substrings of [hex_obj_id], each of length [slicing],
	+ # starting from the beginning.
	+ id_steps = [hex_obj_id[i * self.slicing:
	+ i * self.slicing + self.slicing]
	+ for i in range(self.depth)]
	+ steps = [self.root] + id_steps
	+
	+ return os.path.join(*steps)
	+
	+ def _obj_path(self, hex_obj_id):
	+ """ Compute the full path to an object into the current storage.
	+
	+ See also: PathSlicingObjStorage::_obj_dir
	+
	+ Args:
	+ hex_obj_id: object id as hexlified string.
	+
	+ Returns:
	+ Path to the actual object corresponding to the given id.
	+ """
	+ return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id)
	+
	+ def add(self, bytes, obj_id=None, check_presence=True):
	+ """ Add a new object to the object storage.
	+
	+ Args:
	+ bytes: content of the object to be added to the storage.
	+ obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When
	+ given, obj_id will be trusted to match the bytes. If missing,
	+ obj_id will be computed on the fly.
	+ check_presence: indicate if the presence of the content should be
	+ verified before adding the file.
	+
	+ Returns:
	+ the id of the object into the storage.
	+ """
	+ if obj_id is None:
	+ # Checksum is missing, compute it on the fly.
	+ h = hashutil._new_hash(ID_HASH_ALGO, len(bytes))
	+ h.update(bytes)
	+ obj_id = h.digest()
	+
	+ if check_presence and obj_id in self:
	+ # If the object is already present, return immediatly.
	+ return obj_id
	+
	+ hex_obj_id = hashutil.hash_to_hex(obj_id)
	+ with _write_obj_file(hex_obj_id, self) as f:
	+ f.write(bytes)
	+
	+ return obj_id
	+
	+ def restore(self, bytes, obj_id=None):
	+ """ Restore a content that have been corrupted.
	+
	+ This function is identical to add_bytes but does not check if
	+ the object id is already in the file system.
	+
	+ Args:
	+ bytes: content of the object to be added to the storage
	+ obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When
	+ given, obj_id will be trusted to match bytes. If missing,
	+ obj_id will be computed on the fly.
	+ """
	+ return self.add(bytes, obj_id, check_presence=False)
	+
	+ def get(self, obj_id):
	+ """ Retrieve the content of a given object.
	+
	+ Args:
	+ obj_id: object id.
	+
	+ Returns:
	+ the content of the requested object as bytes.
	+
	+ Raises:
	+ ObjNotFoundError: if the requested object is missing.
	+ """
	+ if obj_id not in self:
	+ raise ObjNotFoundError(obj_id)
	+
	+ # Open the file and return its content as bytes
	+ hex_obj_id = hashutil.hash_to_hex(obj_id)
	+ with _read_obj_file(hex_obj_id, self) as f:
	+ return f.read()
	+
	+ def check(self, obj_id):
	+ """ Perform an integrity check for a given object.
	+
	+ Verify that the file object is in place and that the gziped content
	+ matches the object id.
	+
	+ Args:
	+ obj_id: object id.
	+
	+ Raises:
	+ ObjNotFoundError: if the requested object is missing.
	+ Error: if the request object is corrupted.
	+ """
	+ if obj_id not in self:
	+ raise ObjNotFoundError(obj_id)
	+
	+ hex_obj_id = hashutil.hash_to_hex(obj_id)
	+
	+ try:
	+ with gzip.open(self._obj_path(hex_obj_id)) as f:
	+ length = None
	+ if ID_HASH_ALGO.endswith('_git'):
	+ # if the hashing algorithm is git-like, we need to know the
	+ # content size to hash on the fly. Do a first pass here to
	+ # compute the size
	+ length = 0
	+ while True:
	+ chunk = f.read(GZIP_BUFSIZ)
	+ length += len(chunk)
	+ if not chunk:
	+ break
	+ f.rewind()
	+
	+ checksums = hashutil._hash_file_obj(f, length,
	+ algorithms=[ID_HASH_ALGO])
	+ actual_obj_id = checksums[ID_HASH_ALGO]
	+ if obj_id != actual_obj_id:
	+ raise Error(
	+ 'Corrupt object %s should have id %s'
	+ % (hashutil.hash_to_hex(obj_id),
	+ hashutil.hash_to_hex(actual_obj_id))
	+ )
	+ except (OSError, IOError):
	+ # IOError is for compatibility with older python versions
	+ raise Error('Corrupt object %s is not a gzip file' % obj_id)
	+
	+ def get_random(self, batch_size):
	+ """ Get random ids of existing contents
	+
	+ This method is used in order to get random ids to perform
	+ content integrity verifications on random contents.
	+
	+ Attributes:
	+ batch_size (int): Number of ids that will be given
	+
	+ Yields:
	+ An iterable of ids of contents that are in the current object
	+ storage.
	+ """
	+ def get_random_content(self, batch_size):
	+ """ Get a batch of content inside a single directory.
	+
	+ Returns:
	+ a tuple (batch size, batch).
	+ """
	+ dirs = []
	+ for level in range(self.depth):
	+ path = os.path.join(self.root, *dirs)
	+ dir_list = next(os.walk(path))[1]
	+ if 'tmp' in dir_list:
	+ dir_list.remove('tmp')
	+ dirs.append(random.choice(dir_list))
	+
	+ path = os.path.join(self.root, *dirs)
	+ content_list = next(os.walk(path))[2]
	+ length = min(batch_size, len(content_list))
	+ return length, map(hashutil.hex_to_hash,
	+ random.sample(content_list, length))
	+
	+ while batch_size:
	+ length, it = get_random_content(self, batch_size)
	+ batch_size = batch_size - length
	+ yield from it

File Metadata

Mime Type: text/x-diff
Expires: Fri, Jul 4, 11:23 AM (3 w, 2 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3281455

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions