diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..6ce4572 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +from .objstorage import ObjStorage, DIR_MODE, FILE_MODE # NOQA diff --git a/objstorage.py b/objstorage.py new file mode 100644 index 0000000..1f78de0 --- /dev/null +++ b/objstorage.py @@ -0,0 +1,386 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import gzip +import os +import shutil +import tempfile + +from contextlib import contextmanager + +from ..exc import ObjNotFoundError, Error +from swh.core import hashutil + + +ID_HASH_ALGO = 'sha1' +# ID_HASH_ALGO = 'sha1_git' + +GZIP_BUFSIZ = 1048576 + +DIR_MODE = 0o755 +FILE_MODE = 0o644 + + +def _obj_dir(hex_obj_id, root_dir, depth): + """compute the storage directory of an object + + Args: + hex_obj_id: object id as hexlified string + root_dir: object storage root directory + depth: slicing depth of object IDs in the storage + + see also: `_obj_path` + + """ + if len(hex_obj_id) < depth * 2: + raise ValueError('object id "%s" is too short for slicing at depth %d' + % (hex_obj_id, depth)) + + # compute [depth] substrings of [obj_id], each of length 2, starting from + # the beginning + id_steps = [hex_obj_id[i*2:i*2+2] for i in range(0, depth)] + steps = [root_dir] + id_steps + + return os.path.join(*steps) + + +def _obj_path(hex_obj_id, root_dir, depth): + """similar to `obj_dir`, but also include the actual object file name in the + returned path + + """ + return os.path.join(_obj_dir(hex_obj_id, root_dir, depth), hex_obj_id) + + +@contextmanager +def _write_obj_file(hex_obj_id, root_dir, depth): + """context manager for writing object files to the object storage + + During writing data are written to a temporary file, which is atomically + renamed to the right file name after closing. This context manager also + takes care of (gzip) compressing the data on the fly. + + Yields: + a file-like object open for writing bytes + + Sample usage: + + with _write_obj_file(hex_obj_id, root_dir, depth) as f: + f.write(obj_data) + + """ + dir = _obj_dir(hex_obj_id, root_dir, depth) + if not os.path.isdir(dir): + os.makedirs(dir, DIR_MODE, exist_ok=True) + + path = os.path.join(dir, hex_obj_id) + (tmp, tmp_path) = tempfile.mkstemp(suffix='.tmp', prefix='hex_obj_id.', + dir=dir) + tmp_f = os.fdopen(tmp, 'wb') + with gzip.GzipFile(filename=tmp_path, fileobj=tmp_f) as f: + yield f + tmp_f.close() + os.chmod(tmp_path, FILE_MODE) + os.rename(tmp_path, path) + + +class ObjStorage: + """high-level API to manipulate the Software Heritage object storage + + Conceptually, the object storage offers 4 methods: + + - add() add a new object, returning an object id + - __contains__() check if an object is present, by object id + - get() retrieve the content of an object, by object id + - check() check the integrity of an object, by object id + + Variants of the above methods are implemented by this class, depending on + how the content of an object is specified (bytes, file-like object, etc.). + + On disk, an object storage is a directory tree containing files named after + their object IDs. An object ID is a checksum of its content, depending on + the value of the ID_HASH_ALGO constant (see hashutil for its meaning). + + To avoid directories that contain too many files, the object storage has a + given depth (default: 3). Each depth level consumes two characters of the + object id. So for instance a file with (git) SHA1 of + 34973274ccef6ab4dfaaf86599792fa9c3fe4689 will be stored in an object + storage configured at depth 3 at + 34/97/32/34973274ccef6ab4dfaaf86599792fa9c3fe4689. + + The actual files in the storage are stored in gzipped compressed format. + + Each file can hence be self-verified (on the shell) with something like: + + actual_id=34973274ccef6ab4dfaaf86599792fa9c3fe4689 + expected_id=$(zcat $filename | sha1sum | cut -f 1 -d' ') + if [ $actual_id != $expected_id ] ; then + echo "AYEEE, invalid object $actual_id /o\" + fi + + """ + + def __init__(self, root, depth=3): + """create a proxy object to the object storage + + Args: + root: object storage root directory + depth: slicing depth of object IDs in the storage + + """ + if not os.path.isdir(root): + raise ValueError('obj storage root "%s" is not a directory' + % root) + + self._root_dir = root + self._depth = depth + + self._temp_dir = os.path.join(root, 'tmp') + if not os.path.isdir(self._temp_dir): + os.makedirs(self._temp_dir, DIR_MODE, exist_ok=True) + + def __obj_dir(self, hex_obj_id): + """_obj_dir wrapper using this storage configuration""" + return _obj_dir(hex_obj_id, self._root_dir, self._depth) + + def __obj_path(self, hex_obj_id): + """_obj_path wrapper using this storage configuration""" + return _obj_path(hex_obj_id, self._root_dir, self._depth) + + def __contains__(self, obj_id): + """check whether a given object id is present in the storage or not + + Return: + True iff the object id is present in the storage + + """ + hex_obj_id = hashutil.hash_to_hex(obj_id) + + return os.path.exists(_obj_path(hex_obj_id, self._root_dir, + self._depth)) + + def add_bytes(self, bytes, obj_id=None): + """add a new object to the object storage + + Args: + bytes: content of the object to be added to the storage + obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When + given, obj_id will be trusted to match bytes. If missing, + obj_id will be computed on the fly. + + """ + if obj_id is None: + # missing checksum, compute it in memory and write to file + h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) + h.update(bytes) + obj_id = h.digest() + + if obj_id in self: + return obj_id + + hex_obj_id = hashutil.hash_to_hex(obj_id) + + # object is either absent, or present but overwrite is requested + with _write_obj_file(hex_obj_id, + root_dir=self._root_dir, + depth=self._depth) as f: + f.write(bytes) + + return obj_id + + def add_file(self, f, length, obj_id=None): + """similar to `add_bytes`, but add the content of file-like object f to the + object storage + + add_file will read the file content only once, and avoid storing all of + it in memory + + """ + if obj_id is None: + # unknkown object id: work on temp file, compute checksum as we go, + # mv temp file into place + (tmp, tmp_path) = tempfile.mkstemp(dir=self._temp_dir) + try: + t = os.fdopen(tmp, 'wb') + tz = gzip.GzipFile(fileobj=t) + sums = hashutil._hash_file_obj(f, length, + algorithms=[ID_HASH_ALGO], + chunk_cb=lambda b: tz.write(b)) + tz.close() + t.close() + + obj_id = sums[ID_HASH_ALGO] + if obj_id in self: + return obj_id + + hex_obj_id = hashutil.hash_to_hex(obj_id) + + dir = self.__obj_dir(hex_obj_id) + if not os.path.isdir(dir): + os.makedirs(dir, DIR_MODE, exist_ok=True) + path = os.path.join(dir, hex_obj_id) + + os.chmod(tmp_path, FILE_MODE) + os.rename(tmp_path, path) + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + else: + # known object id: write to .new file, rename + if obj_id in self: + return obj_id + + hex_obj_id = hashutil.hash_to_hex(obj_id) + + with _write_obj_file(hex_obj_id, + root_dir=self._root_dir, + depth=self._depth) as obj: + shutil.copyfileobj(f, obj) + + return obj_id + + @contextmanager + def get_file_obj(self, obj_id): + """context manager to read the content of an object + + Args: + obj_id: object id + + Yields: + a file-like object open for reading (bytes) + + Raises: + ObjNotFoundError: if the requested object is missing + + Sample usage: + + with objstorage.get_file_obj(obj_id) as f: + do_something(f.read()) + + """ + if obj_id not in self: + raise ObjNotFoundError(obj_id) + + hex_obj_id = hashutil.hash_to_hex(obj_id) + + path = self.__obj_path(hex_obj_id) + with gzip.GzipFile(path, 'rb') as f: + yield f + + def get_bytes(self, obj_id): + """retrieve the content of a given object + + Args: + obj_id: object id + + Returns: + the content of the requested objects as bytes + + Raises: + ObjNotFoundError: if the requested object is missing + + """ + with self.get_file_obj(obj_id) as f: + return f.read() + + def _get_file_path(self, obj_id): + """retrieve the path of a given object in the objects storage + + Note that the path point to a gzip-compressed file, so you need + gzip.open() or equivalent to get the actual object content. + + Args: + obj_id: object id + + Returns: + a file path pointing into the object storage + + Raises: + ObjNotFoundError: if the requested object is missing + + """ + if obj_id not in self: + raise ObjNotFoundError(obj_id) + + hex_obj_id = hashutil.hash_to_hex(obj_id) + + return self.__obj_path(hex_obj_id) + + def check(self, obj_id): + """integrity check for a given object + + verify that the file object is in place, and that the gzipped content + matches the object id + + Args: + obj_id: object id + + Raises: + ObjNotFoundError: if the requested object is missing + Error: if the requested object is corrupt + + """ + if obj_id not in self: + raise ObjNotFoundError(obj_id) + + hex_obj_id = hashutil.hash_to_hex(obj_id) + + try: + with gzip.open(self.__obj_path(hex_obj_id)) as f: + length = None + if ID_HASH_ALGO.endswith('_git'): + # if the hashing algorithm is git-like, we need to know the + # content size to hash on the fly. Do a first pass here to + # compute the size + length = 0 + while True: + chunk = f.read(GZIP_BUFSIZ) + length += len(chunk) + if not chunk: + break + f.rewind() + + checksums = hashutil._hash_file_obj(f, length, + algorithms=[ID_HASH_ALGO]) + actual_obj_id = checksums[ID_HASH_ALGO] + if obj_id != actual_obj_id: + raise Error('corrupt object %s should have id %s' % + (obj_id, actual_obj_id)) + except (OSError, IOError): + # IOError is for compatibility with older python versions + raise Error('corrupt object %s is not a gzip file' % obj_id) + + def __iter__(self): + """iterate over the object identifiers currently available in the storage + + Warning: with the current implementation of the object storage, this + method will walk the filesystem to list objects, meaning that listing + all objects will be very slow for large storages. You almost certainly + don't want to use this method in production. + + Return: + iterator over object IDs + + """ + def obj_iterator(): + # XXX hackish: it does not verify that the depth of found files + # matches the slicing depth of the storage + for root, _dirs, files in os.walk(self._root_dir): + for f in files: + yield bytes.fromhex(f) + + return obj_iterator() + + def __len__(self): + """compute the number of objects available in the storage + + Warning: this currently uses `__iter__`, its warning about bad + performances applies + + Return: + number of objects contained in the storage + + """ + return sum(1 for i in self)