Changeset View
Changeset View
Standalone View
Standalone View
swh/objstorage/objstorage_pathslicing.py
# Copyright (C) 2015-2016 The Software Heritage developers | # Copyright (C) 2015-2016 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
import gzip | import gzip | ||||
import tempfile | import tempfile | ||||
import random | import random | ||||
from contextlib import contextmanager | from contextlib import contextmanager | ||||
from swh.core import hashutil | from swh.core import hashutil | ||||
from .objstorage import ObjStorage, ID_HASH_ALGO, ID_HASH_LENGTH | from .objstorage import ObjStorage, compute_hash, ID_HASH_ALGO, ID_HASH_LENGTH | ||||
from .exc import ObjNotFoundError, Error | from .exc import ObjNotFoundError, Error | ||||
GZIP_BUFSIZ = 1048576 | GZIP_BUFSIZ = 1048576 | ||||
DIR_MODE = 0o755 | DIR_MODE = 0o755 | ||||
FILE_MODE = 0o644 | FILE_MODE = 0o644 | ||||
▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines | def __init__(self, root, slicing): | ||||
max_endchar = max(map(lambda bound: bound.stop, self.bounds)) | max_endchar = max(map(lambda bound: bound.stop, self.bounds)) | ||||
if ID_HASH_LENGTH < max_endchar: | if ID_HASH_LENGTH < max_endchar: | ||||
raise ValueError( | raise ValueError( | ||||
'Algorithm %s has too short hash for slicing to char %d' | 'Algorithm %s has too short hash for slicing to char %d' | ||||
% (ID_HASH_ALGO, max_endchar) | % (ID_HASH_ALGO, max_endchar) | ||||
) | ) | ||||
def __contains__(self, obj_id): | def __contains__(self, obj_id): | ||||
""" Check whether the given object is present in the storage or not. | |||||
Returns: | |||||
True iff the object is present in the storage. | |||||
""" | |||||
hex_obj_id = hashutil.hash_to_hex(obj_id) | hex_obj_id = hashutil.hash_to_hex(obj_id) | ||||
return os.path.exists(self._obj_path(hex_obj_id)) | return os.path.exists(self._obj_path(hex_obj_id)) | ||||
def __iter__(self): | def __iter__(self): | ||||
"""iterate over the object identifiers currently available in the storage | """iterate over the object identifiers currently available in the storage | ||||
Warning: with the current implementation of the object storage, this | Warning: with the current implementation of the object storage, this | ||||
method will walk the filesystem to list objects, meaning that listing | method will walk the filesystem to list objects, meaning that listing | ||||
Show All 15 Lines | class PathSlicingObjStorage(ObjStorage): | ||||
def __len__(self): | def __len__(self): | ||||
"""compute the number of objects available in the storage | """compute the number of objects available in the storage | ||||
Warning: this currently uses `__iter__`, its warning about bad | Warning: this currently uses `__iter__`, its warning about bad | ||||
performances applies | performances applies | ||||
Return: | Return: | ||||
number of objects contained in the storage | number of objects contained in the storage | ||||
""" | """ | ||||
return sum(1 for i in self) | return sum(1 for i in self) | ||||
def _obj_dir(self, hex_obj_id): | def _obj_dir(self, hex_obj_id): | ||||
""" Compute the storage directory of an object. | """ Compute the storage directory of an object. | ||||
See also: PathSlicingObjStorage::_obj_path | See also: PathSlicingObjStorage::_obj_path | ||||
Show All 13 Lines | def _obj_path(self, hex_obj_id): | ||||
Args: | Args: | ||||
hex_obj_id: object id as hexlified string. | hex_obj_id: object id as hexlified string. | ||||
Returns: | Returns: | ||||
Path to the actual object corresponding to the given id. | Path to the actual object corresponding to the given id. | ||||
""" | """ | ||||
return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id) | return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id) | ||||
def add(self, bytes, obj_id=None, check_presence=True): | def add(self, content, obj_id=None, check_presence=True): | ||||
zack: Same comment about these docstrings that I made in D94. //If// they add nothing wrt parent… | |||||
""" Add a new object to the object storage. | |||||
Args: | |||||
bytes: content of the object to be added to the storage. | |||||
obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When | |||||
given, obj_id will be trusted to match the bytes. If missing, | |||||
obj_id will be computed on the fly. | |||||
check_presence: indicate if the presence of the content should be | |||||
verified before adding the file. | |||||
Returns: | |||||
the id of the object into the storage. | |||||
""" | |||||
if obj_id is None: | if obj_id is None: | ||||
# Checksum is missing, compute it on the fly. | obj_id = compute_hash(content) | ||||
h = hashutil._new_hash(ID_HASH_ALGO, len(bytes)) | |||||
h.update(bytes) | |||||
obj_id = h.digest() | |||||
if check_presence and obj_id in self: | if check_presence and obj_id in self: | ||||
# If the object is already present, return immediatly. | # If the object is already present, return immediatly. | ||||
return obj_id | return obj_id | ||||
hex_obj_id = hashutil.hash_to_hex(obj_id) | hex_obj_id = hashutil.hash_to_hex(obj_id) | ||||
with _write_obj_file(hex_obj_id, self) as f: | with _write_obj_file(hex_obj_id, self) as f: | ||||
f.write(bytes) | f.write(content) | ||||
return obj_id | return obj_id | ||||
def restore(self, bytes, obj_id=None): | |||||
""" Restore a content that have been corrupted. | |||||
This function is identical to add_bytes but does not check if | |||||
the object id is already in the file system. | |||||
Args: | |||||
bytes: content of the object to be added to the storage | |||||
obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When | |||||
given, obj_id will be trusted to match bytes. If missing, | |||||
obj_id will be computed on the fly. | |||||
""" | |||||
return self.add(bytes, obj_id, check_presence=False) | |||||
def get(self, obj_id): | def get(self, obj_id): | ||||
""" Retrieve the content of a given object. | |||||
Args: | |||||
obj_id: object id. | |||||
Returns: | |||||
the content of the requested object as bytes. | |||||
Raises: | |||||
ObjNotFoundError: if the requested object is missing. | |||||
""" | |||||
if obj_id not in self: | if obj_id not in self: | ||||
raise ObjNotFoundError(obj_id) | raise ObjNotFoundError(obj_id) | ||||
# Open the file and return its content as bytes | # Open the file and return its content as bytes | ||||
hex_obj_id = hashutil.hash_to_hex(obj_id) | hex_obj_id = hashutil.hash_to_hex(obj_id) | ||||
with _read_obj_file(hex_obj_id, self) as f: | with _read_obj_file(hex_obj_id, self) as f: | ||||
return f.read() | return f.read() | ||||
def check(self, obj_id): | def check(self, obj_id): | ||||
""" Perform an integrity check for a given object. | |||||
Verify that the file object is in place and that the gziped content | |||||
matches the object id. | |||||
Args: | |||||
obj_id: object id. | |||||
Raises: | |||||
ObjNotFoundError: if the requested object is missing. | |||||
Error: if the request object is corrupted. | |||||
""" | |||||
if obj_id not in self: | if obj_id not in self: | ||||
raise ObjNotFoundError(obj_id) | raise ObjNotFoundError(obj_id) | ||||
hex_obj_id = hashutil.hash_to_hex(obj_id) | hex_obj_id = hashutil.hash_to_hex(obj_id) | ||||
try: | try: | ||||
with gzip.open(self._obj_path(hex_obj_id)) as f: | with gzip.open(self._obj_path(hex_obj_id)) as f: | ||||
length = None | length = None | ||||
Show All 18 Lines | def check(self, obj_id): | ||||
% (hashutil.hash_to_hex(obj_id), | % (hashutil.hash_to_hex(obj_id), | ||||
hashutil.hash_to_hex(actual_obj_id)) | hashutil.hash_to_hex(actual_obj_id)) | ||||
) | ) | ||||
except (OSError, IOError): | except (OSError, IOError): | ||||
# IOError is for compatibility with older python versions | # IOError is for compatibility with older python versions | ||||
raise Error('Corrupt object %s is not a gzip file' % obj_id) | raise Error('Corrupt object %s is not a gzip file' % obj_id) | ||||
def get_random(self, batch_size): | def get_random(self, batch_size): | ||||
""" Get random ids of existing contents | |||||
This method is used in order to get random ids to perform | |||||
content integrity verifications on random contents. | |||||
Attributes: | |||||
batch_size (int): Number of ids that will be given | |||||
Yields: | |||||
An iterable of ids of contents that are in the current object | |||||
storage. | |||||
""" | |||||
def get_random_content(self, batch_size): | def get_random_content(self, batch_size): | ||||
""" Get a batch of content inside a single directory. | """ Get a batch of content inside a single directory. | ||||
Returns: | Returns: | ||||
a tuple (batch size, batch). | a tuple (batch size, batch). | ||||
""" | """ | ||||
dirs = [] | dirs = [] | ||||
for level in range(len(self.bounds)): | for level in range(len(self.bounds)): | ||||
Show All 16 Lines |
Same comment about these docstrings that I made in D94. If they add nothing wrt parent class, please remove them.