Differential D95 Diff 320 swh/objstorage/objstorage_pathslicing.py

Changeset View

Standalone View

swh/objstorage/objstorage_pathslicing.py

# Copyright (C) 2015-2016 The Software Heritage developers		# Copyright (C) 2015-2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import os		import os
import gzip		import gzip
import tempfile		import tempfile
import random		import random

from contextlib import contextmanager		from contextlib import contextmanager

from swh.core import hashutil		from swh.core import hashutil

from .objstorage import ObjStorage, ID_HASH_ALGO, ID_HASH_LENGTH		from .objstorage import ObjStorage, compute_hash, ID_HASH_ALGO, ID_HASH_LENGTH
from .exc import ObjNotFoundError, Error		from .exc import ObjNotFoundError, Error


GZIP_BUFSIZ = 1048576		GZIP_BUFSIZ = 1048576

DIR_MODE = 0o755		DIR_MODE = 0o755
FILE_MODE = 0o644		FILE_MODE = 0o644

▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	def __init__(self, root, slicing):
max_endchar = max(map(lambda bound: bound.stop, self.bounds))		max_endchar = max(map(lambda bound: bound.stop, self.bounds))
if ID_HASH_LENGTH < max_endchar:		if ID_HASH_LENGTH < max_endchar:
raise ValueError(		raise ValueError(
'Algorithm %s has too short hash for slicing to char %d'		'Algorithm %s has too short hash for slicing to char %d'
% (ID_HASH_ALGO, max_endchar)		% (ID_HASH_ALGO, max_endchar)
)		)

def __contains__(self, obj_id):		def __contains__(self, obj_id):
""" Check whether the given object is present in the storage or not.

Returns:
True iff the object is present in the storage.
"""
hex_obj_id = hashutil.hash_to_hex(obj_id)		hex_obj_id = hashutil.hash_to_hex(obj_id)
return os.path.exists(self._obj_path(hex_obj_id))		return os.path.exists(self._obj_path(hex_obj_id))

def __iter__(self):		def __iter__(self):
"""iterate over the object identifiers currently available in the storage		"""iterate over the object identifiers currently available in the storage

Warning: with the current implementation of the object storage, this		Warning: with the current implementation of the object storage, this
method will walk the filesystem to list objects, meaning that listing		method will walk the filesystem to list objects, meaning that listing
Show All 15 Lines	class PathSlicingObjStorage(ObjStorage):
def __len__(self):		def __len__(self):
"""compute the number of objects available in the storage		"""compute the number of objects available in the storage

Warning: this currently uses `__iter__`, its warning about bad		Warning: this currently uses `__iter__`, its warning about bad
performances applies		performances applies

Return:		Return:
number of objects contained in the storage		number of objects contained in the storage

"""		"""
return sum(1 for i in self)		return sum(1 for i in self)

def _obj_dir(self, hex_obj_id):		def _obj_dir(self, hex_obj_id):
""" Compute the storage directory of an object.		""" Compute the storage directory of an object.

See also: PathSlicingObjStorage::_obj_path		See also: PathSlicingObjStorage::_obj_path

Show All 13 Lines	def _obj_path(self, hex_obj_id):

Args:		Args:
hex_obj_id: object id as hexlified string.		hex_obj_id: object id as hexlified string.

Returns:		Returns:
Path to the actual object corresponding to the given id.		Path to the actual object corresponding to the given id.
"""		"""
return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id)		return os.path.join(self._obj_dir(hex_obj_id), hex_obj_id)

def add(self, bytes, obj_id=None, check_presence=True):		def add(self, content, obj_id=None, check_presence=True):
		zackUnsubmitted Done Inline Actions Same comment about these docstrings that I made in D94. If they add nothing wrt parent class, please remove them. zack: Same comment about these docstrings that I made in D94. //If// they add nothing wrt parent…
""" Add a new object to the object storage.

Args:
bytes: content of the object to be added to the storage.
obj_id: checksum of [bytes] using [ID_HASH_ALGO] algorithm. When
given, obj_id will be trusted to match the bytes. If missing,
obj_id will be computed on the fly.
check_presence: indicate if the presence of the content should be
verified before adding the file.

Returns:
the id of the object into the storage.
"""
if obj_id is None:		if obj_id is None:
# Checksum is missing, compute it on the fly.		obj_id = compute_hash(content)
h = hashutil._new_hash(ID_HASH_ALGO, len(bytes))
h.update(bytes)
obj_id = h.digest()

if check_presence and obj_id in self:		if check_presence and obj_id in self:
# If the object is already present, return immediatly.		# If the object is already present, return immediatly.
return obj_id		return obj_id

hex_obj_id = hashutil.hash_to_hex(obj_id)		hex_obj_id = hashutil.hash_to_hex(obj_id)
with _write_obj_file(hex_obj_id, self) as f:		with _write_obj_file(hex_obj_id, self) as f:
f.write(bytes)		f.write(content)

return obj_id		return obj_id

def restore(self, bytes, obj_id=None):
""" Restore a content that have been corrupted.

This function is identical to add_bytes but does not check if
the object id is already in the file system.

Args:
bytes: content of the object to be added to the storage
obj_id: checksums of `bytes` as computed by ID_HASH_ALGO. When
given, obj_id will be trusted to match bytes. If missing,
obj_id will be computed on the fly.
"""
return self.add(bytes, obj_id, check_presence=False)

def get(self, obj_id):		def get(self, obj_id):
""" Retrieve the content of a given object.

Args:
obj_id: object id.

Returns:
the content of the requested object as bytes.

Raises:
ObjNotFoundError: if the requested object is missing.
"""
if obj_id not in self:		if obj_id not in self:
raise ObjNotFoundError(obj_id)		raise ObjNotFoundError(obj_id)

# Open the file and return its content as bytes		# Open the file and return its content as bytes
hex_obj_id = hashutil.hash_to_hex(obj_id)		hex_obj_id = hashutil.hash_to_hex(obj_id)
with _read_obj_file(hex_obj_id, self) as f:		with _read_obj_file(hex_obj_id, self) as f:
return f.read()		return f.read()

def check(self, obj_id):		def check(self, obj_id):
""" Perform an integrity check for a given object.

Verify that the file object is in place and that the gziped content
matches the object id.

Args:
obj_id: object id.

Raises:
ObjNotFoundError: if the requested object is missing.
Error: if the request object is corrupted.
"""
if obj_id not in self:		if obj_id not in self:
raise ObjNotFoundError(obj_id)		raise ObjNotFoundError(obj_id)

hex_obj_id = hashutil.hash_to_hex(obj_id)		hex_obj_id = hashutil.hash_to_hex(obj_id)

try:		try:
with gzip.open(self._obj_path(hex_obj_id)) as f:		with gzip.open(self._obj_path(hex_obj_id)) as f:
length = None		length = None
Show All 18 Lines	def check(self, obj_id):
% (hashutil.hash_to_hex(obj_id),		% (hashutil.hash_to_hex(obj_id),
hashutil.hash_to_hex(actual_obj_id))		hashutil.hash_to_hex(actual_obj_id))
)		)
except (OSError, IOError):		except (OSError, IOError):
# IOError is for compatibility with older python versions		# IOError is for compatibility with older python versions
raise Error('Corrupt object %s is not a gzip file' % obj_id)		raise Error('Corrupt object %s is not a gzip file' % obj_id)

def get_random(self, batch_size):		def get_random(self, batch_size):
""" Get random ids of existing contents

This method is used in order to get random ids to perform
content integrity verifications on random contents.

Attributes:
batch_size (int): Number of ids that will be given

Yields:
An iterable of ids of contents that are in the current object
storage.
"""
def get_random_content(self, batch_size):		def get_random_content(self, batch_size):
""" Get a batch of content inside a single directory.		""" Get a batch of content inside a single directory.

Returns:		Returns:
a tuple (batch size, batch).		a tuple (batch size, batch).
"""		"""
dirs = []		dirs = []
for level in range(len(self.bounds)):		for level in range(len(self.bounds)):
Show All 16 Lines