Changeset View
Changeset View
Standalone View
Standalone View
swh/model/hashutil.py
Show All 37 Lines | |||||
"""Algorithms computed by default when calling the functions from this module. | """Algorithms computed by default when calling the functions from this module. | ||||
Subset of :const:`ALGORITHMS`. | Subset of :const:`ALGORITHMS`. | ||||
""" | """ | ||||
HASH_BLOCK_SIZE = 32768 | HASH_BLOCK_SIZE = 32768 | ||||
"""Block size for streaming hash computations made in this module""" | """Block size for streaming hash computations made in this module""" | ||||
# Load blake2 hashes from pyblake2 if they are not available in the builtin | _blake2_hash_cache = {} | ||||
# hashlib | |||||
__pyblake2_hashes = {'blake2s256': 'blake2s', | |||||
'blake2b512': 'blake2b'} | def _new_blake2_hash(algo): | ||||
__cache = hashlib.__builtin_constructor_cache | """Return a function that initializes a blake2 hash. | ||||
for __hash, __pyblake2_fn in __pyblake2_hashes.items(): | |||||
if __hash not in hashlib.algorithms_available: | """ | ||||
if algo in _blake2_hash_cache: | |||||
return _blake2_hash_cache[algo]() | |||||
lalgo = algo.lower() | |||||
if not lalgo.startswith('blake2'): | |||||
raise ValueError('Algorithm %s is not a blake2 hash' % algo) | |||||
blake_family = lalgo[:7] | |||||
digest_size = None | |||||
if lalgo[7:]: | |||||
try: | |||||
digest_size, remainder = divmod(int(lalgo[7:]), 8) | |||||
except ValueError: | |||||
raise ValueError( | |||||
'Unknown digest size for algo %s' % algo | |||||
) from None | |||||
if remainder: | |||||
raise ValueError( | |||||
'Digest size for algorithm %s must be a multiple of 8' % algo | |||||
) | |||||
if lalgo in hashlib.algorithms_available: | |||||
# Handle the case where OpenSSL ships the given algorithm | |||||
# (e.g. Python 3.5 on Debian 9 stretch) | |||||
_blake2_hash_cache[algo] = lambda: hashlib.new(lalgo) | |||||
else: | |||||
# Try using the built-in implementation for Python 3.6+ | |||||
if blake_family in hashlib.algorithms_available: | |||||
blake2 = getattr(hashlib, blake_family) | |||||
else: | |||||
import pyblake2 | import pyblake2 | ||||
__cache[__hash] = getattr(pyblake2, __pyblake2_fn) | blake2 = getattr(pyblake2, blake_family) | ||||
_blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size) | |||||
return _blake2_hash_cache[algo]() | |||||
def _new_hashlib_hash(algo): | |||||
"""Initialize a digest object from hashlib. | |||||
Handle the swh-specific names for the blake2-related algorithms | |||||
""" | |||||
if algo.startswith('blake2'): | |||||
return _new_blake2_hash(algo) | |||||
else: | |||||
return hashlib.new(algo) | |||||
def _new_git_hash(base_algo, git_type, length): | def _new_git_hash(base_algo, git_type, length): | ||||
"""Initialize a digest object (as returned by python's hashlib) for the | """Initialize a digest object (as returned by python's hashlib) for the | ||||
requested algorithm, and feed it with the header for a git object of the | requested algorithm, and feed it with the header for a git object of the | ||||
given type and length. | given type and length. | ||||
The header for hashing a git object consists of: | The header for hashing a git object consists of: | ||||
- The type of the object (encoded in ASCII) | - The type of the object (encoded in ASCII) | ||||
- One ASCII space (\x20) | - One ASCII space (\x20) | ||||
- The length of the object (decimal encoded in ASCII) | - The length of the object (decimal encoded in ASCII) | ||||
- One NUL byte | - One NUL byte | ||||
Args: | Args: | ||||
base_algo (str from :const:`ALGORITHMS`): a hashlib-supported algorithm | base_algo (str from :const:`ALGORITHMS`): a hashlib-supported algorithm | ||||
git_type: the type of the git object (supposedly one of 'blob', | git_type: the type of the git object (supposedly one of 'blob', | ||||
'commit', 'tag', 'tree') | 'commit', 'tag', 'tree') | ||||
length: the length of the git object you're encoding | length: the length of the git object you're encoding | ||||
Returns: | Returns: | ||||
a hashutil.hash object | a hashutil.hash object | ||||
""" | """ | ||||
h = hashlib.new(base_algo) | h = _new_hashlib_hash(base_algo) | ||||
git_header = '%s %d\0' % (git_type, length) | git_header = '%s %d\0' % (git_type, length) | ||||
h.update(git_header.encode('ascii')) | h.update(git_header.encode('ascii')) | ||||
return h | return h | ||||
def _new_hash(algo, length=None): | def _new_hash(algo, length=None): | ||||
"""Initialize a digest object (as returned by python's hashlib) for | """Initialize a digest object (as returned by python's hashlib) for | ||||
Show All 21 Lines | if algo not in ALGORITHMS: | ||||
(algo, ', '.join(sorted(ALGORITHMS)))) | (algo, ', '.join(sorted(ALGORITHMS)))) | ||||
if algo.endswith('_git'): | if algo.endswith('_git'): | ||||
if length is None: | if length is None: | ||||
raise ValueError('Missing length for git hashing algorithm') | raise ValueError('Missing length for git hashing algorithm') | ||||
base_algo = algo[:-4] | base_algo = algo[:-4] | ||||
return _new_git_hash(base_algo, 'blob', length) | return _new_git_hash(base_algo, 'blob', length) | ||||
return hashlib.new(algo) | return _new_hashlib_hash(algo) | ||||
def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): | def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): | ||||
"""Hash the contents of the given file object with the given algorithms. | """Hash the contents of the given file object with the given algorithms. | ||||
Args: | Args: | ||||
fobj: a file-like object | fobj: a file-like object | ||||
length: the length of the contents of the file-like object (for the | length: the length of the contents of the file-like object (for the | ||||
▲ Show 20 Lines • Show All 151 Lines • Show Last 20 Lines |