diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -7,6 +7,7 @@ python3-all, python3-nose, python3-setuptools, + python3-pyblake2, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DMOD/ diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -0,0 +1 @@ +pyblake2 diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -1,21 +1,53 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +"""Module in charge of hashing functions definition. This is the base +module use to compute swh's hashes. + +Only a subset of hashing algorithms is supported as defined in the +ALGORITHMS dictionary. Any provided algorithms not in that list +will result in a ValueError. + +This modules defines the following hashing functions: +- hash_file: Hash the contents of the given file object with the given + algorithms. +- hash_data: Hash the given binary blob with the given algorithms. +- hash_path: Hash the contents of the file at the given path with the + given algorithms. + +All those functions use by default the DEFAULT_ALGORITHMS set of +hashing algorithms. This can be overriden on a call basis by providing +the algorithms parameter. + +""" + +import sys import binascii import functools import hashlib -from io import BytesIO import os -# supported hashing algorithms -ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) +from io import BytesIO + +# Supported algorithms +ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256', 'blake2b512']) + +# Default algorithms used +DEFAULT_ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) # should be a multiple of 64 (sha1/sha256's block size) # FWIW coreutils' sha1sum uses 32768 HASH_BLOCK_SIZE = 32768 +# For old python3 version, dependency on pyblake2 +if sys.version_info.major == 3 and sys.version_info.minor <= 4: + import pyblake2 + cache = hashlib.__builtin_constructor_cache + cache['blake2s256'] = pyblake2.blake2s + cache['blake2b512'] = pyblake2.blake2b + def _new_git_hash(base_algo, git_type, length): """Initialize a digest object (as returned by python's hashlib) for the @@ -46,16 +78,16 @@ def _new_hash(algo, length=None): - """Initialize a digest object (as returned by python's hashlib) for the - requested algorithm. See the constant ALGORITHMS for the list of supported - algorithms. If a git-specific hashing algorithm is requested (e.g., - "sha1_git"), the hashing object will be pre-fed with the needed header; for - this to work, length must be given. + """Initialize a digest object (as returned by python's hashlib) for + the requested algorithm. See the constant ALGORITHMS for the list + of supported algorithms. If a git-specific hashing algorithm is + requested (e.g., "sha1_git"), the hashing object will be pre-fed + with the needed header; for this to work, length must be given. Args: - algo: a hashing algorithm (one of ALGORITHMS) - length: the length of the hashed payload (needed for git-specific - algorithms) + algo (str): a hashing algorithm (one of ALGORITHMS) + length (int): the length of the hashed payload (needed for + git-specific algorithms) Returns: a hashutil.hash object @@ -63,25 +95,23 @@ Raises: ValueError if algo is unknown, or length is missing for a git-specific hash. + """ if algo not in ALGORITHMS: - raise ValueError('Unexpected hashing algorithm %s, ' - 'expected one of %s' % - (algo, ', '.join(sorted(ALGORITHMS)))) + raise ValueError( + 'Unexpected hashing algorithm %s, expected one of %s' % + (algo, ', '.join(sorted(ALGORITHMS)))) - h = None if algo.endswith('_git'): if length is None: raise ValueError('Missing length for git hashing algorithm') base_algo = algo[:-4] - h = _new_git_hash(base_algo, 'blob', length) - else: - h = hashlib.new(algo) + return _new_git_hash(base_algo, 'blob', length) - return h + return hashlib.new(algo) -def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None): +def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): """Hash the contents of the given file object with the given algorithms. Args: @@ -109,8 +139,9 @@ return {algo: hash.digest() for algo, hash in hashes.items()} -def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None): - """Hash the contents of the file at the given path with the given algorithms. +def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): + """Hash the contents of the file at the given path with the given + algorithms. Args: path: the path of the file to hash @@ -122,6 +153,7 @@ Raises: ValueError if algorithms contains an unknown hash algorithm. OSError on file access error + """ length = os.path.getsize(path) with open(path, 'rb') as fobj: @@ -130,7 +162,7 @@ return hash -def hash_data(data, algorithms=ALGORITHMS): +def hash_data(data, algorithms=DEFAULT_ALGORITHMS): """Hash the given binary blob with the given algorithms. Args: diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -8,6 +8,7 @@ import unittest from nose.tools import istest +from unittest.mock import patch from swh.model import hashutil @@ -124,6 +125,36 @@ hashutil.bytehex_to_hash( self.hex_checksums[algo].encode())) + @istest + def new_hash_unsupported_hashing_algorithm(self): + try: + hashutil._new_hash('blake2:10') + except ValueError as e: + self.assertEquals(str(e), + 'Unexpected hashing algorithm blake2:10, ' + 'expected one of blake2b512, blake2s256, ' + 'sha1, sha1_git, sha256') + + @patch('swh.model.hashutil.hashlib') + @istest + def new_hash_blake2b(self, mock_hashlib): + mock_hashlib.new.return_value = 'some-hashlib-object' + + h = hashutil._new_hash('blake2b512') + + self.assertEquals(h, 'some-hashlib-object') + mock_hashlib.new.assert_called_with('blake2b512') + + @patch('swh.model.hashutil.hashlib') + @istest + def new_hash_blake2s(self, mock_hashlib): + mock_hashlib.new.return_value = 'some-hashlib-object' + + h = hashutil._new_hash('blake2s256') + + self.assertEquals(h, 'some-hashlib-object') + mock_hashlib.new.assert_called_with('blake2s256') + class HashlibGit(unittest.TestCase):