diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index ea28414..0075e63 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -1,201 +1,220 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import binascii import functools import hashlib -from io import BytesIO import os +import sys + +from io import BytesIO # supported hashing algorithms ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) # should be a multiple of 64 (sha1/sha256's block size) # FWIW coreutils' sha1sum uses 32768 HASH_BLOCK_SIZE = 32768 def _new_git_hash(base_algo, git_type, length): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm, and feed it with the header for a git object of the given type and length. The header for hashing a git object consists of: - The type of the object (encoded in ASCII) - One ASCII space (\x20) - The length of the object (decimal encoded in ASCII) - One NUL byte Args: base_algo: a hashlib-supported algorithm git_type: the type of the git object (supposedly one of 'blob', 'commit', 'tag', 'tree') length: the length of the git object you're encoding Returns: a hashutil.hash object """ h = hashlib.new(base_algo) git_header = '%s %d\0' % (git_type, length) h.update(git_header.encode('ascii')) return h def _new_hash(algo, length=None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is requested (e.g., "sha1_git"), the hashing object will be pre-fed with the needed header; for this to work, length must be given. Args: algo: a hashing algorithm (one of ALGORITHMS) length: the length of the hashed payload (needed for git-specific algorithms) Returns: a hashutil.hash object Raises: ValueError if algo is unknown, or length is missing for a git-specific hash. """ - if algo not in ALGORITHMS: + if algo not in ALGORITHMS and ':' not in algo: raise ValueError('Unexpected hashing algorithm %s, ' 'expected one of %s' % (algo, ', '.join(sorted(ALGORITHMS)))) h = None if algo.endswith('_git'): if length is None: raise ValueError('Missing length for git hashing algorithm') base_algo = algo[:-4] h = _new_git_hash(base_algo, 'blob', length) + elif ':' in algo: # variable length hashing algorithms (only from + # python3 >= 3.6) + if sys.version_info.major == 3 and sys.version_info.minor >= 6: + _algo = algo.split(':') + base_algo = _algo[0] + variable_length = int(_algo[1]) + + if base_algo == 'blake2b': + h = hashlib.blake2b(digest_size=variable_length) + elif base_algo == 'blake2s': + h = hashlib.blake2s(digest_size=variable_length) + else: + raise ValueError('Unexpected hashing algorithm %s, ' + 'expected one of %s' % + (algo, ', '.join(sorted(ALGORITHMS)))) + else: + raise ValueError('Unsupported hashing algorithm %s' % algo) else: h = hashlib.new(algo) return h def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None): """Hash the contents of the given file object with the given algorithms. Args: fobj: a file-like object length: the length of the contents of the file-like object (for the git-specific algorithms) algorithms: the hashing algorithms used Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. """ hashes = {algo: _new_hash(algo, length) for algo in algorithms} while True: chunk = fobj.read(HASH_BLOCK_SIZE) if not chunk: break for hash in hashes.values(): hash.update(chunk) if chunk_cb: chunk_cb(chunk) return {algo: hash.digest() for algo, hash in hashes.items()} def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None): """Hash the contents of the file at the given path with the given algorithms. Args: path: the path of the file to hash algorithms: the hashing algorithms used chunk_cb: a callback Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. OSError on file access error """ length = os.path.getsize(path) with open(path, 'rb') as fobj: hash = hash_file(fobj, length, algorithms, chunk_cb) hash['length'] = length return hash def hash_data(data, algorithms=ALGORITHMS): """Hash the given binary blob with the given algorithms. Args: data: a bytes object algorithms: the hashing algorithms used Returns: a dict mapping each algorithm to a bytes digest Raises: TypeError if data does not support the buffer interface. ValueError if algorithms contains an unknown hash algorithm. """ fobj = BytesIO(data) return hash_file(fobj, len(data), algorithms) def hash_git_data(data, git_type, base_algo='sha1'): """Hash the given data as a git object of type git_type. Args: data: a bytes object git_type: the git object type base_algo: the base hashing algorithm used (default: sha1) Returns: a dict mapping each algorithm to a bytes digest Raises: ValueError if the git_type is unexpected. """ git_object_types = {'blob', 'tree', 'commit', 'tag'} if git_type not in git_object_types: raise ValueError('Unexpected git object type %s, expected one of %s' % (git_type, ', '.join(sorted(git_object_types)))) h = _new_git_hash(base_algo, git_type, len(data)) h.update(data) return h.digest() @functools.lru_cache() def hash_to_hex(hash): """Converts a hash (in hex or bytes form) to its hexadecimal ascii form""" if isinstance(hash, str): return hash return binascii.hexlify(hash).decode('ascii') @functools.lru_cache() def hash_to_bytehex(hash): """Converts a hash to its hexadecimal bytes representation""" return binascii.hexlify(hash) @functools.lru_cache() def hash_to_bytes(hash): """Converts a hash (in hex or bytes form) to its raw bytes form""" if isinstance(hash, bytes): return hash return bytes.fromhex(hash) @functools.lru_cache() def bytehex_to_hash(hex): """Converts a hexadecimal bytes representation of a hash to that hash""" return hash_to_bytes(hex.decode()) diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index 614e7ee..afa5a58 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -1,210 +1,255 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import tempfile import unittest from nose.tools import istest +from unittest.mock import MagicMock, patch from swh.model import hashutil class Hashutil(unittest.TestCase): def setUp(self): self.data = b'1984\n' self.hex_checksums = { 'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731', 'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d', 'sha256': '26602113b4b9afd9d55466b08580d3c2' '4a9b50ee5b5866c0d91fab0e65907311', } self.checksums = { type: bytes.fromhex(cksum) for type, cksum in self.hex_checksums.items() } self.git_hex_checksums = { 'blob': self.hex_checksums['sha1_git'], 'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0', 'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f', 'tag': 'd6bf62466f287b4d986c545890716ce058bddf67', } self.git_checksums = { type: bytes.fromhex(cksum) for type, cksum in self.git_hex_checksums.items() } @istest def hash_data(self): checksums = hashutil.hash_data(self.data) self.assertEqual(checksums, self.checksums) @istest def hash_data_unknown_hash(self): with self.assertRaises(ValueError) as cm: hashutil.hash_data(self.data, ['unknown-hash']) self.assertIn('Unexpected hashing algorithm', cm.exception.args[0]) self.assertIn('unknown-hash', cm.exception.args[0]) @istest def hash_git_data(self): checksums = { git_type: hashutil.hash_git_data(self.data, git_type) for git_type in self.git_checksums } self.assertEqual(checksums, self.git_checksums) @istest def hash_git_data_unknown_git_type(self): with self.assertRaises(ValueError) as cm: hashutil.hash_git_data(self.data, 'unknown-git-type') self.assertIn('Unexpected git object type', cm.exception.args[0]) self.assertIn('unknown-git-type', cm.exception.args[0]) @istest def hash_file(self): fobj = io.BytesIO(self.data) checksums = hashutil.hash_file(fobj, length=len(self.data)) self.assertEqual(checksums, self.checksums) @istest def hash_file_missing_length(self): fobj = io.BytesIO(self.data) with self.assertRaises(ValueError) as cm: hashutil.hash_file(fobj, algorithms=['sha1_git']) self.assertIn('Missing length', cm.exception.args[0]) @istest def hash_path(self): with tempfile.NamedTemporaryFile(delete=False) as f: f.write(self.data) f.close() hashes = hashutil.hash_path(f.name) self.checksums['length'] = len(self.data) self.assertEquals(self.checksums, hashes) @istest def hash_to_hex(self): for type in self.checksums: hex = self.hex_checksums[type] hash = self.checksums[type] self.assertEquals(hashutil.hash_to_hex(hex), hex) self.assertEquals(hashutil.hash_to_hex(hash), hex) @istest def hash_to_bytes(self): for type in self.checksums: hex = self.hex_checksums[type] hash = self.checksums[type] self.assertEquals(hashutil.hash_to_bytes(hex), hash) self.assertEquals(hashutil.hash_to_bytes(hash), hash) @istest def hash_to_bytehex(self): for algo in self.checksums: self.assertEqual(self.hex_checksums[algo].encode('ascii'), hashutil.hash_to_bytehex(self.checksums[algo])) @istest def bytehex_to_hash(self): for algo in self.checksums: self.assertEqual(self.checksums[algo], hashutil.bytehex_to_hash( self.hex_checksums[algo].encode())) + @istest + def new_hash_unsupported_hashing_algorithm(self): + try: + hashutil._new_hash('blake2:10') + except ValueError as e: + self.assertEquals(str(e), + 'Unsupported hashing algorithm blake2:10') + + @patch('swh.model.hashutil.sys') + @istest + def new_hash_unexpected_hashing_algo(self, mock_sys): + mock_sys.version_info = MagicMock(major=3, minor=6) + + try: + hashutil._new_hash('blake3:256') + except ValueError as e: + self.assertEquals(str(e), + 'Unexpected hashing algorithm blake3:256, ' + 'expected one of sha1, sha1_git, sha256') + + @patch('swh.model.hashutil.sys') + @patch('swh.model.hashutil.hashlib') + @istest + def new_hash_blake2b(self, mock_hashlib, mock_sys): + mock_sys.version_info = MagicMock(major=3, minor=6) + mock_hashlib.blake2b.return_value = 'some-hashlib-object' + + h = hashutil._new_hash('blake2b:256') + + self.assertEquals(h, 'some-hashlib-object') + mock_hashlib.blake2b.assert_called_with(digest_size=256) + + @patch('swh.model.hashutil.sys') + @patch('swh.model.hashutil.hashlib') + @istest + def new_hash_blake2s(self, mock_hashlib, mock_sys): + mock_sys.version_info = MagicMock(major=3, minor=6) + mock_hashlib.blake2s.return_value = 'some-hashlib-object' + + h = hashutil._new_hash('blake2s:128') + + self.assertEquals(h, 'some-hashlib-object') + mock_hashlib.blake2s.assert_called_with(digest_size=128) + class HashlibGit(unittest.TestCase): def setUp(self): self.blob_data = b'42\n' self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') # NOQA self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode('utf-8') # NOQA self.checksums = { 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' 'e07157b6cd'), 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' 'e9e959f120'), } @istest def unknown_header_type(self): with self.assertRaises(ValueError) as cm: hashutil.hash_git_data(b'any-data', 'some-unknown-type') self.assertIn('Unexpected git object type', cm.exception.args[0]) @istest def hashdata_content(self): # when actual_hash = hashutil.hash_git_data(self.blob_data, git_type='blob') # then self.assertEqual(actual_hash, self.checksums['blob_sha1_git']) @istest def hashdata_tree(self): # when actual_hash = hashutil.hash_git_data(self.tree_data, git_type='tree') # then self.assertEqual(actual_hash, self.checksums['tree_sha1_git']) @istest def hashdata_revision(self): # when actual_hash = hashutil.hash_git_data(self.commit_data, git_type='commit') # then self.assertEqual(actual_hash, self.checksums['commit_sha1_git']) @istest def hashdata_tag(self): # when actual_hash = hashutil.hash_git_data(self.tag_data, git_type='tag') # then self.assertEqual(actual_hash, self.checksums['tag_sha1_git'])