diff --git a/swh/core/hashutil.py b/swh/core/hashutil.py index b643a6c..8f04b2f 100644 --- a/swh/core/hashutil.py +++ b/swh/core/hashutil.py @@ -1,106 +1,158 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import binascii import functools import hashlib import os from io import BytesIO # supported hashing algorithms ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) # should be a multiple of 64 (sha1/sha256's block size) # FWIW coreutils' sha1sum uses 32768 HASH_BLOCK_SIZE = 32768 +def _new_git_hash(base_algo, git_type, length): + """Initialize a digest object (as returned by python's hashlib) for the + requested algorithm, and feed it with the header for a git object of the + given type and length. + + The header for hashing a git object consists of: + - The type of the object (encoded in ASCII) + - One ASCII space (\x20) + - The length of the object (decimal encoded in ASCII) + - One NUL byte + + Args: + base_algo: a hashlib-supported algorithm + git_type: the type of the git object (supposedly one of 'blob', + 'commit', 'tag', 'tree') + length: the length of the git object you're encoding + + Returns: + a hashutil.hash object + """ + + h = hashlib.new(base_algo) + git_header = '%s %d\0' % (git_type, length) + h.update(git_header.encode('ascii')) + + return h + + def _new_hash(algo, length=None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is requested (e.g., "sha1_git"), the hashing object will be pre-fed with the needed header; for this to work, length must be given. """ if algo not in ALGORITHMS: raise ValueError('unknown hashing algorithm ' + algo) h = None if algo.endswith('_git'): if length is None: raise ValueError('missing length for git hashing algorithm') - h = hashlib.new(algo.split('_')[0]) - h.update(('blob %d\0' % length).encode('ascii')) # git hash header + base_algo = algo[:-4] + h = _new_git_hash(base_algo, 'blob', length) else: h = hashlib.new(algo) return h def _hash_file_obj(f, length, algorithms=ALGORITHMS, chunk_cb=None): """hash the content of a file-like object If chunk_cb is given, call it on each data chunk after updating the hash """ hashers = {algo: _new_hash(algo, length) for algo in algorithms} while True: chunk = f.read(HASH_BLOCK_SIZE) if not chunk: break for h in hashers.values(): h.update(chunk) if chunk_cb: chunk_cb(chunk) return {algo: hashers[algo].digest() for algo in hashers} def _hash_fname(fname, algorithms=ALGORITHMS): """hash the content of a file specified by file name """ length = os.path.getsize(fname) with open(fname, 'rb') as f: return _hash_file_obj(f, length) def hashfile(f, length=None, algorithms=ALGORITHMS): """Hash the content of a given file, given either as a file-like object or a file name. All specified hash algorithms will be computed, reading the file only once. Returns a dictionary mapping algorithm names to hex-encoded checksums. When passing a file-like object, content length must be given; when passing a file name, content length is ignored. """ if isinstance(f, (str, bytes)): return _hash_fname(f, algorithms) else: return _hash_file_obj(f, length, algorithms) def hashdata(data, algorithms=ALGORITHMS): """Like hashfile, but hashes content passed as a string (of bytes) """ buf = BytesIO(data) return _hash_file_obj(buf, len(data), algorithms) +def hash_git_object(git_object, git_type, hash_algo='sha1'): + """Hash a git_object of git_type using hash_algo. + + Args: + git_object: a bytestring containing a git object + git_type: one of ('blob', 'commit', 'tag', 'tree') + hash_algo: one of BASE_ALGORITHMS + Returns: + The resulting hashutil.hash object, fed with all the needed data. + """ + + git_types = ('blob', 'commit', 'tag', 'tree') + if git_type not in git_types: + raise ValueError('Unexpected git object type %s. Expected one of %s' % + (git_type, ', '.join(git_types))) + + length = len(git_object) + + h = _new_git_hash(hash_algo, git_type, length) + h.update(git_object) + + return h + + @functools.lru_cache() def hash_to_hex(hash): """Converts a hash to its hexadecimal string representation""" return binascii.hexlify(hash).decode('ascii') @functools.lru_cache() def hex_to_hash(hex): """Converts a hexadecimal string representation of a hash to that hash""" return bytes.fromhex(hex) diff --git a/swh/core/tests/test_hashutil.py b/swh/core/tests/test_hashutil.py index f277707..2fd2f9b 100644 --- a/swh/core/tests/test_hashutil.py +++ b/swh/core/tests/test_hashutil.py @@ -1,85 +1,169 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest from nose.tools import istest from swh.core import hashutil class Hashlib(unittest.TestCase): def setUp(self): self.data = b'42\n' self.hex_checksums = { 'sha1': '34973274ccef6ab4dfaaf86599792fa9c3fe4689', 'sha1_git': 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd', 'sha256': '084c799cd551dd1d8d5c5f9a5d593b2e931f5e36' '122ee5c793c1d08a19839cc0', } self.checksums = { 'sha1': bytes.fromhex('34973274ccef6ab4dfaaf865997' '92fa9c3fe4689'), 'sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a845' '3e1e07157b6cd'), 'sha256': bytes.fromhex('084c799cd551dd1d8d5c5f9a5d5' '93b2e931f5e36122ee5c793c1d0' '8a19839cc0'), } @istest def hashdata(self): checksums = hashutil.hashdata(self.data) self.assertEqual(checksums, self.checksums) @istest def unknown_algo(self): with self.assertRaises(ValueError): hashutil.hashdata(self.data, algorithms=['does-not-exist']) @istest def algo_selection(self): checksums = hashutil.hashdata(self.data, algorithms=['sha1', 'sha256']) self.assertIn('sha1', checksums) self.assertIn('sha256', checksums) self.assertNotIn('sha1_git', checksums) @istest def hashfile_by_name(self): with tempfile.NamedTemporaryFile() as f: f.write(self.data) f.flush() checksums = hashutil.hashfile(f.name) self.assertEqual(checksums, self.checksums) @istest def hashfile_by_name_as_bytes(self): with tempfile.NamedTemporaryFile() as f: f.write(self.data) f.flush() checksums = hashutil.hashfile(f.name.encode('utf-8')) self.assertEqual(checksums, self.checksums) @istest def hashfile_by_obj(self): with tempfile.TemporaryFile() as f: f.write(self.data) f.seek(0) checksums = hashutil.hashfile(f, len(self.data)) self.assertEqual(checksums, self.checksums) @istest def hex_to_hash(self): for algo in self.checksums: self.assertEqual(self.checksums[algo], hashutil.hex_to_hash(self.hex_checksums[algo])) @istest def hash_to_hex(self): for algo in self.checksums: self.assertEqual(self.hex_checksums[algo], hashutil.hash_to_hex(self.checksums[algo])) + + +class HashlibGit(unittest.TestCase): + + def setUp(self): + self.blob_data = b'42\n' + + self.tree_data = b''.join([b'40000 barfoo\0', + bytes.fromhex('c3020f6bf135a38c6df' + '3afeb5fb38232c5e07087'), + b'100644 blah\0', + bytes.fromhex('63756ef0df5e4f10b6efa' + '33cfe5c758749615f20'), + b'100644 hello\0', + bytes.fromhex('907b308167f0880fb2a' + '5c0e1614bb0c7620f9dc3')]) + + self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 +author Antoine R. Dumont (@ardumont) 1444054085 +0200 +committer Antoine R. Dumont (@ardumont) 1444054085 +0200 + +initial +""".encode('utf-8') # NOQA + self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 +type commit +tag 0.0.1 +tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 + +blah +""".encode('utf-8') # NOQA + + self.checksums = { + 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' + 'e07157b6cd'), + 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' + '121dacdb1c'), + 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' + 'd629189653'), + 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' + 'e9e959f120'), + } + + @istest + def unknown_header_type(self): + with self.assertRaises(ValueError) as cm: + hashutil.hash_git_object(b'any-data', 'some-unknown-type') + + self.assertIn('Unexpected git object type', cm.exception.args[0]) + + @istest + def hashdata_content(self): + # when + hashobj = hashutil.hash_git_object(self.blob_data, 'blob') + + # then + self.assertEqual(hashobj.digest(), + self.checksums['blob_sha1_git']) + + @istest + def hashdata_tree(self): + # when + hashobj = hashutil.hash_git_object(self.tree_data, 'tree') + + # then + self.assertEqual(hashobj.digest(), + self.checksums['tree_sha1_git']) + + @istest + def hashdata_revision(self): + # when + hashobj = hashutil.hash_git_object(self.commit_data, 'commit') + + # then + self.assertEqual(hashobj.digest(), + self.checksums['commit_sha1_git']) + + @istest + def hashdata_tag(self): + # when + hashobj = hashutil.hash_git_object(self.tag_data, 'tag') + + # then + self.assertEqual(hashobj.digest(), + self.checksums['tag_sha1_git'])