diff --git a/swh/core/hashutil.py b/swh/core/hashutil.py deleted file mode 100644 index 43bd331..0000000 --- a/swh/core/hashutil.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import binascii -import functools -import hashlib -import os - -from io import BytesIO - -# supported hashing algorithms -ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) - -# should be a multiple of 64 (sha1/sha256's block size) -# FWIW coreutils' sha1sum uses 32768 -HASH_BLOCK_SIZE = 32768 - - -def _new_git_hash(base_algo, git_type, length): - """Initialize a digest object (as returned by python's hashlib) for the - requested algorithm, and feed it with the header for a git object of the - given type and length. - - The header for hashing a git object consists of: - - The type of the object (encoded in ASCII) - - One ASCII space (\x20) - - The length of the object (decimal encoded in ASCII) - - One NUL byte - - Args: - base_algo: a hashlib-supported algorithm - git_type: the type of the git object (supposedly one of 'blob', - 'commit', 'tag', 'tree') - length: the length of the git object you're encoding - - Returns: - a hashutil.hash object - """ - - h = hashlib.new(base_algo) - git_header = '%s %d\0' % (git_type, length) - h.update(git_header.encode('ascii')) - - return h - - -def _new_hash(algo, length=None): - """Initialize a digest object (as returned by python's hashlib) for the - requested algorithm. See the constant ALGORITHMS for the list of supported - algorithms. If a git-specific hashing algorithm is requested (e.g., - "sha1_git"), the hashing object will be pre-fed with the needed header; for - this to work, length must be given. - - """ - if algo not in ALGORITHMS: - raise ValueError('unknown hashing algorithm ' + algo) - - h = None - if algo.endswith('_git'): - if length is None: - raise ValueError('missing length for git hashing algorithm') - base_algo = algo[:-4] - h = _new_git_hash(base_algo, 'blob', length) - else: - h = hashlib.new(algo) - - return h - - -def _hash_file_obj(f, length, algorithms=ALGORITHMS, chunk_cb=None): - """hash the content of a file-like object - - If chunk_cb is given, call it on each data chunk after updating the hash - - """ - hashers = {algo: _new_hash(algo, length) - for algo in algorithms} - while True: - chunk = f.read(HASH_BLOCK_SIZE) - if not chunk: - break - for h in hashers.values(): - h.update(chunk) - if chunk_cb: - chunk_cb(chunk) - - return {algo: hashers[algo].digest() for algo in hashers} - - -def _hash_fname(fname, algorithms=ALGORITHMS): - """hash the content of a file specified by file name - - """ - length = os.path.getsize(fname) - with open(fname, 'rb') as f: - return _hash_file_obj(f, length) - - -def hashfile(f, length=None, algorithms=ALGORITHMS): - """Hash the content of a given file, given either as a file-like object or a - file name. All specified hash algorithms will be computed, reading the file - only once. Returns a dictionary mapping algorithm names to hex-encoded - checksums. - - When passing a file-like object, content length must be given; when passing - a file name, content length is ignored. - - """ - if isinstance(f, (str, bytes)): - return _hash_fname(f, algorithms) - else: - return _hash_file_obj(f, length, algorithms) - - -def hashdata(data, algorithms=ALGORITHMS): - """Like hashfile, but hashes content passed as a string (of bytes) - - """ - buf = BytesIO(data) - return _hash_file_obj(buf, len(data), algorithms) - - -def hash_git_object(git_object, git_type, hash_algo='sha1'): - """Hash a git_object of git_type using hash_algo. - - Args: - git_object: a bytestring containing a git object - git_type: one of ('blob', 'commit', 'tag', 'tree') - hash_algo: one of BASE_ALGORITHMS - Returns: - The resulting hashutil.hash object, fed with all the needed data. - """ - - git_types = ('blob', 'commit', 'tag', 'tree') - if git_type not in git_types: - raise ValueError('Unexpected git object type %s. Expected one of %s' % - (git_type, ', '.join(git_types))) - - length = len(git_object) - - h = _new_git_hash(hash_algo, git_type, length) - h.update(git_object) - - return h - - -@functools.lru_cache() -def hash_to_hex(hash): - """Converts a hash to its hexadecimal string representation""" - return hash_to_bytehex(hash).decode('ascii') - - -@functools.lru_cache() -def hash_to_bytehex(hash): - """Converts a hash to its hexadecimal bytes representation""" - return binascii.hexlify(hash) - - -@functools.lru_cache() -def hex_to_hash(hex): - """Converts a hexadecimal string representation of a hash to that hash""" - return bytes.fromhex(hex) - - -@functools.lru_cache() -def bytehex_to_hash(hex): - """Converts a hexadecimal bytes representation of a hash to that hash""" - return hex_to_hash(hex.decode()) diff --git a/swh/core/tests/test_hashutil.py b/swh/core/tests/test_hashutil.py deleted file mode 100644 index e797437..0000000 --- a/swh/core/tests/test_hashutil.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import tempfile -import unittest - -from nose.tools import istest - -from swh.core import hashutil - - -class Hashlib(unittest.TestCase): - - def setUp(self): - self.data = b'42\n' - self.hex_checksums = { - 'sha1': '34973274ccef6ab4dfaaf86599792fa9c3fe4689', - 'sha1_git': 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd', - 'sha256': '084c799cd551dd1d8d5c5f9a5d593b2e931f5e36' - '122ee5c793c1d08a19839cc0', - } - self.checksums = { - 'sha1': bytes.fromhex('34973274ccef6ab4dfaaf865997' - '92fa9c3fe4689'), - 'sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a845' - '3e1e07157b6cd'), - 'sha256': bytes.fromhex('084c799cd551dd1d8d5c5f9a5d5' - '93b2e931f5e36122ee5c793c1d0' - '8a19839cc0'), - } - - @istest - def hashdata(self): - checksums = hashutil.hashdata(self.data) - self.assertEqual(checksums, self.checksums) - - @istest - def unknown_algo(self): - with self.assertRaises(ValueError): - hashutil.hashdata(self.data, algorithms=['does-not-exist']) - - @istest - def algo_selection(self): - checksums = hashutil.hashdata(self.data, algorithms=['sha1', 'sha256']) - self.assertIn('sha1', checksums) - self.assertIn('sha256', checksums) - self.assertNotIn('sha1_git', checksums) - - @istest - def hashfile_by_name(self): - with tempfile.NamedTemporaryFile() as f: - f.write(self.data) - f.flush() - checksums = hashutil.hashfile(f.name) - self.assertEqual(checksums, self.checksums) - - @istest - def hashfile_by_name_as_bytes(self): - with tempfile.NamedTemporaryFile() as f: - f.write(self.data) - f.flush() - checksums = hashutil.hashfile(f.name.encode('utf-8')) - self.assertEqual(checksums, self.checksums) - - @istest - def hashfile_by_obj(self): - with tempfile.TemporaryFile() as f: - f.write(self.data) - f.seek(0) - checksums = hashutil.hashfile(f, len(self.data)) - self.assertEqual(checksums, self.checksums) - - @istest - def hex_to_hash(self): - for algo in self.checksums: - self.assertEqual(self.checksums[algo], - hashutil.hex_to_hash(self.hex_checksums[algo])) - - @istest - def hash_to_hex(self): - for algo in self.checksums: - self.assertEqual(self.hex_checksums[algo], - hashutil.hash_to_hex(self.checksums[algo])) - - @istest - def hash_to_bytehex(self): - for algo in self.checksums: - self.assertEqual(self.hex_checksums[algo].encode('ascii'), - hashutil.hash_to_bytehex(self.checksums[algo])) - - @istest - def bytehex_to_hash(self): - for algo in self.checksums: - self.assertEqual(self.checksums[algo], - hashutil.bytehex_to_hash( - self.hex_checksums[algo].encode())) - - -class HashlibGit(unittest.TestCase): - - def setUp(self): - self.blob_data = b'42\n' - - self.tree_data = b''.join([b'40000 barfoo\0', - bytes.fromhex('c3020f6bf135a38c6df' - '3afeb5fb38232c5e07087'), - b'100644 blah\0', - bytes.fromhex('63756ef0df5e4f10b6efa' - '33cfe5c758749615f20'), - b'100644 hello\0', - bytes.fromhex('907b308167f0880fb2a' - '5c0e1614bb0c7620f9dc3')]) - - self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 -author Antoine R. Dumont (@ardumont) 1444054085 +0200 -committer Antoine R. Dumont (@ardumont) 1444054085 +0200 - -initial -""".encode('utf-8') # NOQA - self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 -type commit -tag 0.0.1 -tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 - -blah -""".encode('utf-8') # NOQA - - self.checksums = { - 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' - 'e07157b6cd'), - 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' - '121dacdb1c'), - 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' - 'd629189653'), - 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' - 'e9e959f120'), - } - - @istest - def unknown_header_type(self): - with self.assertRaises(ValueError) as cm: - hashutil.hash_git_object(b'any-data', 'some-unknown-type') - - self.assertIn('Unexpected git object type', cm.exception.args[0]) - - @istest - def hashdata_content(self): - # when - hashobj = hashutil.hash_git_object(self.blob_data, 'blob') - - # then - self.assertEqual(hashobj.digest(), - self.checksums['blob_sha1_git']) - - @istest - def hashdata_tree(self): - # when - hashobj = hashutil.hash_git_object(self.tree_data, 'tree') - - # then - self.assertEqual(hashobj.digest(), - self.checksums['tree_sha1_git']) - - @istest - def hashdata_revision(self): - # when - hashobj = hashutil.hash_git_object(self.commit_data, 'commit') - - # then - self.assertEqual(hashobj.digest(), - self.checksums['commit_sha1_git']) - - @istest - def hashdata_tag(self): - # when - hashobj = hashutil.hash_git_object(self.tag_data, 'tag') - - # then - self.assertEqual(hashobj.digest(), - self.checksums['tag_sha1_git'])