diff --git a/PKG-INFO b/PKG-INFO index be8fa1c..b26d95b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.11 +Version: 0.0.12 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index be8fa1c..b26d95b 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.11 +Version: 0.0.12 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh/core/hashutil.py b/swh/core/hashutil.py index 9a4ee54..ff0679c 100644 --- a/swh/core/hashutil.py +++ b/swh/core/hashutil.py @@ -1,131 +1,106 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import binascii import functools import hashlib import os from io import BytesIO # supported hashing algorithms ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) -# Default algorithms when not mentioned -KNOWN_ALGORITHMS = ALGORITHMS | set(['sha1_blob_git', 'sha1_tree_git', - 'sha1_commit_git']) - # should be a multiple of 64 (sha1/sha256's block size) # FWIW coreutils' sha1sum uses 32768 HASH_BLOCK_SIZE = 32768 def _new_hash(algo, length=None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is requested (e.g., - "sha1_git", "sha1_blob_git", "sha1_tree_git", "sha1_commit_git"), the - hashing object will be pre-fed with the needed header; for + "sha1_git"), the hashing object will be pre-fed with the needed header; for this to work, length must be given. - Args: - algo: List of algorithms in ALGORITHMS - length: Length of content to hash. Could be None if when hashing - with sha1 and sha256 - - Returns: - A digest object - - Raises: - ValueError when on sha1_*git algorithms with length to None - ValueError when sha1_*git with * not in ('blob', 'commit', 'tree') - """ - if algo not in KNOWN_ALGORITHMS: + if algo not in ALGORITHMS: raise ValueError('unknown hashing algorithm ' + algo) h = None if algo.endswith('_git'): if length is None: raise ValueError('missing length for git hashing algorithm') - - algo_hash = algo.split('_') - h = hashlib.new(algo_hash[0]) - obj_type = 'blob' if algo_hash[1] == 'git' else algo_hash[1] - if obj_type not in ('blob', 'commit', 'tree'): - raise ValueError( - 'For `a la git` sha1 computation, the only supported types are' - ' blob, commit, tree') - - h.update(('%s %d\0' % (obj_type, length)).encode('ascii')) # git hash header + h = hashlib.new(algo.split('_')[0]) + h.update(('blob %d\0' % length).encode('ascii')) # git hash header else: h = hashlib.new(algo) return h def _hash_file_obj(f, length, algorithms=ALGORITHMS, chunk_cb=None): """hash the content of a file-like object If chunk_cb is given, call it on each data chunk after updating the hash """ hashers = {algo: _new_hash(algo, length) for algo in algorithms} while True: chunk = f.read(HASH_BLOCK_SIZE) if not chunk: break for h in hashers.values(): h.update(chunk) if chunk_cb: chunk_cb(chunk) return {algo: hashers[algo].digest() for algo in hashers} def _hash_fname(fname, algorithms=ALGORITHMS): """hash the content of a file specified by file name """ length = os.path.getsize(fname) with open(fname, 'rb') as f: - return _hash_file_obj(f, length, algorithms) + return _hash_file_obj(f, length) def hashfile(f, length=None, algorithms=ALGORITHMS): """Hash the content of a given file, given either as a file-like object or a file name. All specified hash algorithms will be computed, reading the file only once. Returns a dictionary mapping algorithm names to hex-encoded checksums. When passing a file-like object, content length must be given; when passing a file name, content length is ignored. """ if isinstance(f, str): return _hash_fname(f, algorithms) else: return _hash_file_obj(f, length, algorithms) def hashdata(data, algorithms=ALGORITHMS): """Like hashfile, but hashes content passed as a string (of bytes) """ buf = BytesIO(data) return _hash_file_obj(buf, len(data), algorithms) @functools.lru_cache() def hash_to_hex(hash): """Converts a hash to its hexadecimal string representation""" return binascii.hexlify(hash).decode('ascii') @functools.lru_cache() def hex_to_hash(hex): """Converts a hexadecimal string representation of a hash to that hash""" return bytes.fromhex(hex) diff --git a/swh/core/tests/test_hashutil.py b/swh/core/tests/test_hashutil.py index 8b482db..0931019 100644 --- a/swh/core/tests/test_hashutil.py +++ b/swh/core/tests/test_hashutil.py @@ -1,102 +1,77 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest from nose.tools import istest from swh.core import hashutil class Hashlib(unittest.TestCase): def setUp(self): self.data = b'42\n' self.hex_checksums = { - 'sha1': '34973274ccef6ab4dfaaf86599792fa9c3fe4689', - 'sha1_git': 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd', - 'sha1_blob_git': 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd', - 'sha1_tree_git': 'a3b4138923e146bdf5b51bd6fd7c64e3f59dfcad', - 'sha1_commit_git': 'ebbb89b3165385c35a0dfa78ff9059ddd28d5126', - 'sha256': '084c799cd551dd1d8d5c5f9a5d593b2e931f5e36' - '122ee5c793c1d08a19839cc0', - } + 'sha1': '34973274ccef6ab4dfaaf86599792fa9c3fe4689', + 'sha1_git': 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd', + 'sha256': '084c799cd551dd1d8d5c5f9a5d593b2e931f5e36' + '122ee5c793c1d08a19839cc0', + } self.checksums = { 'sha1': bytes.fromhex('34973274ccef6ab4dfaaf865997' '92fa9c3fe4689'), 'sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a845' '3e1e07157b6cd'), - 'sha1_blob_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a845' - '3e1e07157b6cd'), - 'sha1_tree_git': bytes.fromhex('a3b4138923e146bdf5b51bd6fd7' - 'c64e3f59dfcad'), - 'sha1_commit_git': bytes.fromhex('ebbb89b3165385c35a0dfa78ff9' - '059ddd28d5126'), 'sha256': bytes.fromhex('084c799cd551dd1d8d5c5f9a5d5' '93b2e931f5e36122ee5c793c1d0' '8a19839cc0'), - } + } @istest def hashdata(self): - checksums = hashutil.hashdata(self.data, algorithms=hashutil.KNOWN_ALGORITHMS) + checksums = hashutil.hashdata(self.data) self.assertEqual(checksums, self.checksums) @istest def unknown_algo(self): with self.assertRaises(ValueError): hashutil.hashdata(self.data, algorithms=['does-not-exist']) - for known_hash_algo in hashutil.KNOWN_ALGORITHMS: - self.assertIsNotNone(hashutil._new_hash(known_hash_algo, length=10)) - - @istest - def fail_without_length_on_sha1_git_but_ok_otherwise(self): - for hash_algo in ['sha1_git', 'sha1_blob_git', 'sha1_tree_git', 'sha1_commit_git']: - with self.assertRaises(ValueError): - hashutil._new_hash(hash_algo, length=None) - - for other_hash_algo in ['sha1', 'sha256']: - self.assertIsNotNone(hashutil._new_hash(other_hash_algo, length=None)) - @istest def algo_selection(self): checksums = hashutil.hashdata(self.data, algorithms=['sha1', 'sha256']) self.assertIn('sha1', checksums) self.assertIn('sha256', checksums) self.assertNotIn('sha1_git', checksums) @istest def hashfile_by_name(self): with tempfile.NamedTemporaryFile() as f: f.write(self.data) f.flush() - checksums = hashutil.hashfile(f.name, - length=None, - algorithms=hashutil.KNOWN_ALGORITHMS) + checksums = hashutil.hashfile(f.name) self.assertEqual(checksums, self.checksums) @istest def hashfile_by_obj(self): with tempfile.TemporaryFile() as f: f.write(self.data) f.seek(0) - checksums = hashutil.hashfile(f, - len(self.data), - algorithms=hashutil.KNOWN_ALGORITHMS) + checksums = hashutil.hashfile(f, len(self.data)) self.assertEqual(checksums, self.checksums) @istest def hex_to_hash(self): for algo in self.checksums: self.assertEqual(self.checksums[algo], hashutil.hex_to_hash(self.hex_checksums[algo])) @istest def hash_to_hex(self): for algo in self.checksums: self.assertEqual(self.hex_checksums[algo], hashutil.hash_to_hex(self.checksums[algo])) diff --git a/version.txt b/version.txt index a2c51d5..a3e195a 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.11-0-g5c25456 \ No newline at end of file +v0.0.12-0-g444daaf \ No newline at end of file