diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6350e98 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.coverage diff --git a/bin/swh-hashfile b/bin/swh-hashfile new file mode 100755 index 0000000..af066e5 --- /dev/null +++ b/bin/swh-hashfile @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import sys + +from swh.core.hashutil import hashfile + +if __name__ == '__main__': + fname = sys.argv[1] + for (algo, checksum) in sorted(hashfile(fname).items()): + print('%s\t%s' % (algo, checksum)) diff --git a/swh/__init__.py b/swh/__init__.py new file mode 100644 index 0000000..3ad9513 --- /dev/null +++ b/swh/__init__.py @@ -0,0 +1,2 @@ +from pkgutil import extend_path +__path__ = extend_path(__path__, __name__) diff --git a/swh/core/__init__.py b/swh/core/__init__.py new file mode 100644 index 0000000..fdffa2a --- /dev/null +++ b/swh/core/__init__.py @@ -0,0 +1 @@ +# placeholder diff --git a/swh/core/hashutil.py b/swh/core/hashutil.py new file mode 100644 index 0000000..08bd0dc --- /dev/null +++ b/swh/core/hashutil.py @@ -0,0 +1,88 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import hashlib +import os + +from io import BytesIO + +# supported hashing algorithms +ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) + +# should be a multiple of 64 (sha1/sha256's block size) +# FWIW coreutils' sha1sum uses 32768 +HASH_BLOCK_SIZE = 32768 + + +def _new_hash(algo, length=None): + """Initialize a digest object (as returned by python's hashli) for the + requested algorithm. See the constant ALGORITHMS for the list of supported + algorithms. If a git-specific hashing algorithm is requested (e.g., + "sha1_git"), the hasing object will be pre-fed with the needed header; for + this to work, length must be given. + + """ + if algo not in ALGORITHMS: + raise ValueError('unknown hashing algorithm ' + algo) + + h = None + if algo.endswith('_git'): + if length is None: + raise ValueError('missing length for git hashing algorithm') + h = hashlib.new(algo.split('_')[0]) + h.update(('blob %d\0' % length).encode('ascii')) # git hash header + else: + h = hashlib.new(algo) + + return h + + +def _hash_file_obj(f, length, algorithms=ALGORITHMS): + """hash the content of a file-like object + + """ + hashers = {algo: _new_hash(algo, length) + for algo in algorithms} + while True: + chunk = f.read(HASH_BLOCK_SIZE) + if not chunk: + break + for h in hashers.values(): + h.update(chunk) + + return {algo: hashers[algo].hexdigest() for algo in hashers} + + +def _hash_fname(fname, algorithms=ALGORITHMS): + """hash the content of a file specified by file name + + """ + length = os.path.getsize(fname) + with open(fname, 'rb') as f: + return _hash_file_obj(f, length) + + +def hashfile(f, length=None, algorithms=ALGORITHMS): + """Hash the content of a given file, given either as a file-like object or a + file name. All specified hash algorithms will be computed, reading the file + only once. Returns a dictionary mapping algorithm names to hex-encoded + checksums. + + When passing a file-like object, content length must be given; when passing + a file name, content length is ignored. + + """ + if isinstance(f, str): + return _hash_fname(f, algorithms) + else: + return _hash_file_obj(f, length, algorithms) + + +def hashdata(data, algorithms=ALGORITHMS): + """Like hashfile, but hashes content passed as a string (of bytes) + + """ + buf = BytesIO(data) + return _hash_file_obj(buf, len(data), algorithms) diff --git a/swh/tests/test_hashlib.py b/swh/tests/test_hashlib.py new file mode 100644 index 0000000..21b416f --- /dev/null +++ b/swh/tests/test_hashlib.py @@ -0,0 +1,48 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import tempfile +import unittest + +from nose.tools import istest + +from swh.core import hashutil + + +class Hashlib(unittest.TestCase): + + def setUp(self): + self.data = b'42\n' + self.checksums = { + 'sha1': '34973274ccef6ab4dfaaf86599792fa9c3fe4689', + 'sha1_git': 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd', + 'sha256': '084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0', # NOQA + } + + @istest + def hashdata(self): + checksums = hashutil.hashdata(self.data) + self.assertEqual(checksums, self.checksums) + + @istest + def unknown_algo(self): + with self.assertRaises(ValueError): + hashutil.hashdata(self.data, algorithms=['does-not-exist']) + + @istest + def hashfile_by_name(self): + with tempfile.NamedTemporaryFile() as f: + f.write(self.data) + f.flush() + checksums = hashutil.hashfile(f.name) + self.assertEqual(checksums, self.checksums) + + @istest + def hashfile_by_obj(self): + with tempfile.TemporaryFile() as f: + f.write(self.data) + f.seek(0) + checksums = hashutil.hashfile(f, len(self.data)) + self.assertEqual(checksums, self.checksums)