Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123796
D410.id1277.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D410.id1277.diff
View Options
diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2017 The Software Heritage developers
+# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -12,6 +12,10 @@
This modules defines the following hashing functions:
+- hash_stream: Hash the contents of something iterable (file, stream,
+ ...) with the given algorithms (defaulting to DEFAULT_ALGORITHMS if
+ none provided).
+
- hash_file: Hash the contents of the given file object with the given
algorithms (defaulting to DEFAULT_ALGORITHMS if none provided).
@@ -45,6 +49,73 @@
_blake2_hash_cache = {}
+HASH_FORMATS = set(['bytes', 'bytehex', 'hex'])
+"""Supported output hash formats
+"""
+
+
+class MultiHash:
+ """Hashutil class to support multiple hashes computation.
+
+ Args:
+
+ hash_names (set): Set of hash algorithms (+ length) to compute
+ hashes (cf. DEFAULT_ALGORITHMS)
+ length (int): Length of the total sum of chunks to read
+
+ If the length is provided as algorithm, the length is also
+ computed and returned.
+
+ """
+ def __init__(self, hash_names=DEFAULT_ALGORITHMS, length=None):
+ self.state = {}
+ self.track_length = False
+ for name in hash_names:
+ if name == 'length':
+ self.state['length'] = 0
+ self.track_length = True
+ else:
+ self.state[name] = _new_hash(name, length)
+
+ @classmethod
+ def from_state(cls, state, track_length):
+ ret = cls([])
+ ret.state = state
+ ret.track_length = track_length
+
+ def update(self, chunk):
+ for name, h in self.state.items():
+ if name == 'length':
+ continue
+ h.update(chunk)
+ if self.track_length:
+ self.state['length'] += len(chunk)
+
+ def digest(self):
+ return {
+ name: h.digest() if name != 'length' else h
+ for name, h in self.state.items()
+ }
+
+ def hexdigest(self):
+ return {
+ name: h.hexdigest() if name != 'length' else h
+ for name, h in self.state.items()
+ }
+
+ def bytehexdigest(self):
+ return {
+ name: hash_to_bytehex(h.digest()) if name != 'length' else h
+ for name, h in self.state.items()
+ }
+
+ def copy(self):
+ copied_state = {
+ name: h.copy() if name != 'length' else h
+ for name, h in self.state.items()
+ }
+ return self.from_state(copied_state, self.track_length)
+
def _new_blake2_hash(algo):
"""Return a function that initializes a blake2 hash.
@@ -162,7 +233,8 @@
return _new_hashlib_hash(algo)
-def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
+def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS,
+ chunk_cb=None, hash_format='bytes'):
"""Hash the contents of the given file object with the given algorithms.
Args:
@@ -171,69 +243,91 @@
git-specific algorithms)
algorithms: the hashing algorithms to be used, as an iterable over
strings
+ hash_format (str): Format required for the output of the
+ computed hashes (cf. HASH_FORMATS)
- Returns: a dict mapping each algorithm to a bytes digest.
+ Returns: a dict mapping each algorithm to a digest (bytes by default).
Raises:
- ValueError if algorithms contains an unknown hash algorithm.
+ ValueError if:
+
+ algorithms contains an unknown hash algorithm.
+ hash_format is an unknown hash format
+
"""
- hashes = {algo: _new_hash(algo, length) for algo in algorithms}
+ if hash_format not in HASH_FORMATS:
+ raise ValueError('Unexpected hash format %s, expected one of %s' % (
+ hash_format, HASH_FORMATS))
+ h = MultiHash(algorithms, length)
while True:
chunk = fobj.read(HASH_BLOCK_SIZE)
if not chunk:
break
- for hash in hashes.values():
- hash.update(chunk)
+ h.update(chunk)
if chunk_cb:
chunk_cb(chunk)
- return {algo: hash.digest() for algo, hash in hashes.items()}
+ if hash_format == 'bytes':
+ return h.digest()
+ if hash_format == 'bytehex':
+ return h.bytehexdigest()
+ return h.hexdigest()
-def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
+def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None,
+ hash_format='bytes', track_length=True):
"""Hash the contents of the file at the given path with the given
algorithms.
Args:
- path: the path of the file to hash
- algorithms: the hashing algorithms used
- chunk_cb: a callback
+ path (str): the path of the file to hash
+ algorithms (set): the hashing algorithms used
+ chunk_cb (def): a callback
+ hash_format (str): Format required for the output of the
+ computed hashes (cf. HASH_FORMATS)
Returns: a dict mapping each algorithm to a bytes digest.
Raises:
- ValueError if algorithms contains an unknown hash algorithm.
+ ValueError if:
+
+ algorithms contains an unknown hash algorithm.
+ hash_format is an unknown hash format
+
OSError on file access error
"""
+ if track_length:
+ algorithms = set(['length']).union(algorithms)
length = os.path.getsize(path)
with open(path, 'rb') as fobj:
- hash = hash_file(fobj, length, algorithms, chunk_cb)
- hash['length'] = length
- return hash
+ return hash_file(fobj, length, algorithms, chunk_cb=chunk_cb,
+ hash_format=hash_format)
-def hash_data(data, algorithms=DEFAULT_ALGORITHMS, with_length=False):
+def hash_data(data, algorithms=DEFAULT_ALGORITHMS, hash_format='bytes'):
"""Hash the given binary blob with the given algorithms.
Args:
data (bytes): raw content to hash
algorithms (list): the hashing algorithms used
- with_length (bool): add the length key in the resulting dict
+ hash_format (str): Format required for the output of the
+ computed hashes (cf. HASH_FORMATS)
Returns: a dict mapping each algorithm to a bytes digest
Raises:
TypeError if data does not support the buffer interface.
- ValueError if algorithms contains an unknown hash algorithm.
+ ValueError if:
+
+ algorithms contains an unknown hash algorithm.
+ hash_format is an unknown hash format
+
"""
fobj = BytesIO(data)
length = len(data)
- data = hash_file(fobj, length, algorithms)
- if with_length:
- data['length'] = length
- return data
+ return hash_file(fobj, length, algorithms, hash_format=hash_format)
def hash_git_data(data, git_type, base_algo='sha1'):
diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py
--- a/swh/model/tests/test_hashutil.py
+++ b/swh/model/tests/test_hashutil.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2017 The Software Heritage developers
+# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -35,6 +35,11 @@
for type, cksum in self.hex_checksums.items()
}
+ self.bytehex_checksums = {
+ type: hashutil.hash_to_bytehex(cksum)
+ for type, cksum in self.checksums.items()
+ }
+
self.git_hex_checksums = {
'blob': self.hex_checksums['sha1_git'],
'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0',
@@ -58,7 +63,8 @@
expected_checksums = self.checksums.copy()
expected_checksums['length'] = len(self.data)
- checksums = hashutil.hash_data(self.data, with_length=True)
+ algos = set(['length']).union(hashutil.DEFAULT_ALGORITHMS)
+ checksums = hashutil.hash_data(self.data, algorithms=algos)
self.assertEqual(checksums, expected_checksums)
self.assertTrue('length' in checksums)
@@ -71,6 +77,16 @@
self.assertIn('Unexpected hashing algorithm', cm.exception.args[0])
self.assertIn('unknown-hash', cm.exception.args[0])
+ @istest
+ def hash_data_unknown_hash_format(self):
+ with self.assertRaises(ValueError) as cm:
+ hashutil.hash_data(
+ self.data, hashutil.DEFAULT_ALGORITHMS,
+ hash_format='unknown-format')
+
+ self.assertIn('Unexpected hash format', cm.exception.args[0])
+ self.assertIn('unknown-format', cm.exception.args[0])
+
@istest
def hash_git_data(self):
checksums = {
@@ -95,6 +111,20 @@
checksums = hashutil.hash_file(fobj, length=len(self.data))
self.assertEqual(checksums, self.checksums)
+ @istest
+ def hash_file_hexdigest(self):
+ fobj = io.BytesIO(self.data)
+ checksums = hashutil.hash_file(
+ fobj, length=len(self.data), hash_format='hex')
+ self.assertEqual(checksums, self.hex_checksums)
+
+ @istest
+ def hash_file_bytehexdigest(self):
+ fobj = io.BytesIO(self.data)
+ checksums = hashutil.hash_file(
+ fobj, length=len(self.data), hash_format='bytehex')
+ self.assertEqual(checksums, self.bytehex_checksums)
+
@istest
def hash_file_missing_length(self):
fobj = io.BytesIO(self.data)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 20 2024, 2:23 AM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3232239
Attached To
D410: model.hashutil: Open new endpoint to allow to hash stream
Event Timeline
Log In to Comment