Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123776
D410.id1281.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
D410.id1281.diff
View Options
diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2017 The Software Heritage developers
+# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -10,7 +10,35 @@
ALGORITHMS set. Any provided algorithms not in that list will result
in a ValueError explaining the error.
-This modules defines the following hashing functions:
+This module defines MultiHash class to ease the softwareheritage
+hashing algorithm. This allows as before (with hash_* function) to
+compute hashes from file object, path, data.
+
+Basic usage examples:
+
+- file object: MultiHash.from_file(file_object).digest()
+
+- path (filepath): MultiHash.from_path(b'foo').hexdigest()
+
+- data (bytes): MultiHash.from_data(b'foo').bytehexdigest()
+
+Complex usage (old use was through callback):
+
+- To compute length, integrate the length to the set of algorithms to
+ compute, for example:
+
+ h = MultiHash(hash_names=set({'length'}).union(DEFAULT_ALGORITHMS))
+
+- Write alongside computing hashing algorithms (from a stream), example:
+
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in r.iter_content(): # r a stream of sort
+ h.update(chunk)
+ f.write(chunk)
+
+
+This module also defines the following (deprecated) hashing functions:
- hash_file: Hash the contents of the given file object with the given
algorithms (defaulting to DEFAULT_ALGORITHMS if none provided).
@@ -46,6 +74,95 @@
_blake2_hash_cache = {}
+class MultiHash:
+ """Hashutil class to support multiple hashes computation.
+
+ Args:
+
+ hash_names (set): Set of hash algorithms (+ optionally length)
+ to compute hashes (cf. DEFAULT_ALGORITHMS)
+ length (int): Length of the total sum of chunks to read
+
+ If the length is provided as algorithm, the length is also
+ computed and returned.
+
+ """
+ def __init__(self, hash_names=DEFAULT_ALGORITHMS, length=None):
+ self.state = {}
+ self.track_length = False
+ for name in hash_names:
+ if name == 'length':
+ self.state['length'] = 0
+ self.track_length = True
+ else:
+ self.state[name] = _new_hash(name, length)
+
+ @classmethod
+ def from_state(cls, state, track_length):
+ ret = cls([])
+ ret.state = state
+ ret.track_length = track_length
+
+ @classmethod
+ def from_file(cls, file, hash_names=DEFAULT_ALGORITHMS, length=None):
+ ret = cls(length=length, hash_names=hash_names)
+ for chunk in file:
+ ret.update(chunk)
+ return ret
+
+ @classmethod
+ def from_path(cls, path, hash_names=DEFAULT_ALGORITHMS, length=None,
+ track_length=True):
+ if not length:
+ length = os.path.getsize(path)
+ with open(path, 'rb') as f:
+ ret = cls.from_file(f, hash_names=hash_names, length=length)
+ # For compatibility reason with `hash_path`
+ if track_length:
+ ret.state['length'] = length
+ return ret
+
+ @classmethod
+ def from_data(cls, data, hash_names=DEFAULT_ALGORITHMS, length=None):
+ if not length:
+ length = len(data)
+ fobj = BytesIO(data)
+ return cls.from_file(fobj, hash_names=hash_names, length=length)
+
+ def update(self, chunk):
+ for name, h in self.state.items():
+ if name == 'length':
+ continue
+ h.update(chunk)
+ if self.track_length:
+ self.state['length'] += len(chunk)
+
+ def digest(self):
+ return {
+ name: h.digest() if name != 'length' else h
+ for name, h in self.state.items()
+ }
+
+ def hexdigest(self):
+ return {
+ name: h.hexdigest() if name != 'length' else h
+ for name, h in self.state.items()
+ }
+
+ def bytehexdigest(self):
+ return {
+ name: hash_to_bytehex(h.digest()) if name != 'length' else h
+ for name, h in self.state.items()
+ }
+
+ def copy(self):
+ copied_state = {
+ name: h.copy() if name != 'length' else h
+ for name, h in self.state.items()
+ }
+ return self.from_state(copied_state, self.track_length)
+
+
def _new_blake2_hash(algo):
"""Return a function that initializes a blake2 hash.
@@ -162,43 +279,50 @@
return _new_hashlib_hash(algo)
-def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
- """Hash the contents of the given file object with the given algorithms.
+def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS,
+ chunk_cb=None):
+ """(Deprecated) cf. MultiHash.from_file
+
+ Hash the contents of the given file object with the given algorithms.
Args:
fobj: a file-like object
- length: the length of the contents of the file-like object (for the
- git-specific algorithms)
- algorithms: the hashing algorithms to be used, as an iterable over
- strings
+ length (int): the length of the contents of the file-like
+ object (for the git-specific algorithms)
+ algorithms (set): the hashing algorithms to be used, as an
+ iterable over strings
+ chunk_cb (fun): a callback function taking a chunk of data as
+ parameter
- Returns: a dict mapping each algorithm to a bytes digest.
+ Returns:
+ a dict mapping each algorithm to a digest (bytes by default).
Raises:
ValueError if algorithms contains an unknown hash algorithm.
- """
- hashes = {algo: _new_hash(algo, length) for algo in algorithms}
+ """
+ h = MultiHash(algorithms, length)
while True:
chunk = fobj.read(HASH_BLOCK_SIZE)
if not chunk:
break
- for hash in hashes.values():
- hash.update(chunk)
+ h.update(chunk)
if chunk_cb:
chunk_cb(chunk)
- return {algo: hash.digest() for algo, hash in hashes.items()}
+ return h.digest()
def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
- """Hash the contents of the file at the given path with the given
- algorithms.
+ """(deprecated) cf. MultiHash.from_path
+
+ Hash the contents of the file at the given path with the given
+ algorithms.
Args:
- path: the path of the file to hash
- algorithms: the hashing algorithms used
- chunk_cb: a callback
+ path (str): the path of the file to hash
+ algorithms (set): the hashing algorithms used
+ chunk_cb (fun): a callback function taking a chunk of data as parameter
Returns: a dict mapping each algorithm to a bytes digest.
@@ -209,31 +333,28 @@
"""
length = os.path.getsize(path)
with open(path, 'rb') as fobj:
- hash = hash_file(fobj, length, algorithms, chunk_cb)
- hash['length'] = length
- return hash
+ hashes = hash_file(fobj, length, algorithms, chunk_cb=chunk_cb)
+ hashes['length'] = length
+ return hashes
+
+def hash_data(data, algorithms=DEFAULT_ALGORITHMS):
+ """(deprecated) cf. MultiHash.from_data
-def hash_data(data, algorithms=DEFAULT_ALGORITHMS, with_length=False):
- """Hash the given binary blob with the given algorithms.
+ Hash the given binary blob with the given algorithms.
Args:
data (bytes): raw content to hash
- algorithms (list): the hashing algorithms used
- with_length (bool): add the length key in the resulting dict
+ algorithms (set): the hashing algorithms used
Returns: a dict mapping each algorithm to a bytes digest
Raises:
TypeError if data does not support the buffer interface.
ValueError if algorithms contains an unknown hash algorithm.
+
"""
- fobj = BytesIO(data)
- length = len(data)
- data = hash_file(fobj, length, algorithms)
- if with_length:
- data['length'] = length
- return data
+ return MultiHash.from_data(data, hash_names=algorithms).digest()
def hash_git_data(data, git_type, base_algo='sha1'):
diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py
--- a/swh/model/tests/test_hashutil.py
+++ b/swh/model/tests/test_hashutil.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2017 The Software Heritage developers
+# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -13,9 +13,10 @@
from unittest.mock import patch
from swh.model import hashutil
+from swh.model.hashutil import MultiHash
-class Hashutil(unittest.TestCase):
+class BaseHashutil(unittest.TestCase):
def setUp(self):
# Reset function cache
hashutil._blake2_hash_cache = {}
@@ -35,6 +36,11 @@
for type, cksum in self.hex_checksums.items()
}
+ self.bytehex_checksums = {
+ type: hashutil.hash_to_bytehex(cksum)
+ for type, cksum in self.checksums.items()
+ }
+
self.git_hex_checksums = {
'blob': self.hex_checksums['sha1_git'],
'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0',
@@ -47,6 +53,75 @@
for type, cksum in self.git_hex_checksums.items()
}
+
+class MultiHashTest(BaseHashutil):
+ @istest
+ def multi_hash_data(self):
+ checksums = MultiHash.from_data(self.data).digest()
+ self.assertEqual(checksums, self.checksums)
+ self.assertFalse('length' in checksums)
+
+ @istest
+ def multi_hash_data_with_length(self):
+ expected_checksums = self.checksums.copy()
+ expected_checksums['length'] = len(self.data)
+
+ algos = set(['length']).union(hashutil.DEFAULT_ALGORITHMS)
+ checksums = MultiHash.from_data(self.data, hash_names=algos).digest()
+
+ self.assertEqual(checksums, expected_checksums)
+ self.assertTrue('length' in checksums)
+
+ @istest
+ def multi_hash_data_unknown_hash(self):
+ with self.assertRaises(ValueError) as cm:
+ MultiHash.from_data(self.data, ['unknown-hash'])
+
+ self.assertIn('Unexpected hashing algorithm', cm.exception.args[0])
+ self.assertIn('unknown-hash', cm.exception.args[0])
+
+ @istest
+ def multi_hash_file(self):
+ fobj = io.BytesIO(self.data)
+
+ checksums = MultiHash.from_file(fobj, length=len(self.data)).digest()
+ self.assertEqual(checksums, self.checksums)
+
+ @istest
+ def multi_hash_file_hexdigest(self):
+ fobj = io.BytesIO(self.data)
+ length = len(self.data)
+ checksums = MultiHash.from_file(fobj, length=length).hexdigest()
+ self.assertEqual(checksums, self.hex_checksums)
+
+ @istest
+ def multi_hash_file_bytehexdigest(self):
+ fobj = io.BytesIO(self.data)
+ length = len(self.data)
+ checksums = MultiHash.from_file(fobj, length=length).bytehexdigest()
+ self.assertEqual(checksums, self.bytehex_checksums)
+
+ @istest
+ def multi_hash_file_missing_length(self):
+ fobj = io.BytesIO(self.data)
+ with self.assertRaises(ValueError) as cm:
+ MultiHash.from_file(fobj, hash_names=['sha1_git'])
+
+ self.assertIn('Missing length', cm.exception.args[0])
+
+ @istest
+ def multi_hash_path(self):
+ with tempfile.NamedTemporaryFile(delete=False) as f:
+ f.write(self.data)
+
+ hashes = MultiHash.from_path(f.name).digest()
+ os.remove(f.name)
+
+ self.checksums['length'] = len(self.data)
+ self.assertEquals(self.checksums, hashes)
+
+
+class Hashutil(BaseHashutil):
@istest
def hash_data(self):
checksums = hashutil.hash_data(self.data)
@@ -58,7 +133,8 @@
expected_checksums = self.checksums.copy()
expected_checksums['length'] = len(self.data)
- checksums = hashutil.hash_data(self.data, with_length=True)
+ algos = set(['length']).union(hashutil.DEFAULT_ALGORITHMS)
+ checksums = hashutil.hash_data(self.data, algorithms=algos)
self.assertEqual(checksums, expected_checksums)
self.assertTrue('length' in checksums)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 20 2024, 1:19 AM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3232235
Attached To
D410: model.hashutil: Open new endpoint to allow to hash stream
Event Timeline
Log In to Comment