diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -162,7 +162,8 @@ return _new_hashlib_hash(algo) -def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): +def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, + chunk_cb=None, with_length=False, hexdigest=False): """Hash the contents of the given file object with the given algorithms. Args: @@ -171,11 +172,15 @@ git-specific algorithms) algorithms: the hashing algorithms to be used, as an iterable over strings + with_length (bool): Include length in the dict result + hexdigest (bool): False returns the hash as binary, otherwise + returns as hex - Returns: a dict mapping each algorithm to a bytes digest. + Returns: a dict mapping each algorithm to a digest (bytes by default). Raises: ValueError if algorithms contains an unknown hash algorithm. + """ hashes = {algo: _new_hash(algo, length) for algo in algorithms} @@ -188,10 +193,56 @@ if chunk_cb: chunk_cb(chunk) - return {algo: hash.digest() for algo, hash in hashes.items()} + if hexdigest: + h = {algo: hash.hexdigest() for algo, hash in hashes.items()} + else: + h = {algo: hash.digest() for algo, hash in hashes.items()} + if with_length: + h['length'] = length + return h + + +def hash_stream(s, length=None, algorithms=DEFAULT_ALGORITHMS, + chunk_cb=None, with_length=False, hexdigest=False): + """Hash the contents of the given stream with the given algorithms. + + Args: + s (stream): a stream object (e.g requests.get(stream=True)) + length (int): the length of the contents of the stream (for the + git-specific algorithms) + algorithms (dict): the hashing algorithms to be used, as an + iterable over strings + with_length (bool): Include length in the dict result + hexdigest (bool): False returns the hash as binary, otherwise + returns as hex + + Returns: a dict mapping each algorithm to a digest (bytes by default). + + Raises: + ValueError if algorithms contains an unknown hash algorithm. + + """ + hashes = {algo: _new_hash(algo, length) for algo in algorithms} + + for chunk in s.iter_content(): + if not chunk: + break + for hash in hashes.values(): + hash.update(chunk) + if chunk_cb: + chunk_cb(chunk) + + if hexdigest: + h = {algo: hash.hexdigest() for algo, hash in hashes.items()} + else: + h = {algo: hash.digest() for algo, hash in hashes.items()} + if with_length: + h['length'] = length + return h -def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): +def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None, + with_length=True, hexdigest=False): """Hash the contents of the file at the given path with the given algorithms. @@ -199,6 +250,9 @@ path: the path of the file to hash algorithms: the hashing algorithms used chunk_cb: a callback + with_length (bool): Include length in the dict result + hexdigest (bool): False returns the hash as binary, otherwise + returns as hex Returns: a dict mapping each algorithm to a bytes digest. @@ -209,9 +263,8 @@ """ length = os.path.getsize(path) with open(path, 'rb') as fobj: - hash = hash_file(fobj, length, algorithms, chunk_cb) - hash['length'] = length - return hash + return hash_file(fobj, length, algorithms, chunk_cb=chunk_cb, + with_length=with_length, hexdigest=hexdigest) def hash_data(data, algorithms=DEFAULT_ALGORITHMS, with_length=False): @@ -230,10 +283,7 @@ """ fobj = BytesIO(data) length = len(data) - data = hash_file(fobj, length, algorithms) - if with_length: - data['length'] = length - return data + return hash_file(fobj, length, algorithms, with_length=with_length) def hash_git_data(data, git_type, base_algo='sha1'): diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -95,6 +95,27 @@ checksums = hashutil.hash_file(fobj, length=len(self.data)) self.assertEqual(checksums, self.checksums) + @istest + def hash_file_hexdigest(self): + fobj = io.BytesIO(self.data) + checksums = hashutil.hash_file(fobj, length=len(self.data), + hexdigest=True) + self.assertEqual(checksums, self.hex_checksums) + + @istest + def hash_stream(self): + class StreamStub: + def __init__(self, data): + self.data = data + + def iter_content(self): + yield from io.BytesIO(self.data) + + s = StreamStub(self.data) + checksums = hashutil.hash_stream(s, length=len(self.data), + hexdigest=True) + self.assertEqual(checksums, self.hex_checksums) + @istest def hash_file_missing_length(self): fobj = io.BytesIO(self.data)