diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,7 +7,7 @@ import os import stat -from . import hashutil +from .hashutil import MultiHash, HASH_BLOCK_SIZE from .merkle import MerkleLeaf, MerkleNode from .identifiers import ( directory_identifier, @@ -77,8 +77,9 @@ mode (int): a file mode (passed to :func:`mode_to_perms`) data (bytes): raw contents of the file """ - ret = hashutil.hash_data(data) - ret['length'] = len(data) + length = len(data) + ret = MultiHash.from_data(data, length=length).digest() + ret['length'] = length ret['perms'] = mode_to_perms(mode) ret['data'] = data @@ -91,8 +92,8 @@ @classmethod def from_file(cls, *, path, data=False, save_path=False): - """Compute the Software Heritage content entry corresponding to an on-disk - file. + """Compute the Software Heritage content entry corresponding to an + on-disk file. The returned dictionary contains keys useful for both: - loading the content in the archive (hashes, `length`) @@ -103,6 +104,7 @@ content entry data (bool): add the file data to the entry save_path (bool): add the file path to the entry + """ file_stat = os.lstat(path) mode = file_stat.st_mode @@ -117,17 +119,16 @@ length = file_stat.st_size if not data: - ret = hashutil.hash_path(path) + ret = MultiHash.from_path(path).digest() else: + h = MultiHash(length=length) chunks = [] - - def append_chunk(x, chunks=chunks): - chunks.append(x) - with open(path, 'rb') as fobj: - ret = hashutil.hash_file(fobj, length=length, - chunk_cb=append_chunk) + for chunk in fobj: + h.update(chunk) + chunks.append(chunk) + ret = h.digest() ret['data'] = b''.join(chunks) if save_path: diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -11,8 +11,7 @@ from .exceptions import ValidationError from .fields.hashes import validate_sha1 -from .hashutil import hash_data, hash_git_data, DEFAULT_ALGORITHMS -from .hashutil import hash_to_hex +from .hashutil import hash_git_data, hash_to_hex, MultiHash SNAPSHOT = 'snapshot' @@ -104,7 +103,7 @@ """ - return hash_data(content['data'], DEFAULT_ALGORITHMS) + return MultiHash.from_data(content['data']).digest() def _sort_key(entry): diff --git a/swh/model/validators.py b/swh/model/validators.py --- a/swh/model/validators.py +++ b/swh/model/validators.py @@ -1,10 +1,11 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .exceptions import ValidationError, NON_FIELD_ERRORS -from . import fields, hashutil +from . import fields +from .hashutil import MultiHash, hash_to_bytes def validate_content(content): @@ -44,11 +45,11 @@ def validate_hashes(content): errors = [] if 'data' in content: - hashes = hashutil.hash_data(content['data']) + hashes = MultiHash.from_data(content['data']).digest() for hash_type, computed_hash in hashes.items(): if hash_type not in content: continue - content_hash = hashutil.hash_to_bytes(content[hash_type]) + content_hash = hash_to_bytes(content[hash_type]) if content_hash != computed_hash: errors.append(ValidationError( 'hash mismatch in content for hash %(hash)s',