diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index a9fc2d3..bfd7c7c 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -1,350 +1,349 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import enum import os import stat from .hashutil import MultiHash, HASH_BLOCK_SIZE from .merkle import MerkleLeaf, MerkleNode from .identifiers import ( directory_identifier, identifier_to_bytes as id_to_bytes, identifier_to_str as id_to_str, ) class DentryPerms(enum.IntEnum): """Admissible permissions for directory entries.""" content = 0o100644 """Content""" executable_content = 0o100755 """Executable content (e.g. executable script)""" symlink = 0o120000 """Symbolic link""" directory = 0o040000 """Directory""" revision = 0o160000 """Revision (e.g. submodule)""" def mode_to_perms(mode): """Convert a file mode to a permission compatible with Software Heritage directory entries Args: mode (int): a file mode as returned by :func:`os.stat` in :attr:`os.stat_result.st_mode` Returns: DentryPerms: one of the following values: :const:`DentryPerms.content`: plain file :const:`DentryPerms.executable_content`: executable file :const:`DentryPerms.symlink`: symbolic link :const:`DentryPerms.directory`: directory """ if stat.S_ISLNK(mode): return DentryPerms.symlink if stat.S_ISDIR(mode): return DentryPerms.directory else: # file is executable in any way if mode & (0o111): return DentryPerms.executable_content else: return DentryPerms.content class Content(MerkleLeaf): """Representation of a Software Heritage content as a node in a Merkle tree. The current Merkle hash for the Content nodes is the `sha1_git`, which makes it consistent with what :class:`Directory` uses for its own hash computation. """ __slots__ = [] type = 'content' @classmethod def from_bytes(cls, *, mode, data): """Convert data (raw :class:`bytes`) to a Software Heritage content entry Args: mode (int): a file mode (passed to :func:`mode_to_perms`) data (bytes): raw contents of the file """ - length = len(data) - ret = MultiHash.from_data(data, length=length).digest() - ret['length'] = length + ret = MultiHash.from_data(data).digest() + ret['length'] = len(data) ret['perms'] = mode_to_perms(mode) ret['data'] = data return cls(ret) @classmethod def from_symlink(cls, *, path, mode): """Convert a symbolic link to a Software Heritage content entry""" return cls.from_bytes(mode=mode, data=os.readlink(path)) @classmethod def from_file(cls, *, path, data=False, save_path=False): """Compute the Software Heritage content entry corresponding to an on-disk file. The returned dictionary contains keys useful for both: - loading the content in the archive (hashes, `length`) - using the content as a directory entry in a directory Args: path (bytes): path to the file for which we're computing the content entry data (bool): add the file data to the entry save_path (bool): add the file path to the entry """ file_stat = os.lstat(path) mode = file_stat.st_mode if stat.S_ISLNK(mode): # Symbolic link: return a file whose contents are the link target return cls.from_symlink(path=path, mode=mode) elif not stat.S_ISREG(mode): # not a regular file: return the empty file instead return cls.from_bytes(mode=mode, data=b'') length = file_stat.st_size if not data: ret = MultiHash.from_path(path).digest() else: h = MultiHash(length=length) chunks = [] with open(path, 'rb') as fobj: while True: chunk = fobj.read(HASH_BLOCK_SIZE) if not chunk: break h.update(chunk) chunks.append(chunk) ret = h.digest() ret['data'] = b''.join(chunks) if save_path: ret['path'] = path ret['perms'] = mode_to_perms(mode) ret['length'] = length obj = cls(ret) return obj def __repr__(self): return 'Content(id=%s)' % id_to_str(self.hash) def compute_hash(self): return self.data['sha1_git'] def accept_all_directories(dirname, entries): """Default filter for :func:`Directory.from_disk` accepting all directories Args: dirname (bytes): directory name entries (list): directory entries """ return True def ignore_empty_directories(dirname, entries): """Filter for :func:`directory_to_objects` ignoring empty directories Args: dirname (bytes): directory name entries (list): directory entries Returns: True if the directory is not empty, false if the directory is empty """ return bool(entries) def ignore_named_directories(names, *, case_sensitive=True): """Filter for :func:`directory_to_objects` to ignore directories named one of names. Args: names (list of bytes): names to ignore case_sensitive (bool): whether to do the filtering in a case sensitive way Returns: a directory filter for :func:`directory_to_objects` """ if not case_sensitive: names = [name.lower() for name in names] def named_filter(dirname, entries, names=names, case_sensitive=case_sensitive): if case_sensitive: return dirname not in names else: return dirname.lower() not in names return named_filter class Directory(MerkleNode): """Representation of a Software Heritage directory as a node in a Merkle Tree. This class can be used to generate, from an on-disk directory, all the objects that need to be sent to the Software Heritage archive. The :func:`from_disk` constructor allows you to generate the data structure from a directory on disk. The resulting :class:`Directory` can then be manipulated as a dictionary, using the path as key. The :func:`collect` method is used to retrieve all the objects that need to be added to the Software Heritage archive since the last collection, by class (contents and directories). When using the dict-like methods to update the contents of the directory, the affected levels of hierarchy are reset and can be collected again using the same method. This enables the efficient collection of updated nodes, for instance when the client is applying diffs. """ __slots__ = ['__entries'] type = 'directory' @classmethod def from_disk(cls, *, path, data=False, save_path=False, dir_filter=accept_all_directories): """Compute the Software Heritage objects for a given directory tree Args: path (bytes): the directory to traverse data (bool): whether to add the data to the content objects save_path (bool): whether to add the path to the content objects dir_filter (function): a filter to ignore some directories by name or contents. Takes two arguments: dirname and entries, and returns True if the directory should be added, False if the directory should be ignored. """ top_path = path dirs = {} for root, dentries, fentries in os.walk(top_path, topdown=False): entries = {} # Join fentries and dentries in the same processing, as symbolic # links to directories appear in dentries... for name in fentries + dentries: path = os.path.join(root, name) if not os.path.isdir(path) or os.path.islink(path): content = Content.from_file(path=path, data=data, save_path=save_path) entries[name] = content else: if dir_filter(name, dirs[path].entries): entries[name] = dirs[path] dirs[root] = cls({'name': os.path.basename(root)}) dirs[root].update(entries) return dirs[top_path] def __init__(self, data=None): super().__init__(data=data) self.__entries = None def invalidate_hash(self): self.__entries = None super().invalidate_hash() @staticmethod def child_to_directory_entry(name, child): if isinstance(child, Directory): return { 'type': 'dir', 'perms': DentryPerms.directory, 'target': child.hash, 'name': name, } elif isinstance(child, Content): return { 'type': 'file', 'perms': child.data['perms'], 'target': child.hash, 'name': name, } else: raise ValueError('unknown child') def get_data(self, **kwargs): return { 'id': self.hash, 'entries': self.entries, } @property def entries(self): if self.__entries is None: self.__entries = [ self.child_to_directory_entry(name, child) for name, child in self.items() ] return self.__entries def compute_hash(self): return id_to_bytes(directory_identifier({'entries': self.entries})) def __getitem__(self, key): if not isinstance(key, bytes): raise ValueError('Can only get a bytes from Directory') # Convenience shortcut if key == b'': return self if b'/' not in key: return super().__getitem__(key) else: key1, key2 = key.split(b'/', 1) return self.__getitem__(key1)[key2] def __setitem__(self, key, value): if not isinstance(key, bytes): raise ValueError('Can only set a bytes Directory entry') if not isinstance(value, (Content, Directory)): raise ValueError('Can only set a Directory entry to a Content or ' 'Directory') if key == b'': raise ValueError('Directory entry must have a name') if b'\x00' in key: raise ValueError('Directory entry name must not contain nul bytes') if b'/' not in key: return super().__setitem__(key, value) else: key1, key2 = key.rsplit(b'/', 1) self[key1].__setitem__(key2, value) def __delitem__(self, key): if not isinstance(key, bytes): raise ValueError('Can only delete a bytes Directory entry') if b'/' not in key: super().__delitem__(key) else: key1, key2 = key.rsplit(b'/', 1) del self[key1][key2] def __repr__(self): return 'Directory(id=%s, entries=[%s])' % ( id_to_str(self.hash), ', '.join(str(entry) for entry in self), ) diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 24c2f69..91250f4 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -1,458 +1,452 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of hashing function definitions. This is the base module use to compute swh's hashes. Only a subset of hashing algorithms is supported as defined in the ALGORITHMS set. Any provided algorithms not in that list will result in a ValueError explaining the error. This module defines a MultiHash class to ease the softwareheritage hashing algorithms computation. This allows to compute hashes from file object, path, data using a similar interface as what the standard hashlib module provides. Basic usage examples: - file object: MultiHash.from_file( file_object, hash_names=DEFAULT_ALGORITHMS).digest() - path (filepath): MultiHash.from_path(b'foo').hexdigest() - data (bytes): MultiHash.from_data(b'foo').bytehexdigest() "Complex" usage, defining a swh hashlib instance first: - To compute length, integrate the length to the set of algorithms to compute, for example: h = MultiHash(hash_names=set({'length'}).union(DEFAULT_ALGORITHMS)) with open(filepath, 'rb') as f: h.update(f.read(HASH_BLOCK_SIZE)) hashes = h.digest() # returns a dict of {hash_algo_name: hash_in_bytes} for chunk in # then use h as you would - Write alongside computing hashing algorithms (from a stream), example: h = MultiHash(length=length) with open(filepath, 'wb') as f: for chunk in r.iter_content(): # r a stream of sort h.update(chunk) f.write(chunk) hashes = h.hexdigest() # returns a dict of {hash_algo_name: hash_in_hex} Note: Prior to this, we would have to use chunk_cb (cf. hash_file, hash_path) This module also defines the following (deprecated) hashing functions: - hash_file: Hash the contents of the given file object with the given algorithms (defaulting to DEFAULT_ALGORITHMS if none provided). - hash_data: Hash the given binary blob with the given algorithms (defaulting to DEFAULT_ALGORITHMS if none provided). - hash_path: Hash the contents of the file at the given path with the given algorithms (defaulting to DEFAULT_ALGORITHMS if none provided). """ import binascii import functools import hashlib import os from io import BytesIO ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256', 'blake2b512']) """Hashing algorithms supported by this module""" DEFAULT_ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256']) """Algorithms computed by default when calling the functions from this module. Subset of :const:`ALGORITHMS`. """ HASH_BLOCK_SIZE = 32768 """Block size for streaming hash computations made in this module""" _blake2_hash_cache = {} class MultiHash: """Hashutil class to support multiple hashes computation. Args: hash_names (set): Set of hash algorithms (+ optionally length) to compute hashes (cf. DEFAULT_ALGORITHMS) length (int): Length of the total sum of chunks to read If the length is provided as algorithm, the length is also computed and returned. """ def __init__(self, hash_names=DEFAULT_ALGORITHMS, length=None): self.state = {} self.track_length = False for name in hash_names: if name == 'length': self.state['length'] = 0 self.track_length = True else: self.state[name] = _new_hash(name, length) @classmethod def from_state(cls, state, track_length): ret = cls([]) ret.state = state ret.track_length = track_length @classmethod def from_file(cls, fobj, hash_names=DEFAULT_ALGORITHMS, length=None): ret = cls(length=length, hash_names=hash_names) while True: chunk = fobj.read(HASH_BLOCK_SIZE) if not chunk: break ret.update(chunk) return ret @classmethod - def from_path(cls, path, hash_names=DEFAULT_ALGORITHMS, length=None, - track_length=True): - if not length: - length = os.path.getsize(path) + def from_path(cls, path, hash_names=DEFAULT_ALGORITHMS): + length = os.path.getsize(path) with open(path, 'rb') as f: ret = cls.from_file(f, hash_names=hash_names, length=length) - # For compatibility reason with `hash_path` - if track_length: - ret.state['length'] = length return ret @classmethod - def from_data(cls, data, hash_names=DEFAULT_ALGORITHMS, length=None): - if not length: - length = len(data) + def from_data(cls, data, hash_names=DEFAULT_ALGORITHMS): + length = len(data) fobj = BytesIO(data) return cls.from_file(fobj, hash_names=hash_names, length=length) def update(self, chunk): for name, h in self.state.items(): if name == 'length': continue h.update(chunk) if self.track_length: self.state['length'] += len(chunk) def digest(self): return { name: h.digest() if name != 'length' else h for name, h in self.state.items() } def hexdigest(self): return { name: h.hexdigest() if name != 'length' else h for name, h in self.state.items() } def bytehexdigest(self): return { name: hash_to_bytehex(h.digest()) if name != 'length' else h for name, h in self.state.items() } def copy(self): copied_state = { name: h.copy() if name != 'length' else h for name, h in self.state.items() } return self.from_state(copied_state, self.track_length) def _new_blake2_hash(algo): """Return a function that initializes a blake2 hash. """ if algo in _blake2_hash_cache: return _blake2_hash_cache[algo]() lalgo = algo.lower() if not lalgo.startswith('blake2'): raise ValueError('Algorithm %s is not a blake2 hash' % algo) blake_family = lalgo[:7] digest_size = None if lalgo[7:]: try: digest_size, remainder = divmod(int(lalgo[7:]), 8) except ValueError: raise ValueError( 'Unknown digest size for algo %s' % algo ) from None if remainder: raise ValueError( 'Digest size for algorithm %s must be a multiple of 8' % algo ) if lalgo in hashlib.algorithms_available: # Handle the case where OpenSSL ships the given algorithm # (e.g. Python 3.5 on Debian 9 stretch) _blake2_hash_cache[algo] = lambda: hashlib.new(lalgo) else: # Try using the built-in implementation for Python 3.6+ if blake_family in hashlib.algorithms_available: blake2 = getattr(hashlib, blake_family) else: import pyblake2 blake2 = getattr(pyblake2, blake_family) _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size) return _blake2_hash_cache[algo]() def _new_hashlib_hash(algo): """Initialize a digest object from hashlib. Handle the swh-specific names for the blake2-related algorithms """ if algo.startswith('blake2'): return _new_blake2_hash(algo) else: return hashlib.new(algo) def _new_git_hash(base_algo, git_type, length): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm, and feed it with the header for a git object of the given type and length. The header for hashing a git object consists of: - The type of the object (encoded in ASCII) - One ASCII space (\x20) - The length of the object (decimal encoded in ASCII) - One NUL byte Args: base_algo (str from :const:`ALGORITHMS`): a hashlib-supported algorithm git_type: the type of the git object (supposedly one of 'blob', 'commit', 'tag', 'tree') length: the length of the git object you're encoding Returns: a hashutil.hash object """ h = _new_hashlib_hash(base_algo) git_header = '%s %d\0' % (git_type, length) h.update(git_header.encode('ascii')) return h def _new_hash(algo, length=None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is requested (e.g., "sha1_git"), the hashing object will be pre-fed with the needed header; for this to work, length must be given. Args: algo (str): a hashing algorithm (one of ALGORITHMS) length (int): the length of the hashed payload (needed for git-specific algorithms) Returns: a hashutil.hash object Raises: ValueError if algo is unknown, or length is missing for a git-specific hash. """ if algo not in ALGORITHMS: raise ValueError( 'Unexpected hashing algorithm %s, expected one of %s' % (algo, ', '.join(sorted(ALGORITHMS)))) if algo.endswith('_git'): if length is None: raise ValueError('Missing length for git hashing algorithm') base_algo = algo[:-4] return _new_git_hash(base_algo, 'blob', length) return _new_hashlib_hash(algo) def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): """(Deprecated) cf. MultiHash.from_file Hash the contents of the given file object with the given algorithms. Args: fobj: a file-like object length (int): the length of the contents of the file-like object (for the git-specific algorithms) algorithms (set): the hashing algorithms to be used, as an iterable over strings chunk_cb (fun): a callback function taking a chunk of data as parameter Returns: a dict mapping each algorithm to a digest (bytes by default). Raises: ValueError if algorithms contains an unknown hash algorithm. """ h = MultiHash(algorithms, length) while True: chunk = fobj.read(HASH_BLOCK_SIZE) if not chunk: break h.update(chunk) if chunk_cb: chunk_cb(chunk) return h.digest() def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): """(deprecated) cf. MultiHash.from_path Hash the contents of the file at the given path with the given algorithms. Args: path (str): the path of the file to hash algorithms (set): the hashing algorithms used chunk_cb (fun): a callback function taking a chunk of data as parameter Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. OSError on file access error """ length = os.path.getsize(path) with open(path, 'rb') as fobj: hashes = hash_file(fobj, length, algorithms, chunk_cb=chunk_cb) hashes['length'] = length return hashes def hash_data(data, algorithms=DEFAULT_ALGORITHMS): """(deprecated) cf. MultiHash.from_data Hash the given binary blob with the given algorithms. Args: data (bytes): raw content to hash algorithms (set): the hashing algorithms used Returns: a dict mapping each algorithm to a bytes digest Raises: TypeError if data does not support the buffer interface. ValueError if algorithms contains an unknown hash algorithm. """ return MultiHash.from_data(data, hash_names=algorithms).digest() def hash_git_data(data, git_type, base_algo='sha1'): """Hash the given data as a git object of type git_type. Args: data: a bytes object git_type: the git object type base_algo: the base hashing algorithm used (default: sha1) Returns: a dict mapping each algorithm to a bytes digest Raises: ValueError if the git_type is unexpected. """ git_object_types = {'blob', 'tree', 'commit', 'tag', 'snapshot'} if git_type not in git_object_types: raise ValueError('Unexpected git object type %s, expected one of %s' % (git_type, ', '.join(sorted(git_object_types)))) h = _new_git_hash(base_algo, git_type, len(data)) h.update(data) return h.digest() @functools.lru_cache() def hash_to_hex(hash): """Converts a hash (in hex or bytes form) to its hexadecimal ascii form Args: hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing the hexadecimal form of the hash Returns: str: the hexadecimal form of the hash """ if isinstance(hash, str): return hash return binascii.hexlify(hash).decode('ascii') @functools.lru_cache() def hash_to_bytehex(hash): """Converts a hash to its hexadecimal bytes representation Args: hash (bytes): a :class:`bytes` hash Returns: bytes: the hexadecimal form of the hash, as :class:`bytes` """ return binascii.hexlify(hash) @functools.lru_cache() def hash_to_bytes(hash): """Converts a hash (in hex or bytes form) to its raw bytes form Args: hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing the hexadecimal form of the hash Returns: bytes: the :class:`bytes` form of the hash """ if isinstance(hash, bytes): return hash return bytes.fromhex(hash) @functools.lru_cache() def bytehex_to_hash(hex): """Converts a hexadecimal bytes representation of a hash to that hash Args: hash (bytes): a :class:`bytes` containing the hexadecimal form of the hash encoded in ascii Returns: bytes: the :class:`bytes` form of the hash """ return hash_to_bytes(hex.decode()) diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index 92b3684..94a66dd 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -1,414 +1,413 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import io import os import tempfile import unittest from nose.tools import istest from unittest.mock import patch from swh.model import hashutil from swh.model.hashutil import MultiHash class BaseHashutil(unittest.TestCase): def setUp(self): # Reset function cache hashutil._blake2_hash_cache = {} self.data = b'1984\n' self.hex_checksums = { 'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731', 'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d', 'sha256': '26602113b4b9afd9d55466b08580d3c2' '4a9b50ee5b5866c0d91fab0e65907311', 'blake2s256': '63cfb259e1fdb485bc5c55749697a6b21ef31fb7445f6c78a' 'c9422f9f2dc8906', } self.checksums = { type: bytes.fromhex(cksum) for type, cksum in self.hex_checksums.items() } self.bytehex_checksums = { type: hashutil.hash_to_bytehex(cksum) for type, cksum in self.checksums.items() } self.git_hex_checksums = { 'blob': self.hex_checksums['sha1_git'], 'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0', 'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f', 'tag': 'd6bf62466f287b4d986c545890716ce058bddf67', } self.git_checksums = { type: bytes.fromhex(cksum) for type, cksum in self.git_hex_checksums.items() } class MultiHashTest(BaseHashutil): @istest def multi_hash_data(self): checksums = MultiHash.from_data(self.data).digest() self.assertEqual(checksums, self.checksums) self.assertFalse('length' in checksums) @istest def multi_hash_data_with_length(self): expected_checksums = self.checksums.copy() expected_checksums['length'] = len(self.data) algos = set(['length']).union(hashutil.DEFAULT_ALGORITHMS) checksums = MultiHash.from_data(self.data, hash_names=algos).digest() self.assertEqual(checksums, expected_checksums) self.assertTrue('length' in checksums) @istest def multi_hash_data_unknown_hash(self): with self.assertRaises(ValueError) as cm: MultiHash.from_data(self.data, ['unknown-hash']) self.assertIn('Unexpected hashing algorithm', cm.exception.args[0]) self.assertIn('unknown-hash', cm.exception.args[0]) @istest def multi_hash_file(self): fobj = io.BytesIO(self.data) checksums = MultiHash.from_file(fobj, length=len(self.data)).digest() self.assertEqual(checksums, self.checksums) @istest def multi_hash_file_hexdigest(self): fobj = io.BytesIO(self.data) length = len(self.data) checksums = MultiHash.from_file(fobj, length=length).hexdigest() self.assertEqual(checksums, self.hex_checksums) @istest def multi_hash_file_bytehexdigest(self): fobj = io.BytesIO(self.data) length = len(self.data) checksums = MultiHash.from_file(fobj, length=length).bytehexdigest() self.assertEqual(checksums, self.bytehex_checksums) @istest def multi_hash_file_missing_length(self): fobj = io.BytesIO(self.data) with self.assertRaises(ValueError) as cm: MultiHash.from_file(fobj, hash_names=['sha1_git']) self.assertIn('Missing length', cm.exception.args[0]) @istest def multi_hash_path(self): with tempfile.NamedTemporaryFile(delete=False) as f: f.write(self.data) hashes = MultiHash.from_path(f.name).digest() os.remove(f.name) - self.checksums['length'] = len(self.data) self.assertEquals(self.checksums, hashes) class Hashutil(BaseHashutil): @istest def hash_data(self): checksums = hashutil.hash_data(self.data) self.assertEqual(checksums, self.checksums) self.assertFalse('length' in checksums) @istest def hash_data_with_length(self): expected_checksums = self.checksums.copy() expected_checksums['length'] = len(self.data) algos = set(['length']).union(hashutil.DEFAULT_ALGORITHMS) checksums = hashutil.hash_data(self.data, algorithms=algos) self.assertEqual(checksums, expected_checksums) self.assertTrue('length' in checksums) @istest def hash_data_unknown_hash(self): with self.assertRaises(ValueError) as cm: hashutil.hash_data(self.data, ['unknown-hash']) self.assertIn('Unexpected hashing algorithm', cm.exception.args[0]) self.assertIn('unknown-hash', cm.exception.args[0]) @istest def hash_git_data(self): checksums = { git_type: hashutil.hash_git_data(self.data, git_type) for git_type in self.git_checksums } self.assertEqual(checksums, self.git_checksums) @istest def hash_git_data_unknown_git_type(self): with self.assertRaises(ValueError) as cm: hashutil.hash_git_data(self.data, 'unknown-git-type') self.assertIn('Unexpected git object type', cm.exception.args[0]) self.assertIn('unknown-git-type', cm.exception.args[0]) @istest def hash_file(self): fobj = io.BytesIO(self.data) checksums = hashutil.hash_file(fobj, length=len(self.data)) self.assertEqual(checksums, self.checksums) @istest def hash_file_missing_length(self): fobj = io.BytesIO(self.data) with self.assertRaises(ValueError) as cm: hashutil.hash_file(fobj, algorithms=['sha1_git']) self.assertIn('Missing length', cm.exception.args[0]) @istest def hash_path(self): with tempfile.NamedTemporaryFile(delete=False) as f: f.write(self.data) hashes = hashutil.hash_path(f.name) os.remove(f.name) self.checksums['length'] = len(self.data) self.assertEquals(self.checksums, hashes) @istest def hash_to_hex(self): for type in self.checksums: hex = self.hex_checksums[type] hash = self.checksums[type] self.assertEquals(hashutil.hash_to_hex(hex), hex) self.assertEquals(hashutil.hash_to_hex(hash), hex) @istest def hash_to_bytes(self): for type in self.checksums: hex = self.hex_checksums[type] hash = self.checksums[type] self.assertEquals(hashutil.hash_to_bytes(hex), hash) self.assertEquals(hashutil.hash_to_bytes(hash), hash) @istest def hash_to_bytehex(self): for algo in self.checksums: self.assertEqual(self.hex_checksums[algo].encode('ascii'), hashutil.hash_to_bytehex(self.checksums[algo])) @istest def bytehex_to_hash(self): for algo in self.checksums: self.assertEqual(self.checksums[algo], hashutil.bytehex_to_hash( self.hex_checksums[algo].encode())) @istest def new_hash_unsupported_hashing_algorithm(self): try: hashutil._new_hash('blake2:10') except ValueError as e: self.assertEquals(str(e), 'Unexpected hashing algorithm blake2:10, ' 'expected one of blake2b512, blake2s256, ' 'sha1, sha1_git, sha256') @patch('hashlib.new') @istest def new_hash_blake2b_blake2b512_builtin(self, mock_hashlib_new): if 'blake2b512' not in hashlib.algorithms_available: self.skipTest('blake2b512 not built-in') mock_hashlib_new.return_value = sentinel = object() h = hashutil._new_hash('blake2b512') self.assertIs(h, sentinel) mock_hashlib_new.assert_called_with('blake2b512') @patch('hashlib.new') @istest def new_hash_blake2s_blake2s256_builtin(self, mock_hashlib_new): if 'blake2s256' not in hashlib.algorithms_available: self.skipTest('blake2s256 not built-in') mock_hashlib_new.return_value = sentinel = object() h = hashutil._new_hash('blake2s256') self.assertIs(h, sentinel) mock_hashlib_new.assert_called_with('blake2s256') @istest def new_hash_blake2b_builtin(self): removed_hash = False try: if 'blake2b512' in hashlib.algorithms_available: removed_hash = True hashlib.algorithms_available.remove('blake2b512') if 'blake2b' not in hashlib.algorithms_available: self.skipTest('blake2b not built in') with patch('hashlib.blake2b') as mock_blake2b: mock_blake2b.return_value = sentinel = object() h = hashutil._new_hash('blake2b512') self.assertIs(h, sentinel) mock_blake2b.assert_called_with(digest_size=512//8) finally: if removed_hash: hashlib.algorithms_available.add('blake2b512') @istest def new_hash_blake2s_builtin(self): removed_hash = False try: if 'blake2s256' in hashlib.algorithms_available: removed_hash = True hashlib.algorithms_available.remove('blake2s256') if 'blake2s' not in hashlib.algorithms_available: self.skipTest('blake2s not built in') with patch('hashlib.blake2s') as mock_blake2s: mock_blake2s.return_value = sentinel = object() h = hashutil._new_hash('blake2s256') self.assertIs(h, sentinel) mock_blake2s.assert_called_with(digest_size=256//8) finally: if removed_hash: hashlib.algorithms_available.add('blake2s256') @istest def new_hash_blake2b_pyblake2(self): if 'blake2b512' in hashlib.algorithms_available: self.skipTest('blake2b512 built in') if 'blake2b' in hashlib.algorithms_available: self.skipTest('blake2b built in') with patch('pyblake2.blake2b') as mock_blake2b: mock_blake2b.return_value = sentinel = object() h = hashutil._new_hash('blake2b512') self.assertIs(h, sentinel) mock_blake2b.assert_called_with(digest_size=512//8) @istest def new_hash_blake2s_pyblake2(self): if 'blake2s256' in hashlib.algorithms_available: self.skipTest('blake2s256 built in') if 'blake2s' in hashlib.algorithms_available: self.skipTest('blake2s built in') with patch('pyblake2.blake2s') as mock_blake2s: mock_blake2s.return_value = sentinel = object() h = hashutil._new_hash('blake2s256') self.assertIs(h, sentinel) mock_blake2s.assert_called_with(digest_size=256//8) class HashlibGit(unittest.TestCase): def setUp(self): self.blob_data = b'42\n' self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') # NOQA self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode('utf-8') # NOQA self.checksums = { 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' 'e07157b6cd'), 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' 'e9e959f120'), } @istest def unknown_header_type(self): with self.assertRaises(ValueError) as cm: hashutil.hash_git_data(b'any-data', 'some-unknown-type') self.assertIn('Unexpected git object type', cm.exception.args[0]) @istest def hashdata_content(self): # when actual_hash = hashutil.hash_git_data(self.blob_data, git_type='blob') # then self.assertEqual(actual_hash, self.checksums['blob_sha1_git']) @istest def hashdata_tree(self): # when actual_hash = hashutil.hash_git_data(self.tree_data, git_type='tree') # then self.assertEqual(actual_hash, self.checksums['tree_sha1_git']) @istest def hashdata_revision(self): # when actual_hash = hashutil.hash_git_data(self.commit_data, git_type='commit') # then self.assertEqual(actual_hash, self.checksums['commit_sha1_git']) @istest def hashdata_tag(self): # when actual_hash = hashutil.hash_git_data(self.tag_data, git_type='tag') # then self.assertEqual(actual_hash, self.checksums['tag_sha1_git'])