diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index b8c6025..2d5ff12 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -1,147 +1,187 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import binascii +import functools import hashlib from io import BytesIO +import os # supported hashing algorithms ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) # should be a multiple of 64 (sha1/sha256's block size) # FWIW coreutils' sha1sum uses 32768 HASH_BLOCK_SIZE = 32768 def _new_git_hash(base_algo, git_type, length): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm, and feed it with the header for a git object of the given type and length. The header for hashing a git object consists of: - The type of the object (encoded in ASCII) - One ASCII space (\x20) - The length of the object (decimal encoded in ASCII) - One NUL byte Args: base_algo: a hashlib-supported algorithm git_type: the type of the git object (supposedly one of 'blob', 'commit', 'tag', 'tree') length: the length of the git object you're encoding Returns: a hashutil.hash object """ h = hashlib.new(base_algo) git_header = '%s %d\0' % (git_type, length) h.update(git_header.encode('ascii')) return h def _new_hash(algo, length=None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is requested (e.g., "sha1_git"), the hashing object will be pre-fed with the needed header; for this to work, length must be given. Args: algo: a hashing algorithm (one of ALGORITHMS) length: the length of the hashed payload (needed for git-specific algorithms) Returns: a hashutil.hash object Raises: ValueError if algo is unknown, or length is missing for a git-specific hash. """ if algo not in ALGORITHMS: raise ValueError('Unexpected hashing algorithm %s, ' 'expected one of %s' % (algo, ', '.join(sorted(ALGORITHMS)))) h = None if algo.endswith('_git'): if length is None: raise ValueError('Missing length for git hashing algorithm') base_algo = algo[:-4] h = _new_git_hash(base_algo, 'blob', length) else: h = hashlib.new(algo) return h -def hash_file(fobj, length=None, algorithms=ALGORITHMS): +def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None): """Hash the contents of the given file object with the given algorithms. Args: fobj: a file-like object length: the length of the contents of the file-like object (for the git-specific algorithms) algorithms: the hashing algorithms used - Returns: a dict mapping each algorithm to a hexadecimal digest + Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. """ hashes = {algo: _new_hash(algo, length) for algo in algorithms} while True: chunk = fobj.read(HASH_BLOCK_SIZE) if not chunk: break for hash in hashes.values(): hash.update(chunk) + if chunk_cb: + chunk_cb(chunk) - return {algo: hash.hexdigest() for algo, hash in hashes.items()} + return {algo: hash.digest() for algo, hash in hashes.items()} + + +def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None): + """Hash the contents of the file at the given path with the given algorithms. + + Args: + path: the path of the file to hash + algorithms: the hashing algorithms used + chunk_cb: a callback + + Returns: a dict mapping each algorithm to a bytes digest. + + Raises: + ValueError if algorithms contains an unknown hash algorithm. + OSError on file access error + """ + length = os.path.getsize(path) + with open(path, 'rb') as fobj: + return hash_file(fobj, length, algorithms, chunk_cb) def hash_data(data, algorithms=ALGORITHMS): """Hash the given binary blob with the given algorithms. Args: data: a bytes object algorithms: the hashing algorithms used - Returns: a dict mapping each algorithm to a hexadecimal digest + Returns: a dict mapping each algorithm to a bytes digest Raises: TypeError if data does not support the buffer interface. ValueError if algorithms contains an unknown hash algorithm. """ fobj = BytesIO(data) return hash_file(fobj, len(data), algorithms) def hash_git_data(data, git_type, base_algo='sha1'): """Hash the given data as a git object of type git_type. Args: data: a bytes object git_type: the git object type base_algo: the base hashing algorithm used (default: sha1) - Returns: a dict mapping each algorithm to a hexadecimal digest + Returns: a dict mapping each algorithm to a bytes digest Raises: ValueError if the git_type is unexpected. """ git_object_types = {'blob', 'tree', 'commit', 'tag'} if git_type not in git_object_types: raise ValueError('Unexpected git object type %s, expected one of %s' % (git_type, ', '.join(sorted(git_object_types)))) h = _new_git_hash(base_algo, git_type, len(data)) h.update(data) - return h.hexdigest() + return h.digest() + + +@functools.lru_cache() +def hash_to_hex(hash): + """Converts a hash (in hex or bytes form) to its hexadecimal ascii form""" + if isinstance(hash, str): + return hash + return binascii.hexlify(hash).decode('ascii') + + +@functools.lru_cache() +def hash_to_bytes(hash): + """Converts a hash (in hex or bytes form) to its raw bytes form""" + if isinstance(hash, bytes): + return hash + return bytes.fromhex(hash) diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index e44d894..311e58c 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -1,297 +1,300 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import binascii import datetime from functools import lru_cache from . import hashutil @lru_cache() def identifier_to_bytes(identifier): """Convert a text identifier to bytes. Args: identifier: an identifier, either a 40-char hexadecimal string or a bytes object of length 20 Returns: The length 20 bytestring corresponding to the given identifier Raises: ValueError if the identifier is of an unexpected type or length. """ if isinstance(identifier, bytes): if len(identifier) != 20: raise ValueError( 'Wrong length for bytes identifier %s, expected 20' % len(identifier)) return identifier if isinstance(identifier, str): if len(identifier) != 40: raise ValueError( 'Wrong length for str identifier %s, expected 40' % len(identifier)) return bytes.fromhex(identifier) raise ValueError('Wrong type for identitfier %s, expected bytes or str' % identifier.__class__.__name__) @lru_cache() def identifier_to_str(identifier): """Convert an identifier to an hexadecimal string. Args: identifier: an identifier, either a 40-char hexadecimal string or a bytes object of length 20 Returns: The length 40 string corresponding to the given identifier, hex encoded Raises: ValueError if the identifier is of an unexpected type or length. """ if isinstance(identifier, str): if len(identifier) != 40: raise ValueError( 'Wrong length for str identifier %s, expected 40' % len(identifier)) return identifier if isinstance(identifier, bytes): if len(identifier) != 20: raise ValueError( 'Wrong length for bytes identifier %s, expected 20' % len(identifier)) return binascii.hexlify(identifier).decode() raise ValueError('Wrong type for identitfier %s, expected bytes or str' % identifier.__class__.__name__) def content_identifier(content): """Return the intrinsic identifier for a content. A content's identifier is the sha1, sha1_git and sha256 checksums of its data. Args: content: a content conforming to the Software Heritage schema Returns: A dictionary with all the hashes for the data Raises: KeyError if the content doesn't have a data member. """ hashes = hashutil.hash_data( content['data'], {'sha1', 'sha1_git', 'sha256'}, ) return hashes def _sort_key(entry): """The sorting key for tree entries""" if entry['type'] == 'dir': return entry['name'] + b'/' else: return entry['name'] @lru_cache() def _perms_to_bytes(perms): """Convert the perms value to its bytes representation""" oc = oct(perms)[2:] return oc.encode('ascii') def directory_identifier(directory): """Return the intrinsic identifier for a directory. A directory's identifier is the tree sha1 à la git of a directory listing, using the following algorithm, which is equivalent to the git algorithm for trees: 1. Entries of the directory are sorted using the name (or the name with '/' appended for directory entries) as key, in bytes order. 2. For each entry of the directory, the following bytes are output: - the octal representation of the permissions for the entry (stored in the 'perms' member), which is a representation of the entry type: b'100644' (int 33188) for files b'100755' (int 33261) for executable files b'120000' (int 40960) for symbolic links b'40000' (int 16384) for directories b'160000' (int 57344) for references to revisions - an ascii space (b'\x20') - the entry's name (as raw bytes), stored in the 'name' member - a null byte (b'\x00') - the 20 byte long identifier of the object pointed at by the entry, stored in the 'target' member: for files or executable files: their blob sha1_git for symbolic links: the blob sha1_git of a file containing the link destination for directories: their intrinsic identifier for revisions: their intrinsic identifier (Note that there is no separator between entries) """ components = [] for entry in sorted(directory['entries'], key=_sort_key): components.extend([ _perms_to_bytes(entry['perms']), b'\x20', entry['name'], b'\x00', identifier_to_bytes(entry['target']), ]) - return hashutil.hash_git_data(b''.join(components), 'tree') + return identifier_to_str(hashutil.hash_git_data(b''.join(components), + 'tree')) def format_date(date): """Convert a date object into an UTC timestamp encoded as ascii bytes. Git stores timestamps as an integer number of seconds since the UNIX epoch. However, Software Heritage stores timestamps as an integer number of microseconds (postgres type "datetime with timezone"). Therefore, we print timestamps with no microseconds as integers, and timestamps with microseconds as floating point values. """ if isinstance(date, datetime.datetime): if date.microsecond == 0: date = int(date.timestamp()) else: date = date.timestamp() return str(date).encode() else: if date == int(date): date = int(date) return str(date).encode() @lru_cache() def format_offset(offset): """Convert an integer number of minutes into an offset representation. The offset representation is [+-]hhmm where: hh is the number of hours; mm is the number of minutes. A null offset is represented as +0000. """ if offset >= 0: sign = '+' else: sign = '-' hours = abs(offset) // 60 minutes = abs(offset) % 60 t = '%s%02d%02d' % (sign, hours, minutes) return t.encode() def format_date_offset(date_offset): """Format a date-compatible object with its timezone offset. A date-compatible object is either: - a dict with two members timestamp: floating point number of seconds since the unix epoch offset: (int) number of minutes representing the offset from UTC - a datetime.datetime object with a timezone - a numeric value (in which case the offset is hardcoded to 0) """ # FIXME: move normalization to another module if isinstance(date_offset, dict): date = date_offset['timestamp'] offset = date_offset['offset'] elif isinstance(date_offset, datetime.datetime): date = date_offset utcoffset = date_offset.utcoffset() if utcoffset is None: raise ValueError('Received a datetime without a timezone') seconds_offset = utcoffset.total_seconds() if seconds_offset - int(seconds_offset) != 0 or seconds_offset % 60: raise ValueError('Offset is not an integer number of minutes') offset = int(seconds_offset) // 60 else: date = date_offset offset = 0 return b''.join([format_date(date), b' ', format_offset(offset)]) def format_author(author): return b''.join([author['name'], b' <', author['email'], b'>']) def revision_identifier(revision): """Return the intrinsic identifier for a revision. """ components = [ b'tree ', identifier_to_str(revision['directory']).encode(), b'\n', ] for parent in revision['parents']: if parent: components.extend([ b'parent ', identifier_to_str(parent).encode(), b'\n', ]) components.extend([ b'author ', format_author(revision['author']), b' ', format_date_offset(revision['date']), b'\n', b'committer ', format_author(revision['committer']), b' ', format_date_offset(revision['committer_date']), b'\n', b'\n', revision['message'], ]) - return hashutil.hash_git_data(b''.join(components), 'commit') + return identifier_to_str(hashutil.hash_git_data(b''.join(components), + 'commit')) def target_type_to_git(target_type): """Convert a software heritage target type to a git object type""" return { 'content': b'blob', 'directory': b'tree', 'revision': b'commit', 'release': b'tag', }[target_type] def release_identifier(release): """Return the intrinsic identifier for a release.""" components = [ b'object ', identifier_to_str(release['target']).encode(), b'\n', b'type ', target_type_to_git(release['target_type']), b'\n', b'tag ', release['name'].encode('utf-8'), b'\n', ] if 'author' in release and release['author']: components.extend([ b'tagger ', format_author(release['author']), b' ', format_date_offset(release['date']), b'\n', ]) components.extend([b'\n', release['message']]) - return hashutil.hash_git_data(b''.join(components), 'tag') + return identifier_to_str(hashutil.hash_git_data(b''.join(components), + 'tag')) diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index 45c55f8..79cdc9e 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -1,75 +1,111 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io +import tempfile import unittest from nose.tools import istest from swh.model import hashutil class Hashutil(unittest.TestCase): def setUp(self): self.data = b'1984\n' self.hex_checksums = { 'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731', 'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d', 'sha256': '26602113b4b9afd9d55466b08580d3c2' '4a9b50ee5b5866c0d91fab0e65907311', } - self.git_checksums = { + self.checksums = { + type: bytes.fromhex(cksum) + for type, cksum in self.hex_checksums.items() + } + + self.git_hex_checksums = { 'blob': self.hex_checksums['sha1_git'], 'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0', 'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f', 'tag': 'd6bf62466f287b4d986c545890716ce058bddf67', } + self.git_checksums = { + type: bytes.fromhex(cksum) + for type, cksum in self.git_hex_checksums.items() + } + @istest def hash_data(self): checksums = hashutil.hash_data(self.data) - self.assertEqual(checksums, self.hex_checksums) + self.assertEqual(checksums, self.checksums) @istest def hash_data_unknown_hash(self): with self.assertRaises(ValueError) as cm: hashutil.hash_data(self.data, ['unknown-hash']) self.assertIn('Unexpected hashing algorithm', cm.exception.args[0]) self.assertIn('unknown-hash', cm.exception.args[0]) @istest def hash_git_data(self): checksums = { git_type: hashutil.hash_git_data(self.data, git_type) for git_type in self.git_checksums } self.assertEqual(checksums, self.git_checksums) @istest def hash_git_data_unknown_git_type(self): with self.assertRaises(ValueError) as cm: hashutil.hash_git_data(self.data, 'unknown-git-type') self.assertIn('Unexpected git object type', cm.exception.args[0]) self.assertIn('unknown-git-type', cm.exception.args[0]) @istest def hash_file(self): fobj = io.BytesIO(self.data) checksums = hashutil.hash_file(fobj, length=len(self.data)) - self.assertEqual(checksums, self.hex_checksums) + self.assertEqual(checksums, self.checksums) @istest def hash_file_missing_length(self): fobj = io.BytesIO(self.data) with self.assertRaises(ValueError) as cm: hashutil.hash_file(fobj, algorithms=['sha1_git']) self.assertIn('Missing length', cm.exception.args[0]) + + @istest + def hash_path(self): + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(self.data) + f.close() + hashes = hashutil.hash_path(f.name) + + self.assertEquals(self.checksums, hashes) + + @istest + def hash_to_hex(self): + for type in self.checksums: + hex = self.hex_checksums[type] + hash = self.checksums[type] + self.assertEquals(hashutil.hash_to_hex(hex), hex) + self.assertEquals(hashutil.hash_to_hex(hash), hex) + + @istest + def hash_to_bytes(self): + for type in self.checksums: + hex = self.hex_checksums[type] + hash = self.checksums[type] + self.assertEquals(hashutil.hash_to_bytes(hex), hash) + self.assertEquals(hashutil.hash_to_bytes(hash), hash) diff --git a/swh/model/validators.py b/swh/model/validators.py index cb2e277..ea64b40 100644 --- a/swh/model/validators.py +++ b/swh/model/validators.py @@ -1,80 +1,76 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import binascii - from .exceptions import ValidationError, NON_FIELD_ERRORS from . import fields, hashutil def validate_content(content): """Validate that a content has the correct schema. Args: a content (dictionary) to validate.""" def validate_content_status(status): return fields.validate_enum(status, {'absent', 'visible', 'hidden'}) def validate_keys(content): hashes = {'sha1', 'sha1_git', 'sha256'} errors = [] out = True if content['status'] == 'absent': try: out = out and fields.validate_all_keys(content, {'reason', 'origin'}) except ValidationError as e: errors.append(e) try: out = out and fields.validate_any_key(content, hashes) except ValidationError as e: errors.append(e) else: try: out = out and fields.validate_all_keys(content, hashes) except ValidationError as e: errors.append(e) if errors: raise ValidationError(errors) return out def validate_hashes(content): errors = [] if 'data' in content: hashes = hashutil.hash_data(content['data']) for hash_type, computed_hash in hashes.items(): if hash_type not in content: continue - content_hash = content[hash_type] - if isinstance(content_hash, bytes): - content_hash = binascii.hexlify(content_hash).decode() + content_hash = hashutil.hash_to_bytes(content[hash_type]) if content_hash != computed_hash: errors.append(ValidationError( 'hash mismatch in content for hash %(hash)s', params={'hash': hash_type}, code='content-hash-mismatch', )) if errors: raise ValidationError(errors) return True content_schema = { 'sha1': (False, fields.validate_sha1), 'sha1_git': (False, fields.validate_sha1_git), 'sha256': (False, fields.validate_sha256), 'status': (True, validate_content_status), 'length': (True, fields.validate_int), 'ctime': (True, fields.validate_datetime), 'reason': (False, fields.validate_str), 'origin': (False, fields.validate_int), 'data': (False, fields.validate_bytes), NON_FIELD_ERRORS: [validate_keys, validate_hashes], } return fields.validate_against_schema('content', content_schema, content)