diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py
index b8c6025..2d5ff12 100644
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -1,147 +1,187 @@
 # Copyright (C) 2015  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+import binascii
+import functools
 import hashlib
 from io import BytesIO
+import os
 
 # supported hashing algorithms
 ALGORITHMS = set(['sha1', 'sha256', 'sha1_git'])
 
 # should be a multiple of 64 (sha1/sha256's block size)
 # FWIW coreutils' sha1sum uses 32768
 HASH_BLOCK_SIZE = 32768
 
 
 def _new_git_hash(base_algo, git_type, length):
     """Initialize a digest object (as returned by python's hashlib) for the
     requested algorithm, and feed it with the header for a git object of the
     given type and length.
 
     The header for hashing a git object consists of:
      - The type of the object (encoded in ASCII)
      - One ASCII space (\x20)
      - The length of the object (decimal encoded in ASCII)
      - One NUL byte
 
     Args:
         base_algo: a hashlib-supported algorithm
         git_type: the type of the git object (supposedly one of 'blob',
                   'commit', 'tag', 'tree')
         length: the length of the git object you're encoding
 
     Returns:
         a hashutil.hash object
     """
 
     h = hashlib.new(base_algo)
     git_header = '%s %d\0' % (git_type, length)
     h.update(git_header.encode('ascii'))
 
     return h
 
 
 def _new_hash(algo, length=None):
     """Initialize a digest object (as returned by python's hashlib) for the
     requested algorithm. See the constant ALGORITHMS for the list of supported
     algorithms. If a git-specific hashing algorithm is requested (e.g.,
     "sha1_git"), the hashing object will be pre-fed with the needed header; for
     this to work, length must be given.
 
     Args:
         algo: a hashing algorithm (one of ALGORITHMS)
         length: the length of the hashed payload (needed for git-specific
                 algorithms)
 
     Returns:
         a hashutil.hash object
 
     Raises:
         ValueError if algo is unknown, or length is missing for a git-specific
         hash.
     """
     if algo not in ALGORITHMS:
         raise ValueError('Unexpected hashing algorithm %s, '
                          'expected one of %s' %
                          (algo, ', '.join(sorted(ALGORITHMS))))
 
     h = None
     if algo.endswith('_git'):
         if length is None:
             raise ValueError('Missing length for git hashing algorithm')
         base_algo = algo[:-4]
         h = _new_git_hash(base_algo, 'blob', length)
     else:
         h = hashlib.new(algo)
 
     return h
 
 
-def hash_file(fobj, length=None, algorithms=ALGORITHMS):
+def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None):
     """Hash the contents of the given file object with the given algorithms.
 
     Args:
         fobj: a file-like object
         length: the length of the contents of the file-like object (for the
                 git-specific algorithms)
         algorithms: the hashing algorithms used
 
-    Returns: a dict mapping each algorithm to a hexadecimal digest
+    Returns: a dict mapping each algorithm to a bytes digest.
 
     Raises:
         ValueError if algorithms contains an unknown hash algorithm.
     """
     hashes = {algo: _new_hash(algo, length) for algo in algorithms}
 
     while True:
         chunk = fobj.read(HASH_BLOCK_SIZE)
         if not chunk:
             break
         for hash in hashes.values():
             hash.update(chunk)
+        if chunk_cb:
+            chunk_cb(chunk)
 
-    return {algo: hash.hexdigest() for algo, hash in hashes.items()}
+    return {algo: hash.digest() for algo, hash in hashes.items()}
+
+
+def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None):
+    """Hash the contents of the file at the given path with the given algorithms.
+
+    Args:
+        path: the path of the file to hash
+        algorithms: the hashing algorithms used
+        chunk_cb: a callback
+
+    Returns: a dict mapping each algorithm to a bytes digest.
+
+    Raises:
+        ValueError if algorithms contains an unknown hash algorithm.
+        OSError on file access error
+    """
+    length = os.path.getsize(path)
+    with open(path, 'rb') as fobj:
+        return hash_file(fobj, length, algorithms, chunk_cb)
 
 
 def hash_data(data, algorithms=ALGORITHMS):
     """Hash the given binary blob with the given algorithms.
 
     Args:
         data: a bytes object
         algorithms: the hashing algorithms used
 
-    Returns: a dict mapping each algorithm to a hexadecimal digest
+    Returns: a dict mapping each algorithm to a bytes digest
 
     Raises:
         TypeError if data does not support the buffer interface.
         ValueError if algorithms contains an unknown hash algorithm.
     """
     fobj = BytesIO(data)
     return hash_file(fobj, len(data), algorithms)
 
 
 def hash_git_data(data, git_type, base_algo='sha1'):
     """Hash the given data as a git object of type git_type.
 
     Args:
         data: a bytes object
         git_type: the git object type
         base_algo: the base hashing algorithm used (default: sha1)
 
-    Returns: a dict mapping each algorithm to a hexadecimal digest
+    Returns: a dict mapping each algorithm to a bytes digest
 
     Raises:
         ValueError if the git_type is unexpected.
     """
 
     git_object_types = {'blob', 'tree', 'commit', 'tag'}
 
     if git_type not in git_object_types:
         raise ValueError('Unexpected git object type %s, expected one of %s' %
                          (git_type, ', '.join(sorted(git_object_types))))
 
     h = _new_git_hash(base_algo, git_type, len(data))
     h.update(data)
 
-    return h.hexdigest()
+    return h.digest()
+
+
+@functools.lru_cache()
+def hash_to_hex(hash):
+    """Converts a hash (in hex or bytes form) to its hexadecimal ascii form"""
+    if isinstance(hash, str):
+        return hash
+    return binascii.hexlify(hash).decode('ascii')
+
+
+@functools.lru_cache()
+def hash_to_bytes(hash):
+    """Converts a hash (in hex or bytes form) to its raw bytes form"""
+    if isinstance(hash, bytes):
+        return hash
+    return bytes.fromhex(hash)
diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
index e44d894..311e58c 100644
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -1,297 +1,300 @@
 # Copyright (C) 2015  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import binascii
 import datetime
 from functools import lru_cache
 
 from . import hashutil
 
 
 @lru_cache()
 def identifier_to_bytes(identifier):
     """Convert a text identifier to bytes.
 
     Args:
         identifier: an identifier, either a 40-char hexadecimal string or a
                     bytes object of length 20
     Returns:
         The length 20 bytestring corresponding to the given identifier
 
     Raises:
         ValueError if the identifier is of an unexpected type or length.
     """
 
     if isinstance(identifier, bytes):
         if len(identifier) != 20:
             raise ValueError(
                 'Wrong length for bytes identifier %s, expected 20' %
                 len(identifier))
         return identifier
 
     if isinstance(identifier, str):
         if len(identifier) != 40:
             raise ValueError(
                 'Wrong length for str identifier %s, expected 40' %
                 len(identifier))
         return bytes.fromhex(identifier)
 
     raise ValueError('Wrong type for identitfier %s, expected bytes or str' %
                      identifier.__class__.__name__)
 
 
 @lru_cache()
 def identifier_to_str(identifier):
     """Convert an identifier to an hexadecimal string.
 
     Args:
         identifier: an identifier, either a 40-char hexadecimal string or a
                     bytes object of length 20
     Returns:
         The length 40 string corresponding to the given identifier, hex encoded
 
     Raises:
         ValueError if the identifier is of an unexpected type or length.
     """
 
     if isinstance(identifier, str):
         if len(identifier) != 40:
             raise ValueError(
                 'Wrong length for str identifier %s, expected 40' %
                 len(identifier))
         return identifier
 
     if isinstance(identifier, bytes):
         if len(identifier) != 20:
             raise ValueError(
                 'Wrong length for bytes identifier %s, expected 20' %
                 len(identifier))
         return binascii.hexlify(identifier).decode()
 
     raise ValueError('Wrong type for identitfier %s, expected bytes or str' %
                      identifier.__class__.__name__)
 
 
 def content_identifier(content):
     """Return the intrinsic identifier for a content.
 
     A content's identifier is the sha1, sha1_git and sha256 checksums of its
     data.
 
     Args:
         content: a content conforming to the Software Heritage schema
 
     Returns:
         A dictionary with all the hashes for the data
 
     Raises:
         KeyError if the content doesn't have a data member.
 
     """
 
     hashes = hashutil.hash_data(
         content['data'],
         {'sha1', 'sha1_git', 'sha256'},
     )
 
     return hashes
 
 
 def _sort_key(entry):
     """The sorting key for tree entries"""
     if entry['type'] == 'dir':
         return entry['name'] + b'/'
     else:
         return entry['name']
 
 
 @lru_cache()
 def _perms_to_bytes(perms):
     """Convert the perms value to its bytes representation"""
     oc = oct(perms)[2:]
     return oc.encode('ascii')
 
 
 def directory_identifier(directory):
     """Return the intrinsic identifier for a directory.
 
     A directory's identifier is the tree sha1 à la git of a directory listing,
     using the following algorithm, which is equivalent to the git algorithm for
     trees:
 
     1. Entries of the directory are sorted using the name (or the name with '/'
     appended for directory entries) as key, in bytes order.
 
     2. For each entry of the directory, the following bytes are output:
         - the octal representation of the permissions for the entry
           (stored in the 'perms' member), which is a representation of the
           entry type:
             b'100644' (int 33188) for files
             b'100755' (int 33261) for executable files
             b'120000' (int 40960) for symbolic links
             b'40000' (int 16384) for directories
             b'160000' (int 57344) for references to revisions
         - an ascii space (b'\x20')
         - the entry's name (as raw bytes), stored in the 'name' member
         - a null byte (b'\x00')
         - the 20 byte long identifier of the object pointed at by the entry,
           stored in the 'target' member:
             for files or executable files: their blob sha1_git
             for symbolic links: the blob sha1_git of a file containing the
                                 link destination
             for directories: their intrinsic identifier
             for revisions: their intrinsic identifier
 
       (Note that there is no separator between entries)
 
     """
 
     components = []
 
     for entry in sorted(directory['entries'], key=_sort_key):
         components.extend([
             _perms_to_bytes(entry['perms']),
             b'\x20',
             entry['name'],
             b'\x00',
             identifier_to_bytes(entry['target']),
         ])
 
-    return hashutil.hash_git_data(b''.join(components), 'tree')
+    return identifier_to_str(hashutil.hash_git_data(b''.join(components),
+                                                    'tree'))
 
 
 def format_date(date):
     """Convert a date object into an UTC timestamp encoded as ascii bytes.
 
     Git stores timestamps as an integer number of seconds since the UNIX epoch.
 
     However, Software Heritage stores timestamps as an integer number of
     microseconds (postgres type "datetime with timezone").
 
     Therefore, we print timestamps with no microseconds as integers, and
     timestamps with microseconds as floating point values.
 
     """
     if isinstance(date, datetime.datetime):
         if date.microsecond == 0:
             date = int(date.timestamp())
         else:
             date = date.timestamp()
         return str(date).encode()
     else:
         if date == int(date):
             date = int(date)
         return str(date).encode()
 
 
 @lru_cache()
 def format_offset(offset):
     """Convert an integer number of minutes into an offset representation.
 
     The offset representation is [+-]hhmm where:
         hh is the number of hours;
         mm is the number of minutes.
 
     A null offset is represented as +0000.
     """
     if offset >= 0:
         sign = '+'
     else:
         sign = '-'
 
     hours = abs(offset) // 60
     minutes = abs(offset) % 60
 
     t = '%s%02d%02d' % (sign, hours, minutes)
     return t.encode()
 
 
 def format_date_offset(date_offset):
     """Format a date-compatible object with its timezone offset.
 
     A date-compatible object is either:
         - a dict with two members
             timestamp: floating point number of seconds since the unix epoch
             offset: (int) number of minutes representing the offset from UTC
         - a datetime.datetime object with a timezone
         - a numeric value (in which case the offset is hardcoded to 0)
     """
 
     # FIXME: move normalization to another module
 
     if isinstance(date_offset, dict):
         date = date_offset['timestamp']
         offset = date_offset['offset']
     elif isinstance(date_offset, datetime.datetime):
         date = date_offset
         utcoffset = date_offset.utcoffset()
         if utcoffset is None:
             raise ValueError('Received a datetime without a timezone')
         seconds_offset = utcoffset.total_seconds()
         if seconds_offset - int(seconds_offset) != 0 or seconds_offset % 60:
             raise ValueError('Offset is not an integer number of minutes')
         offset = int(seconds_offset) // 60
     else:
         date = date_offset
         offset = 0
 
     return b''.join([format_date(date), b' ', format_offset(offset)])
 
 
 def format_author(author):
     return b''.join([author['name'], b' <', author['email'], b'>'])
 
 
 def revision_identifier(revision):
     """Return the intrinsic identifier for a revision.
     """
     components = [
         b'tree ', identifier_to_str(revision['directory']).encode(), b'\n',
     ]
     for parent in revision['parents']:
         if parent:
             components.extend([
                 b'parent ', identifier_to_str(parent).encode(), b'\n',
             ])
 
     components.extend([
         b'author ', format_author(revision['author']),
         b' ', format_date_offset(revision['date']), b'\n',
         b'committer ', format_author(revision['committer']),
         b' ', format_date_offset(revision['committer_date']), b'\n',
         b'\n',
         revision['message'],
     ])
 
-    return hashutil.hash_git_data(b''.join(components), 'commit')
+    return identifier_to_str(hashutil.hash_git_data(b''.join(components),
+                                                    'commit'))
 
 
 def target_type_to_git(target_type):
     """Convert a software heritage target type to a git object type"""
     return {
         'content': b'blob',
         'directory': b'tree',
         'revision': b'commit',
         'release': b'tag',
     }[target_type]
 
 
 def release_identifier(release):
     """Return the intrinsic identifier for a release."""
     components = [
         b'object ', identifier_to_str(release['target']).encode(), b'\n',
         b'type ', target_type_to_git(release['target_type']), b'\n',
         b'tag ', release['name'].encode('utf-8'), b'\n',
     ]
 
     if 'author' in release and release['author']:
         components.extend([
             b'tagger ', format_author(release['author']), b' ',
             format_date_offset(release['date']), b'\n',
         ])
 
     components.extend([b'\n', release['message']])
 
-    return hashutil.hash_git_data(b''.join(components), 'tag')
+    return identifier_to_str(hashutil.hash_git_data(b''.join(components),
+                                                    'tag'))
diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py
index 45c55f8..79cdc9e 100644
--- a/swh/model/tests/test_hashutil.py
+++ b/swh/model/tests/test_hashutil.py
@@ -1,75 +1,111 @@
 # Copyright (C) 2015  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import io
+import tempfile
 import unittest
 
 from nose.tools import istest
 
 from swh.model import hashutil
 
 
 class Hashutil(unittest.TestCase):
     def setUp(self):
         self.data = b'1984\n'
         self.hex_checksums = {
             'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731',
             'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d',
             'sha256': '26602113b4b9afd9d55466b08580d3c2'
                       '4a9b50ee5b5866c0d91fab0e65907311',
         }
 
-        self.git_checksums = {
+        self.checksums = {
+            type: bytes.fromhex(cksum)
+            for type, cksum in self.hex_checksums.items()
+        }
+
+        self.git_hex_checksums = {
             'blob': self.hex_checksums['sha1_git'],
             'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0',
             'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f',
             'tag': 'd6bf62466f287b4d986c545890716ce058bddf67',
         }
 
+        self.git_checksums = {
+            type: bytes.fromhex(cksum)
+            for type, cksum in self.git_hex_checksums.items()
+        }
+
     @istest
     def hash_data(self):
         checksums = hashutil.hash_data(self.data)
-        self.assertEqual(checksums, self.hex_checksums)
+        self.assertEqual(checksums, self.checksums)
 
     @istest
     def hash_data_unknown_hash(self):
         with self.assertRaises(ValueError) as cm:
             hashutil.hash_data(self.data, ['unknown-hash'])
 
         self.assertIn('Unexpected hashing algorithm', cm.exception.args[0])
         self.assertIn('unknown-hash', cm.exception.args[0])
 
     @istest
     def hash_git_data(self):
         checksums = {
             git_type: hashutil.hash_git_data(self.data, git_type)
             for git_type in self.git_checksums
         }
 
         self.assertEqual(checksums, self.git_checksums)
 
     @istest
     def hash_git_data_unknown_git_type(self):
         with self.assertRaises(ValueError) as cm:
             hashutil.hash_git_data(self.data, 'unknown-git-type')
 
         self.assertIn('Unexpected git object type', cm.exception.args[0])
         self.assertIn('unknown-git-type', cm.exception.args[0])
 
     @istest
     def hash_file(self):
         fobj = io.BytesIO(self.data)
 
         checksums = hashutil.hash_file(fobj, length=len(self.data))
-        self.assertEqual(checksums, self.hex_checksums)
+        self.assertEqual(checksums, self.checksums)
 
     @istest
     def hash_file_missing_length(self):
         fobj = io.BytesIO(self.data)
 
         with self.assertRaises(ValueError) as cm:
             hashutil.hash_file(fobj, algorithms=['sha1_git'])
 
         self.assertIn('Missing length', cm.exception.args[0])
+
+    @istest
+    def hash_path(self):
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            f.write(self.data)
+            f.close()
+            hashes = hashutil.hash_path(f.name)
+
+        self.assertEquals(self.checksums, hashes)
+
+    @istest
+    def hash_to_hex(self):
+        for type in self.checksums:
+            hex = self.hex_checksums[type]
+            hash = self.checksums[type]
+            self.assertEquals(hashutil.hash_to_hex(hex), hex)
+            self.assertEquals(hashutil.hash_to_hex(hash), hex)
+
+    @istest
+    def hash_to_bytes(self):
+        for type in self.checksums:
+            hex = self.hex_checksums[type]
+            hash = self.checksums[type]
+            self.assertEquals(hashutil.hash_to_bytes(hex), hash)
+            self.assertEquals(hashutil.hash_to_bytes(hash), hash)
diff --git a/swh/model/validators.py b/swh/model/validators.py
index cb2e277..ea64b40 100644
--- a/swh/model/validators.py
+++ b/swh/model/validators.py
@@ -1,80 +1,76 @@
 # Copyright (C) 2015  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import binascii
-
 from .exceptions import ValidationError, NON_FIELD_ERRORS
 from . import fields, hashutil
 
 
 def validate_content(content):
     """Validate that a content has the correct schema.
 
     Args: a content (dictionary) to validate."""
 
     def validate_content_status(status):
         return fields.validate_enum(status, {'absent', 'visible', 'hidden'})
 
     def validate_keys(content):
         hashes = {'sha1', 'sha1_git', 'sha256'}
         errors = []
 
         out = True
         if content['status'] == 'absent':
             try:
                 out = out and fields.validate_all_keys(content, {'reason',
                                                                  'origin'})
             except ValidationError as e:
                 errors.append(e)
             try:
                 out = out and fields.validate_any_key(content, hashes)
             except ValidationError as e:
                 errors.append(e)
         else:
             try:
                 out = out and fields.validate_all_keys(content, hashes)
             except ValidationError as e:
                 errors.append(e)
 
         if errors:
             raise ValidationError(errors)
 
         return out
 
     def validate_hashes(content):
         errors = []
         if 'data' in content:
             hashes = hashutil.hash_data(content['data'])
             for hash_type, computed_hash in hashes.items():
                 if hash_type not in content:
                     continue
-                content_hash = content[hash_type]
-                if isinstance(content_hash, bytes):
-                    content_hash = binascii.hexlify(content_hash).decode()
+                content_hash = hashutil.hash_to_bytes(content[hash_type])
                 if content_hash != computed_hash:
                     errors.append(ValidationError(
                         'hash mismatch in content for hash %(hash)s',
                         params={'hash': hash_type},
                         code='content-hash-mismatch',
                     ))
             if errors:
                 raise ValidationError(errors)
 
         return True
 
     content_schema = {
         'sha1': (False, fields.validate_sha1),
         'sha1_git': (False, fields.validate_sha1_git),
         'sha256': (False, fields.validate_sha256),
         'status': (True, validate_content_status),
         'length': (True, fields.validate_int),
         'ctime': (True, fields.validate_datetime),
         'reason': (False, fields.validate_str),
         'origin': (False, fields.validate_int),
         'data': (False, fields.validate_bytes),
         NON_FIELD_ERRORS: [validate_keys, validate_hashes],
     }
 
     return fields.validate_against_schema('content', content_schema, content)