diff --git a/swh/loader/git/git.py b/swh/loader/git/git.py
index 25f2427..be119dd 100644
--- a/swh/loader/git/git.py
+++ b/swh/loader/git/git.py
@@ -1,225 +1,231 @@
 # Copyright (C) 2015  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import glob
 import logging
 import os
 import subprocess
 import time
 
 import pygit2
 
 from datetime import datetime
 from pygit2 import GIT_REF_OID
 from pygit2 import GIT_OBJ_COMMIT, GIT_OBJ_TREE, GIT_SORT_TOPOLOGICAL
 from enum import Enum
 
 from swh.core import hashutil
 from swh.loader.git.data import swhrepo
 from swh.loader.git.storage import storage
 
 
 class DirectoryTypeEntry(Enum):
     """Types of git objects.
     """
     file = 'file'
     directory = 'directory'
 
 
 def date_format(d):
     """d is expected to be a datetime object.
     """
     return time.strftime("%a, %d %b %Y %H:%M:%S +0000", d.timetuple())
 
 
 def now():
     """Cheat time values."""
     return date_format(datetime.utcnow())
 
 
 def timestamp_to_string(timestamp):
     """Convert a timestamps to string.
     """
     return date_format(datetime.utcfromtimestamp(timestamp))
 
+
 def list_objects_from_packfile_index(packfile_index):
     """List the objects indexed by this packfile"""
     input_file = open(packfile_index, 'rb')
     with subprocess.Popen(
         ['/usr/bin/git', 'show-index'],
         stdin=input_file,
         stdout=subprocess.PIPE,
     ) as process:
         for line in process.stdout.readlines():
-            obj_id = line.decode('utf-8', 'ignore').split()[1]
-            yield obj_id
+            # git show-index returns the line as:
+            # <packfile offset> <object_id> (<object CRC>)
+            line_components = line.split()
+            yield line_components[1].decode('utf-8')
+
 
 def list_objects(repo):
     """List the objects in a given repository"""
     objects_dir = os.path.join(repo.path, 'objects')
+    # Git hashes are 40-character long
     objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38)
 
     packfile_dir = os.path.join(objects_dir, 'pack')
 
     if os.path.isdir(packfile_dir):
         for packfile_index in os.listdir(packfile_dir):
             if not packfile_index.endswith('.idx'):
                 # Not an index file
                 continue
             packfile_index_path = os.path.join(packfile_dir, packfile_index)
             yield from list_objects_from_packfile_index(packfile_index_path)
 
     for object_file in glob.glob(objects_glob):
+        # Rebuild the object id as the last two components of the path
         yield ''.join(object_file.split(os.path.sep)[-2:])
 
 
 HASH_ALGORITHMS=['sha1', 'sha256']
 
 
 def parse(repo_path):
     """Given a repository path, parse and return a memory model of such
     repository."""
     def read_signature(signature):
         return '%s <%s>' % (signature.name, signature.email)
 
     def treewalk(repo, tree):
         """Walk a tree with the same implementation as `os.path`.
         Returns: tree, trees, blobs
         """
         trees, blobs, dir_entries = [], [], []
         for tree_entry in tree:
             if swh_repo.already_visited(tree_entry.hex):
                 logging.debug('tree_entry %s already visited, skipped' % tree_entry.hex)
                 continue
 
             obj = repo.get(tree_entry.oid)
             if obj is None:  # or obj.type == GIT_OBJ_COMMIT:
                 logging.warn('skip submodule-commit %s' % tree_entry.hex)
                 continue  # submodule!
 
             if obj.type == GIT_OBJ_TREE:
                 logging.debug('found tree %s' % tree_entry.hex)
                 nature = DirectoryTypeEntry.directory.value
                 trees.append(tree_entry)
             else:
                 logging.debug('found content %s' % tree_entry.hex)
                 data = obj.data
                 nature = DirectoryTypeEntry.file.value
                 hashes = hashutil.hashdata(data, HASH_ALGORITHMS)
                 blobs.append({'id': obj.hex,
                               'type': storage.Type.content,
                               'content-sha1': hashes['sha1'],
                               'content-sha256': hashes['sha256'],
                               'content': data,
                               'size': obj.size})
 
             dir_entries.append({'name': tree_entry.name,
                                 'type': storage.Type.directory_entry,
                                 'target-sha1': obj.hex,
                                 'nature': nature,
                                 'perms': tree_entry.filemode,
                                 'atime': None,
                                 'mtime': None,
                                 'ctime': None,
                                 'parent': tree.hex})
 
         yield tree, dir_entries, trees, blobs
         for tree_entry in trees:
             for x in treewalk(repo, repo[tree_entry.oid]):
                 yield x
 
     def walk_tree(repo, swh_repo, rev):
         """Walk the rev revision's directories.
         """
         if swh_repo.already_visited(rev.hex):
             logging.debug('commit %s already visited, skipped' % rev.hex)
             return swh_repo
 
         for dir_root, dir_entries, _, contents_ref in treewalk(repo, rev.tree):
             for content_ref in contents_ref:
                 swh_repo.add_content(content_ref)
 
             swh_repo.add_directory({'id': dir_root.hex,
                                     'type': storage.Type.directory,
                                     'entries': dir_entries})
 
         revision_parent_sha1s = list(map(str, rev.parent_ids))
 
         author = {'name': rev.author.name,
                   'email': rev.author.email,
                   'type': storage.Type.person}
         committer = {'name': rev.committer.name,
                      'email': rev.committer.email,
                      'type': storage.Type.person}
 
         swh_repo.add_revision({'id': rev.hex,
                                'type':storage.Type.revision,
                                'date': timestamp_to_string(rev.commit_time),
                                'directory': rev.tree.hex,
                                'message': rev.message,
                                'committer': committer,
                                'author': author,
                                'parent-sha1s': revision_parent_sha1s
         })
 
         swh_repo.add_person(read_signature(rev.author), author)
         swh_repo.add_person(read_signature(rev.committer), committer)
 
         return swh_repo
 
     def walk_revision_from(repo, swh_repo, head_rev):
         """Walk the rev history log from head_rev.
         - repo is the current repository
         - rev is the latest rev to start from.
         """
         for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL):
             swh_repo = walk_tree(repo, swh_repo, rev)
 
         return swh_repo
 
     repo = pygit2.Repository(repo_path)
     # memory model
     swh_repo = swhrepo.SWHRepo()
     # add origin
     origin = {'type': 'git',
               'url': 'file://' + repo.path}
     swh_repo.add_origin(origin)
     # add references and crawl them
     for ref_name in repo.listall_references():
         logging.info('walk reference %s' % ref_name)
         ref = repo.lookup_reference(ref_name)
 
         head_rev = repo[ref.target] \
                         if ref.type is GIT_REF_OID \
                         else ref.peel(GIT_OBJ_COMMIT)  # noqa
 
         if isinstance(head_rev, pygit2.Tag):
             head_start = head_rev.get_object()
             taggerSig = head_rev.tagger
             author = {'name': taggerSig.name,
                       'email': taggerSig.email,
                       'type': storage.Type.person}
             release = {'id': head_rev.hex,
                        'type': storage.Type.release,
                        'revision': head_rev.target.hex,
                        'name': ref_name,
                        'date': now(),  # FIXME: find the tag's date,
                        'author':  author,
                        'comment': head_rev.message}
 
             swh_repo.add_release(release)
             swh_repo.add_person(read_signature(taggerSig), author)
         else:
             swh_repo.add_occurrence({'id': head_rev.hex,
                                      'revision': head_rev.hex,
                                      'reference': ref_name,
                                      'url-origin': origin['url'],
                                      'type': storage.Type.occurrence})
             head_start = head_rev
 
         # crawl commits and trees
         walk_revision_from(repo, swh_repo, head_start)
 
     return swh_repo