diff --git a/debian/control b/debian/control
index 6876512..49926cb 100644
--- a/debian/control
+++ b/debian/control
@@ -1,31 +1,31 @@
 Source: swh-loader-git
 Maintainer: Software Heritage developers <swh-devel@inria.fr>
 Section: python
 Priority: optional
 Build-Depends: debhelper (>= 9),
                dh-python (>= 2),
                python3-all,
                python3-click,
                python3-dulwich (>= 0.18.7~),
                python3-nose,
                python3-retrying,
                python3-setuptools,
                python3-swh.core (>= 0.0.7~),
-               python3-swh.loader.core (>= 0.0.22),
+               python3-swh.loader.core (>= 0.0.28),
                python3-swh.model (>= 0.0.15~),
                python3-swh.scheduler (>= 0.0.14~),
                python3-swh.storage (>= 0.0.83~),
                python3-vcversioner
 Standards-Version: 3.9.6
 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/
 
 Package: python3-swh.loader.git
 Architecture: all
 Depends: python3-swh.core (>= 0.0.7~),
-         python3-swh.loader.core (>= 0.0.22~),
+         python3-swh.loader.core (>= 0.0.28~),
          python3-swh.model (>= 0.0.15~),
          python3-swh.scheduler (>= 0.0.14~),
          python3-swh.storage (>= 0.0.83~),
          ${misc:Depends},
          ${python3:Depends}
 Description: Software Heritage Git loader
diff --git a/requirements-swh.txt b/requirements-swh.txt
index 187f8ca..8cc4f47 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
 swh.core >= 0.0.7
-swh.loader.core >= 0.0.22
+swh.loader.core >= 0.0.28
 swh.model >= 0.0.15
 swh.scheduler >= 0.0.14
 swh.storage >= 0.0.83
diff --git a/swh/loader/git/base.py b/swh/loader/git/base.py
deleted file mode 100644
index c135a68..0000000
--- a/swh/loader/git/base.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (C) 2016-2017 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import abc
-import os
-
-from swh.loader.core.loader import SWHLoader
-
-
-class BaseLoader(SWHLoader):
-    """This base class is a pattern for loaders.
-
-    The external calling convention is as such:
-
-    - instantiate the class once (loads storage and the configuration)
-    - for each origin, call load with the origin-specific arguments (for
-      instance, an origin URL).
-
-    load calls several methods that must be implemented in subclasses:
-
-    - prepare(\*args, \**kwargs) prepares the loader for the new origin
-    - get_origin gets the origin object associated to the current loader
-    - fetch_data downloads the necessary data from the origin
-    - get_{contents,directories,revisions,releases,occurrences} retrieve each
-      kind of object from the origin
-    - has\_* checks whether there are some objects to load for that object type
-    - get_fetch_history_result retrieves the data to insert in the
-      fetch_history table once the load was successful
-    - cleanup cleans up an eventual state installed for computations
-    - eventful returns whether the load was eventful or not
-
-    """
-    DEFAULT_CONFIG = {
-        'storage': ('dict', {
-            'cls': 'remote',
-            'args': {
-              'url': 'http://localhost:5002/'
-            },
-        }),
-        'send_contents': ('bool', True),
-        'send_directories': ('bool', True),
-        'send_revisions': ('bool', True),
-        'send_releases': ('bool', True),
-        'send_occurrences': ('bool', True),
-
-        'save_data': ('bool', False),
-        'save_data_path': ('str', ''),
-
-        'content_packet_size': ('int', 10000),
-        'content_packet_size_bytes': ('int', 1024 * 1024 * 1024),
-        'directory_packet_size': ('int', 25000),
-        'revision_packet_size': ('int', 100000),
-        'release_packet_size': ('int', 100000),
-        'occurrence_packet_size': ('int', 100000),
-    }
-
-    def __init__(self):
-        super().__init__(logging_class='swh.loader.git.BulkLoader')
-
-        # Make sure the config is sane
-        if self.config['save_data']:
-            path = self.config['save_data_path']
-            os.stat(path)
-            if not os.access(path, os.R_OK | os.W_OK):
-                raise PermissionError("Permission denied: %r" % path)
-
-        self.visit_date = None  # possibly overridden in self.prepare method
-
-    @abc.abstractmethod
-    def has_contents(self):
-        """Checks whether we need to load contents"""
-        pass
-
-    def get_contents(self):
-        """Get the contents that need to be loaded"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def has_directories(self):
-        """Checks whether we need to load directories"""
-        pass
-
-    def get_directories(self):
-        """Get the directories that need to be loaded"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def has_revisions(self):
-        """Checks whether we need to load revisions"""
-
-    def get_revisions(self):
-        """Get the revisions that need to be loaded"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def has_releases(self):
-        """Checks whether we need to load releases"""
-        return True
-
-    def get_releases(self):
-        """Get the releases that need to be loaded"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def has_occurrences(self):
-        """Checks whether we need to load occurrences"""
-        pass
-
-    def get_occurrences(self):
-        """Get the occurrences that need to be loaded"""
-        raise NotImplementedError
-
-    def get_fetch_history_result(self):
-        """Return the data to store in fetch_history for the current loader"""
-        raise NotImplementedError
-
-    def eventful(self):
-        """Whether the load was eventful"""
-        raise NotImplementedError
-
-    def save_data(self):
-        """Save the data associated to the current load"""
-        raise NotImplementedError
-
-    def get_save_data_path(self):
-        """The path to which we save the data"""
-        if not hasattr(self, '__save_data_path'):
-            origin_id = self.origin_id
-            year = str(self.visit_date.year)
-
-            path = os.path.join(
-                self.config['save_data_path'],
-                "%04d" % (origin_id % 10000),
-                "%08d" % origin_id,
-                year,
-            )
-
-            os.makedirs(path, exist_ok=True)
-            self.__save_data_path = path
-
-        return self.__save_data_path
-
-    def cleanup(self):
-        """Clean up an eventual state installed for computations.
-           Nothing specific for the loader-git is needed.
-
-        """
-        pass
-
-    def store_data(self):
-        """Store data fetched from the git repository.
-
-        """
-        if self.config['save_data']:
-            self.save_data()
-
-        if self.config['send_contents'] and self.has_contents():
-            self.send_batch_contents(self.get_contents())
-        if self.config['send_directories'] and self.has_directories():
-            self.send_batch_directories(self.get_directories())
-        if self.config['send_revisions'] and self.has_revisions():
-            self.send_batch_revisions(self.get_revisions())
-        if self.config['send_releases'] and self.has_releases():
-            self.send_batch_releases(self.get_releases())
-        if self.config['send_occurrences'] and self.has_occurrences():
-            self.send_batch_occurrences(self.get_occurrences())
diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py
index 785ee4a..b7a2720 100644
--- a/swh/loader/git/loader.py
+++ b/swh/loader/git/loader.py
@@ -1,295 +1,299 @@
 # Copyright (C) 2015-2017  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 import dulwich.repo
 import os
 import shutil
 
 from dulwich.errors import ObjectFormatException, EmptyFileException
 from collections import defaultdict
 
 from swh.model import hashutil
-from . import base, converters, utils
+from swh.loader.core.loader import SWHStatelessLoader
+from . import converters, utils
 
 
-class GitLoader(base.BaseLoader):
+class GitLoader(SWHStatelessLoader):
     """Load a git repository from a directory.
     """
 
     CONFIG_BASE_FILENAME = 'loader/git-loader'
 
+    def __init__(self, config=None):
+        super().__init__(logging_class='swh.loader.git.Loader', config=config)
+
     def prepare(self, origin_url, directory, visit_date):
         self.origin_url = origin_url
         self.origin = self.get_origin()
         self.repo = dulwich.repo.Repo(directory)
         self.visit_date = visit_date
 
     def get_origin(self):
         """Get the origin that is currently being loaded"""
         return converters.origin_url_to_origin(self.origin_url)
 
     def iter_objects(self):
         object_store = self.repo.object_store
 
         for pack in object_store.packs:
             objs = list(pack.index.iterentries())
             objs.sort(key=lambda x: x[1])
             for sha, offset, crc32 in objs:
                 yield hashutil.hash_to_bytehex(sha)
 
         yield from object_store._iter_loose_objects()
         yield from object_store._iter_alternate_objects()
 
     def _check(self, obj):
         """Check the object's repository representation.
 
         If any errors in check exists, an ObjectFormatException is
         raised.
 
         Args:
             obj (object): Dulwich object read from the repository.
 
         """
         obj.check()
         from dulwich.objects import Commit, Tag
         try:
             # For additional checks on dulwich objects with date
             # for now, only checks on *time
             if isinstance(obj, Commit):
                 commit_time = obj._commit_time
                 utils.check_date_time(commit_time)
                 author_time = obj._author_time
                 utils.check_date_time(author_time)
             elif isinstance(obj, Tag):
                 tag_time = obj._tag_time
                 utils.check_date_time(tag_time)
         except Exception as e:
             raise ObjectFormatException(e)
 
     def get_object(self, oid):
         """Given an object id, return the object if it is found and not
            malformed in some way.
 
         Args:
             oid (bytes): the object's identifier
 
         Returns:
             The object if found without malformation
 
         """
         try:
             # some errors are raised when reading the object
             obj = self.repo[oid]
             # some we need to check ourselves
             self._check(obj)
         except KeyError:
             _id = oid.decode('utf-8')
             self.log.warn('object %s not found, skipping' % _id,
                           extra={
                               'swh_type': 'swh_loader_git_missing_object',
                               'swh_object_id': _id,
                               'origin_id': self.origin_id,
                           })
             return None
         except ObjectFormatException:
             _id = oid.decode('utf-8')
             self.log.warn('object %s malformed, skipping' % _id,
                           extra={
                               'swh_type': 'swh_loader_git_missing_object',
                               'swh_object_id': _id,
                               'origin_id': self.origin_id,
                           })
             return None
         except EmptyFileException:
             _id = oid.decode('utf-8')
             self.log.warn('object %s corrupted (empty file), skipping' % _id,
                           extra={
                               'swh_type': 'swh_loader_git_missing_object',
                               'swh_object_id': _id,
                               'origin_id': self.origin_id,
                           })
         else:
             return obj
 
     def fetch_data(self):
         """Fetch the data from the data source"""
         type_to_ids = defaultdict(list)
         for oid in self.iter_objects():
             obj = self.get_object(oid)
             if not obj:
                 continue
             type_name = obj.type_name
             type_to_ids[type_name].append(oid)
 
         self.type_to_ids = type_to_ids
 
     def has_contents(self):
         """Checks whether we need to load contents"""
         return bool(self.type_to_ids[b'blob'])
 
     def get_content_ids(self):
         """Get the content identifiers from the git repository"""
         for oid in self.type_to_ids[b'blob']:
             yield converters.dulwich_blob_to_content_id(self.repo[oid])
 
     def get_contents(self):
         """Get the contents that need to be loaded"""
         max_content_size = self.config['content_size_limit']
 
         missing_contents = set(self.storage.content_missing(
             self.get_content_ids(), 'sha1_git'))
 
         for oid in missing_contents:
             yield converters.dulwich_blob_to_content(
                 self.repo[hashutil.hash_to_bytehex(oid)], log=self.log,
                 max_content_size=max_content_size,
                 origin_id=self.origin_id)
 
     def has_directories(self):
         """Checks whether we need to load directories"""
         return bool(self.type_to_ids[b'tree'])
 
     def get_directory_ids(self):
         """Get the directory identifiers from the git repository"""
         return (hashutil.hash_to_bytes(id.decode())
                 for id in self.type_to_ids[b'tree'])
 
     def get_directories(self):
         """Get the directories that need to be loaded"""
         missing_dirs = set(self.storage.directory_missing(
             sorted(self.get_directory_ids())))
 
         for oid in missing_dirs:
             yield converters.dulwich_tree_to_directory(
                 self.repo[hashutil.hash_to_bytehex(oid)], log=self.log)
 
     def has_revisions(self):
         """Checks whether we need to load revisions"""
         return bool(self.type_to_ids[b'commit'])
 
     def get_revision_ids(self):
         """Get the revision identifiers from the git repository"""
         return (hashutil.hash_to_bytes(id.decode())
                 for id in self.type_to_ids[b'commit'])
 
     def get_revisions(self):
         """Get the revisions that need to be loaded"""
         missing_revs = set(self.storage.revision_missing(
             sorted(self.get_revision_ids())))
 
         for oid in missing_revs:
             yield converters.dulwich_commit_to_revision(
                 self.repo[hashutil.hash_to_bytehex(oid)], log=self.log)
 
     def has_releases(self):
         """Checks whether we need to load releases"""
         return bool(self.type_to_ids[b'tag'])
 
     def get_release_ids(self):
         """Get the release identifiers from the git repository"""
         return (hashutil.hash_to_bytes(id.decode())
                 for id in self.type_to_ids[b'tag'])
 
     def get_releases(self):
         """Get the releases that need to be loaded"""
         missing_rels = set(self.storage.release_missing(
             sorted(self.get_release_ids())))
 
         for oid in missing_rels:
             yield converters.dulwich_tag_to_release(
                 self.repo[hashutil.hash_to_bytehex(oid)], log=self.log)
 
     def has_occurrences(self):
         """Checks whether we need to load occurrences"""
         return True
 
     def get_occurrences(self):
         """Get the occurrences that need to be loaded"""
         origin_id = self.origin_id
         visit = self.visit
         ref_objs = ((refs, target, self.get_object(target))
                     for refs, target in self.repo.refs.as_dict().items()
                     if self.get_object(target))
 
         for ref, target, obj in ref_objs:
             target_type_name = obj.type_name
             target_type = converters.DULWICH_TYPES[target_type_name]
             yield {
                 'branch': ref,
                 'origin': origin_id,
                 'target': hashutil.bytehex_to_hash(target),
                 'target_type': target_type,
                 'visit': visit,
             }
 
     def get_fetch_history_result(self):
         """Return the data to store in fetch_history for the current loader"""
         return {
             'contents': len(self.type_to_ids[b'blob']),
             'directories': len(self.type_to_ids[b'tree']),
             'revisions': len(self.type_to_ids[b'commit']),
             'releases': len(self.type_to_ids[b'tag']),
             'occurrences': len(self.repo.refs.allkeys()),
         }
 
     def save_data(self):
         """We already have the data locally, no need to save it"""
         pass
 
     def eventful(self):
         """Whether the load was eventful"""
         return True
 
 
 class GitLoaderFromArchive(GitLoader):
     """Load a git repository from an archive.
 
     """
     def project_name_from_archive(self, archive_path):
         """Compute the project name from the archive's path.
 
         """
         return os.path.basename(os.path.dirname(archive_path))
 
     def prepare(self, origin_url, archive_path, visit_date):
         """1. Uncompress the archive in temporary location.
            2. Prepare as the GitLoader does
            3. Load as GitLoader does
 
         """
         project_name = self.project_name_from_archive(archive_path)
         self.temp_dir, self.repo_path = utils.init_git_repo_from_archive(
             project_name, archive_path)
 
         self.log.info('Project %s - Uncompressing archive %s at %s' % (
             origin_url, os.path.basename(archive_path), self.repo_path))
         super().prepare(origin_url, self.repo_path, visit_date)
 
     def cleanup(self):
         """Cleanup the temporary location (if it exists).
 
         """
         if self.temp_dir and os.path.exists(self.temp_dir):
             shutil.rmtree(self.temp_dir)
         self.log.info('Project %s - Done injecting %s' % (
             self.origin_url, self.repo_path))
 
 
 if __name__ == '__main__':
     import logging
     import sys
 
     logging.basicConfig(
         level=logging.DEBUG,
         format='%(asctime)s %(process)d %(message)s'
     )
     loader = GitLoader()
 
     origin_url = sys.argv[1]
     directory = sys.argv[2]
     visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
 
     print(loader.load(origin_url, directory, visit_date))
diff --git a/swh/loader/git/updater.py b/swh/loader/git/updater.py
index bb02b54..2782e88 100644
--- a/swh/loader/git/updater.py
+++ b/swh/loader/git/updater.py
@@ -1,480 +1,481 @@
 # Copyright (C) 2016-2017 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from io import BytesIO
 import datetime
 import logging
 import os
 import pickle
 import sys
 
 from collections import defaultdict
 import dulwich.client
 from dulwich.object_store import ObjectStoreGraphWalker
 from dulwich.pack import PackData, PackInflater
 from urllib.parse import urlparse
 
 from swh.model import hashutil
-
-from . import base, converters
+from swh.loader.core.loader import SWHStatelessLoader
+from . import converters
 
 
 class SWHRepoRepresentation:
     """Repository representation for a Software Heritage origin."""
     def __init__(self, storage, origin_id, occurrences=None):
         self.storage = storage
 
         self._parents_cache = {}
         self._type_cache = {}
 
         if origin_id:
             self.heads = set(self._cache_heads(origin_id, occurrences))
         else:
             self.heads = set()
 
     def _fill_parents_cache(self, commits):
         """When querying for a commit's parents, we fill the cache to a depth of 1000
         commits."""
         root_revs = self._encode_for_storage(commits)
         for rev, parents in self.storage.revision_shortlog(root_revs, 1000):
             rev_id = hashutil.hash_to_bytehex(rev)
             if rev_id not in self._parents_cache:
                 self._parents_cache[rev_id] = [
                     hashutil.hash_to_bytehex(parent) for parent in parents
                 ]
         for rev in commits:
             if rev not in self._parents_cache:
                 self._parents_cache[rev] = []
 
     def _cache_heads(self, origin_id, occurrences):
         """Return all the known head commits for `origin_id`"""
         if not occurrences:
             occurrences = self.storage.occurrence_get(origin_id)
 
         return self._decode_from_storage(
             occurrence['target'] for occurrence in occurrences
         )
 
     def get_parents(self, commit):
         """get the parent commits for `commit`"""
         # Prime the parents cache
         if not self._parents_cache and self.heads:
             self._fill_parents_cache(self.heads)
 
         if commit not in self._parents_cache:
             self._fill_parents_cache([commit])
         return self._parents_cache[commit]
 
     def get_heads(self):
         return self.heads
 
     @staticmethod
     def _encode_for_storage(objects):
         return [hashutil.bytehex_to_hash(object) for object in objects]
 
     @staticmethod
     def _decode_from_storage(objects):
         return set(hashutil.hash_to_bytehex(object) for object in objects)
 
     def graph_walker(self):
         return ObjectStoreGraphWalker(self.get_heads(), self.get_parents)
 
     @staticmethod
     def filter_unwanted_refs(refs):
         """Filter the unwanted references from refs"""
         ret = {}
         for ref, val in refs.items():
             if ref.endswith(b'^{}'):
                 # Peeled refs make the git protocol explode
                 continue
             elif ref.startswith(b'refs/pull/') and ref.endswith(b'/merge'):
                 # We filter-out auto-merged GitHub pull requests
                 continue
             else:
                 ret[ref] = val
 
         return ret
 
     def determine_wants(self, refs):
         """Filter the remote references to figure out which ones
         Software Heritage needs.
         """
         if not refs:
             return []
 
         # Find what objects Software Heritage has
         refs = self.find_remote_ref_types_in_swh(refs)
 
         # Cache the objects found in swh as existing heads
         for target in refs.values():
             if target['target_type'] is not None:
                 self.heads.add(target['target'])
 
         ret = set()
         for target in self.filter_unwanted_refs(refs).values():
             if target['target_type'] is None:
                 # The target doesn't exist in Software Heritage, let's retrieve
                 # it.
                 ret.add(target['target'])
 
         return list(ret)
 
     def get_stored_objects(self, objects):
         return self.storage.object_find_by_sha1_git(
             self._encode_for_storage(objects))
 
     def find_remote_ref_types_in_swh(self, remote_refs):
         """Parse the remote refs information and list the objects that exist in
         Software Heritage.
         """
 
         all_objs = set(remote_refs.values()) - set(self._type_cache)
         type_by_id = {}
 
         for id, objs in self.get_stored_objects(all_objs).items():
             id = hashutil.hash_to_bytehex(id)
             if objs:
                 type_by_id[id] = objs[0]['type']
 
         self._type_cache.update(type_by_id)
 
         ret = {}
         for ref, id in remote_refs.items():
             ret[ref] = {
                 'target': id,
                 'target_type': self._type_cache.get(id),
             }
         return ret
 
 
-class BulkUpdater(base.BaseLoader):
+class BulkUpdater(SWHStatelessLoader):
     """A bulk loader for a git repository"""
     CONFIG_BASE_FILENAME = 'loader/git-updater'
 
     ADDITIONAL_CONFIG = {
         'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024),
     }
 
-    def __init__(self, repo_representation=SWHRepoRepresentation):
+    def __init__(self, repo_representation=SWHRepoRepresentation, config=None):
         """Initialize the bulk updater.
 
         Args:
             repo_representation: swh's repository representation
             which is in charge of filtering between known and remote
             data.
 
         """
-        super().__init__()
+        super().__init__(logging_class='swh.loader.git.BulkLoader',
+                         config=config)
         self.repo_representation = repo_representation
 
     def fetch_pack_from_origin(self, origin_url, base_origin_id,
                                base_occurrences, do_activity):
         """Fetch a pack from the origin"""
         pack_buffer = BytesIO()
 
         base_repo = self.repo_representation(self.storage, base_origin_id,
                                              base_occurrences)
 
         parsed_uri = urlparse(origin_url)
 
         path = parsed_uri.path
         if not path.endswith('.git'):
             path += '.git'
 
         client = dulwich.client.TCPGitClient(parsed_uri.netloc,
                                              thin_packs=False)
 
         size_limit = self.config['pack_size_bytes']
 
         def do_pack(data,
                     pack_buffer=pack_buffer,
                     limit=size_limit,
                     origin_url=origin_url):
             cur_size = pack_buffer.tell()
             would_write = len(data)
             if cur_size + would_write > limit:
                 raise IOError('Pack file too big for repository %s, '
                               'limit is %d bytes, current size is %d, '
                               'would write %d' %
                               (origin_url, limit, cur_size, would_write))
 
             pack_buffer.write(data)
 
         remote_refs = client.fetch_pack(path.encode('ascii'),
                                         base_repo.determine_wants,
                                         base_repo.graph_walker(),
                                         do_pack,
                                         progress=do_activity)
 
         if remote_refs:
             local_refs = base_repo.find_remote_ref_types_in_swh(remote_refs)
         else:
             local_refs = remote_refs = {}
 
         pack_buffer.flush()
         pack_size = pack_buffer.tell()
         pack_buffer.seek(0)
 
         return {
             'remote_refs': base_repo.filter_unwanted_refs(remote_refs),
             'local_refs': local_refs,
             'pack_buffer': pack_buffer,
             'pack_size': pack_size,
         }
 
     def list_pack(self, pack_data, pack_size):
         id_to_type = {}
         type_to_ids = defaultdict(set)
 
         inflater = self.get_inflater()
 
         for obj in inflater:
             type, id = obj.type_name, obj.id
             id_to_type[id] = type
             type_to_ids[type].add(id)
 
         return id_to_type, type_to_ids
 
     def prepare(self, origin_url, base_url=None):
         self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
 
         origin = converters.origin_url_to_origin(origin_url)
         base_origin = converters.origin_url_to_origin(base_url)
 
         base_occurrences = []
         base_origin_id = origin_id = None
 
         db_origin = self.storage.origin_get(origin)
         if db_origin:
             base_origin_id = origin_id = db_origin['id']
 
         if origin_id:
             base_occurrences = self.storage.occurrence_get(origin_id)
 
         if base_url and not base_occurrences:
             base_origin = self.storage.origin_get(base_origin)
             if base_origin:
                 base_origin_id = base_origin['id']
                 base_occurrences = self.storage.occurrence_get(base_origin_id)
 
         self.base_occurrences = list(sorted(base_occurrences,
                                             key=lambda occ: occ['branch']))
         self.base_origin_id = base_origin_id
         self.origin = origin
 
     def get_origin(self):
         return self.origin
 
     def fetch_data(self):
         def do_progress(msg):
             sys.stderr.buffer.write(msg)
             sys.stderr.flush()
 
         fetch_info = self.fetch_pack_from_origin(
             self.origin['url'], self.base_origin_id, self.base_occurrences,
             do_progress)
 
         self.pack_buffer = fetch_info['pack_buffer']
         self.pack_size = fetch_info['pack_size']
 
         self.remote_refs = fetch_info['remote_refs']
         self.local_refs = fetch_info['local_refs']
 
         origin_url = self.origin['url']
 
         self.log.info('Listed %d refs for repo %s' % (
             len(self.remote_refs), origin_url), extra={
                 'swh_type': 'git_repo_list_refs',
                 'swh_repo': origin_url,
                 'swh_num_refs': len(self.remote_refs),
             })
 
         # We want to load the repository, walk all the objects
         id_to_type, type_to_ids = self.list_pack(self.pack_buffer,
                                                  self.pack_size)
 
         self.id_to_type = id_to_type
         self.type_to_ids = type_to_ids
 
     def save_data(self):
         """Store a pack for archival"""
 
         write_size = 8192
         pack_dir = self.get_save_data_path()
 
         pack_name = "%s.pack" % self.visit_date.isoformat()
         refs_name = "%s.refs" % self.visit_date.isoformat()
 
         with open(os.path.join(pack_dir, pack_name), 'xb') as f:
             while True:
                 r = self.pack_buffer.read(write_size)
                 if not r:
                     break
                 f.write(r)
 
         self.pack_buffer.seek(0)
 
         with open(os.path.join(pack_dir, refs_name), 'xb') as f:
             pickle.dump(self.remote_refs, f)
 
     def get_inflater(self):
         """Reset the pack buffer and get an object inflater from it"""
         self.pack_buffer.seek(0)
         return PackInflater.for_pack_data(
             PackData.from_file(self.pack_buffer, self.pack_size))
 
     def has_contents(self):
         return bool(self.type_to_ids[b'blob'])
 
     def get_content_ids(self):
         """Get the content identifiers from the git repository"""
         for raw_obj in self.get_inflater():
             if raw_obj.type_name != b'blob':
                 continue
 
             yield converters.dulwich_blob_to_content_id(raw_obj)
 
     def get_contents(self):
         """Format the blobs from the git repository as swh contents"""
         max_content_size = self.config['content_size_limit']
 
         missing_contents = set(self.storage.content_missing(
             self.get_content_ids(), 'sha1_git'))
 
         for raw_obj in self.get_inflater():
             if raw_obj.type_name != b'blob':
                 continue
 
             if raw_obj.sha().digest() not in missing_contents:
                 continue
 
             yield converters.dulwich_blob_to_content(
                 raw_obj, log=self.log, max_content_size=max_content_size,
                 origin_id=self.origin_id)
 
     def has_directories(self):
         return bool(self.type_to_ids[b'tree'])
 
     def get_directory_ids(self):
         """Get the directory identifiers from the git repository"""
         return (hashutil.hash_to_bytes(id.decode())
                 for id in self.type_to_ids[b'tree'])
 
     def get_directories(self):
         """Format the trees as swh directories"""
         missing_dirs = set(self.storage.directory_missing(
             sorted(self.get_directory_ids())))
 
         for raw_obj in self.get_inflater():
             if raw_obj.type_name != b'tree':
                 continue
 
             if raw_obj.sha().digest() not in missing_dirs:
                 continue
 
             yield converters.dulwich_tree_to_directory(raw_obj, log=self.log)
 
     def has_revisions(self):
         return bool(self.type_to_ids[b'commit'])
 
     def get_revision_ids(self):
         """Get the revision identifiers from the git repository"""
         return (hashutil.hash_to_bytes(id.decode())
                 for id in self.type_to_ids[b'commit'])
 
     def get_revisions(self):
         """Format commits as swh revisions"""
         missing_revs = set(self.storage.revision_missing(
             sorted(self.get_revision_ids())))
 
         for raw_obj in self.get_inflater():
             if raw_obj.type_name != b'commit':
                 continue
 
             if raw_obj.sha().digest() not in missing_revs:
                 continue
 
             yield converters.dulwich_commit_to_revision(raw_obj, log=self.log)
 
     def has_releases(self):
         return bool(self.type_to_ids[b'tag'])
 
     def get_release_ids(self):
         """Get the release identifiers from the git repository"""
         return (hashutil.hash_to_bytes(id.decode())
                 for id in self.type_to_ids[b'tag'])
 
     def get_releases(self):
         """Retrieve all the release objects from the git repository"""
         missing_rels = set(self.storage.release_missing(
             sorted(self.get_release_ids())))
 
         for raw_obj in self.get_inflater():
             if raw_obj.type_name != b'tag':
                 continue
 
             if raw_obj.sha().digest() not in missing_rels:
                 continue
 
             yield converters.dulwich_tag_to_release(raw_obj, log=self.log)
 
     def has_occurrences(self):
         return bool(self.remote_refs)
 
     def get_occurrences(self):
         origin_id = self.origin_id
         visit = self.visit
 
         ret = []
         for ref in self.remote_refs:
             ret_ref = self.local_refs[ref].copy()
             ret_ref.update({
                 'branch': ref,
                 'origin': origin_id,
                 'visit': visit,
             })
             if not ret_ref['target_type']:
                 target_type = self.id_to_type[ret_ref['target']]
                 ret_ref['target_type'] = converters.DULWICH_TYPES[target_type]
 
             ret_ref['target'] = hashutil.bytehex_to_hash(ret_ref['target'])
 
             ret.append(ret_ref)
 
         return ret
 
     def get_fetch_history_result(self):
         return {
             'contents': len(self.type_to_ids[b'blob']),
             'directories': len(self.type_to_ids[b'tree']),
             'revisions': len(self.type_to_ids[b'commit']),
             'releases': len(self.type_to_ids[b'tag']),
             'occurrences': len(self.remote_refs),
         }
 
     def eventful(self):
         """The load was eventful if the current occurrences are different to
            the ones we retrieved at the beginning of the run"""
         current_occurrences = list(sorted(
             self.storage.occurrence_get(self.origin_id),
             key=lambda occ: occ['branch'],
         ))
 
         return self.base_occurrences != current_occurrences
 
 
 if __name__ == '__main__':
     logging.basicConfig(
         level=logging.DEBUG,
         format='%(asctime)s %(process)d %(message)s'
     )
     bulkupdater = BulkUpdater()
 
     origin_url = sys.argv[1]
     base_url = origin_url
     if len(sys.argv) > 2:
         base_url = sys.argv[2]
 
     print(bulkupdater.load(origin_url, base_url))