diff --git a/swh/storage/archiver/director.py b/swh/storage/archiver/director.py
index a9f630f7..e0101e0a 100644
--- a/swh/storage/archiver/director.py
+++ b/swh/storage/archiver/director.py
@@ -1,287 +1,296 @@
 # Copyright (C) 2015-2016  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import logging
 import click
 import time
 
 from swh.core import hashutil, config
 from swh.objstorage import PathSlicingObjStorage
 from swh.objstorage.api.client import RemoteObjStorage
 from swh.scheduler.celery_backend.config import app
 
 from . import tasks  # NOQA
 from .storage import ArchiverStorage
 
 
 DEFAULT_CONFIG = {
     'objstorage_type': ('str', 'local_storage'),
     'objstorage_path': ('str', '/tmp/swh-storage/objects'),
     'objstorage_slicing': ('str', '0:2/2:4/4:6'),
     'objstorage_url': ('str', 'http://localhost:5003/'),
 
     'batch_max_size': ('int', 50),
     'archival_max_age': ('int', 3600),
     'retention_policy': ('int', 2),
     'asynchronous': ('bool', True),
 
     'dbconn': ('str', 'dbname=softwareheritage-archiver-dev user=guest')
 }
 
 task_name = 'swh.storage.archiver.tasks.SWHArchiverTask'
 
 logger = logging.getLogger()
 
 
 class ArchiverDirector():
     """Process the files in order to know which one is needed as backup.
 
     The archiver director processes the files in the local storage in order
     to know which one needs archival and it delegates this task to
     archiver workers.
     Attributes:
         master_objstorage: the local storage of the master server.
         master_objstorage_args (dict): arguments of the master objstorage
             initialization.
 
         archiver_storage: a wrapper for archiver db operations.
         db_conn_archiver: Either a libpq connection string,
                 or a psycopg2 connection for the archiver db.
 
         slave_objstorages: Iterable of remote obj storages to the slaves
             servers used for backup.
         config: Archiver_configuration. A dictionary that must contain
                 the following keys:
 
                 objstorage_type (str): type of objstorage used (local_storage
                     or remote_storage).
                     If the storage is local, the arguments keys must be present
                         objstorage_path (str): master's objstorage path
                         objstorage_slicing (str): masters's objstorage slicing
                     Otherwise, if it's a remote objstorage, the keys must be:
                         objstorage_url (str): url of the remote objstorage
 
                 batch_max_size (int): The number of content items that can be
                     given to the same archiver worker.
                 archival_max_age (int): Delay given to the worker to copy all
                     the files in a given batch.
                 retention_policy (int): Required number of copies for the
                     content to be considered safe.
                 asynchronous (boolean): Indicate whenever the archival should
                     run in asynchronous mode or not.
     """
 
     def __init__(self, db_conn_archiver, config):
         """ Constructor of the archiver director.
 
         Args:
             db_conn_archiver: Either a libpq connection string,
                 or a psycopg2 connection for the archiver db.
             config: Archiver_configuration. A dictionary that must contain
                 the following keys:
 
                 objstorage_type (str): type of objstorage used
                     (local_objstorage or remote_objstorage).
                     If the storage is local, the arguments keys must be present
                         objstorage_path (str): master's objstorage path
                         objstorage_slicing (str): masters's objstorage slicing
                     Otherwise, if it's a remote objstorage, the keys must be:
                         objstorage_url (str): url of the remote objstorage
 
                 batch_max_size (int): The number of content items that can be
                     given to the same archiver worker.
                 archival_max_age (int): Delay given to the worker to copy all
                     the files in a given batch.
                 retention_policy (int): Required number of copies for the
                     content to be considered safe.
                 asynchronous (boolean): Indicate whenever the archival should
                     run in asynchronous mode or not.
         """
         # Get the slave storages
         self.db_conn_archiver = db_conn_archiver
         self.archiver_storage = ArchiverStorage(db_conn_archiver)
         self.slave_objstorages = {
             id: url
             for id, url
             in self.archiver_storage.archive_ls()
         }
         # Check that there is enough backup servers for the retention policy
         if config['retention_policy'] > len(self.slave_objstorages) + 1:
             raise ValueError(
                 "Can't have a retention policy of %d with %d backup servers"
                 % (config['retention_policy'], len(self.slave_objstorages))
             )
 
         # Get the master storage that contains content to be archived
         if config['objstorage_type'] == 'local_objstorage':
             master_objstorage_args = {
                 'root': config['objstorage_path'],
                 'slicing': config['objstorage_slicing']
             }
             master_objstorage = PathSlicingObjStorage(
                 **master_objstorage_args
             )
         elif config['objstorage_type'] == 'remote_objstorage':
             master_objstorage_args = {'base_url': config['objstorage_url']}
             master_objstorage = RemoteObjStorage(**master_objstorage_args)
         else:
             raise ValueError(
                 'Unknow objstorage class `%s`' % config['objstorage_type']
             )
         self.master_objstorage = master_objstorage
         self.master_objstorage_args = master_objstorage_args
 
         # Keep the full configuration
         self.config = config
 
     def run(self):
         """ Run the archiver director.
 
         The archiver director will check all the contents of the archiver
         database and do the required backup jobs.
         """
         if self.config['asynchronous']:
             run_fn = self.run_async_worker
         else:
             run_fn = self.run_sync_worker
 
         for batch in self.get_unarchived_content():
             run_fn(batch)
 
     def run_async_worker(self, batch):
         """ Produce a worker that will be added to the task queue.
         """
         task = app.tasks[task_name]
         task.delay(batch,
                    archiver_args=self.db_conn_archiver,
                    master_objstorage_args=self.master_objstorage_args,
                    slave_objstorages=self.slave_objstorages,
                    config=self.config)
 
     def run_sync_worker(self, batch):
         """ Run synchronously a worker on the given batch.
         """
         task = app.tasks[task_name]
         task(batch,
              archiver_args=self.db_conn_archiver,
              master_objstorage_args=self.master_objstorage_args,
              slave_objstorages=self.slave_objstorages,
              config=self.config)
 
     def get_unarchived_content(self):
         """ Get contents that need to be archived.
 
         Yields:
             A batch of contents. Batches are dictionaries which associates
             a content id to the data about servers that contains it or not.
 
             {'id1':
                 {'present': [('slave1', 'slave1_url')],
                  'missing': [('slave2', 'slave2_url'),
                              ('slave3', 'slave3_url')]
                 },
              'id2':
                 {'present': [],
                  'missing': [
                      ('slave1', 'slave1_url'),
                      ('slave2', 'slave2_url'),
                      ('slave3', 'slave3_url')
                  ]}
             }
 
             Where keys (idX) are sha1 of the content and (slaveX, slaveX_url)
             are ids and urls of the storage slaves.
 
             At least all the content that don't have enough copies on the
             backups servers are distributed into these batches.
         """
         contents = {}
         # Get the archives
-        archives = [(k, v) for k, v in self.archiver_storage.archive_ls()]
+        archives = dict(self.archiver_storage.archive_ls())
         # Get all the contents referenced into the archiver tables
-        for content_id, copies in self.archiver_storage.content_archive_get():
-            content_id = r'\x' + hashutil.hash_to_hex(content_id)
-            data = {'present': [], 'missing': []}
-            # For each archive server, check the current content status
-            for archive_id, archive_url in archives:
-                if archive_id not in copies:
-                    data['missing'].append((archive_id, archive_url))
-                else:
-                    copy = copies[archive_id]
-                    vstatus = self.get_virtual_status(copy['status'],
-                                                      copy['mtime'])
-                    data[vstatus].append((archive_id, archive_url))
-
-            contents[content_id] = data
-
-            if len(contents) >= self.config['batch_max_size']:
-                yield contents
-                contents = {}
+        last_object = b''
+        while True:
+            archived_contents = list(
+                self.archiver_storage.content_archive_get_copies(last_object))
+
+            if not archived_contents:
+                break
+
+            for content_id, present, ongoing in archived_contents:
+                last_object = content_id
+                data = {
+                    'present': set(present),
+                    'missing': set(archives) - set(present) - set(ongoing),
+                }
+
+                for archive_id, mtime in ongoing.items():
+                    status = self.get_virtual_status('ongoing', mtime)
+                    data[status].add(archive_id)
+
+                contents[r'\x%s' % hashutil.hash_to_hex(content_id)] = {
+                    k: [(archive_id, archives[archive_id]) for archive_id in v]
+                    for k, v in data.items()
+                }
+
+                if len(contents) >= self.config['batch_max_size']:
+                    yield contents
+                    contents = {}
 
         if len(contents) > 0:
             yield contents
 
     def get_virtual_status(self, status, mtime):
         """ Compute the virtual presence of a content.
 
         If the status is ongoing but the time is not elasped, the archiver
         consider it will be present in the futur, and so consider it as
         present.
         However, if the time is elasped, the copy may have failed, so consider
         the content as missing.
 
         Arguments:
             status (string): One of ('present', 'missing', 'ongoing'). The
                 status of the content.
             mtime (datetime): Time at which the content have been updated for
                 the last time.
 
         Returns:
             The virtual status of the studied content, which is 'present' or
             'missing'.
 
         Raises:
             ValueError: if the status is not one 'present', 'missing'
                 or 'ongoing'
         """
         if status in ('present', 'missing'):
             return status
 
         # If the status is 'ongoing' but there is still time, another worker
         # may still be on the task.
         if status == 'ongoing':
             elapsed = int(time.time()) - mtime
             if elapsed <= self.config['archival_max_age']:
                 return 'present'
             else:
                 return 'missing'
         else:
             raise ValueError("status must be either 'present', 'missing' "
                              "or 'ongoing'")
 
 
 @click.command()
 @click.argument('config-path', required=1)
 @click.option('--dbconn', default=DEFAULT_CONFIG['dbconn'][1],
               help="Connection string for the archiver database")
 @click.option('--async/--sync', default=DEFAULT_CONFIG['asynchronous'][1],
               help="Indicates if the archiver should run asynchronously")
 def launch(config_path, dbconn, async):
     # The configuration have following priority :
     # command line > file config > default config
     cl_config = {
         'dbconn': dbconn,
         'asynchronous': async
     }
     conf = config.read(config_path, DEFAULT_CONFIG)
     conf.update(cl_config)
     # Create connection data and run the archiver.
     archiver = ArchiverDirector(conf['dbconn'], conf)
     logger.info("Starting an archival at", time.time())
     archiver.run()
 
 
 if __name__ == '__main__':
     launch()
diff --git a/swh/storage/archiver/storage.py b/swh/storage/archiver/storage.py
index eb5f031c..9a62f67a 100644
--- a/swh/storage/archiver/storage.py
+++ b/swh/storage/archiver/storage.py
@@ -1,73 +1,93 @@
 # Copyright (C) 2016  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import psycopg2
 
 from ..common import db_transaction_generator, db_transaction
 from ..db import Db
 from ..exc import StorageDBError
 
 
 class ArchiverStorage():
     """SWH Archiver storage proxy, encompassing DB
 
     """
     def __init__(self, db_conn):
         """
         Args:
             db_conn: either a libpq connection string, or a psycopg2 connection
 
         """
         try:
             if isinstance(db_conn, psycopg2.extensions.connection):
                 self.db = Db(db_conn)
             else:
                 self.db = Db.connect(db_conn)
         except psycopg2.OperationalError as e:
             raise StorageDBError(e)
 
     @db_transaction_generator
     def archive_ls(self, cur=None):
         """ Get all the archives registered on the server.
 
         Yields:
             a tuple (server_id, server_url) for each archive server.
         """
         yield from self.db.archive_ls(cur)
 
     @db_transaction_generator
     def content_archive_get(self, content=None, cur=None):
         """ Get the archival status of a content in a specific server.
 
         Retreive from the database the archival status of the given content
         in the given archive server.
 
         Args:
             content: the sha1 of the content. May be None for any id.
             archive: the database id of the server we're looking into
                 may be None for any server.
 
         Yields:
             A tuple (content_id, server_id, archival status, mtime, tzinfo).
         """
         yield from self.db.content_archive_get(content, cur)
 
+    @db_transaction_generator
+    def content_archive_get_copies(self, previous_content=None, limit=1000,
+                                   cur=None):
+        """Get the list of copies for `limit` contents starting after
+           `previous_content`.
+
+        Args:
+            previous_content: sha1 of the last content retrieved. May be None
+                              to start at the beginning.
+            limit: number of contents to retrieve. Can be None to retrieve all
+                   objects (will be slow).
+
+        Yields:
+            A tuple (content_id, present_copies, ongoing_copies), where
+            ongoing_copies is a dict mapping copy to mtime.
+
+        """
+        yield from self.db.content_archive_get_copies(previous_content, limit,
+                                                      cur)
+
     @db_transaction
     def content_archive_update(self, content_id, archive_id,
                                new_status=None, cur=None):
         """ Update the status of an archive content and set its mtime to
 
         Change the mtime of an archived content for the given archive and set
         it's mtime to the current time.
 
         Args:
             content_id (str): content sha1
             archive_id (str): name of the archive
             new_status (str): one of 'missing', 'present' or 'ongoing'.
                 this status will replace the previous one. If not given,
                 the function only change the mtime of the content for the
                 given archive.
         """
         self.db.content_archive_update(content_id, archive_id, new_status, cur)
diff --git a/swh/storage/db.py b/swh/storage/db.py
index 03614346..e2cb01ce 100644
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -1,719 +1,769 @@
 # Copyright (C) 2015  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import binascii
 import datetime
 import functools
 import json
 import psycopg2
 import psycopg2.extras
 import tempfile
 import time
 
 from contextlib import contextmanager
 
 TMP_CONTENT_TABLE = 'tmp_content'
 
 
 psycopg2.extras.register_uuid()
 
 
 def stored_procedure(stored_proc):
     """decorator to execute remote stored procedure, specified as argument
 
     Generally, the body of the decorated function should be empty. If it is
     not, the stored procedure will be executed first; the function body then.
 
     """
     def wrap(meth):
         @functools.wraps(meth)
         def _meth(self, *args, **kwargs):
             cur = kwargs.get('cur', None)
             self._cursor(cur).execute('SELECT %s()' % stored_proc)
             meth(self, *args, **kwargs)
         return _meth
     return wrap
 
 
 def jsonize(value):
     """Convert a value to a psycopg2 JSON object if necessary"""
     if isinstance(value, dict):
         return psycopg2.extras.Json(value)
 
     return value
 
 
 def entry_to_bytes(entry):
     """Convert an entry coming from the database to bytes"""
     if isinstance(entry, memoryview):
         return entry.tobytes()
     if isinstance(entry, list):
         return [entry_to_bytes(value) for value in entry]
     return entry
 
 
 def line_to_bytes(line):
     """Convert a line coming from the database to bytes"""
     if isinstance(line, dict):
         return {k: entry_to_bytes(v) for k, v in line.items()}
     return line.__class__(entry_to_bytes(entry) for entry in line)
 
 
 def cursor_to_bytes(cursor):
     """Yield all the data from a cursor as bytes"""
     yield from (line_to_bytes(line) for line in cursor)
 
 
 class Db:
     """Proxy to the SWH DB, with wrappers around stored procedures
 
     """
 
     @classmethod
     def connect(cls, *args, **kwargs):
         """factory method to create a DB proxy
 
         Accepts all arguments of psycopg2.connect; only some specific
         possibilities are reported below.
 
         Args:
             connstring: libpq2 connection string
 
         """
         conn = psycopg2.connect(*args, **kwargs)
         return cls(conn)
 
     def _cursor(self, cur_arg):
         """get a cursor: from cur_arg if given, or a fresh one otherwise
 
         meant to avoid boilerplate if/then/else in methods that proxy stored
         procedures
 
         """
         if cur_arg is not None:
             return cur_arg
         # elif self.cur is not None:
         #     return self.cur
         else:
             return self.conn.cursor()
 
     def __init__(self, conn):
         """create a DB proxy
 
         Args:
             conn: psycopg2 connection to the SWH DB
 
         """
         self.conn = conn
 
     @contextmanager
     def transaction(self):
         """context manager to execute within a DB transaction
 
         Yields:
             a psycopg2 cursor
 
         """
         with self.conn.cursor() as cur:
             try:
                 yield cur
                 self.conn.commit()
             except:
                 if not self.conn.closed:
                     self.conn.rollback()
                 raise
 
     def mktemp(self, tblname, cur=None):
         self._cursor(cur).execute('SELECT swh_mktemp(%s)', (tblname,))
 
     def mktemp_dir_entry(self, entry_type, cur=None):
         self._cursor(cur).execute('SELECT swh_mktemp_dir_entry(%s)',
                                   (('directory_entry_%s' % entry_type),))
 
     @stored_procedure('swh_mktemp_revision')
     def mktemp_revision(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_release')
     def mktemp_release(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_occurrence_history')
     def mktemp_occurrence_history(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_entity_lister')
     def mktemp_entity_lister(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_entity_history')
     def mktemp_entity_history(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_bytea')
     def mktemp_bytea(self, cur=None): pass
 
     def copy_to(self, items, tblname, columns, cur=None, item_cb=None):
         def escape(data):
             if data is None:
                 return ''
             if isinstance(data, bytes):
                 return '\\x%s' % binascii.hexlify(data).decode('ascii')
             elif isinstance(data, str):
                 return '"%s"' % data.replace('"', '""')
             elif isinstance(data, datetime.datetime):
                 # We escape twice to make sure the string generated by
                 # isoformat gets escaped
                 return escape(data.isoformat())
             elif isinstance(data, dict):
                 return escape(json.dumps(data))
             elif isinstance(data, list):
                 return escape("{%s}" % ','.join(escape(d) for d in data))
             elif isinstance(data, psycopg2.extras.Range):
                 # We escape twice here too, so that we make sure
                 # everything gets passed to copy properly
                 return escape(
                     '%s%s,%s%s' % (
                         '[' if data.lower_inc else '(',
                         '-infinity' if data.lower_inf else escape(data.lower),
                         'infinity' if data.upper_inf else escape(data.upper),
                         ']' if data.upper_inc else ')',
                     )
                 )
             else:
                 # We don't escape here to make sure we pass literals properly
                 return str(data)
         with tempfile.TemporaryFile('w+') as f:
             for d in items:
                 if item_cb is not None:
                     item_cb(d)
                 line = [escape(d.get(k)) for k in columns]
                 f.write(','.join(line))
                 f.write('\n')
             f.seek(0)
             self._cursor(cur).copy_expert('COPY %s (%s) FROM STDIN CSV' % (
                 tblname, ', '.join(columns)), f)
 
     @stored_procedure('swh_content_add')
     def content_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_directory_add')
     def directory_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_skipped_content_add')
     def skipped_content_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_revision_add')
     def revision_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_release_add')
     def release_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_occurrence_history_add')
     def occurrence_history_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_entity_history_add')
     def entity_history_add_from_temp(self, cur=None): pass
 
     def store_tmp_bytea(self, ids, cur=None):
         """Store the given identifiers in a new tmp_bytea table"""
         cur = self._cursor(cur)
 
         self.mktemp_bytea(cur)
         self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea',
                      ['id'], cur)
 
     def content_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
 
         cur.execute("""SELECT sha1, sha1_git, sha256
                        FROM swh_content_missing()""")
 
         yield from cursor_to_bytes(cur)
 
     def content_missing_per_sha1_from_temp(self, cur=None):
         cur = self._cursor(cur)
 
         cur.execute("""SELECT *
                        FROM swh_content_missing_per_sha1()""")
 
         yield from cursor_to_bytes(cur)
 
     def skipped_content_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
 
         cur.execute("""SELECT sha1, sha1_git, sha256
                        FROM swh_skipped_content_missing()""")
 
         yield from cursor_to_bytes(cur)
 
     def occurrence_get(self, origin_id, cur=None):
         """Retrieve latest occurrence's information by origin_id.
 
         """
         cur = self._cursor(cur)
 
         cur.execute("""SELECT origin, branch, target, target_type,
                               (select max(date) from origin_visit
                                where origin=%s) as date
                        FROM occurrence
                        WHERE origin=%s
                     """,
                     (origin_id, origin_id))
 
         yield from cursor_to_bytes(cur)
 
     def content_find(self, sha1=None, sha1_git=None, sha256=None, cur=None):
         """Find the content optionally on a combination of the following
         checksums sha1, sha1_git or sha256.
 
         Args:
             sha1: sha1 content
             git_sha1: the sha1 computed `a la git` sha1 of the content
             sha256: sha256 content
 
         Returns:
             The triplet (sha1, sha1_git, sha256) if found or None.
 
         """
         cur = self._cursor(cur)
 
         cur.execute("""SELECT sha1, sha1_git, sha256, length, ctime, status
                        FROM swh_content_find(%s, %s, %s)
                        LIMIT 1""", (sha1, sha1_git, sha256))
 
         content = line_to_bytes(cur.fetchone())
         if set(content) == {None}:
             return None
         else:
             return content
 
     def content_find_occurrence(self, sha1, cur=None):
         """Find one content's occurrence.
 
         Args:
             sha1: sha1 content
             cur: cursor to use
 
         Returns:
             One occurrence for that particular sha1
 
         """
         cur = self._cursor(cur)
 
         cur.execute("""SELECT origin_type, origin_url, branch, target, target_type, path
                        FROM swh_content_find_occurrence(%s)
                        LIMIT 1""",
                     (sha1, ))
 
         return line_to_bytes(cur.fetchone())
 
     def directory_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         cur.execute('''SELECT id, file_entries, dir_entries, rev_entries
                        FROM swh_directory_get()''')
         yield from cursor_to_bytes(cur)
 
     def directory_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_directory_missing()')
         yield from cursor_to_bytes(cur)
 
     def directory_walk_one(self, directory, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_directory_walk_one(%s)', (directory,))
         yield from cursor_to_bytes(cur)
 
     def directory_walk(self, directory, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_directory_walk(%s)', (directory,))
         yield from cursor_to_bytes(cur)
 
     def revision_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
 
         cur.execute('SELECT id FROM swh_revision_missing() as r(id)')
 
         yield from cursor_to_bytes(cur)
 
     revision_add_cols = [
         'id', 'date', 'date_offset', 'date_neg_utc_offset', 'committer_date',
         'committer_date_offset', 'committer_date_neg_utc_offset', 'type',
         'directory', 'message', 'author_fullname', 'author_name',
         'author_email', 'committer_fullname', 'committer_name',
         'committer_email', 'metadata', 'synthetic',
     ]
 
     revision_get_cols = revision_add_cols + [
         'author_id', 'committer_id', 'parents']
 
     origin_visit_get_cols = [
         'origin', 'visit', 'date'
     ]
 
     def origin_visit_get(self, origin_id, cur=None):
         """Retrieve occurrence's history information by origin_id.
 
         Args:
             origin_id: The occurrence's origin
 
         Yields:
             The occurrence's history visits
 
         """
         cur = self._cursor(cur)
 
         cur.execute(
             'SELECT origin, visit, date FROM origin_visit where origin=%s',
             (origin_id, ))
 
         yield from cursor_to_bytes(cur)
 
     def revision_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         query = 'SELECT %s FROM swh_revision_get()' % (
             ', '.join(self.revision_get_cols))
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
     def revision_log(self, root_revisions, limit=None, cur=None):
         cur = self._cursor(cur)
 
         query = """SELECT %s
                    FROM swh_revision_log(%%s, %%s)
                 """ % ', '.join(self.revision_get_cols)
 
         cur.execute(query, (root_revisions, limit))
         yield from cursor_to_bytes(cur)
 
     revision_shortlog_cols = ['id', 'parents']
 
     def revision_shortlog(self, root_revisions, limit=None, cur=None):
         cur = self._cursor(cur)
 
         query = """SELECT %s
                    FROM swh_revision_list(%%s, %%s)
                 """ % ', '.join(self.revision_shortlog_cols)
 
         cur.execute(query, (root_revisions, limit))
         yield from cursor_to_bytes(cur)
 
     def release_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT id FROM swh_release_missing() as r(id)')
         yield from cursor_to_bytes(cur)
 
     object_find_by_sha1_git_cols = ['sha1_git', 'type', 'id', 'object_id']
 
     def object_find_by_sha1_git(self, ids, cur=None):
         cur = self._cursor(cur)
 
         self.store_tmp_bytea(ids, cur)
         query = 'select %s from swh_object_find_by_sha1_git()' % (
             ', '.join(self.object_find_by_sha1_git_cols)
         )
         cur.execute(query)
 
         yield from cursor_to_bytes(cur)
 
     def stat_counters(self, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_stat_counters()')
         yield from cur
 
     fetch_history_cols = ['origin', 'date', 'status', 'result', 'stdout',
                           'stderr', 'duration']
 
     def create_fetch_history(self, fetch_history, cur=None):
         """Create a fetch_history entry with the data in fetch_history"""
         cur = self._cursor(cur)
         query = '''INSERT INTO fetch_history (%s)
                    VALUES (%s) RETURNING id''' % (
             ','.join(self.fetch_history_cols),
             ','.join(['%s'] * len(self.fetch_history_cols))
         )
         cur.execute(query, [fetch_history.get(col) for col in
                             self.fetch_history_cols])
 
         return cur.fetchone()[0]
 
     def get_fetch_history(self, fetch_history_id, cur=None):
         """Get a fetch_history entry with the given id"""
         cur = self._cursor(cur)
         query = '''SELECT %s FROM fetch_history WHERE id=%%s''' % (
             ', '.join(self.fetch_history_cols),
         )
         cur.execute(query, (fetch_history_id,))
 
         data = cur.fetchone()
 
         if not data:
             return None
 
         ret = {'id': fetch_history_id}
         for i, col in enumerate(self.fetch_history_cols):
             ret[col] = data[i]
 
         return ret
 
     def update_fetch_history(self, fetch_history, cur=None):
         """Update the fetch_history entry from the data in fetch_history"""
         cur = self._cursor(cur)
         query = '''UPDATE fetch_history
                    SET %s
                    WHERE id=%%s''' % (
             ','.join('%s=%%s' % col for col in self.fetch_history_cols)
         )
         cur.execute(query, [jsonize(fetch_history.get(col)) for col in
                             self.fetch_history_cols + ['id']])
 
     base_entity_cols = ['uuid', 'parent', 'name', 'type',
                         'description', 'homepage', 'active',
                         'generated', 'lister_metadata',
                         'metadata']
 
     entity_cols = base_entity_cols + ['last_seen', 'last_id']
     entity_history_cols = base_entity_cols + ['id', 'validity']
 
     def origin_add(self, type, url, cur=None):
         """Insert a new origin and return the new identifier."""
         insert = """INSERT INTO origin (type, url) values (%s, %s)
                     RETURNING id"""
 
         cur.execute(insert, (type, url))
         return cur.fetchone()[0]
 
     def origin_get_with(self, type, url, cur=None):
         """Retrieve the origin id from its type and url if found."""
         cur = self._cursor(cur)
 
         query = """SELECT id, type, url, lister, project
                    FROM origin
                    WHERE type=%s AND url=%s"""
 
         cur.execute(query, (type, url))
         data = cur.fetchone()
         if data:
             return line_to_bytes(data)
         return None
 
     def origin_get(self, id, cur=None):
         """Retrieve the origin per its identifier.
 
         """
         cur = self._cursor(cur)
 
         query = "SELECT id, type, url, lister, project FROM origin WHERE id=%s"
 
         cur.execute(query, (id,))
         data = cur.fetchone()
         if data:
             return line_to_bytes(data)
         return None
 
     person_cols = ['fullname', 'name', 'email']
     person_get_cols = person_cols + ['id']
 
     def person_add(self, person, cur=None):
         """Add a person identified by its name and email.
 
         Returns:
             The new person's id
 
         """
         cur = self._cursor(cur)
 
         query_new_person = '''\
         INSERT INTO person(%s)
         VALUES (%s)
         RETURNING id''' % (
             ', '.join(self.person_cols),
             ', '.join('%s' for i in range(len(self.person_cols)))
         )
         cur.execute(query_new_person,
                     [person[col] for col in self.person_cols])
         return cur.fetchone()[0]
 
     def person_get(self, ids, cur=None):
         """Retrieve the persons identified by the list of ids.
 
         """
         cur = self._cursor(cur)
 
         query = """SELECT %s
                    FROM person
                    WHERE id IN %%s""" % ', '.join(self.person_get_cols)
 
         cur.execute(query, (tuple(ids),))
         yield from cursor_to_bytes(cur)
 
     release_add_cols = [
         'id', 'target', 'target_type', 'date', 'date_offset',
         'date_neg_utc_offset', 'name', 'comment', 'synthetic',
         'author_fullname', 'author_name', 'author_email',
     ]
     release_get_cols = release_add_cols + ['author_id']
 
     def release_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         query = '''
         SELECT %s
             FROM swh_release_get()
         ''' % ', '.join(self.release_get_cols)
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
     def release_get_by(self,
                        origin_id,
                        limit=None,
                        cur=None):
         """Retrieve a release by occurrence criterion (only origin right now)
 
         Args:
             - origin_id: The origin to look for.
 
         """
         cur = self._cursor(cur)
         query = """
         SELECT %s
             FROM swh_release_get_by(%%s)
             LIMIT %%s
         """ % ', '.join(self.release_get_cols)
         cur.execute(query, (origin_id, limit))
         yield from cursor_to_bytes(cur)
 
     def revision_get_by(self,
                         origin_id,
                         branch_name,
                         datetime,
                         limit=None,
                         cur=None):
         """Retrieve a revision by occurrence criterion.
 
         Args:
             - origin_id: The origin to look for
             - branch_name: the branch name to look for
             - datetime: the lower bound of timerange to look for.
             - limit: limit number of results to return
             The upper bound being now.
         """
         cur = self._cursor(cur)
         if branch_name and isinstance(branch_name, str):
             branch_name = branch_name.encode('utf-8')
 
         query = '''
         SELECT %s
             FROM swh_revision_get_by(%%s, %%s, %%s)
             LIMIT %%s
         ''' % ', '.join(self.revision_get_cols)
 
         cur.execute(query, (origin_id, branch_name, datetime, limit))
         yield from cursor_to_bytes(cur)
 
     def directory_entry_get_by_path(self, directory, paths, cur=None):
         """Retrieve a directory entry by path.
 
         """
         cur = self._cursor(cur)
         cur.execute("""SELECT dir_id, type, target, name, perms, status, sha1,
                        sha1_git, sha256
                        FROM swh_find_directory_entry_by_path(%s, %s)""",
                     (directory, paths))
 
         data = cur.fetchone()
         if set(data) == {None}:
             return None
         return line_to_bytes(data)
 
     def entity_get(self, uuid, cur=None):
         """Retrieve the entity and its parent hierarchy chain per uuid.
 
         """
         cur = self._cursor(cur)
         cur.execute("""SELECT %s
                        FROM swh_entity_get(%%s)""" % (
                            ', '.join(self.entity_cols)),
                     (uuid, ))
         yield from cursor_to_bytes(cur)
 
     def entity_get_one(self, uuid, cur=None):
         """Retrieve a single entity given its uuid.
 
         """
         cur = self._cursor(cur)
         cur.execute("""SELECT %s
                        FROM entity
                        WHERE uuid = %%s""" % (
                            ', '.join(self.entity_cols)),
                     (uuid, ))
         data = cur.fetchone()
         if not data:
             return None
         return line_to_bytes(data)
 
     def archive_ls(self, cur=None):
         """ Get all the archives registered on the server.
 
         Yields:
             a tuple (server_id, server_url) for each archive server.
         """
         cur = self._cursor(cur)
         cur.execute("""SELECT id, url
                     FROM archive
                     """)
         yield from cursor_to_bytes(cur)
 
     def content_archive_get(self, content=None, cur=None):
         """ Get the archival status of a content in a specific server.
 
         Retrieve from the database the archival status of the given content
         in the given archive server.
 
         Args:
             content: the sha1 of the content. May be None for all contents.
 
         Yields:
             A tuple (content_id, copies_json).
         """
         query = """SELECT content_id, copies
                FROM content_archive
                """
         if content is not None:
             query += "WHERE content_id='%s'" % content
         else:
             query += 'ORDER BY content_id'
 
         cur = self._cursor(cur)
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
+    def content_archive_get_copies(self, previous_content=None, limit=1000,
+                                   cur=None):
+        """Get the list of copies for `limit` contents starting after
+           `previous_content`.
+
+        Args:
+            previous_content: sha1 of the last content retrieved. May be None
+                              to start at the beginning.
+            limit: number of contents to retrieve. Can be None to retrieve all
+                   objects (will be slow).
+
+        Yields:
+            A tuple (content_id, present_copies, ongoing_copies), where
+            ongoing_copies is a dict mapping copy to mtime.
+
+        """
+
+        query = """SELECT content_id,
+                          array(
+                            SELECT key
+                            FROM jsonb_each(copies)
+                            WHERE value->>'status' = 'present'
+                            ORDER BY key
+                          ) AS present,
+                          array(
+                            SELECT key
+                            FROM jsonb_each(copies)
+                            WHERE value->>'status' = 'ongoing'
+                            ORDER BY key
+                          ) AS ongoing,
+                          array(
+                            SELECT value->'mtime'
+                            FROM jsonb_each(copies)
+                            WHERE value->>'status' = 'ongoing'
+                            ORDER BY key
+                          ) AS ongoing_mtime
+                   FROM content_archive
+                   WHERE content_id > %s
+                   ORDER BY content_id
+                   LIMIT %s
+        """
+
+        if previous_content is None:
+            previous_content = b''
+
+        cur = self._cursor(cur)
+        cur.execute(query, (previous_content, limit))
+        for content_id, present, ongoing, mtimes in cursor_to_bytes(cur):
+            yield (content_id, present, dict(zip(ongoing, mtimes)))
+
     def content_archive_update(self, content_id, archive_id,
                                new_status=None, cur=None):
         """ Update the status of an archive content and set its mtime to
 
         Change the mtime of an archived content for the given archive and set
         it's mtime to the current time.
 
         Args:
             content_id (str): content sha1
             archive_id (str): name of the archive
             new_status (str): one of 'missing', 'present' or 'ongoing'.
                 this status will replace the previous one. If not given,
                 the function only change the mtime of the content for the
                 given archive.
         """
         if new_status is not None:
             query = """UPDATE content_archive
                     SET copies=jsonb_set(
                         copies, '{%s}',
                         '{"status":"%s", "mtime":%d}'
                     )
                     WHERE content_id='%s'
                     """ % (archive_id,
                            new_status, int(time.time()),
                            content_id)
         else:
             query = """ UPDATE content_archive
                     SET copies=jsonb_set(copies, '{%s,mtime}', '%d')
                     WHERE content_id='%s'
                     """ % (archive_id, int(time.time()))
 
         cur = self._cursor(cur)
         cur.execute(query)