diff --git a/PKG-INFO b/PKG-INFO
index a2dd0464..7c158c87 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,216 +1,218 @@
 Metadata-Version: 2.1
 Name: swh.storage
-Version: 0.10.3
+Version: 0.10.4
 Summary: Software Heritage storage manager
 Home-page: https://forge.softwareheritage.org/diffusion/DSTO/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-storage
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-storage/
 Description: swh-storage
         ===========
         
         Abstraction layer over the archive, allowing to access all stored source code
         artifacts as well as their metadata.
         
         See the
         [documentation](https://docs.softwareheritage.org/devel/swh-storage/index.html)
         for more details.
         
         ## Quick start
         
         ### Dependencies
         
         Python tests for this module include tests that cannot be run without a local
         Postgresql database, so you need the Postgresql server executable on your
         machine (no need to have a running Postgresql server). They also expect a
         cassandra server.
         
         #### Debian-like host
         
         ```
         $ sudo apt install libpq-dev postgresql-11 cassandra
         ```
         
         #### Non Debian-like host
         
-        The tests expects `/usr/sbin/cassandra` to exist.
+        The tests expects the path to `cassandra` to either be unspecified, it is then
+        looked up at `/usr/sbin/cassandra`, either specified through the environment
+        variable `SWH_CASSANDRA_BIN`.
         
         Optionally, you can avoid running the cassandra tests.
         
         ```
         (swh) :~/swh-storage$ tox -- -m 'not cassandra'
         ```
         
         ### Installation
         
         It is strongly recommended to use a virtualenv. In the following, we
         consider you work in a virtualenv named `swh`. See the
         [developer setup guide](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup)
         for a more details on how to setup a working environment.
         
         
         You can install the package directly from
         [pypi](https://pypi.org/p/swh.storage):
         
         ```
         (swh) :~$ pip install swh.storage
         [...]
         ```
         
         Or from sources:
         
         ```
         (swh) :~$ git clone https://forge.softwareheritage.org/source/swh-storage.git
         [...]
         (swh) :~$ cd swh-storage
         (swh) :~/swh-storage$ pip install .
         [...]
         ```
         
         Then you can check it's properly installed:
         ```
         (swh) :~$ swh storage --help
         Usage: swh storage [OPTIONS] COMMAND [ARGS]...
         
           Software Heritage Storage tools.
         
         Options:
           -h, --help  Show this message and exit.
         
         Commands:
           rpc-serve  Software Heritage Storage RPC server.
         ```
         
         
         ## Tests
         
         The best way of running Python tests for this module is to use
         [tox](https://tox.readthedocs.io/).
         
         ```
         (swh) :~$ pip install tox
         ```
         
         ### tox
         
         From the sources directory, simply use tox:
         
         ```
         (swh) :~/swh-storage$ tox
         [...]
         ========= 315 passed, 6 skipped, 15 warnings in 40.86 seconds ==========
         _______________________________ summary ________________________________
           flake8: commands succeeded
           py3: commands succeeded
           congratulations :)
         ```
         
         ## Development
         
         The storage server can be locally started. It requires a configuration file and
         a running Postgresql database.
         
         ### Sample configuration
         
         A typical configuration `storage.yml` file is:
         
         ```
         storage:
           cls: local
           args:
             db: "dbname=softwareheritage-dev user=<user> password=<pwd>"
             objstorage:
               cls: pathslicing
               args:
                 root: /tmp/swh-storage/
                 slicing: 0:2/2:4/4:6
         ```
         
         which means, this uses:
         
         - a local storage instance whose db connection is to
           `softwareheritage-dev` local instance,
         
         - the objstorage uses a local objstorage instance whose:
         
           - `root` path is /tmp/swh-storage,
         
           - slicing scheme is `0:2/2:4/4:6`. This means that the identifier of
             the content (sha1) which will be stored on disk at first level
             with the first 2 hex characters, the second level with the next 2
             hex characters and the third level with the next 2 hex
             characters. And finally the complete hash file holding the raw
             content. For example: 00062f8bd330715c4f819373653d97b3cd34394c
             will be stored at 00/06/2f/00062f8bd330715c4f819373653d97b3cd34394c
         
         Note that the `root` path should exist on disk before starting the server.
         
         
         ### Starting the storage server
         
         If the python package has been properly installed (e.g. in a virtual env), you
         should be able to use the command:
         
         ```
         (swh) :~/swh-storage$ swh storage rpc-serve storage.yml
         ```
         
         This runs a local swh-storage api at 5002 port.
         
         ```
         (swh) :~/swh-storage$ curl http://127.0.0.1:5002
         <html>
         <head><title>Software Heritage storage server</title></head>
         <body>
         <p>You have reached the
         <a href="https://www.softwareheritage.org/">Software Heritage</a>
         storage server.<br />
         See its
         <a href="https://docs.softwareheritage.org/devel/swh-storage/">documentation
         and API</a> for more information</p>
         ```
         
         ### And then what?
         
         In your upper layer
         ([loader-git](https://forge.softwareheritage.org/source/swh-loader-git/),
         [loader-svn](https://forge.softwareheritage.org/source/swh-loader-svn/),
         etc...), you can define a remote storage with this snippet of yaml
         configuration.
         
         ```
         storage:
           cls: remote
           args:
             url: http://localhost:5002/
         ```
         
         You could directly define a local storage with the following snippet:
         
         ```
         storage:
           cls: local
           args:
             db: service=swh-dev
             objstorage:
               cls: pathslicing
               args:
                 root: /home/storage/swh-storage/
                 slicing: 0:2/2:4/4:6
         ```
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 Provides-Extra: schemata
 Provides-Extra: journal
diff --git a/README.md b/README.md
index 08b52b67..25b65f1b 100644
--- a/README.md
+++ b/README.md
@@ -1,192 +1,194 @@
 swh-storage
 ===========
 
 Abstraction layer over the archive, allowing to access all stored source code
 artifacts as well as their metadata.
 
 See the
 [documentation](https://docs.softwareheritage.org/devel/swh-storage/index.html)
 for more details.
 
 ## Quick start
 
 ### Dependencies
 
 Python tests for this module include tests that cannot be run without a local
 Postgresql database, so you need the Postgresql server executable on your
 machine (no need to have a running Postgresql server). They also expect a
 cassandra server.
 
 #### Debian-like host
 
 ```
 $ sudo apt install libpq-dev postgresql-11 cassandra
 ```
 
 #### Non Debian-like host
 
-The tests expects `/usr/sbin/cassandra` to exist.
+The tests expects the path to `cassandra` to either be unspecified, it is then
+looked up at `/usr/sbin/cassandra`, either specified through the environment
+variable `SWH_CASSANDRA_BIN`.
 
 Optionally, you can avoid running the cassandra tests.
 
 ```
 (swh) :~/swh-storage$ tox -- -m 'not cassandra'
 ```
 
 ### Installation
 
 It is strongly recommended to use a virtualenv. In the following, we
 consider you work in a virtualenv named `swh`. See the
 [developer setup guide](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup)
 for a more details on how to setup a working environment.
 
 
 You can install the package directly from
 [pypi](https://pypi.org/p/swh.storage):
 
 ```
 (swh) :~$ pip install swh.storage
 [...]
 ```
 
 Or from sources:
 
 ```
 (swh) :~$ git clone https://forge.softwareheritage.org/source/swh-storage.git
 [...]
 (swh) :~$ cd swh-storage
 (swh) :~/swh-storage$ pip install .
 [...]
 ```
 
 Then you can check it's properly installed:
 ```
 (swh) :~$ swh storage --help
 Usage: swh storage [OPTIONS] COMMAND [ARGS]...
 
   Software Heritage Storage tools.
 
 Options:
   -h, --help  Show this message and exit.
 
 Commands:
   rpc-serve  Software Heritage Storage RPC server.
 ```
 
 
 ## Tests
 
 The best way of running Python tests for this module is to use
 [tox](https://tox.readthedocs.io/).
 
 ```
 (swh) :~$ pip install tox
 ```
 
 ### tox
 
 From the sources directory, simply use tox:
 
 ```
 (swh) :~/swh-storage$ tox
 [...]
 ========= 315 passed, 6 skipped, 15 warnings in 40.86 seconds ==========
 _______________________________ summary ________________________________
   flake8: commands succeeded
   py3: commands succeeded
   congratulations :)
 ```
 
 ## Development
 
 The storage server can be locally started. It requires a configuration file and
 a running Postgresql database.
 
 ### Sample configuration
 
 A typical configuration `storage.yml` file is:
 
 ```
 storage:
   cls: local
   args:
     db: "dbname=softwareheritage-dev user=<user> password=<pwd>"
     objstorage:
       cls: pathslicing
       args:
         root: /tmp/swh-storage/
         slicing: 0:2/2:4/4:6
 ```
 
 which means, this uses:
 
 - a local storage instance whose db connection is to
   `softwareheritage-dev` local instance,
 
 - the objstorage uses a local objstorage instance whose:
 
   - `root` path is /tmp/swh-storage,
 
   - slicing scheme is `0:2/2:4/4:6`. This means that the identifier of
     the content (sha1) which will be stored on disk at first level
     with the first 2 hex characters, the second level with the next 2
     hex characters and the third level with the next 2 hex
     characters. And finally the complete hash file holding the raw
     content. For example: 00062f8bd330715c4f819373653d97b3cd34394c
     will be stored at 00/06/2f/00062f8bd330715c4f819373653d97b3cd34394c
 
 Note that the `root` path should exist on disk before starting the server.
 
 
 ### Starting the storage server
 
 If the python package has been properly installed (e.g. in a virtual env), you
 should be able to use the command:
 
 ```
 (swh) :~/swh-storage$ swh storage rpc-serve storage.yml
 ```
 
 This runs a local swh-storage api at 5002 port.
 
 ```
 (swh) :~/swh-storage$ curl http://127.0.0.1:5002
 <html>
 <head><title>Software Heritage storage server</title></head>
 <body>
 <p>You have reached the
 <a href="https://www.softwareheritage.org/">Software Heritage</a>
 storage server.<br />
 See its
 <a href="https://docs.softwareheritage.org/devel/swh-storage/">documentation
 and API</a> for more information</p>
 ```
 
 ### And then what?
 
 In your upper layer
 ([loader-git](https://forge.softwareheritage.org/source/swh-loader-git/),
 [loader-svn](https://forge.softwareheritage.org/source/swh-loader-svn/),
 etc...), you can define a remote storage with this snippet of yaml
 configuration.
 
 ```
 storage:
   cls: remote
   args:
     url: http://localhost:5002/
 ```
 
 You could directly define a local storage with the following snippet:
 
 ```
 storage:
   cls: local
   args:
     db: service=swh-dev
     objstorage:
       cls: pathslicing
       args:
         root: /home/storage/swh-storage/
         slicing: 0:2/2:4/4:6
 ```
diff --git a/sql/upgrades/158.sql b/sql/upgrades/158.sql
index 0c8e0849..f57f07f0 100644
--- a/sql/upgrades/158.sql
+++ b/sql/upgrades/158.sql
@@ -1,76 +1,76 @@
 -- SWH DB schema upgrade
 -- from_version: 157
 -- to_version: 158
 -- description: Add the extra_headers column in the revision table
 
 -- latest schema version
 insert into dbversion(version, release, description)
       values(158, now(), 'Work Still In Progress');
 
 -- Adapt the revision table for the new extra_headers column
-alter table revision add column (extra_headers bytea[][]);
+alter table revision add column extra_headers bytea[][];
 
 -- Adapt the revision_entry type for the new extra_headers attribute
-alter type revision_entry add attribute (extra_headers bytea[][]);
+alter type revision_entry add attribute extra_headers bytea[][];
 
 -- Create entries in revision from tmp_revision
 create or replace function swh_revision_add()
     returns void
     language plpgsql
 as $$
 begin
     perform swh_person_add_from_revision();
 
     insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers)
     select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers
     from tmp_revision t
     left join person a on a.fullname = t.author_fullname
     left join person c on c.fullname = t.committer_fullname;
     return;
 end
 $$;
 
 -- "git style" revision log. Similar to swh_revision_list(), but returning all
 -- information associated to each revision, and expanding authors/committers
 create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL)
     returns setof revision_entry
     language sql
     stable
 as $$
     select t.id, r.date, r.date_offset, r.date_neg_utc_offset,
            r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
            r.type, r.directory, r.message,
            a.id, a.fullname, a.name, a.email,
            c.id, c.fullname, c.name, c.email,
-           r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id
+           r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers
     from swh_revision_list(root_revisions, num_revs) as t
     left join revision r on t.id = r.id
     left join person a on a.id = r.author
     left join person c on c.id = r.committer;
 $$;
 
 create or replace function swh_revision_list_by_object_id(
     min_excl bigint,
     max_incl bigint
 )
     returns setof revision_entry
     language sql
     stable
 as $$
     with revs as (
         select * from revision
         where object_id > min_excl and object_id <= max_incl
     )
     select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
            r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
            r.type, r.directory, r.message,
-           a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers,
+           a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
            array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
-               as parents, r.object_id
+               as parents, r.object_id, r.extra_headers
     from revs r
     left join person a on a.id = r.author
     left join person c on c.id = r.committer
     order by r.object_id;
 $$;
 
 -- TODO: add the migration magic query...
diff --git a/swh.storage.egg-info/PKG-INFO b/swh.storage.egg-info/PKG-INFO
index a2dd0464..7c158c87 100644
--- a/swh.storage.egg-info/PKG-INFO
+++ b/swh.storage.egg-info/PKG-INFO
@@ -1,216 +1,218 @@
 Metadata-Version: 2.1
 Name: swh.storage
-Version: 0.10.3
+Version: 0.10.4
 Summary: Software Heritage storage manager
 Home-page: https://forge.softwareheritage.org/diffusion/DSTO/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-storage
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-storage/
 Description: swh-storage
         ===========
         
         Abstraction layer over the archive, allowing to access all stored source code
         artifacts as well as their metadata.
         
         See the
         [documentation](https://docs.softwareheritage.org/devel/swh-storage/index.html)
         for more details.
         
         ## Quick start
         
         ### Dependencies
         
         Python tests for this module include tests that cannot be run without a local
         Postgresql database, so you need the Postgresql server executable on your
         machine (no need to have a running Postgresql server). They also expect a
         cassandra server.
         
         #### Debian-like host
         
         ```
         $ sudo apt install libpq-dev postgresql-11 cassandra
         ```
         
         #### Non Debian-like host
         
-        The tests expects `/usr/sbin/cassandra` to exist.
+        The tests expects the path to `cassandra` to either be unspecified, it is then
+        looked up at `/usr/sbin/cassandra`, either specified through the environment
+        variable `SWH_CASSANDRA_BIN`.
         
         Optionally, you can avoid running the cassandra tests.
         
         ```
         (swh) :~/swh-storage$ tox -- -m 'not cassandra'
         ```
         
         ### Installation
         
         It is strongly recommended to use a virtualenv. In the following, we
         consider you work in a virtualenv named `swh`. See the
         [developer setup guide](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup)
         for a more details on how to setup a working environment.
         
         
         You can install the package directly from
         [pypi](https://pypi.org/p/swh.storage):
         
         ```
         (swh) :~$ pip install swh.storage
         [...]
         ```
         
         Or from sources:
         
         ```
         (swh) :~$ git clone https://forge.softwareheritage.org/source/swh-storage.git
         [...]
         (swh) :~$ cd swh-storage
         (swh) :~/swh-storage$ pip install .
         [...]
         ```
         
         Then you can check it's properly installed:
         ```
         (swh) :~$ swh storage --help
         Usage: swh storage [OPTIONS] COMMAND [ARGS]...
         
           Software Heritage Storage tools.
         
         Options:
           -h, --help  Show this message and exit.
         
         Commands:
           rpc-serve  Software Heritage Storage RPC server.
         ```
         
         
         ## Tests
         
         The best way of running Python tests for this module is to use
         [tox](https://tox.readthedocs.io/).
         
         ```
         (swh) :~$ pip install tox
         ```
         
         ### tox
         
         From the sources directory, simply use tox:
         
         ```
         (swh) :~/swh-storage$ tox
         [...]
         ========= 315 passed, 6 skipped, 15 warnings in 40.86 seconds ==========
         _______________________________ summary ________________________________
           flake8: commands succeeded
           py3: commands succeeded
           congratulations :)
         ```
         
         ## Development
         
         The storage server can be locally started. It requires a configuration file and
         a running Postgresql database.
         
         ### Sample configuration
         
         A typical configuration `storage.yml` file is:
         
         ```
         storage:
           cls: local
           args:
             db: "dbname=softwareheritage-dev user=<user> password=<pwd>"
             objstorage:
               cls: pathslicing
               args:
                 root: /tmp/swh-storage/
                 slicing: 0:2/2:4/4:6
         ```
         
         which means, this uses:
         
         - a local storage instance whose db connection is to
           `softwareheritage-dev` local instance,
         
         - the objstorage uses a local objstorage instance whose:
         
           - `root` path is /tmp/swh-storage,
         
           - slicing scheme is `0:2/2:4/4:6`. This means that the identifier of
             the content (sha1) which will be stored on disk at first level
             with the first 2 hex characters, the second level with the next 2
             hex characters and the third level with the next 2 hex
             characters. And finally the complete hash file holding the raw
             content. For example: 00062f8bd330715c4f819373653d97b3cd34394c
             will be stored at 00/06/2f/00062f8bd330715c4f819373653d97b3cd34394c
         
         Note that the `root` path should exist on disk before starting the server.
         
         
         ### Starting the storage server
         
         If the python package has been properly installed (e.g. in a virtual env), you
         should be able to use the command:
         
         ```
         (swh) :~/swh-storage$ swh storage rpc-serve storage.yml
         ```
         
         This runs a local swh-storage api at 5002 port.
         
         ```
         (swh) :~/swh-storage$ curl http://127.0.0.1:5002
         <html>
         <head><title>Software Heritage storage server</title></head>
         <body>
         <p>You have reached the
         <a href="https://www.softwareheritage.org/">Software Heritage</a>
         storage server.<br />
         See its
         <a href="https://docs.softwareheritage.org/devel/swh-storage/">documentation
         and API</a> for more information</p>
         ```
         
         ### And then what?
         
         In your upper layer
         ([loader-git](https://forge.softwareheritage.org/source/swh-loader-git/),
         [loader-svn](https://forge.softwareheritage.org/source/swh-loader-svn/),
         etc...), you can define a remote storage with this snippet of yaml
         configuration.
         
         ```
         storage:
           cls: remote
           args:
             url: http://localhost:5002/
         ```
         
         You could directly define a local storage with the following snippet:
         
         ```
         storage:
           cls: local
           args:
             db: service=swh-dev
             objstorage:
               cls: pathslicing
               args:
                 root: /home/storage/swh-storage/
                 slicing: 0:2/2:4/4:6
         ```
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 Provides-Extra: schemata
 Provides-Extra: journal
diff --git a/swh/storage/pytest_plugin.py b/swh/storage/pytest_plugin.py
index 1b010923..a4640a8c 100644
--- a/swh/storage/pytest_plugin.py
+++ b/swh/storage/pytest_plugin.py
@@ -1,208 +1,211 @@
 # Copyright (C) 2019-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import glob
 
 from os import path, environ
 from typing import Dict, Union
 
 import pytest
 
 import swh.storage
 
 from pytest_postgresql import factories
 from pytest_postgresql.janitor import DatabaseJanitor, psycopg2, Version
 
 from swh.core.utils import numfile_sortkey as sortkey
 from swh.storage import get_storage
 from swh.storage.tests.storage_data import data
 
 
 SQL_DIR = path.join(path.dirname(swh.storage.__file__), "sql")
 
 environ["LC_ALL"] = "C.UTF-8"
 
 DUMP_FILES = path.join(SQL_DIR, "*.sql")
 
 
 @pytest.fixture
 def swh_storage_backend_config(postgresql_proc, swh_storage_postgresql):
+    """Basic pg storage configuration with no journal collaborator
+    (to avoid pulling optional dependency on clients of this fixture)
+
+    """
     yield {
         "cls": "local",
         "db": "postgresql://{user}@{host}:{port}/{dbname}".format(
             host=postgresql_proc.host,
             port=postgresql_proc.port,
             user="postgres",
             dbname="tests",
         ),
         "objstorage": {"cls": "memory", "args": {}},
-        "journal_writer": {"cls": "memory",},
     }
 
 
 @pytest.fixture
 def swh_storage(swh_storage_backend_config):
     return get_storage(cls="validate", storage=swh_storage_backend_config)
 
 
 # the postgres_fact factory fixture below is mostly a copy of the code
 # from pytest-postgresql. We need a custom version here to be able to
 # specify our version of the DBJanitor we use.
 def postgresql_fact(process_fixture_name, db_name=None, dump_files=DUMP_FILES):
     @pytest.fixture
     def postgresql_factory(request):
         """
         Fixture factory for PostgreSQL.
 
         :param FixtureRequest request: fixture request object
         :rtype: psycopg2.connection
         :returns: postgresql client
         """
         config = factories.get_config(request)
         if not psycopg2:
             raise ImportError("No module named psycopg2. Please install it.")
         proc_fixture = request.getfixturevalue(process_fixture_name)
 
         # _, config = try_import('psycopg2', request)
         pg_host = proc_fixture.host
         pg_port = proc_fixture.port
         pg_user = proc_fixture.user
         pg_options = proc_fixture.options
         pg_db = db_name or config["dbname"]
         with SwhDatabaseJanitor(
             pg_user,
             pg_host,
             pg_port,
             pg_db,
             proc_fixture.version,
             dump_files=dump_files,
         ):
             connection = psycopg2.connect(
                 dbname=pg_db,
                 user=pg_user,
                 host=pg_host,
                 port=pg_port,
                 options=pg_options,
             )
             yield connection
             connection.close()
 
     return postgresql_factory
 
 
 swh_storage_postgresql = postgresql_fact("postgresql_proc")
 
 
 # This version of the DatabaseJanitor implement a different setup/teardown
 # behavior than than the stock one: instead of dropping, creating and
 # initializing the database for each test, it create and initialize the db only
 # once, then it truncate the tables. This is needed to have acceptable test
 # performances.
 class SwhDatabaseJanitor(DatabaseJanitor):
     def __init__(
         self,
         user: str,
         host: str,
         port: str,
         db_name: str,
         version: Union[str, float, Version],
         dump_files: str = DUMP_FILES,
     ) -> None:
         super().__init__(user, host, port, db_name, version)
         self.dump_files = sorted(glob.glob(dump_files), key=sortkey)
 
     def db_setup(self):
         with psycopg2.connect(
             dbname=self.db_name, user=self.user, host=self.host, port=self.port,
         ) as cnx:
             with cnx.cursor() as cur:
                 for fname in self.dump_files:
                     with open(fname) as fobj:
                         sql = fobj.read().replace("concurrently", "").strip()
                         if sql:
                             cur.execute(sql)
             cnx.commit()
 
     def db_reset(self):
         with psycopg2.connect(
             dbname=self.db_name, user=self.user, host=self.host, port=self.port,
         ) as cnx:
             with cnx.cursor() as cur:
                 cur.execute(
                     "SELECT table_name FROM information_schema.tables "
                     "WHERE table_schema = %s",
                     ("public",),
                 )
                 tables = set(table for (table,) in cur.fetchall())
                 for table in tables:
                     cur.execute("truncate table %s cascade" % table)
 
                 cur.execute(
                     "SELECT sequence_name FROM information_schema.sequences "
                     "WHERE sequence_schema = %s",
                     ("public",),
                 )
                 seqs = set(seq for (seq,) in cur.fetchall())
                 for seq in seqs:
                     cur.execute("ALTER SEQUENCE %s RESTART;" % seq)
             cnx.commit()
 
     def init(self):
         with self.cursor() as cur:
             cur.execute(
                 "SELECT COUNT(1) FROM pg_database WHERE datname=%s;", (self.db_name,)
             )
             db_exists = cur.fetchone()[0] == 1
             if db_exists:
                 cur.execute(
                     "UPDATE pg_database SET datallowconn=true " "WHERE datname = %s;",
                     (self.db_name,),
                 )
 
         if db_exists:
             self.db_reset()
         else:
             with self.cursor() as cur:
                 cur.execute('CREATE DATABASE "{}";'.format(self.db_name))
             self.db_setup()
 
     def drop(self):
         pid_column = "pid"
         with self.cursor() as cur:
             cur.execute(
                 "UPDATE pg_database SET datallowconn=false " "WHERE datname = %s;",
                 (self.db_name,),
             )
             cur.execute(
                 "SELECT pg_terminate_backend(pg_stat_activity.{})"
                 "FROM pg_stat_activity "
                 "WHERE pg_stat_activity.datname = %s;".format(pid_column),
                 (self.db_name,),
             )
 
 
 @pytest.fixture
 def sample_data() -> Dict:
     """Pre-defined sample storage object data to manipulate
 
     Returns:
         Dict of data (keys: content, directory, revision, release, person,
         origin)
 
     """
     return {
         "content": [data.cont, data.cont2],
         "content_metadata": [data.cont3],
         "skipped_content": [data.skipped_cont, data.skipped_cont2],
         "person": [data.person],
         "directory": [data.dir2, data.dir],
         "revision": [data.revision, data.revision2, data.revision3],
         "release": [data.release, data.release2, data.release3],
         "snapshot": [data.snapshot],
         "origin": [data.origin, data.origin2],
         "fetcher": [data.metadata_fetcher],
         "authority": [data.metadata_authority],
         "origin_metadata": [data.origin_metadata, data.origin_metadata2],
     }
diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql
index e244ebf1..fa177309 100644
--- a/swh/storage/sql/40-swh-func.sql
+++ b/swh/storage/sql/40-swh-func.sql
@@ -1,950 +1,950 @@
 create or replace function hash_sha1(text)
        returns text
 as $$
    select encode(digest($1, 'sha1'), 'hex')
 $$ language sql strict immutable;
 
 comment on function hash_sha1(text) is 'Compute SHA1 hash as text';
 
 -- create a temporary table called tmp_TBLNAME, mimicking existing table
 -- TBLNAME
 --
 -- Args:
 --     tblname: name of the table to mimic
 create or replace function swh_mktemp(tblname regclass)
     returns void
     language plpgsql
 as $$
 begin
     execute format('
 	create temporary table if not exists tmp_%1$I
 	    (like %1$I including defaults)
 	    on commit delete rows;
       alter table tmp_%1$I drop column if exists object_id;
 	', tblname);
     return;
 end
 $$;
 
 -- create a temporary table for directory entries called tmp_TBLNAME,
 -- mimicking existing table TBLNAME with an extra dir_id (sha1_git)
 -- column, and dropping the id column.
 --
 -- This is used to create the tmp_directory_entry_<foo> tables.
 --
 -- Args:
 --     tblname: name of the table to mimic
 create or replace function swh_mktemp_dir_entry(tblname regclass)
     returns void
     language plpgsql
 as $$
 begin
     execute format('
 	create temporary table if not exists tmp_%1$I
 	    (like %1$I including defaults, dir_id sha1_git)
 	    on commit delete rows;
         alter table tmp_%1$I drop column if exists id;
 	', tblname);
     return;
 end
 $$;
 
 -- create a temporary table for revisions called tmp_revisions,
 -- mimicking existing table revision, replacing the foreign keys to
 -- people with an email and name field
 --
 create or replace function swh_mktemp_revision()
     returns void
     language sql
 as $$
     create temporary table if not exists tmp_revision (
         like revision including defaults,
         author_fullname bytea,
         author_name bytea,
         author_email bytea,
         committer_fullname bytea,
         committer_name bytea,
         committer_email bytea
     ) on commit delete rows;
     alter table tmp_revision drop column if exists author;
     alter table tmp_revision drop column if exists committer;
     alter table tmp_revision drop column if exists object_id;
 $$;
 
 -- create a temporary table for releases called tmp_release,
 -- mimicking existing table release, replacing the foreign keys to
 -- people with an email and name field
 --
 create or replace function swh_mktemp_release()
     returns void
     language sql
 as $$
     create temporary table if not exists tmp_release (
         like release including defaults,
         author_fullname bytea,
         author_name bytea,
         author_email bytea
     ) on commit delete rows;
     alter table tmp_release drop column if exists author;
     alter table tmp_release drop column if exists object_id;
 $$;
 
 -- create a temporary table for the branches of a snapshot
 create or replace function swh_mktemp_snapshot_branch()
     returns void
     language sql
 as $$
   create temporary table if not exists tmp_snapshot_branch (
       name bytea not null,
       target bytea,
       target_type snapshot_target
   ) on commit delete rows;
 $$;
 
 -- a content signature is a set of cryptographic checksums that we use to
 -- uniquely identify content, for the purpose of verifying if we already have
 -- some content or not during content injection
 create type content_signature as (
     sha1       sha1,
     sha1_git   sha1_git,
     sha256     sha256,
     blake2s256 blake2s256
 );
 
 
 -- check which entries of tmp_skipped_content are missing from skipped_content
 --
 -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content,
 -- 2. call this function
 create or replace function swh_skipped_content_missing()
     returns setof content_signature
     language plpgsql
 as $$
 begin
     return query
 	select sha1, sha1_git, sha256, blake2s256 from tmp_skipped_content t
 	where not exists
 	(select 1 from skipped_content s where
 	    s.sha1 is not distinct from t.sha1 and
 	    s.sha1_git is not distinct from t.sha1_git and
 	    s.sha256 is not distinct from t.sha256);
     return;
 end
 $$;
 
 
 -- add tmp_content entries to content, skipping duplicates
 --
 -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content,
 -- 2. call this function
 create or replace function swh_content_add()
     returns void
     language plpgsql
 as $$
 begin
     insert into content (sha1, sha1_git, sha256, blake2s256, length, status, ctime)
         select distinct sha1, sha1_git, sha256, blake2s256, length, status, ctime from tmp_content;
     return;
 end
 $$;
 
 
 -- add tmp_skipped_content entries to skipped_content, skipping duplicates
 --
 -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content,
 -- 2. call this function
 create or replace function swh_skipped_content_add()
     returns void
     language plpgsql
 as $$
 begin
     insert into skipped_content (sha1, sha1_git, sha256, blake2s256, length, status, reason, origin)
         select distinct sha1, sha1_git, sha256, blake2s256, length, status, reason, origin
 	from tmp_skipped_content
 	where (coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')) in (
             select coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')
             from swh_skipped_content_missing()
         );
         -- TODO XXX use postgres 9.5 "UPSERT" support here, when available.
         -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid
         -- the extra swh_skipped_content_missing() query here.
     return;
 end
 $$;
 
 -- Update content entries from temporary table.
 -- (columns are potential new columns added to the schema, this cannot be empty)
 --
 create or replace function swh_content_update(columns_update text[])
     returns void
     language plpgsql
 as $$
 declare
    query text;
    tmp_array text[];
 begin
     if array_length(columns_update, 1) = 0 then
         raise exception 'Please, provide the list of column names to update.';
     end if;
 
     tmp_array := array(select format('%1$s=t.%1$s', unnest) from unnest(columns_update));
 
     query = format('update content set %s
                     from tmp_content t where t.sha1 = content.sha1',
                     array_to_string(tmp_array, ', '));
 
     execute query;
 
     return;
 end
 $$;
 
 comment on function swh_content_update(text[]) IS 'Update existing content''s columns';
 
 
 create type directory_entry_type as enum('file', 'dir', 'rev');
 
 
 -- Add tmp_directory_entry_* entries to directory_entry_* and directory,
 -- skipping duplicates in directory_entry_*.  This is a generic function that
 -- works on all kind of directory entries.
 --
 -- operates in bulk: 0. swh_mktemp_dir_entry('directory_entry_*'), 1 COPY to
 -- tmp_directory_entry_*, 2. call this function
 --
 -- Assumption: this function is used in the same transaction that inserts the
 -- context directory in table "directory".
 create or replace function swh_directory_entry_add(typ directory_entry_type)
     returns void
     language plpgsql
 as $$
 begin
     execute format('
     insert into directory_entry_%1$s (target, name, perms)
     select distinct t.target, t.name, t.perms
     from tmp_directory_entry_%1$s t
     where not exists (
     select 1
     from directory_entry_%1$s i
     where t.target = i.target and t.name = i.name and t.perms = i.perms)
    ', typ);
 
     execute format('
     with new_entries as (
 	select t.dir_id, array_agg(i.id) as entries
 	from tmp_directory_entry_%1$s t
 	inner join directory_entry_%1$s i
 	using (target, name, perms)
 	group by t.dir_id
     )
     update tmp_directory as d
     set %1$s_entries = new_entries.entries
     from new_entries
     where d.id = new_entries.dir_id
     ', typ);
 
     return;
 end
 $$;
 
 -- Insert the data from tmp_directory, tmp_directory_entry_file,
 -- tmp_directory_entry_dir, tmp_directory_entry_rev into their final
 -- tables.
 --
 -- Prerequisites:
 --  directory ids in tmp_directory
 --  entries in tmp_directory_entry_{file,dir,rev}
 --
 create or replace function swh_directory_add()
     returns void
     language plpgsql
 as $$
 begin
     perform swh_directory_entry_add('file');
     perform swh_directory_entry_add('dir');
     perform swh_directory_entry_add('rev');
 
     insert into directory
     select * from tmp_directory t
     where not exists (
         select 1 from directory d
 	where d.id = t.id);
 
     return;
 end
 $$;
 
 -- a directory listing entry with all the metadata
 --
 -- can be used to list a directory, and retrieve all the data in one go.
 create type directory_entry as
 (
   dir_id   sha1_git,     -- id of the parent directory
   type     directory_entry_type,  -- type of entry
   target   sha1_git,     -- id of target
   name     unix_path,    -- path name, relative to containing dir
   perms    file_perms,   -- unix-like permissions
   status   content_status,  -- visible or absent
   sha1     sha1,            -- content if sha1 if type is not dir
   sha1_git sha1_git,        -- content's sha1 git if type is not dir
   sha256   sha256,          -- content's sha256 if type is not dir
   length   bigint           -- content length if type is not dir
 );
 
 
 -- List a single level of directory walked_dir_id
 -- FIXME: order by name is not correct. For git, we need to order by
 -- lexicographic order but as if a trailing / is present in directory
 -- name
 create or replace function swh_directory_walk_one(walked_dir_id sha1_git)
     returns setof directory_entry
     language sql
     stable
 as $$
     with dir as (
 	select id as dir_id, dir_entries, file_entries, rev_entries
 	from directory
 	where id = walked_dir_id),
     ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir),
     ls_f as (select dir_id, unnest(file_entries) as entry_id from dir),
     ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir)
     (select dir_id, 'dir'::directory_entry_type as type,
             e.target, e.name, e.perms, NULL::content_status,
             NULL::sha1, NULL::sha1_git, NULL::sha256, NULL::bigint
      from ls_d
      left join directory_entry_dir e on ls_d.entry_id = e.id)
     union
     (select dir_id, 'file'::directory_entry_type as type,
             e.target, e.name, e.perms, c.status,
             c.sha1, c.sha1_git, c.sha256, c.length
      from ls_f
      left join directory_entry_file e on ls_f.entry_id = e.id
      left join content c on e.target = c.sha1_git)
     union
     (select dir_id, 'rev'::directory_entry_type as type,
             e.target, e.name, e.perms, NULL::content_status,
             NULL::sha1, NULL::sha1_git, NULL::sha256, NULL::bigint
      from ls_r
      left join directory_entry_rev e on ls_r.entry_id = e.id)
     order by name;
 $$;
 
 -- List recursively the revision directory arborescence
 create or replace function swh_directory_walk(walked_dir_id sha1_git)
     returns setof directory_entry
     language sql
     stable
 as $$
     with recursive entries as (
         select dir_id, type, target, name, perms, status, sha1, sha1_git,
                sha256, length
         from swh_directory_walk_one(walked_dir_id)
         union all
         select dir_id, type, target, (dirname || '/' || name)::unix_path as name,
                perms, status, sha1, sha1_git, sha256, length
         from (select (swh_directory_walk_one(dirs.target)).*, dirs.name as dirname
               from (select target, name from entries where type = 'dir') as dirs) as with_parent
     )
     select dir_id, type, target, name, perms, status, sha1, sha1_git, sha256, length
     from entries
 $$;
 
 -- Find a directory entry by its path
 create or replace function swh_find_directory_entry_by_path(
     walked_dir_id sha1_git,
     dir_or_content_path bytea[])
     returns directory_entry
     language plpgsql
 as $$
 declare
     end_index integer;
     paths bytea default '';
     path bytea;
     res bytea[];
     r record;
 begin
     end_index := array_upper(dir_or_content_path, 1);
     res[1] := walked_dir_id;
 
     for i in 1..end_index
     loop
         path := dir_or_content_path[i];
         -- concatenate path for patching the name in the result record (if we found it)
         if i = 1 then
             paths = path;
         else
             paths := paths || '/' || path;  -- concatenate paths
         end if;
 
         if i <> end_index then
             select *
             from swh_directory_walk_one(res[i] :: sha1_git)
             where name=path
             and type = 'dir'
             limit 1 into r;
         else
             select *
             from swh_directory_walk_one(res[i] :: sha1_git)
             where name=path
             limit 1 into r;
         end if;
 
         -- find the path
         if r is null then
            return null;
         else
             -- store the next dir to lookup the next local path from
             res[i+1] := r.target;
         end if;
     end loop;
 
     -- at this moment, r is the result. Patch its 'name' with the full path before returning it.
     r.name := paths;
     return r;
 end
 $$;
 
 -- List all revision IDs starting from a given revision, going back in time
 --
 -- TODO ordering: should be breadth-first right now (what do we want?)
 -- TODO ordering: ORDER BY parent_rank somewhere?
 create or replace function swh_revision_list(root_revisions bytea[], num_revs bigint default NULL)
     returns table (id sha1_git, parents bytea[])
     language sql
     stable
 as $$
     with recursive full_rev_list(id) as (
         (select id from revision where id = ANY(root_revisions))
         union
         (select h.parent_id
          from revision_history as h
          join full_rev_list on h.id = full_rev_list.id)
     ),
     rev_list as (select id from full_rev_list limit num_revs)
     select rev_list.id as id,
            array(select rh.parent_id::bytea
                  from revision_history rh
                  where rh.id = rev_list.id
                  order by rh.parent_rank
                 ) as parent
     from rev_list;
 $$;
 
 
 -- Detailed entry for a revision
 create type revision_entry as
 (
   id                             sha1_git,
   date                           timestamptz,
   date_offset                    smallint,
   date_neg_utc_offset            boolean,
   committer_date                 timestamptz,
   committer_date_offset          smallint,
   committer_date_neg_utc_offset  boolean,
   type                           revision_type,
   directory                      sha1_git,
   message                        bytea,
   author_id                      bigint,
   author_fullname                bytea,
   author_name                    bytea,
   author_email                   bytea,
   committer_id                   bigint,
   committer_fullname             bytea,
   committer_name                 bytea,
   committer_email                bytea,
   metadata                       jsonb,
   synthetic                      boolean,
-  extra_headers                  bytea[][],
   parents                        bytea[],
-  object_id                      bigint
+  object_id                      bigint,
+  extra_headers                  bytea[][]
 );
 
 
 -- "git style" revision log. Similar to swh_revision_list(), but returning all
 -- information associated to each revision, and expanding authors/committers
 create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL)
     returns setof revision_entry
     language sql
     stable
 as $$
     select t.id, r.date, r.date_offset, r.date_neg_utc_offset,
            r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
            r.type, r.directory, r.message,
            a.id, a.fullname, a.name, a.email,
            c.id, c.fullname, c.name, c.email,
-           r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id
+           r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers
     from swh_revision_list(root_revisions, num_revs) as t
     left join revision r on t.id = r.id
     left join person a on a.id = r.author
     left join person c on c.id = r.committer;
 $$;
 
 
 -- Detailed entry for a release
 create type release_entry as
 (
   id                   sha1_git,
   target               sha1_git,
   target_type          object_type,
   date                 timestamptz,
   date_offset          smallint,
   date_neg_utc_offset  boolean,
   name                 bytea,
   comment              bytea,
   synthetic            boolean,
   author_id            bigint,
   author_fullname      bytea,
   author_name          bytea,
   author_email         bytea,
   object_id            bigint
 );
 
 -- Create entries in person from tmp_revision
 create or replace function swh_person_add_from_revision()
     returns void
     language plpgsql
 as $$
 begin
     with t as (
         select author_fullname as fullname, author_name as name, author_email as email from tmp_revision
     union
         select committer_fullname as fullname, committer_name as name, committer_email as email from tmp_revision
     ) insert into person (fullname, name, email)
     select distinct on (fullname) fullname, name, email from t
     where not exists (
         select 1
         from person p
         where t.fullname = p.fullname
     );
     return;
 end
 $$;
 
 
 -- Create entries in revision from tmp_revision
 create or replace function swh_revision_add()
     returns void
     language plpgsql
 as $$
 begin
     perform swh_person_add_from_revision();
 
     insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers)
     select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers
     from tmp_revision t
     left join person a on a.fullname = t.author_fullname
     left join person c on c.fullname = t.committer_fullname;
     return;
 end
 $$;
 
 
 -- Create entries in person from tmp_release
 create or replace function swh_person_add_from_release()
     returns void
     language plpgsql
 as $$
 begin
     with t as (
         select distinct author_fullname as fullname, author_name as name, author_email as email from tmp_release
         where author_fullname is not null
     ) insert into person (fullname, name, email)
     select distinct on (fullname) fullname, name, email from t
     where not exists (
         select 1
         from person p
         where t.fullname = p.fullname
     );
     return;
 end
 $$;
 
 
 -- Create entries in release from tmp_release
 create or replace function swh_release_add()
     returns void
     language plpgsql
 as $$
 begin
     perform swh_person_add_from_release();
 
     insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, name, comment, author, synthetic)
       select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic
         from tmp_release t
         left join person a on a.fullname = t.author_fullname
         where not exists (select 1 from release where t.id = release.id);
     return;
 end
 $$;
 
 
 -- add a new origin_visit for origin origin_id at date.
 --
 -- Returns the new visit id.
 create or replace function swh_origin_visit_add(origin_url text, date timestamptz, type text)
     returns bigint
     language sql
 as $$
   with origin_id as (
     select id
     from origin
     where url = origin_url
   ), last_known_visit as (
     select coalesce(max(visit), 0) as visit
     from origin_visit
     where origin = (select id from origin_id)
   )
   insert into origin_visit (origin, date, type, visit)
   values ((select id from origin_id), date, type,
           (select visit from last_known_visit) + 1)
   returning visit;
 $$;
 
 create or replace function swh_snapshot_add(snapshot_id sha1_git)
   returns void
   language plpgsql
 as $$
 declare
   snapshot_object_id snapshot.object_id%type;
 begin
   select object_id from snapshot where id = snapshot_id into snapshot_object_id;
   if snapshot_object_id is null then
      insert into snapshot (id) values (snapshot_id) returning object_id into snapshot_object_id;
      insert into snapshot_branch (name, target_type, target)
        select name, target_type, target from tmp_snapshot_branch tmp
        where not exists (
          select 1
          from snapshot_branch sb
          where sb.name = tmp.name
            and sb.target = tmp.target
            and sb.target_type = tmp.target_type
        )
        on conflict do nothing;
      insert into snapshot_branches (snapshot_id, branch_id)
      select snapshot_object_id, sb.object_id as branch_id
        from tmp_snapshot_branch tmp
        join snapshot_branch sb
        using (name, target, target_type)
        where tmp.target is not null and tmp.target_type is not null
      union
      select snapshot_object_id, sb.object_id as branch_id
        from tmp_snapshot_branch tmp
        join snapshot_branch sb
        using (name)
        where tmp.target is null and tmp.target_type is null
          and sb.target is null and sb.target_type is null;
   end if;
   truncate table tmp_snapshot_branch;
 end;
 $$;
 
 create type snapshot_result as (
   snapshot_id  sha1_git,
   name         bytea,
   target       bytea,
   target_type  snapshot_target
 );
 
 create or replace function swh_snapshot_get_by_id(id sha1_git,
     branches_from bytea default '', branches_count bigint default null,
     target_types snapshot_target[] default NULL)
   returns setof snapshot_result
   language sql
   stable
 as $$
   -- with small limits, the "naive" version of this query can degenerate into
   -- using the deduplication index on snapshot_branch (name, target,
   -- target_type); The planner happily scans several hundred million rows.
 
   -- Do the query in two steps: first pull the relevant branches for the given
   -- snapshot (filtering them by type), then do the limiting. This two-step
   -- process guides the planner into using the proper index.
   with filtered_snapshot_branches as (
     select swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type
       from snapshot_branches
       inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id
       where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id)
         and (target_types is null or target_type = any(target_types))
       order by name
   )
   select snapshot_id, name, target, target_type
     from filtered_snapshot_branches
     where name >= branches_from
     order by name limit branches_count;
 $$;
 
 create type snapshot_size as (
   target_type snapshot_target,
   count bigint
 );
 
 create or replace function swh_snapshot_count_branches(id sha1_git)
   returns setof snapshot_size
   language sql
   stable
 as $$
   SELECT target_type, count(name)
   from swh_snapshot_get_by_id(swh_snapshot_count_branches.id)
   group by target_type;
 $$;
 
 -- Absolute path: directory reference + complete path relative to it
 create type content_dir as (
     directory  sha1_git,
     path       unix_path
 );
 
 
 -- Find the containing directory of a given content, specified by sha1
 -- (note: *not* sha1_git).
 --
 -- Return a pair (dir_it, path) where path is a UNIX path that, from the
 -- directory root, reach down to a file with the desired content. Return NULL
 -- if no match is found.
 --
 -- In case of multiple paths (i.e., pretty much always), an arbitrary one is
 -- chosen.
 create or replace function swh_content_find_directory(content_id sha1)
     returns content_dir
     language sql
     stable
 as $$
     with recursive path as (
 	-- Recursively build a path from the requested content to a root
 	-- directory. Each iteration returns a pair (dir_id, filename) where
 	-- filename is relative to dir_id. Stops when no parent directory can
 	-- be found.
 	(select dir.id as dir_id, dir_entry_f.name as name, 0 as depth
 	 from directory_entry_file as dir_entry_f
 	 join content on content.sha1_git = dir_entry_f.target
 	 join directory as dir on dir.file_entries @> array[dir_entry_f.id]
 	 where content.sha1 = content_id
 	 limit 1)
 	union all
 	(select dir.id as dir_id,
 		(dir_entry_d.name || '/' || path.name)::unix_path as name,
 		path.depth + 1
 	 from path
 	 join directory_entry_dir as dir_entry_d on dir_entry_d.target = path.dir_id
 	 join directory as dir on dir.dir_entries @> array[dir_entry_d.id]
 	 limit 1)
     )
     select dir_id, name from path order by depth desc limit 1;
 $$;
 
 -- Find the visit of origin closest to date visit_date
 -- Breaks ties by selecting the largest visit id
 create or replace function swh_visit_find_by_date(origin_url text, visit_date timestamptz default NOW())
     returns setof origin_visit
     language plpgsql
     stable
 as $$
 declare
   origin_id bigint;
 begin
   select id into origin_id from origin where url=origin_url;
   return query
   with closest_two_visits as ((
     select ov, (date - visit_date), visit as interval
     from origin_visit ov
     where ov.origin = origin_id
           and ov.date >= visit_date
     order by ov.date asc, ov.visit desc
     limit 1
   ) union (
     select ov, (visit_date - date), visit as interval
     from origin_visit ov
     where ov.origin = origin_id
           and ov.date < visit_date
     order by ov.date desc, ov.visit desc
     limit 1
   )) select (ov).* from closest_two_visits order by interval, visit limit 1;
 end
 $$;
 
 -- Object listing by object_id
 
 create or replace function swh_content_list_by_object_id(
     min_excl bigint,
     max_incl bigint
 )
     returns setof content
     language sql
     stable
 as $$
     select * from content
     where object_id > min_excl and object_id <= max_incl
     order by object_id;
 $$;
 
 create or replace function swh_revision_list_by_object_id(
     min_excl bigint,
     max_incl bigint
 )
     returns setof revision_entry
     language sql
     stable
 as $$
     with revs as (
         select * from revision
         where object_id > min_excl and object_id <= max_incl
     )
     select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
            r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
            r.type, r.directory, r.message,
-           a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers,
+           a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
            array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
-               as parents, r.object_id
+               as parents, r.object_id, r.extra_headers
     from revs r
     left join person a on a.id = r.author
     left join person c on c.id = r.committer
     order by r.object_id;
 $$;
 
 create or replace function swh_release_list_by_object_id(
     min_excl bigint,
     max_incl bigint
 )
     returns setof release_entry
     language sql
     stable
 as $$
     with rels as (
         select * from release
         where object_id > min_excl and object_id <= max_incl
     )
     select r.id, r.target, r.target_type, r.date, r.date_offset, r.date_neg_utc_offset, r.name, r.comment,
            r.synthetic, p.id as author_id, p.fullname as author_fullname, p.name as author_name, p.email as author_email, r.object_id
     from rels r
     left join person p on p.id = r.author
     order by r.object_id;
 $$;
 
 
 -- simple counter mapping a textual label to an integer value
 create type counter as (
     label  text,
     value  bigint
 );
 
 -- return statistics about the number of tuples in various SWH tables
 --
 -- Note: the returned values are based on postgres internal statistics
 -- (pg_class table), which are only updated daily (by autovacuum) or so
 create or replace function swh_stat_counters()
     returns setof counter
     language sql
     stable
 as $$
     select object_type as label, value as value
     from object_counts
     where object_type in (
         'content',
         'directory',
         'directory_entry_dir',
         'directory_entry_file',
         'directory_entry_rev',
         'origin',
         'origin_visit',
         'person',
         'release',
         'revision',
         'revision_history',
         'skipped_content',
         'snapshot'
     );
 $$;
 
 create or replace function swh_update_counter(object_type text)
     returns void
     language plpgsql
 as $$
 begin
     execute format('
 	insert into object_counts
     (value, last_update, object_type)
   values
     ((select count(*) from %1$I), NOW(), %1$L)
   on conflict (object_type) do update set
     value = excluded.value,
     last_update = excluded.last_update',
   object_type);
     return;
 end;
 $$;
 
 create or replace function swh_update_counter_bucketed()
     returns void
     language plpgsql
 as $$
 declare
   query text;
   line_to_update int;
   new_value bigint;
 begin
   select
     object_counts_bucketed.line,
     format(
       'select count(%I) from %I where %s',
       coalesce(identifier, '*'),
       object_type,
       coalesce(
         concat_ws(
           ' and ',
           case when bucket_start is not null then
             format('%I >= %L', identifier, bucket_start) -- lower bound condition, inclusive
           end,
           case when bucket_end is not null then
             format('%I < %L', identifier, bucket_end) -- upper bound condition, exclusive
           end
         ),
         'true'
       )
     )
     from object_counts_bucketed
     order by coalesce(last_update, now() - '1 month'::interval) asc
     limit 1
     into line_to_update, query;
 
   execute query into new_value;
 
   update object_counts_bucketed
     set value = new_value,
         last_update = now()
     where object_counts_bucketed.line = line_to_update;
 
 END
 $$;
 
 create or replace function swh_update_counters_from_buckets()
   returns trigger
   language plpgsql
 as $$
 begin
 with to_update as (
   select object_type, sum(value) as value, max(last_update) as last_update
   from object_counts_bucketed ob1
   where not exists (
     select 1 from object_counts_bucketed ob2
     where ob1.object_type = ob2.object_type
     and value is null
     )
   group by object_type
 ) update object_counts
   set
     value = to_update.value,
     last_update = to_update.last_update
   from to_update
   where
     object_counts.object_type = to_update.object_type
     and object_counts.value != to_update.value;
 return null;
 end
 $$;
 
 create trigger update_counts_from_bucketed
   after insert or update
   on object_counts_bucketed
   for each row
   when (NEW.line % 256 = 0)
   execute procedure swh_update_counters_from_buckets();
diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py
index 40905d5d..634afa4b 100644
--- a/swh/storage/tests/conftest.py
+++ b/swh/storage/tests/conftest.py
@@ -1,74 +1,82 @@
 # Copyright (C) 2019-2020 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import pytest
 import multiprocessing.util
 
 from hypothesis import settings
 
 try:
     import pytest_cov.embed
 except ImportError:
     pytest_cov = None
 
 from swh.model.tests.generate_testdata import gen_contents, gen_origins
 from swh.model.model import (
     Content,
     Directory,
     Origin,
     OriginVisit,
     Release,
     Revision,
     SkippedContent,
     Snapshot,
 )
 
 
 OBJECT_FACTORY = {
     "content": Content.from_dict,
     "directory": Directory.from_dict,
     "origin": Origin.from_dict,
     "origin_visit": OriginVisit.from_dict,
     "release": Release.from_dict,
     "revision": Revision.from_dict,
     "skipped_content": SkippedContent.from_dict,
     "snapshot": Snapshot.from_dict,
 }
 
 
 # define tests profile. Full documentation is at:
 # https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
 settings.register_profile("fast", max_examples=5, deadline=5000)
 settings.register_profile("slow", max_examples=20, deadline=5000)
 
 
 if pytest_cov is not None:
     # pytest_cov + multiprocessing can cause a segmentation fault when starting
     # the child process <https://forge.softwareheritage.org/P706>; so we're
     # removing pytest-coverage's hook that runs when a child process starts.
     # This means code run in child processes won't be counted in the coverage
     # report, but this is not an issue because the only code that runs only in
     # child processes is the RPC server.
     for (key, value) in multiprocessing.util._afterfork_registry.items():
         if value is pytest_cov.embed.multiprocessing_start:
             del multiprocessing.util._afterfork_registry[key]
             break
     else:
         assert False, "missing pytest_cov.embed.multiprocessing_start?"
 
 
 @pytest.fixture
 def swh_contents(swh_storage):
     contents = gen_contents(n=20)
     swh_storage.content_add([c for c in contents if c["status"] != "absent"])
     swh_storage.skipped_content_add([c for c in contents if c["status"] == "absent"])
     return contents
 
 
 @pytest.fixture
 def swh_origins(swh_storage):
     origins = gen_origins(n=100)
     swh_storage.origin_add(origins)
     return origins
+
+
+@pytest.fixture
+def swh_storage_backend_config(swh_storage_backend_config):
+    """storage should test with its journal writer collaborator on
+
+    """
+    yield {**swh_storage_backend_config, "journal_writer": {"cls": "memory",}}
diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py
index 181af743..f6d1c9ef 100644
--- a/swh/storage/tests/test_cassandra.py
+++ b/swh/storage/tests/test_cassandra.py
@@ -1,391 +1,395 @@
 # Copyright (C) 2018-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import namedtuple
 import datetime
 import os
 import signal
 import socket
 import subprocess
 import time
 
 import pytest
 
 from swh.storage import get_storage
 from swh.storage.cassandra import create_keyspace
 from swh.storage.cassandra.schema import TABLES, HASH_ALGORITHMS
 
 from swh.storage.tests.test_storage import TestStorage as _TestStorage
 from swh.storage.tests.test_storage import (
     TestStorageGeneratedData as _TestStorageGeneratedData,
 )
 
 from .storage_data import data
 
 
 CONFIG_TEMPLATE = """
 data_file_directories:
     - {data_dir}/data
 commitlog_directory: {data_dir}/commitlog
 hints_directory: {data_dir}/hints
 saved_caches_directory: {data_dir}/saved_caches
 
 commitlog_sync: periodic
 commitlog_sync_period_in_ms: 1000000
 partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 endpoint_snitch: SimpleSnitch
 seed_provider:
     - class_name: org.apache.cassandra.locator.SimpleSeedProvider
       parameters:
           - seeds: "127.0.0.1"
 
 storage_port: {storage_port}
 native_transport_port: {native_transport_port}
 start_native_transport: true
 listen_address: 127.0.0.1
 
 enable_user_defined_functions: true
 
 # speed-up by disabling period saving to disk
 key_cache_save_period: 0
 row_cache_save_period: 0
 trickle_fsync: false
 commitlog_sync_period_in_ms: 100000
 """
 
 
 def free_port():
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     sock.bind(("127.0.0.1", 0))
     port = sock.getsockname()[1]
     sock.close()
     return port
 
 
 def wait_for_peer(addr, port):
     wait_until = time.time() + 20
     while time.time() < wait_until:
         try:
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect((addr, port))
         except ConnectionRefusedError:
             time.sleep(0.1)
         else:
             sock.close()
             return True
     return False
 
 
 @pytest.fixture(scope="session")
 def cassandra_cluster(tmpdir_factory):
     cassandra_conf = tmpdir_factory.mktemp("cassandra_conf")
     cassandra_data = tmpdir_factory.mktemp("cassandra_data")
     cassandra_log = tmpdir_factory.mktemp("cassandra_log")
     native_transport_port = free_port()
     storage_port = free_port()
     jmx_port = free_port()
 
     with open(str(cassandra_conf.join("cassandra.yaml")), "w") as fd:
         fd.write(
             CONFIG_TEMPLATE.format(
                 data_dir=str(cassandra_data),
                 storage_port=storage_port,
                 native_transport_port=native_transport_port,
             )
         )
 
-    if os.environ.get("LOG_CASSANDRA"):
+    if os.environ.get("SWH_CASSANDRA_LOG"):
         stdout = stderr = None
     else:
         stdout = stderr = subprocess.DEVNULL
+
+    cassandra_bin = os.environ.get("SWH_CASSANDRA_BIN", "/usr/sbin/cassandra")
     proc = subprocess.Popen(
         [
-            "/usr/sbin/cassandra",
+            cassandra_bin,
             "-Dcassandra.config=file://%s/cassandra.yaml" % cassandra_conf,
             "-Dcassandra.logdir=%s" % cassandra_log,
             "-Dcassandra.jmx.local.port=%d" % jmx_port,
             "-Dcassandra-foreground=yes",
         ],
         start_new_session=True,
         env={
             "MAX_HEAP_SIZE": "300M",
             "HEAP_NEWSIZE": "50M",
             "JVM_OPTS": "-Xlog:gc=error:file=%s/gc.log" % cassandra_log,
         },
         stdout=stdout,
         stderr=stderr,
     )
 
     running = wait_for_peer("127.0.0.1", native_transport_port)
 
     if running:
         yield (["127.0.0.1"], native_transport_port)
 
-    if not running or os.environ.get("LOG_CASSANDRA"):
-        with open(str(cassandra_log.join("debug.log"))) as fd:
-            print(fd.read())
+    if not running or os.environ.get("SWH_CASSANDRA_LOG"):
+        debug_log_path = str(cassandra_log.join("debug.log"))
+        if os.path.exists(debug_log_path):
+            with open(debug_log_path) as fd:
+                print(fd.read())
 
     if not running:
         raise Exception("cassandra process stopped unexpectedly.")
 
     pgrp = os.getpgid(proc.pid)
     os.killpg(pgrp, signal.SIGKILL)
 
 
 class RequestHandler:
     def on_request(self, rf):
         if hasattr(rf.message, "query"):
             print()
             print(rf.message.query)
 
 
 @pytest.fixture(scope="session")
 def keyspace(cassandra_cluster):
     (hosts, port) = cassandra_cluster
     keyspace = os.urandom(10).hex()
 
     create_keyspace(hosts, keyspace, port)
 
     return keyspace
 
 
 # tests are executed using imported classes (TestStorage and
 # TestStorageGeneratedData) using overloaded swh_storage fixture
 # below
 
 
 @pytest.fixture
 def swh_storage_backend_config(cassandra_cluster, keyspace):
     (hosts, port) = cassandra_cluster
 
     storage_config = dict(
         cls="cassandra",
         hosts=hosts,
         port=port,
         keyspace=keyspace,
         journal_writer={"cls": "memory",},
         objstorage={"cls": "memory", "args": {},},
     )
 
     yield storage_config
 
     storage = get_storage(**storage_config)
 
     for table in TABLES:
         storage._cql_runner._session.execute('TRUNCATE TABLE "%s"' % table)
 
     storage._cql_runner._cluster.shutdown()
 
 
 @pytest.mark.cassandra
 class TestCassandraStorage(_TestStorage):
     def test_content_add_murmur3_collision(self, swh_storage, mocker):
         """The Murmur3 token is used as link from index tables to the main
         table; and non-matching contents with colliding murmur3-hash
         are filtered-out when reading the main table.
         This test checks the content methods do filter out these collision.
         """
         called = 0
 
         # always return a token
         def mock_cgtfsh(algo, hash_):
             nonlocal called
             called += 1
             assert algo in ("sha1", "sha1_git")
             return [123456]
 
         mocker.patch.object(
             swh_storage.storage._cql_runner,
             "content_get_tokens_from_single_hash",
             mock_cgtfsh,
         )
 
         # For all tokens, always return data.cont
         Row = namedtuple("Row", HASH_ALGORITHMS)
 
         def mock_cgft(token):
             nonlocal called
             called += 1
             return [Row(**{algo: data.cont[algo] for algo in HASH_ALGORITHMS})]
 
         mocker.patch.object(
             swh_storage.storage._cql_runner, "content_get_from_token", mock_cgft
         )
 
         actual_result = swh_storage.content_add([data.cont2])
 
         assert called == 4
         assert actual_result == {
             "content:add": 1,
             "content:add:bytes": data.cont2["length"],
         }
 
     def test_content_get_metadata_murmur3_collision(self, swh_storage, mocker):
         """The Murmur3 token is used as link from index tables to the main
         table; and non-matching contents with colliding murmur3-hash
         are filtered-out when reading the main table.
         This test checks the content methods do filter out these collision.
         """
         called = 0
 
         # always return a token
         def mock_cgtfsh(algo, hash_):
             nonlocal called
             called += 1
             assert algo in ("sha1", "sha1_git")
             return [123456]
 
         mocker.patch.object(
             swh_storage.storage._cql_runner,
             "content_get_tokens_from_single_hash",
             mock_cgtfsh,
         )
 
         # For all tokens, always return data.cont and data.cont2
         cols = list(set(data.cont) - {"data"})
         Row = namedtuple("Row", cols + ["ctime"])
 
         def mock_cgft(token):
             nonlocal called
             called += 1
             return [
                 Row(ctime=42, **{col: cont[col] for col in cols})
                 for cont in [data.cont, data.cont2]
             ]
 
         mocker.patch.object(
             swh_storage.storage._cql_runner, "content_get_from_token", mock_cgft
         )
 
         expected_cont = data.cont.copy()
         del expected_cont["data"]
 
         actual_result = swh_storage.content_get_metadata([data.cont["sha1"]])
 
         assert called == 2
 
         # but data.cont2 should be filtered out
         assert actual_result == {data.cont["sha1"]: [expected_cont]}
 
     def test_content_find_murmur3_collision(self, swh_storage, mocker):
         """The Murmur3 token is used as link from index tables to the main
         table; and non-matching contents with colliding murmur3-hash
         are filtered-out when reading the main table.
         This test checks the content methods do filter out these collision.
         """
         called = 0
 
         # always return a token
         def mock_cgtfsh(algo, hash_):
             nonlocal called
             called += 1
             assert algo in ("sha1", "sha1_git")
             return [123456]
 
         mocker.patch.object(
             swh_storage.storage._cql_runner,
             "content_get_tokens_from_single_hash",
             mock_cgtfsh,
         )
 
         # For all tokens, always return data.cont and data.cont2
         cols = list(set(data.cont) - {"data"})
         Row = namedtuple("Row", cols + ["ctime"])
 
         def mock_cgft(token):
             nonlocal called
             called += 1
             return [
                 Row(ctime=datetime.datetime.now(), **{col: cont[col] for col in cols})
                 for cont in [data.cont, data.cont2]
             ]
 
         mocker.patch.object(
             swh_storage.storage._cql_runner, "content_get_from_token", mock_cgft
         )
 
         expected_cont = data.cont.copy()
         del expected_cont["data"]
 
         actual_result = swh_storage.content_find({"sha1": data.cont["sha1"]})
 
         assert called == 2
 
         # but data.cont2 should be filtered out
         del actual_result[0]["ctime"]
         assert actual_result == [expected_cont]
 
     @pytest.mark.skip("content_update is not yet implemented for Cassandra")
     def test_content_update(self):
         pass
 
     @pytest.mark.skip(
         'The "person" table of the pgsql is a legacy thing, and not '
         "supported by the cassandra backend."
     )
     def test_person_fullname_unicity(self):
         pass
 
     @pytest.mark.skip(
         'The "person" table of the pgsql is a legacy thing, and not '
         "supported by the cassandra backend."
     )
     def test_person_get(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_origin_count(self):
         pass
 
 
 @pytest.mark.cassandra
 class TestCassandraStorageGeneratedData(_TestStorageGeneratedData):
     @pytest.mark.skip("Not supported by Cassandra")
     def test_origin_count(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_origin_get_range(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_origin_get_range_from_zero(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_generate_content_get_range_limit(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_generate_content_get_range_no_limit(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_generate_content_get_range(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_generate_content_get_range_empty(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_generate_content_get_range_limit_none(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_generate_content_get_range_full(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_origin_count_with_visit_no_visits(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_origin_count_with_visit_with_visits_and_snapshot(self):
         pass
 
     @pytest.mark.skip("Not supported by Cassandra")
     def test_origin_count_with_visit_with_visits_no_snapshot(self):
         pass
diff --git a/tox.ini b/tox.ini
index e82bc0ab..cf5cbabd 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,41 +1,42 @@
 [tox]
 envlist=black,flake8,mypy,py3
 
 [testenv]
 extras =
   testing
 deps =
   pytest-cov
   dev: ipdb
 passenv =
-  LOG_CASSANDRA
+  SWH_CASSANDRA_BIN
+  SWH_CASSANDRA_LOG
 commands =
   pytest \
     !slow: --hypothesis-profile=fast \
     slow:  --hypothesis-profile=slow \
          --cov={envsitepackagesdir}/swh/storage \
          {envsitepackagesdir}/swh/storage \
          --doctest-modules \
          --cov-branch {posargs}
 
 [testenv:black]
 skip_install = true
 deps =
   black
 commands =
   {envpython} -m black --check swh
 
 [testenv:flake8]
 skip_install = true
 deps =
   flake8
 commands =
   {envpython} -m flake8
 
 [testenv:mypy]
 extras =
   testing
 deps =
   mypy
 commands =
   mypy swh