diff --git a/docs/index.rst b/docs/index.rst
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,10 +1,137 @@
-.. _swh-py-template:
+.. _swh-provenance:
 
-Software Heritage - Python module template
-==========================================
+Software Heritage Provenance
+============================
 
-Python module template, used as skeleton to create new modules.
+A provenance index database based on the Software Heritage Archive. This is an
+implementation of the paper `Software Provenance Tracking at the Scale of
+Public Source Code`_ published in `Empirical Software Engineering`_
 
+This provenance index database is a tool to help answering the question "where
+does this source code artifact comes from?", which the main Software Heritage
+Archive cannot easily solve.
+
+
+Quick Start
+-----------
+
+Database creation
+~~~~~~~~~~~~~~~~~
+
+Create a provenance index database (in this example we use pifpaf_ to easily
+set up a test Postgresql database. Adapt the example below to your Postgresql
+setup):
+
+.. code-block:: shell
+
+  eval $(pifpaf run postgresql)
+  swh db create -d provdb provenance
+  swh db init-admin -d provdb provenance
+  swh db init -d provdb provenance
+
+The provenance index DB comes in 2 feature flags, so there are 4 possible flavors. Feature flags are:
+
+- `with-path` / `without-path`: whether the provenance index database will store file path,
+- `normalized` / `denormalized`: whether or not the main relation tables are normalized (see below).
+
+So the possible flavors are:
+
+- `with-path`
+- `without-path`
+- `with-path-denormalized`
+- `without-path-denormalized`
+
+Filling the provenance index database
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This step requires an access to the Software Heritage Archive to retrieve the
+actual data from the Archive.
+
+It currently also needs an input CSV file of revisions and origins to insert in
+the provenance database.
+
+Examples of such files are available in the `provenance public dataset`_.
+
+.. _`provenance public dataset`: https://annex.softwareheritage.org/public/dataset/provenance
+
+.. code-block:: shell
+
+  wget https://annex.softwareheritage.org/public/dataset/provenance/sample_10k.csv.bz2
+  bunzip2 sample_10k.csv.bz2
+
+You need a configuration file, like:
+
+.. code-block:: shell
+
+  cat config.yaml
+  provenance:
+    storage:
+      cls: local
+      db:
+        host: /tmp/tmpifn2ov_j
+        port: 9824
+        dbname: provdb
+    archive:
+      cls: api
+      storage:
+        cls: remote
+        url: http://storage:5002/
+
+Note that you need access to the internal API of a :ref:`swh-storage
+<swh-storage>` instance (here the machine named `storage`) for this.
+
+Then you can feed the provenance index database using:
+
+.. code-block:: shell
+
+  swh provenance -C config.yaml iter-revisions sample_10k.csv
+
+
+This may take a while to complete.
+
+Querying the provenance index database
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Using the same config file, you may look for the first known occurrence of a file content:
+
+.. code-block:: shell
+
+  swh provenance -C config.yaml find-first 8a54694c92c944fcb06d73c17743ac72444a5b72
+  swh:1:cnt:8a54694c92c944fcb06d73c17743ac72444a5b72, swh:1:rev:6193fae0668d082d90207f6c9f33d6e8c98dd04a, 2008-10-06 18:32:23+00:00, None, lua/effects/bloodstream/init.lua
+
+
+Or all the known occurrences:
+
+.. code-block:: shell
+
+  swh provenance -C config.yaml find-all 8a54694c92c944fcb06d73c17743ac72444a5b72
+  swh:1:cnt:8a54694c92c944fcb06d73c17743ac72444a5b72, swh:1:rev:6193fae0668d082d90207f6c9f33d6e8c98dd04a, 2008-10-06 18:32:23+00:00, None, lua/effects/bloodstream/init.lua
+  swh:1:cnt:8a54694c92c944fcb06d73c17743ac72444a5b72, swh:1:rev:f0a5078eed8808323b93ed09cddb003dbe2a85e4, 2008-10-06 18:32:23+00:00, None, trunk/lua/effects/bloodstream/init.lua
+  [...]
+
+
+(De)normalized database
+-----------------------
+
+For some relation tables (like the ``content_in_revision`` storing, for each
+content object, in which revision it has been found), the default data schema
+is to store one row for each relation.
+
+For a big database, this can have a significant cost in terms of storage.
+
+So it is possible to store these relations using an array as destination column
+(the ``revision`` column in the case of the ``content_in_revisison`` table).
+
+This can drastically reduce the database storage size, possibly at the price of
+a slight performance hit.
+
+Warning: the denormalized version of the database is still under test and
+validation. Do not use for serious work.
+
+
+.. _`Empirical Software Engineering`: http://link.springer.com/journal/10664
+.. _`Software Provenance Tracking at the Scale of Public Source Code`: http://dx.doi.org/10.1007/s10664-020-09828-5
+.. _pifpaf: https://github.com/jd/pifpaf
 
 .. toctree::
    :maxdepth: 2
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -21,8 +21,7 @@
 
 # All generic config code should reside in swh.core.config
 CONFIG_ENVVAR = "SWH_CONFIG_FILENAME"
-DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml")
-DEFAULT_PATH = os.environ.get(CONFIG_ENVVAR, DEFAULT_CONFIG_PATH)
+DEFAULT_PATH = os.environ.get(CONFIG_ENVVAR, None)
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "provenance": {
@@ -47,21 +46,17 @@
 }
 
 
-CONFIG_FILE_HELP = f"""Configuration file:
-
-\b
-The CLI option or the environment variable will fail if invalid.
-CLI option is checked first.
-Then, environment variable {CONFIG_ENVVAR} is checked.
-Then, if cannot load the default path, a set of default values are used.
-Default config path is {DEFAULT_CONFIG_PATH}.
-Default config values are:
+CONFIG_FILE_HELP = f"""
+\b Configuration can be loaded from a yaml file given either as --config-file
+option or the {CONFIG_ENVVAR} environment variable. If no configuration file
+is specified, use the following default configuration::
 
 \b
 {yaml.dump(DEFAULT_CONFIG)}"""
-PROVENANCE_HELP = f"""Software Heritage Scanner tools.
+PROVENANCE_HELP = f"""Software Heritage provenance index database tools
 
-{CONFIG_FILE_HELP}"""
+{CONFIG_FILE_HELP}
+"""
 
 
 @swh_cli_group.group(
@@ -71,7 +66,7 @@
     "-C",
     "--config-file",
     default=None,
-    type=click.Path(exists=False, dir_okay=False, path_type=str),
+    type=click.Path(exists=True, dir_okay=False, path_type=str),
     help="""YAML configuration file.""",
 )
 @click.option(
@@ -83,17 +78,20 @@
 )
 @click.pass_context
 def cli(ctx: click.core.Context, config_file: Optional[str], profile: str) -> None:
-    if config_file is None and config.config_exists(DEFAULT_PATH):
+    if (
+        config_file is None
+        and DEFAULT_PATH is not None
+        and config.config_exists(DEFAULT_PATH)
+    ):
         config_file = DEFAULT_PATH
 
     if config_file is None:
         conf = DEFAULT_CONFIG
     else:
         # read_raw_config do not fail on ENOENT
-        if not config.config_exists(config_file):
+        if not os.path.exists(config_file):
             raise FileNotFoundError(config_file)
-        conf = config.read_raw_config(config.config_basepath(config_file))
-        conf = config.merge_configs(DEFAULT_CONFIG, conf)
+        conf = yaml.safe_load(open(config_file, "rb"))
 
     ctx.ensure_object(dict)
     ctx.obj["config"] = conf