diff --git a/docker/.gitignore b/docker/.gitignore
new file mode 100644
index 0000000..6b8a876
--- /dev/null
+++ b/docker/.gitignore
@@ -0,0 +1,3 @@
+docker-compose.override.yml
+docker-compose.storage-replica.override.yml
+tests/swh-docker-compose.logs
diff --git a/docker/CODE_OF_CONDUCT.md b/docker/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..0ad22b5
--- /dev/null
+++ b/docker/CODE_OF_CONDUCT.md
@@ -0,0 +1,78 @@
+# Software Heritage Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as Software
+Heritage contributors and maintainers pledge to making participation in our
+project and our community a harassment-free experience for everyone, regardless
+of age, body size, disability, ethnicity, sex characteristics, gender identity
+and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at `conduct@softwareheritage.org`. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an
+incident.  Further details of specific enforcement policies may be posted
+separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/docker/CONTRIBUTORS b/docker/CONTRIBUTORS
new file mode 100644
index 0000000..acd099c
--- /dev/null
+++ b/docker/CONTRIBUTORS
@@ -0,0 +1,2 @@
+Archit Agrawal
+Rob Guinness
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000..af78d10
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,46 @@
+FROM python:3.7
+
+RUN export DEBIAN_FRONTEND=noninteractive && \
+  apt-get update && apt-get upgrade -y && \
+  apt-get install -y \
+    libapr1-dev \
+    libaprutil1-dev \
+    libpq-dev \
+    libsvn-dev \
+    libsystemd-dev \
+    postgresql-client \
+    wait-for-it \
+    ngrep && \
+  apt-get install -y --no-install-recommends \
+    r-base-core \
+    r-cran-jsonlite && \
+  apt-get clean && \
+  rm -rf /var/lib/apt/lists/*
+
+RUN useradd -md /srv/softwareheritage -s /bin/bash swh
+USER swh
+
+RUN python3 -m venv /srv/softwareheritage/venv
+ENV PATH="/srv/softwareheritage/venv/bin:${PATH}"
+RUN pip install --upgrade pip setuptools wheel
+RUN pip install 'gunicorn<20'
+RUN pip install cassandra-driver
+
+RUN pip install \
+        swh-core[db,http] \
+        swh-deposit[server] \
+        swh-indexer \
+        swh-journal \
+        swh-lister \
+        swh-loader-core \
+        swh-loader-git \
+        swh-loader-mercurial \
+        swh-loader-svn \
+        swh-storage \
+        swh-objstorage \
+        swh-scheduler \
+        swh-vault \
+        swh-web
+
+COPY utils/*.sh /srv/softwareheritage/utils/
+RUN mkdir -p /srv/softwareheritage/objects
diff --git a/docker/Makefile b/docker/Makefile
new file mode 100644
index 0000000..527b92c
--- /dev/null
+++ b/docker/Makefile
@@ -0,0 +1,3 @@
+.PHONY: check-staged
+check-staged:
+	docker-compose config -q
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000..90a9c8c
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,679 @@
+# swh-docker-dev
+
+This repo contains Dockerfiles to allow developers to run a small
+Software Heritage instance on their development computer.
+
+The end goal is to smooth the contributors/developers workflow. Focus
+on coding, not configuring!
+
+WARNING: Running a Software Heritage instance on your machine can consume
+         quite a bit of resources: if you play a bit too hard (e.g., if you
+         try to list all GitHub repositories with the corresponding lister),
+         you may fill your hard drive, and consume a lot of CPU, memory and
+         network bandwidth.
+
+
+## Dependencies
+
+This uses docker with docker-compose, so ensure you have a working
+docker environment and docker-compose is installed.
+
+We recommend using the latest version of docker, so please read
+https://docs.docker.com/install/linux/docker-ce/debian/ for more details on how
+to install docker on your machine.
+
+On a debian system, docker-compose can be installed from debian repositories.
+On a stable (stretch) machine, it is recommended to install the version from
+[backports](https://backports.debian.org/Instructions/):
+
+```
+~$ sudo apt install -t stretch-backports docker-compose
+```
+
+## Quick start
+
+First, clone this repository.
+
+If you already have followed the
+[[https://docs.softwareheritage.org/devel/developer-setup.html|developer setup guide]],
+then you should already have a copy of the swh-docker-env git repository. Use
+it:
+
+```
+~$ cd swh-environment/swh-docker-dev
+```
+
+Otherwise, we suggest to create a `swh-environment`
+directory in which this repo will be cloned so you can later on run some
+component in docker containers with overrides code from local repositories (see
+[[<#using-docker-setup-development-and-integration-testing>|below]]):
+
+```
+~$ mkdir swh-environment
+~$ cd swh-environment
+~/swh-environment$ git clone https://forge.softwareheritage.org/source/swh-docker-dev.git
+~/swh-environment$ cd swh-docker-dev
+```
+
+Then, start containers:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose up -d
+[...]
+Creating swh-docker-dev_amqp_1               ... done
+Creating swh-docker-dev_zookeeper_1          ... done
+Creating swh-docker-dev_kafka_1              ... done
+Creating swh-docker-dev_flower_1             ... done
+Creating swh-docker-dev_swh-scheduler-db_1   ... done
+[...]
+```
+
+This will build docker images and run them.
+Check everything is running fine with:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose ps
+                         Name                                       Command               State                                      Ports
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swh-docker-dev_amqp_1                                    docker-entrypoint.sh rabbi ...   Up      15671/tcp, 0.0.0.0:5018->15672/tcp, 25672/tcp, 4369/tcp, 5671/tcp, 5672/tcp
+swh-docker-dev_flower_1                                  flower --broker=amqp://gue ...   Up      0.0.0.0:5555->5555/tcp
+swh-docker-dev_kafka_1                                   start-kafka.sh                   Up      0.0.0.0:9092->9092/tcp
+swh-docker-dev_swh-deposit-db_1                          docker-entrypoint.sh postgres    Up      5432/tcp
+swh-docker-dev_swh-deposit_1                             /entrypoint.sh                   Up      0.0.0.0:5006->5006/tcp
+[...]
+```
+
+At the time of writing this guide, the startup of some containers may fail the
+first time for dependency-related problems. If some containers failed to start,
+just run the `docker-compose up -d` command again.
+
+If a container really refuses to start properly, you can check why using the
+`docker-compose logs` command. For example:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose logs swh-lister
+Attaching to swh-docker-dev_swh-lister_1
+[...]
+swh-lister_1                      | Processing /src/swh-scheduler
+swh-lister_1                      | Could not install packages due to an EnvironmentError: [('/src/swh-scheduler/.hypothesis/unicodedata/8.0.0/charmap.json.gz', '/tmp/pip-req-build-pm7nsax3/.hypothesis/unicodedata/8.0.0/charmap.json.gz', "[Errno 13] Permission denied: '/src/swh-scheduler/.hypothesis/unicodedata/8.0.0/charmap.json.gz'")]
+swh-lister_1                      |
+```
+
+Once all containers are running, you can use the web interface by opening
+http://localhost:5080/ in your web browser.
+
+At this point, the archive is empty and needs to be filled with some content.
+To do so, you can create tasks that will scrape a forge. For example, to inject
+the code from the https://0xacab.org gitlab forge:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \
+    swh scheduler task add list-gitlab-full \
+	  -p oneshot url=https://0xacab.org/api/v4
+
+Created 1 tasks
+
+Task 1
+  Next run: just now (2018-12-19 14:58:49+00:00)
+  Interval: 90 days, 0:00:00
+  Type: list-gitlab-full
+  Policy: oneshot
+  Args:
+  Keyword args:
+    url=https://0xacab.org/api/v4
+```
+
+This task will scrape the forge's project list and create subtasks to inject
+each git repository found there.
+
+This will take a bit af time to complete.
+
+To increase the speed at which git repositories are imported, you can spawn more
+`swh-loader-git` workers:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \
+    celery status
+listers@50ac2185c6c9: OK
+loader@b164f9055637: OK
+indexer@33bc6067a5b8: OK
+vault@c9fef1bbfdc1: OK
+
+4 nodes online.
+~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \
+    celery control pool_grow 3 -d loader@b164f9055637
+-> loader@b164f9055637: OK
+        pool will grow
+~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \
+    celery inspect -d loader@b164f9055637 stats | grep prefetch_count
+       "prefetch_count": 4
+```
+
+Now there are 4 workers ingesting git repositories.
+You can also increase the number of `swh-loader-git` containers:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose up -d --scale swh-loader=4
+[...]
+Creating swh-docker-dev_swh-loader_2        ... done
+Creating swh-docker-dev_swh-loader_3        ... done
+Creating swh-docker-dev_swh-loader_4        ... done
+```
+
+## Updating the docker image
+
+All containers started by `docker-compose` are bound to a docker image
+named `swh/stack` including all the software components of Software Heritage.
+When new versions of these components are released, the docker image will not
+be automatically updated. In order to update all Software heritage components
+to their latest version, the docker image needs to be explicitly rebuilt by
+issuing the following command inside the `swh-docker-dev` directory:
+
+```
+~/swh-environment/swh-docker-dev$ docker build --no-cache -t swh/stack .
+```
+
+## Details
+
+This runs the following services on their respectively standard ports,
+all of the following services are configured to communicate with each
+other:
+
+- swh-storage-db: a `softwareheritage` instance db that stores the
+  Merkle DAG,
+
+- swh-objstorage: Content-addressable object storage,
+
+- swh-storage: Abstraction layer over the archive, allowing to access
+  all stored source code artifacts as well as their metadata,
+
+- swh-web: the swh's web interface over the storage,
+
+- swh-scheduler: the API service as well as 2 utilities,
+  the runner and the listener,
+
+- swh-lister: celery workers dedicated to running lister tasks,
+
+- swh-loaders: celery workers dedicated to importing/updating source code
+  content (VCS repos, source packages, etc.),
+
+- swh-journal: Persistent logger of changes to the archive, with
+  publish-subscribe support.
+
+That means, you can start doing the ingestion using those services using the
+same setup described in the getting-started starting directly at
+https://docs.softwareheritage.org/devel/getting-started.html#step-4-ingest-repositories
+
+### Exposed Ports
+
+Several services have their listening ports exposed on the host:
+
+- amqp: 5072
+- kafka: 5092
+- nginx: 5080
+
+And for SWH services:
+
+- scheduler API: 5008
+- storage API: 5002
+- object storage API: 5003
+- indexer API: 5007
+- web app: 5004
+- deposit app: 5006
+
+Beware that these ports are not the same as the ports used from within the
+docker network. This means that the same command executed from the host or from
+a docker container will not use the same urls to access services. For example,
+to use the `celery` utility from the host, you may type:
+
+```
+~/swh-environment/swh-docker-dev$ CELERY_BROKER_URL=amqp://:5072// celery status
+loader@61704103668c: OK
+[...]
+```
+
+To run the same command from within a container:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api celery status
+loader@61704103668c: OK
+[...]
+```
+
+## Managing tasks
+
+One of the main components of the Software Heritage platform is the task system.
+These are used to manage everything related to background process, like
+discovering new git repositories to import, ingesting them, checking a known
+repository is up to date, etc.
+
+The task system is based on Celery but uses a custom database-based scheduler.
+
+So when we refer to the term 'task', it may designate either a Celery task or a
+SWH one (ie. the entity in the database). When we refer to simply a "task" in
+the documentation, it designates the SWH task.
+
+When a SWH task is ready to be executed, a Celery task is created to handle the
+actual SWH task's job. Note that not all Celery tasks are directly linked to a
+SWH task (some SWH tasks are implemented using a Celery task that spawns Celery
+subtasks).
+
+A (SWH) task can be `recurring` or `oneshot`. `oneshot` tasks are only executed
+once, whereas `recurring` are regularly executed. The scheduling configuration
+of these recurring tasks can be set via the fields `current_interval` and
+`priority` (can be 'high', 'normal' or 'low') of the task database entity.
+
+
+### Inserting a new lister task
+
+To list the content of a source code provider like github or a Debian
+distribution, you may add a new task for this.
+
+This task will (generally) scrape a web page or use a public API to identify
+the list of published software artefacts (git repos, debian source packages,
+etc.)
+
+Then, for each repository, a new task will be created to ingest this repository
+and keep it up to date.
+
+For example, to add a (one shot) task that will list git repos on the
+0xacab.org gitlab instance, one can do (from this git repository):
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \
+    swh scheduler task add list-gitlab-full \
+	  -p oneshot url=https://0xacab.org/api/v4
+
+Created 1 tasks
+
+Task 12
+  Next run: just now (2018-12-19 14:58:49+00:00)
+  Interval: 90 days, 0:00:00
+  Type: list-gitlab-full
+  Policy: oneshot
+  Args:
+  Keyword args:
+    url=https://0xacab.org/api/v4
+```
+
+This will insert a new task in the scheduler. To list existing tasks for a
+given task type:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \
+  swh scheduler task list-pending list-gitlab-full
+
+Found 1 list-gitlab-full tasks
+
+Task 12
+  Next run: 2 minutes ago (2018-12-19 14:58:49+00:00)
+  Interval: 90 days, 0:00:00
+  Type: list-gitlab-full
+  Policy: oneshot
+  Args:
+  Keyword args:
+    url=https://0xacab.org/api/v4
+```
+
+To list all existing task types:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \
+  swh scheduler task-type list
+
+Known task types:
+load-svn-from-archive:
+  Loading svn repositories from svn dump
+load-svn:
+  Create dump of a remote svn repository, mount it and load it
+load-deposit:
+  Loading deposit archive into swh through swh-loader-tar
+check-deposit:
+  Pre-checking deposit step before loading into swh archive
+cook-vault-bundle:
+  Cook a Vault bundle
+load-hg:
+  Loading mercurial repository swh-loader-mercurial
+load-hg-from-archive:
+  Loading archive mercurial repository swh-loader-mercurial
+load-git:
+  Update an origin of type git
+list-github-incremental:
+  Incrementally list GitHub
+list-github-full:
+  Full update of GitHub repos list
+list-debian-distribution:
+  List a Debian distribution
+list-gitlab-incremental:
+  Incrementally list a Gitlab instance
+list-gitlab-full:
+  Full update of a Gitlab instance's repos list
+list-pypi:
+  Full pypi lister
+load-pypi:
+  Load Pypi origin
+index-mimetype:
+  Mimetype indexer task
+index-mimetype-for-range:
+  Mimetype Range indexer task
+index-fossology-license:
+  Fossology license indexer task
+index-fossology-license-for-range:
+  Fossology license range indexer task
+index-origin-head:
+  Origin Head indexer task
+index-revision-metadata:
+  Revision Metadata indexer task
+index-origin-metadata:
+  Origin Metadata indexer task
+
+```
+
+
+### Monitoring activity
+
+You can monitor the workers activity by connecting to the RabbitMQ console on
+`http://localhost:5080/rabbitmq` or the grafana dashboard on
+`http://localhost:5080/grafana`.
+
+If you cannot see any task being executed, check the logs of the
+`swh-scheduler-runner` service (here is a failure example due to the
+debian lister task not being properly registered on the
+swh-scheduler-runner service):
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose logs --tail=10 swh-scheduler-runner
+Attaching to swh-docker-dev_swh-scheduler-runner_1
+swh-scheduler-runner_1    |     "__main__", mod_spec)
+swh-scheduler-runner_1    |   File "/usr/local/lib/python3.7/runpy.py", line 85, in _run_code
+swh-scheduler-runner_1    |     exec(code, run_globals)
+swh-scheduler-runner_1    |   File "/usr/local/lib/python3.7/site-packages/swh/scheduler/celery_backend/runner.py", line 107, in <module>
+swh-scheduler-runner_1    |     run_ready_tasks(main_backend, main_app)
+swh-scheduler-runner_1    |   File "/usr/local/lib/python3.7/site-packages/swh/scheduler/celery_backend/runner.py", line 81, in run_ready_tasks
+swh-scheduler-runner_1    |     task_types[task['type']]['backend_name']
+swh-scheduler-runner_1    |   File "/usr/local/lib/python3.7/site-packages/celery/app/registry.py", line 21, in __missing__
+swh-scheduler-runner_1    |     raise self.NotRegistered(key)
+swh-scheduler-runner_1    | celery.exceptions.NotRegistered: 'swh.lister.debian.tasks.DebianListerTask'
+```
+
+
+## Using docker setup development and integration testing
+
+If you hack the code of one or more archive components with a virtual
+env based setup as described in the
+[[https://docs.softwareheritage.org/devel/developer-setup.html|developer
+setup guide]], you may want to test your modifications in a working
+Software Heritage instance. The simplest way to achieve this is to use
+this docker-based environment.
+
+If you haven't followed the
+[[https://docs.softwareheritage.org/devel/developer-setup.html|developer setup guide]],
+you must clone the the [swh-environment] repo in your `swh-environment`
+directory:
+
+```
+~/swh-environment$ git clone https://forge.softwareheritage.org/source/swh-environment.git .
+```
+
+Note the `.` at the end of this command: we want the git repository to be
+cloned directly in the `~/swh-environment` directory, not in a sub directory.
+Also note that if you haven't done it yet and you want to hack the source code
+of one or more Software Heritage packages, you really should read the
+[[https://docs.softwareheritage.org/devel/developer-setup.html|developer setup guide]].
+
+From there, we will checkout or update all the swh packages:
+
+```
+~/swh-environment$ ./bin/update
+```
+
+### Install a swh package from sources in a container
+
+It is possible to run a docker container with some swh packages installed from
+sources instead of using the latest published packages from pypi. To do this
+you must write a docker-compose override file (`docker-compose.override.yml`).
+An example is given in the `docker-compose.override.yml.example` file:
+
+``` yaml
+version: '2'
+
+services:
+  swh-objstorage:
+    volumes:
+      - "$HOME/swh-environment/swh-objstorage:/src/swh-objstorage"
+```
+
+The file named `docker-compose.override.yml` will automatically be loaded by
+`docker-compose`.
+
+This example shows the simplest case of the `swh-objstorage` package:
+you just have to mount it in the container in `/src` and the
+entrypoint will ensure every swh-* package found in `/src/` is
+installed (using `pip install -e` so you can easily hack your
+code). If the application you play with has autoreload support, there
+is no need to restart the impacted container.)
+
+Note: if the docker fails to start when using local sources for one or more swh
+package, it's most probably due to permission problems on cache files. For
+example, if you have executed tests locally (using pytest or tox), you have
+cache files (__pycache__ etc.) that will prevent `pip install` from working
+within the docker.
+
+The solution is to clean these files and directories before trying to spawn the
+docker.
+
+```
+~/swh-environment$ find . -type d -name __pycache__ -exec rm -rf {} \;
+~/swh-environment$ find . -type d -name .tox -exec rm -rf {} \;
+~/swh-environment$ find . -type d -name .hypothesis -exec rm -rf {} \;
+```
+
+### Using locally installed swh tools with docker
+
+In all examples above, we have executed swh commands from within a running
+container. Now we also have these swh commands locally available in our virtual
+env, we can use them to interact with swh services running in docker
+containers.
+
+For this, we just need to configure a few environment variables. First, ensure
+your Software Heritage virtualenv is activated (here, using virtualenvwrapper):
+
+```
+~$ workon swh
+(swh) ~/swh-environment$ export SWH_SCHEDULER_URL=http://127.0.0.1:5008/
+(swh) ~/swh-environment$ export CELERY_BROKER_URL=amqp://127.0.0.1:5072/
+```
+
+Now we can use the `celery` command directly to control the celery system
+running in the docker environment:
+
+```
+(swh) ~/swh-environment$ celery status
+vault@c9fef1bbfdc1: OK
+listers@ba66f18e7d02: OK
+indexer@cb14c33cbbfb: OK
+loader@61704103668c: OK
+
+4 nodes online.
+(swh) ~/swh-environment$ celery control -d loader@61704103668c pool_grow 3
+```
+
+And we can use the `swh-scheduler` command all the same:
+
+```
+(swh) ~/swh-environment$ swh scheduler task-type list
+Known task types:
+index-fossology-license:
+  Fossology license indexer task
+index-mimetype:
+  Mimetype indexer task
+[...]
+```
+
+### Make your life a bit easier
+
+When you use virtualenvwrapper, you can add postactivation commands:
+
+```
+(swh) ~/swh-environment$ cat >>$VIRTUAL_ENV/bin/postactivate <<'EOF'
+# unfortunately, the interface cmd for the click autocompletion
+# depends on the shell
+# https://click.palletsprojects.com/en/7.x/bashcomplete/#activation
+
+shell=$(basename $SHELL)
+case "$shell" in
+    "zsh")
+        autocomplete_cmd=source_zsh
+        ;;
+    *)
+        autocomplete_cmd=source
+        ;;
+esac
+
+eval "$(_SWH_COMPLETE=$autocomplete_cmd swh)"
+export SWH_SCHEDULER_URL=http://127.0.0.1:5008/
+export CELERY_BROKER_URL=amqp://127.0.0.1:5072/
+export COMPOSE_FILE=~/swh-environment/swh-docker-dev/docker-compose.yml:~/swh-environment/swh-docker-dev/docker-compose.override.yml
+alias doco=docker-compose
+
+function swhclean {
+    find ~/swh-environment -type d -name __pycache__ -exec rm -rf {} \;
+    find ~/swh-environment -type d -name .tox -exec rm -rf {} \;
+    find ~/swh-environment -type d -name .hypothesis -exec rm -rf {} \;
+}
+EOF
+```
+
+This postactivate script does:
+
+- install a shell completion handler for the swh-scheduler command,
+- preset a bunch of environment variables
+
+  - `SWH_SCHEDULER_URL` so that you can just run `swh scheduler` against the
+    scheduler API instance running in docker, without having to specify the
+    endpoint URL,
+
+  - `CELERY_BROKER` so you can execute the `celery` tool (without cli options)
+    against the rabbitmq server running in the docker environment,
+
+  - `COMPOSE_FILE` so you can run `docker-compose` from everywhere,
+
+- create an alias `doco` for `docker-compose` because this is way too
+  long to type,
+
+- add a `swhclean` shell function to clean your source directories so that
+  there is no conflict with docker containers using local swh repositories (see
+  below). This will delete any `.tox`, `__pycache__` and `.hypothesis`
+  directory found in your swh-environment directory.
+
+So now you can easily:
+
+* Start the SWH platform:
+
+```
+  (swh) ~/swh-environment$ doco up -d
+  [...]
+```
+
+* Check celery:
+
+```
+  (swh) ~/swh-environment$ celery status
+  listers@50ac2185c6c9: OK
+  loader@b164f9055637: OK
+  indexer@33bc6067a5b8: OK
+```
+
+* List task-types:
+
+```
+  (swh) ~/swh-environment$ swh scheduler task-type list
+  [...]
+```
+
+* Get more info on a task type:
+
+```
+  (swh) ~/swh-environment$ swh scheduler task-type list -v -t load-hg
+  Known task types:
+  load-hg: swh.loader.mercurial.tasks.LoadMercurial
+    Loading mercurial repository swh-loader-mercurial
+    interval: 1 day, 0:00:00 [1 day, 0:00:00, 1 day, 0:00:00]
+    backoff_factor: 1.0
+    max_queue_length: 1000
+    num_retries: None
+    retry_delay: None
+```
+
+* Add a new task:
+
+```
+  (swh) ~/swh-environment$ swh scheduler task add load-hg \
+    origin_url=https://hg.logilab.org/master/cubicweb
+  Created 1 tasks
+  Task 1
+     Next run: just now (2019-02-06 12:36:58+00:00)
+     Interval: 1 day, 0:00:00
+     Type: load-hg
+     Policy: recurring
+     Args:
+     Keyword args:
+       origin_url: https://hg.logilab.org/master/cubicweb
+```
+
+* Respawn a task:
+
+```
+  (swh) ~/swh-environment$ swh scheduler task respawn 1
+```
+
+
+## Starting a kafka-powered replica of the storage
+
+This repo comes with an optional `docker-compose.storage-replica.yml`
+docker compose file that can be used to test the kafka-powered replication
+mecanism for the main storage.
+
+This can be used like:
+
+```
+~/swh-environment/swh-docker-dev$ docker-compose -f docker-compose.yml -f docker-compose.storage-replica.yml up -d
+[...]
+```
+
+Compared to the original compose file, this will:
+
+- overrides the swh-storage service to activate the kafka direct writer
+  on swh.journal.objects prefixed topics using thw swh.storage.master ID,
+- overrides the swh-web service to make it use the replica instead of the
+  master storage,
+- starts a db for the replica,
+- starts a storage service based on this db,
+- starts a replayer service that runs the process that listen to kafka to
+  keeps the replica in sync.
+
+When using it, you will have a setup in which the master storage is used by
+workers and most other services, whereas the storage replica will be used to
+by the web application and should be kept in sync with the master storage
+by kafka.
+
+
+Note that the object storage is not replicated here, only the graph storage.
+
+## Starting the backfiller
+
+Reading from the storage the objects <object-type> from within range
+[start-object, end-object] to the kafka topics.
+
+```
+(swh) $ docker-compose \
+             -f docker-compose.yml \
+             -f docker-compose.storage-replica.yml \
+             -f docker-compose.storage-replica.override.yml \
+             run \
+             swh-journal-backfiller \
+             snapshot \
+             --start-object 000000 \
+             --end-object 000001 \
+             --dry-run
+```
diff --git a/docker/conf/cassandra.yaml b/docker/conf/cassandra.yaml
new file mode 100644
index 0000000..dc26a6c
--- /dev/null
+++ b/docker/conf/cassandra.yaml
@@ -0,0 +1,1242 @@
+# Cassandra storage config YAML
+
+# NOTE:
+#   See http://wiki.apache.org/cassandra/StorageConfiguration for
+#   full explanations of configuration directives
+# /NOTE
+
+# The name of the cluster. This is mainly used to prevent machines in
+# one logical cluster from joining another.
+cluster_name: 'Test Cluster'
+
+# This defines the number of tokens randomly assigned to this node on the ring
+# The more tokens, relative to other nodes, the larger the proportion of data
+# that this node will store. You probably want all nodes to have the same number
+# of tokens assuming they have equal hardware capability.
+#
+# If you leave this unspecified, Cassandra will use the default of 1 token for legacy compatibility,
+# and will use the initial_token as described below.
+#
+# Specifying initial_token will override this setting on the node's initial start,
+# on subsequent starts, this setting will apply even if initial token is set.
+#
+# If you already have a cluster with 1 token per node, and wish to migrate to 
+# multiple tokens per node, see http://wiki.apache.org/cassandra/Operations
+num_tokens: 256
+
+# Triggers automatic allocation of num_tokens tokens for this node. The allocation
+# algorithm attempts to choose tokens in a way that optimizes replicated load over
+# the nodes in the datacenter for the replication strategy used by the specified
+# keyspace.
+#
+# The load assigned to each node will be close to proportional to its number of
+# vnodes.
+#
+# Only supported with the Murmur3Partitioner.
+# allocate_tokens_for_keyspace: KEYSPACE
+
+# initial_token allows you to specify tokens manually.  While you can use it with
+# vnodes (num_tokens > 1, above) -- in which case you should provide a 
+# comma-separated list -- it's primarily used when adding nodes to legacy clusters 
+# that do not have vnodes enabled.
+# initial_token:
+
+# See http://wiki.apache.org/cassandra/HintedHandoff
+# May either be "true" or "false" to enable globally
+hinted_handoff_enabled: true
+
+# When hinted_handoff_enabled is true, a black list of data centers that will not
+# perform hinted handoff
+# hinted_handoff_disabled_datacenters:
+#    - DC1
+#    - DC2
+
+# this defines the maximum amount of time a dead host will have hints
+# generated.  After it has been dead this long, new hints for it will not be
+# created until it has been seen alive and gone down again.
+max_hint_window_in_ms: 10800000 # 3 hours
+
+# Maximum throttle in KBs per second, per delivery thread.  This will be
+# reduced proportionally to the number of nodes in the cluster.  (If there
+# are two nodes in the cluster, each delivery thread will use the maximum
+# rate; if there are three, each will throttle to half of the maximum,
+# since we expect two nodes to be delivering hints simultaneously.)
+hinted_handoff_throttle_in_kb: 1024
+
+# Number of threads with which to deliver hints;
+# Consider increasing this number when you have multi-dc deployments, since
+# cross-dc handoff tends to be slower
+max_hints_delivery_threads: 2
+
+# Directory where Cassandra should store hints.
+# If not set, the default directory is $CASSANDRA_HOME/data/hints.
+# hints_directory: /var/lib/cassandra/hints
+hints_directory: /var/lib/cassandra/hints
+
+# How often hints should be flushed from the internal buffers to disk.
+# Will *not* trigger fsync.
+hints_flush_period_in_ms: 10000
+
+# Maximum size for a single hints file, in megabytes.
+max_hints_file_size_in_mb: 128
+
+# Compression to apply to the hint files. If omitted, hints files
+# will be written uncompressed. LZ4, Snappy, and Deflate compressors
+# are supported.
+#hints_compression:
+#   - class_name: LZ4Compressor
+#     parameters:
+#         -
+
+# Maximum throttle in KBs per second, total. This will be
+# reduced proportionally to the number of nodes in the cluster.
+batchlog_replay_throttle_in_kb: 1024
+
+# Authentication backend, implementing IAuthenticator; used to identify users
+# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthenticator,
+# PasswordAuthenticator}.
+#
+# - AllowAllAuthenticator performs no checks - set it to disable authentication.
+# - PasswordAuthenticator relies on username/password pairs to authenticate
+#   users. It keeps usernames and hashed passwords in system_auth.roles table.
+#   Please increase system_auth keyspace replication factor if you use this authenticator.
+#   If using PasswordAuthenticator, CassandraRoleManager must also be used (see below)
+authenticator: AllowAllAuthenticator
+
+# Authorization backend, implementing IAuthorizer; used to limit access/provide permissions
+# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthorizer,
+# CassandraAuthorizer}.
+#
+# - AllowAllAuthorizer allows any action to any user - set it to disable authorization.
+# - CassandraAuthorizer stores permissions in system_auth.role_permissions table. Please
+#   increase system_auth keyspace replication factor if you use this authorizer.
+authorizer: AllowAllAuthorizer
+
+# Part of the Authentication & Authorization backend, implementing IRoleManager; used
+# to maintain grants and memberships between roles.
+# Out of the box, Cassandra provides org.apache.cassandra.auth.CassandraRoleManager,
+# which stores role information in the system_auth keyspace. Most functions of the
+# IRoleManager require an authenticated login, so unless the configured IAuthenticator
+# actually implements authentication, most of this functionality will be unavailable.
+#
+# - CassandraRoleManager stores role data in the system_auth keyspace. Please
+#   increase system_auth keyspace replication factor if you use this role manager.
+role_manager: CassandraRoleManager
+
+# Validity period for roles cache (fetching granted roles can be an expensive
+# operation depending on the role manager, CassandraRoleManager is one example)
+# Granted roles are cached for authenticated sessions in AuthenticatedUser and
+# after the period specified here, become eligible for (async) reload.
+# Defaults to 2000, set to 0 to disable caching entirely.
+# Will be disabled automatically for AllowAllAuthenticator.
+roles_validity_in_ms: 2000
+
+# Refresh interval for roles cache (if enabled).
+# After this interval, cache entries become eligible for refresh. Upon next
+# access, an async reload is scheduled and the old value returned until it
+# completes. If roles_validity_in_ms is non-zero, then this must be
+# also.
+# Defaults to the same value as roles_validity_in_ms.
+# roles_update_interval_in_ms: 2000
+
+# Validity period for permissions cache (fetching permissions can be an
+# expensive operation depending on the authorizer, CassandraAuthorizer is
+# one example). Defaults to 2000, set to 0 to disable.
+# Will be disabled automatically for AllowAllAuthorizer.
+permissions_validity_in_ms: 2000
+
+# Refresh interval for permissions cache (if enabled).
+# After this interval, cache entries become eligible for refresh. Upon next
+# access, an async reload is scheduled and the old value returned until it
+# completes. If permissions_validity_in_ms is non-zero, then this must be
+# also.
+# Defaults to the same value as permissions_validity_in_ms.
+# permissions_update_interval_in_ms: 2000
+
+# Validity period for credentials cache. This cache is tightly coupled to
+# the provided PasswordAuthenticator implementation of IAuthenticator. If
+# another IAuthenticator implementation is configured, this cache will not
+# be automatically used and so the following settings will have no effect.
+# Please note, credentials are cached in their encrypted form, so while
+# activating this cache may reduce the number of queries made to the
+# underlying table, it may not  bring a significant reduction in the
+# latency of individual authentication attempts.
+# Defaults to 2000, set to 0 to disable credentials caching.
+credentials_validity_in_ms: 2000
+
+# Refresh interval for credentials cache (if enabled).
+# After this interval, cache entries become eligible for refresh. Upon next
+# access, an async reload is scheduled and the old value returned until it
+# completes. If credentials_validity_in_ms is non-zero, then this must be
+# also.
+# Defaults to the same value as credentials_validity_in_ms.
+# credentials_update_interval_in_ms: 2000
+
+# The partitioner is responsible for distributing groups of rows (by
+# partition key) across nodes in the cluster.  You should leave this
+# alone for new clusters.  The partitioner can NOT be changed without
+# reloading all data, so when upgrading you should set this to the
+# same partitioner you were already using.
+#
+# Besides Murmur3Partitioner, partitioners included for backwards
+# compatibility include RandomPartitioner, ByteOrderedPartitioner, and
+# OrderPreservingPartitioner.
+#
+partitioner: org.apache.cassandra.dht.Murmur3Partitioner
+
+# Directories where Cassandra should store data on disk.  Cassandra
+# will spread data evenly across them, subject to the granularity of
+# the configured compaction strategy.
+# If not set, the default directory is $CASSANDRA_HOME/data/data.
+data_file_directories:
+    - /var/lib/cassandra/data
+
+# commit log.  when running on magnetic HDD, this should be a
+# separate spindle than the data directories.
+# If not set, the default directory is $CASSANDRA_HOME/data/commitlog.
+commitlog_directory: /var/lib/cassandra/commitlog
+
+# Enable / disable CDC functionality on a per-node basis. This modifies the logic used
+# for write path allocation rejection (standard: never reject. cdc: reject Mutation
+# containing a CDC-enabled table if at space limit in cdc_raw_directory).
+cdc_enabled: false
+
+# CommitLogSegments are moved to this directory on flush if cdc_enabled: true and the
+# segment contains mutations for a CDC-enabled table. This should be placed on a
+# separate spindle than the data directories. If not set, the default directory is
+# $CASSANDRA_HOME/data/cdc_raw.
+# cdc_raw_directory: /var/lib/cassandra/cdc_raw
+
+# Policy for data disk failures:
+#
+# die
+#   shut down gossip and client transports and kill the JVM for any fs errors or
+#   single-sstable errors, so the node can be replaced.
+#
+# stop_paranoid
+#   shut down gossip and client transports even for single-sstable errors,
+#   kill the JVM for errors during startup.
+#
+# stop
+#   shut down gossip and client transports, leaving the node effectively dead, but
+#   can still be inspected via JMX, kill the JVM for errors during startup.
+#
+# best_effort
+#    stop using the failed disk and respond to requests based on
+#    remaining available sstables.  This means you WILL see obsolete
+#    data at CL.ONE!
+#
+# ignore
+#    ignore fatal errors and let requests fail, as in pre-1.2 Cassandra
+disk_failure_policy: stop
+
+# Policy for commit disk failures:
+#
+# die
+#   shut down gossip and Thrift and kill the JVM, so the node can be replaced.
+#
+# stop
+#   shut down gossip and Thrift, leaving the node effectively dead, but
+#   can still be inspected via JMX.
+#
+# stop_commit
+#   shutdown the commit log, letting writes collect but
+#   continuing to service reads, as in pre-2.0.5 Cassandra
+#
+# ignore
+#   ignore fatal errors and let the batches fail
+commit_failure_policy: stop
+
+# Maximum size of the native protocol prepared statement cache
+#
+# Valid values are either "auto" (omitting the value) or a value greater 0.
+#
+# Note that specifying a too large value will result in long running GCs and possbily
+# out-of-memory errors. Keep the value at a small fraction of the heap.
+#
+# If you constantly see "prepared statements discarded in the last minute because
+# cache limit reached" messages, the first step is to investigate the root cause
+# of these messages and check whether prepared statements are used correctly -
+# i.e. use bind markers for variable parts.
+#
+# Do only change the default value, if you really have more prepared statements than
+# fit in the cache. In most cases it is not neccessary to change this value.
+# Constantly re-preparing statements is a performance penalty.
+#
+# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater
+prepared_statements_cache_size_mb:
+
+# Maximum size of the Thrift prepared statement cache
+#
+# If you do not use Thrift at all, it is safe to leave this value at "auto".
+#
+# See description of 'prepared_statements_cache_size_mb' above for more information.
+#
+# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater
+thrift_prepared_statements_cache_size_mb:
+
+# Maximum size of the key cache in memory.
+#
+# Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the
+# minimum, sometimes more. The key cache is fairly tiny for the amount of
+# time it saves, so it's worthwhile to use it at large numbers.
+# The row cache saves even more time, but must contain the entire row,
+# so it is extremely space-intensive. It's best to only use the
+# row cache if you have hot rows or static rows.
+#
+# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
+#
+# Default value is empty to make it "auto" (min(5% of Heap (in MB), 100MB)). Set to 0 to disable key cache.
+key_cache_size_in_mb: 1024
+
+# Duration in seconds after which Cassandra should
+# save the key cache. Caches are saved to saved_caches_directory as
+# specified in this configuration file.
+#
+# Saved caches greatly improve cold-start speeds, and is relatively cheap in
+# terms of I/O for the key cache. Row cache saving is much more expensive and
+# has limited use.
+#
+# Default is 14400 or 4 hours.
+key_cache_save_period: 14400
+
+# Number of keys from the key cache to save
+# Disabled by default, meaning all keys are going to be saved
+# key_cache_keys_to_save: 100
+
+# Row cache implementation class name. Available implementations:
+#
+# org.apache.cassandra.cache.OHCProvider
+#   Fully off-heap row cache implementation (default).
+#
+# org.apache.cassandra.cache.SerializingCacheProvider
+#   This is the row cache implementation availabile
+#   in previous releases of Cassandra.
+# row_cache_class_name: org.apache.cassandra.cache.OHCProvider
+
+# Maximum size of the row cache in memory.
+# Please note that OHC cache implementation requires some additional off-heap memory to manage
+# the map structures and some in-flight memory during operations before/after cache entries can be
+# accounted against the cache capacity. This overhead is usually small compared to the whole capacity.
+# Do not specify more memory that the system can afford in the worst usual situation and leave some
+# headroom for OS block level cache. Do never allow your system to swap.
+#
+# Default value is 0, to disable row caching.
+row_cache_size_in_mb: 0
+
+# Duration in seconds after which Cassandra should save the row cache.
+# Caches are saved to saved_caches_directory as specified in this configuration file.
+#
+# Saved caches greatly improve cold-start speeds, and is relatively cheap in
+# terms of I/O for the key cache. Row cache saving is much more expensive and
+# has limited use.
+#
+# Default is 0 to disable saving the row cache.
+row_cache_save_period: 0
+
+# Number of keys from the row cache to save.
+# Specify 0 (which is the default), meaning all keys are going to be saved
+# row_cache_keys_to_save: 100
+
+# Maximum size of the counter cache in memory.
+#
+# Counter cache helps to reduce counter locks' contention for hot counter cells.
+# In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before
+# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration
+# of the lock hold, helping with hot counter cell updates, but will not allow skipping
+# the read entirely. Only the local (clock, count) tuple of a counter cell is kept
+# in memory, not the whole counter, so it's relatively cheap.
+#
+# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
+#
+# Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache.
+# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache.
+counter_cache_size_in_mb:
+
+# Duration in seconds after which Cassandra should
+# save the counter cache (keys only). Caches are saved to saved_caches_directory as
+# specified in this configuration file.
+#
+# Default is 7200 or 2 hours.
+counter_cache_save_period: 7200
+
+# Number of keys from the counter cache to save
+# Disabled by default, meaning all keys are going to be saved
+# counter_cache_keys_to_save: 100
+
+# saved caches
+# If not set, the default directory is $CASSANDRA_HOME/data/saved_caches.
+saved_caches_directory: /var/lib/cassandra/saved_caches
+
+
+# commitlog_sync may be either "periodic" or "batch." 
+# 
+# When in batch mode, Cassandra won't ack writes until the commit log
+# has been fsynced to disk.  It will wait
+# commitlog_sync_batch_window_in_ms milliseconds between fsyncs.
+# This window should be kept short because the writer threads will
+# be unable to do extra work while waiting.  (You may need to increase
+# concurrent_writes for the same reason.)
+#
+# commitlog_sync: batch
+# commitlog_sync_batch_window_in_ms: 2
+#
+# the other option is "periodic" where writes may be acked immediately
+# and the CommitLog is simply synced every commitlog_sync_period_in_ms
+# milliseconds.
+commitlog_sync: periodic
+commitlog_sync_period_in_ms: 10000
+
+# The size of the individual commitlog file segments.  A commitlog
+# segment may be archived, deleted, or recycled once all the data
+# in it (potentially from each columnfamily in the system) has been
+# flushed to sstables.
+#
+# The default size is 32, which is almost always fine, but if you are
+# archiving commitlog segments (see commitlog_archiving.properties),
+# then you probably want a finer granularity of archiving; 8 or 16 MB
+# is reasonable.
+# Max mutation size is also configurable via max_mutation_size_in_kb setting in
+# cassandra.yaml. The default is half the size commitlog_segment_size_in_mb * 1024.
+# This should be positive and less than 2048.
+#
+# NOTE: If max_mutation_size_in_kb is set explicitly then commitlog_segment_size_in_mb must
+# be set to at least twice the size of max_mutation_size_in_kb / 1024
+#
+commitlog_segment_size_in_mb: 512
+# This is much bigger than the default (32), but the segment size must be
+# larger than the largest row we want to write. And we have rows as large
+# as 300MB, so...
+
+# Compression to apply to the commit log. If omitted, the commit log
+# will be written uncompressed.  LZ4, Snappy, and Deflate compressors
+# are supported.
+# commitlog_compression:
+#   - class_name: LZ4Compressor
+#     parameters:
+#         -
+
+# any class that implements the SeedProvider interface and has a
+# constructor that takes a Map<String, String> of parameters will do.
+seed_provider:
+    # Addresses of hosts that are deemed contact points. 
+    # Cassandra nodes use this list of hosts to find each other and learn
+    # the topology of the ring.  You must change this if you are running
+    # multiple nodes!
+    - class_name: org.apache.cassandra.locator.SimpleSeedProvider
+      parameters:
+          # seeds is actually a comma-delimited list of addresses.
+          # Ex: "<ip1>,<ip2>,<ip3>"
+          - seeds: <will be replaced by docker-entrypoint.sh>
+
+# For workloads with more data than can fit in memory, Cassandra's
+# bottleneck will be reads that need to fetch data from
+# disk. "concurrent_reads" should be set to (16 * number_of_drives) in
+# order to allow the operations to enqueue low enough in the stack
+# that the OS and drives can reorder them. Same applies to
+# "concurrent_counter_writes", since counter writes read the current
+# values before incrementing and writing them back.
+#
+# On the other hand, since writes are almost never IO bound, the ideal
+# number of "concurrent_writes" is dependent on the number of cores in
+# your system; (8 * number_of_cores) is a good rule of thumb.
+concurrent_reads: 32
+concurrent_writes: 32
+concurrent_counter_writes: 32
+
+# For materialized view writes, as there is a read involved, so this should
+# be limited by the less of concurrent reads or concurrent writes.
+concurrent_materialized_view_writes: 32
+
+# Maximum memory to use for sstable chunk cache and buffer pooling.
+# 32MB of this are reserved for pooling buffers, the rest is used as an
+# cache that holds uncompressed sstable chunks.
+# Defaults to the smaller of 1/4 of heap or 512MB. This pool is allocated off-heap,
+# so is in addition to the memory allocated for heap. The cache also has on-heap
+# overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size
+# if the default 64k chunk size is used).
+# Memory is only allocated when needed.
+# file_cache_size_in_mb: 512
+
+# Flag indicating whether to allocate on or off heap when the sstable buffer
+# pool is exhausted, that is when it has exceeded the maximum memory
+# file_cache_size_in_mb, beyond which it will not cache buffers but allocate on request.
+
+# buffer_pool_use_heap_if_exhausted: true
+
+# The strategy for optimizing disk read
+# Possible values are:
+# ssd (for solid state disks, the default)
+# spinning (for spinning disks)
+# disk_optimization_strategy: ssd
+
+# Total permitted memory to use for memtables. Cassandra will stop
+# accepting writes when the limit is exceeded until a flush completes,
+# and will trigger a flush based on memtable_cleanup_threshold
+# If omitted, Cassandra will set both to 1/4 the size of the heap.
+# memtable_heap_space_in_mb: 2048
+# memtable_offheap_space_in_mb: 2048
+
+# memtable_cleanup_threshold is deprecated. The default calculation
+# is the only reasonable choice. See the comments on  memtable_flush_writers
+# for more information.
+#
+# Ratio of occupied non-flushing memtable size to total permitted size
+# that will trigger a flush of the largest memtable. Larger mct will
+# mean larger flushes and hence less compaction, but also less concurrent
+# flush activity which can make it difficult to keep your disks fed
+# under heavy write load.
+#
+# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
+# memtable_cleanup_threshold: 0.11
+
+# Specify the way Cassandra allocates and manages memtable memory.
+# Options are:
+#
+# heap_buffers
+#   on heap nio buffers
+#
+# offheap_buffers
+#   off heap (direct) nio buffers
+#
+# offheap_objects
+#    off heap objects
+memtable_allocation_type: heap_buffers
+
+# Total space to use for commit logs on disk.
+#
+# If space gets above this value, Cassandra will flush every dirty CF
+# in the oldest segment and remove it.  So a small total commitlog space
+# will tend to cause more flush activity on less-active columnfamilies.
+#
+# The default value is the smaller of 8192, and 1/4 of the total space
+# of the commitlog volume.
+#
+# commitlog_total_space_in_mb: 8192
+
+# This sets the number of memtable flush writer threads per disk
+# as well as the total number of memtables that can be flushed concurrently.
+# These are generally a combination of compute and IO bound.
+#
+# Memtable flushing is more CPU efficient than memtable ingest and a single thread
+# can keep up with the ingest rate of a whole server on a single fast disk
+# until it temporarily becomes IO bound under contention typically with compaction.
+# At that point you need multiple flush threads. At some point in the future
+# it may become CPU bound all the time.
+#
+# You can tell if flushing is falling behind using the MemtablePool.BlockedOnAllocation
+# metric which should be 0, but will be non-zero if threads are blocked waiting on flushing
+# to free memory.
+#
+# memtable_flush_writers defaults to two for a single data directory.
+# This means that two  memtables can be flushed concurrently to the single data directory.
+# If you have multiple data directories the default is one memtable flushing at a time
+# but the flush will use a thread per data directory so you will get two or more writers.
+#
+# Two is generally enough to flush on a fast disk [array] mounted as a single data directory.
+# Adding more flush writers will result in smaller more frequent flushes that introduce more
+# compaction overhead.
+#
+# There is a direct tradeoff between number of memtables that can be flushed concurrently
+# and flush size and frequency. More is not better you just need enough flush writers
+# to never stall waiting for flushing to free memory.
+#
+#memtable_flush_writers: 2
+
+# Total space to use for change-data-capture logs on disk.
+#
+# If space gets above this value, Cassandra will throw WriteTimeoutException
+# on Mutations including tables with CDC enabled. A CDCCompactor is responsible
+# for parsing the raw CDC logs and deleting them when parsing is completed.
+#
+# The default value is the min of 4096 mb and 1/8th of the total space
+# of the drive where cdc_raw_directory resides.
+# cdc_total_space_in_mb: 4096
+
+# When we hit our cdc_raw limit and the CDCCompactor is either running behind
+# or experiencing backpressure, we check at the following interval to see if any
+# new space for cdc-tracked tables has been made available. Default to 250ms
+# cdc_free_space_check_interval_ms: 250
+
+# A fixed memory pool size in MB for for SSTable index summaries. If left
+# empty, this will default to 5% of the heap size. If the memory usage of
+# all index summaries exceeds this limit, SSTables with low read rates will
+# shrink their index summaries in order to meet this limit.  However, this
+# is a best-effort process. In extreme conditions Cassandra may need to use
+# more than this amount of memory.
+index_summary_capacity_in_mb:
+
+# How frequently index summaries should be resampled.  This is done
+# periodically to redistribute memory from the fixed-size pool to sstables
+# proportional their recent read rates.  Setting to -1 will disable this
+# process, leaving existing index summaries at their current sampling level.
+index_summary_resize_interval_in_minutes: 60
+
+# Whether to, when doing sequential writing, fsync() at intervals in
+# order to force the operating system to flush the dirty
+# buffers. Enable this to avoid sudden dirty buffer flushing from
+# impacting read latencies. Almost always a good idea on SSDs; not
+# necessarily on platters.
+trickle_fsync: false
+trickle_fsync_interval_in_kb: 10240
+
+# TCP port, for commands and data
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+storage_port: 7000
+
+# SSL port, for encrypted communication.  Unused unless enabled in
+# encryption_options
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+ssl_storage_port: 7001
+
+# Address or interface to bind to and tell other Cassandra nodes to connect to.
+# You _must_ change this if you want multiple nodes to be able to communicate!
+#
+# Set listen_address OR listen_interface, not both.
+#
+# Leaving it blank leaves it up to InetAddress.getLocalHost(). This
+# will always do the Right Thing _if_ the node is properly configured
+# (hostname, name resolution, etc), and the Right Thing is to use the
+# address associated with the hostname (it might not be).
+#
+# Setting listen_address to 0.0.0.0 is always wrong.
+#
+listen_address: <will be replaced by docker-entrypoint.sh>
+
+# Set listen_address OR listen_interface, not both. Interfaces must correspond
+# to a single address, IP aliasing is not supported.
+# listen_interface: eth0
+
+# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address
+# you can specify which should be chosen using listen_interface_prefer_ipv6. If false the first ipv4
+# address will be used. If true the first ipv6 address will be used. Defaults to false preferring
+# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6.
+# listen_interface_prefer_ipv6: false
+
+# Address to broadcast to other Cassandra nodes
+# Leaving this blank will set it to the same value as listen_address
+broadcast_address: <will be replaced by docker-entrypoint.sh>
+
+# When using multiple physical network interfaces, set this
+# to true to listen on broadcast_address in addition to
+# the listen_address, allowing nodes to communicate in both
+# interfaces.
+# Ignore this property if the network configuration automatically
+# routes  between the public and private networks such as EC2.
+# listen_on_broadcast_address: false
+
+# Internode authentication backend, implementing IInternodeAuthenticator;
+# used to allow/disallow connections from peer nodes.
+# internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator
+
+# Whether to start the native transport server.
+# Please note that the address on which the native transport is bound is the
+# same as the rpc_address. The port however is different and specified below.
+start_native_transport: true
+# port for the CQL native transport to listen for clients on
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+native_transport_port: 9042
+# Enabling native transport encryption in client_encryption_options allows you to either use
+# encryption for the standard port or to use a dedicated, additional port along with the unencrypted
+# standard native_transport_port.
+# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption
+# for native_transport_port. Setting native_transport_port_ssl to a different value
+# from native_transport_port will use encryption for native_transport_port_ssl while
+# keeping native_transport_port unencrypted.
+# native_transport_port_ssl: 9142
+# The maximum threads for handling requests when the native transport is used.
+# This is similar to rpc_max_threads though the default differs slightly (and
+# there is no native_transport_min_threads, idle threads will always be stopped
+# after 30 seconds).
+# native_transport_max_threads: 128
+#
+# The maximum size of allowed frame. Frame (requests) larger than this will
+# be rejected as invalid. The default is 256MB. If you're changing this parameter,
+# you may want to adjust max_value_size_in_mb accordingly. This should be positive and less than 2048.
+# native_transport_max_frame_size_in_mb: 256
+
+# The maximum number of concurrent client connections.
+# The default is -1, which means unlimited.
+# native_transport_max_concurrent_connections: -1
+
+# The maximum number of concurrent client connections per source ip.
+# The default is -1, which means unlimited.
+# native_transport_max_concurrent_connections_per_ip: -1
+
+# Whether to start the thrift rpc server.
+start_rpc: false
+
+# The address or interface to bind the Thrift RPC service and native transport
+# server to.
+#
+# Set rpc_address OR rpc_interface, not both.
+#
+# Leaving rpc_address blank has the same effect as on listen_address
+# (i.e. it will be based on the configured hostname of the node).
+#
+# Note that unlike listen_address, you can specify 0.0.0.0, but you must also
+# set broadcast_rpc_address to a value other than 0.0.0.0.
+#
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+rpc_address: 0.0.0.0
+
+# Set rpc_address OR rpc_interface, not both. Interfaces must correspond
+# to a single address, IP aliasing is not supported.
+# rpc_interface: eth1
+
+# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address
+# you can specify which should be chosen using rpc_interface_prefer_ipv6. If false the first ipv4
+# address will be used. If true the first ipv6 address will be used. Defaults to false preferring
+# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6.
+# rpc_interface_prefer_ipv6: false
+
+# port for Thrift to listen for clients on
+rpc_port: 9160
+
+# RPC address to broadcast to drivers and other Cassandra nodes. This cannot
+# be set to 0.0.0.0. If left blank, this will be set to the value of
+# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must
+# be set.
+broadcast_rpc_address: <will be replaced by docker-entrypoint.sh>
+
+# enable or disable keepalive on rpc/native connections
+rpc_keepalive: true
+
+# Cassandra provides two out-of-the-box options for the RPC Server:
+#
+# sync
+#   One thread per thrift connection. For a very large number of clients, memory
+#   will be your limiting factor. On a 64 bit JVM, 180KB is the minimum stack size
+#   per thread, and that will correspond to your use of virtual memory (but physical memory
+#   may be limited depending on use of stack space).
+#
+# hsha
+#   Stands for "half synchronous, half asynchronous." All thrift clients are handled
+#   asynchronously using a small number of threads that does not vary with the amount
+#   of thrift clients (and thus scales well to many clients). The rpc requests are still
+#   synchronous (one thread per active request). If hsha is selected then it is essential
+#   that rpc_max_threads is changed from the default value of unlimited.
+#
+# The default is sync because on Windows hsha is about 30% slower.  On Linux,
+# sync/hsha performance is about the same, with hsha of course using less memory.
+#
+# Alternatively,  can provide your own RPC server by providing the fully-qualified class name
+# of an o.a.c.t.TServerFactory that can create an instance of it.
+rpc_server_type: sync
+
+# Uncomment rpc_min|max_thread to set request pool size limits.
+#
+# Regardless of your choice of RPC server (see above), the number of maximum requests in the
+# RPC thread pool dictates how many concurrent requests are possible (but if you are using the sync
+# RPC server, it also dictates the number of clients that can be connected at all).
+#
+# The default is unlimited and thus provides no protection against clients overwhelming the server. You are
+# encouraged to set a maximum that makes sense for you in production, but do keep in mind that
+# rpc_max_threads represents the maximum number of client requests this server may execute concurrently.
+#
+# rpc_min_threads: 16
+# rpc_max_threads: 2048
+
+# uncomment to set socket buffer sizes on rpc connections
+# rpc_send_buff_size_in_bytes:
+# rpc_recv_buff_size_in_bytes:
+
+# Uncomment to set socket buffer size for internode communication
+# Note that when setting this, the buffer size is limited by net.core.wmem_max
+# and when not setting it it is defined by net.ipv4.tcp_wmem
+# See also:
+# /proc/sys/net/core/wmem_max
+# /proc/sys/net/core/rmem_max
+# /proc/sys/net/ipv4/tcp_wmem
+# /proc/sys/net/ipv4/tcp_wmem
+# and 'man tcp'
+# internode_send_buff_size_in_bytes:
+
+# Uncomment to set socket buffer size for internode communication
+# Note that when setting this, the buffer size is limited by net.core.wmem_max
+# and when not setting it it is defined by net.ipv4.tcp_wmem
+# internode_recv_buff_size_in_bytes:
+
+# Frame size for thrift (maximum message length).
+thrift_framed_transport_size_in_mb: 15
+
+# Set to true to have Cassandra create a hard link to each sstable
+# flushed or streamed locally in a backups/ subdirectory of the
+# keyspace data.  Removing these links is the operator's
+# responsibility.
+incremental_backups: false
+
+# Whether or not to take a snapshot before each compaction.  Be
+# careful using this option, since Cassandra won't clean up the
+# snapshots for you.  Mostly useful if you're paranoid when there
+# is a data format change.
+snapshot_before_compaction: false
+
+# Whether or not a snapshot is taken of the data before keyspace truncation
+# or dropping of column families. The STRONGLY advised default of true 
+# should be used to provide data safety. If you set this flag to false, you will
+# lose data on truncation or drop.
+auto_snapshot: true
+
+# Granularity of the collation index of rows within a partition.
+# Increase if your rows are large, or if you have a very large
+# number of rows per partition.  The competing goals are these:
+#
+# - a smaller granularity means more index entries are generated
+#   and looking up rows withing the partition by collation column
+#   is faster
+# - but, Cassandra will keep the collation index in memory for hot
+#   rows (as part of the key cache), so a larger granularity means
+#   you can cache more hot rows
+column_index_size_in_kb: 64
+
+# Per sstable indexed key cache entries (the collation index in memory
+# mentioned above) exceeding this size will not be held on heap.
+# This means that only partition information is held on heap and the
+# index entries are read from disk.
+#
+# Note that this size refers to the size of the
+# serialized index information and not the size of the partition.
+column_index_cache_size_in_kb: 2
+
+# Number of simultaneous compactions to allow, NOT including
+# validation "compactions" for anti-entropy repair.  Simultaneous
+# compactions can help preserve read performance in a mixed read/write
+# workload, by mitigating the tendency of small sstables to accumulate
+# during a single long running compactions. The default is usually
+# fine and if you experience problems with compaction running too
+# slowly or too fast, you should look at
+# compaction_throughput_mb_per_sec first.
+#
+# concurrent_compactors defaults to the smaller of (number of disks,
+# number of cores), with a minimum of 2 and a maximum of 8.
+# 
+# If your data directories are backed by SSD, you should increase this
+# to the number of cores.
+#concurrent_compactors: 1
+
+# Throttles compaction to the given total throughput across the entire
+# system. The faster you insert data, the faster you need to compact in
+# order to keep the sstable count down, but in general, setting this to
+# 16 to 32 times the rate you are inserting data is more than sufficient.
+# Setting this to 0 disables throttling. Note that this account for all types
+# of compaction, including validation compaction.
+compaction_throughput_mb_per_sec: 16
+
+# When compacting, the replacement sstable(s) can be opened before they
+# are completely written, and used in place of the prior sstables for
+# any range that has been written. This helps to smoothly transfer reads 
+# between the sstables, reducing page cache churn and keeping hot rows hot
+sstable_preemptive_open_interval_in_mb: 50
+
+# Throttles all outbound streaming file transfers on this node to the
+# given total throughput in Mbps. This is necessary because Cassandra does
+# mostly sequential IO when streaming data during bootstrap or repair, which
+# can lead to saturating the network connection and degrading rpc performance.
+# When unset, the default is 200 Mbps or 25 MB/s.
+# stream_throughput_outbound_megabits_per_sec: 200
+
+# Throttles all streaming file transfer between the datacenters,
+# this setting allows users to throttle inter dc stream throughput in addition
+# to throttling all network stream traffic as configured with
+# stream_throughput_outbound_megabits_per_sec
+# When unset, the default is 200 Mbps or 25 MB/s
+# inter_dc_stream_throughput_outbound_megabits_per_sec: 200
+
+# How long the coordinator should wait for read operations to complete
+read_request_timeout_in_ms: 50000
+# How long the coordinator should wait for seq or index scans to complete
+range_request_timeout_in_ms: 100000
+# How long the coordinator should wait for writes to complete
+write_request_timeout_in_ms: 20000
+# How long the coordinator should wait for counter writes to complete
+counter_write_request_timeout_in_ms: 50000
+# How long a coordinator should continue to retry a CAS operation
+# that contends with other proposals for the same row
+cas_contention_timeout_in_ms: 10000
+# How long the coordinator should wait for truncates to complete
+# (This can be much longer, because unless auto_snapshot is disabled
+# we need to flush first so we can snapshot before removing the data.)
+truncate_request_timeout_in_ms: 600000
+# The default timeout for other, miscellaneous operations
+request_timeout_in_ms: 100000
+
+# How long before a node logs slow queries. Select queries that take longer than
+# this timeout to execute, will generate an aggregated log message, so that slow queries
+# can be identified. Set this value to zero to disable slow query logging.
+slow_query_log_timeout_in_ms: 500
+
+# Enable operation timeout information exchange between nodes to accurately
+# measure request timeouts.  If disabled, replicas will assume that requests
+# were forwarded to them instantly by the coordinator, which means that
+# under overload conditions we will waste that much extra time processing 
+# already-timed-out requests.
+#
+# Warning: before enabling this property make sure to ntp is installed
+# and the times are synchronized between the nodes.
+cross_node_timeout: false
+
+# Set keep-alive period for streaming
+# This node will send a keep-alive message periodically with this period.
+# If the node does not receive a keep-alive message from the peer for
+# 2 keep-alive cycles the stream session times out and fail
+# Default value is 300s (5 minutes), which means stalled stream
+# times out in 10 minutes by default
+# streaming_keep_alive_period_in_secs: 300
+
+# phi value that must be reached for a host to be marked down.
+# most users should never need to adjust this.
+# phi_convict_threshold: 8
+
+# endpoint_snitch -- Set this to a class that implements
+# IEndpointSnitch.  The snitch has two functions:
+#
+# - it teaches Cassandra enough about your network topology to route
+#   requests efficiently
+# - it allows Cassandra to spread replicas around your cluster to avoid
+#   correlated failures. It does this by grouping machines into
+#   "datacenters" and "racks."  Cassandra will do its best not to have
+#   more than one replica on the same "rack" (which may not actually
+#   be a physical location)
+#
+# CASSANDRA WILL NOT ALLOW YOU TO SWITCH TO AN INCOMPATIBLE SNITCH
+# ONCE DATA IS INSERTED INTO THE CLUSTER.  This would cause data loss.
+# This means that if you start with the default SimpleSnitch, which
+# locates every node on "rack1" in "datacenter1", your only options
+# if you need to add another datacenter are GossipingPropertyFileSnitch
+# (and the older PFS).  From there, if you want to migrate to an
+# incompatible snitch like Ec2Snitch you can do it by adding new nodes
+# under Ec2Snitch (which will locate them in a new "datacenter") and
+# decommissioning the old ones.
+#
+# Out of the box, Cassandra provides:
+#
+# SimpleSnitch:
+#    Treats Strategy order as proximity. This can improve cache
+#    locality when disabling read repair.  Only appropriate for
+#    single-datacenter deployments.
+#
+# GossipingPropertyFileSnitch
+#    This should be your go-to snitch for production use.  The rack
+#    and datacenter for the local node are defined in
+#    cassandra-rackdc.properties and propagated to other nodes via
+#    gossip.  If cassandra-topology.properties exists, it is used as a
+#    fallback, allowing migration from the PropertyFileSnitch.
+#
+# PropertyFileSnitch:
+#    Proximity is determined by rack and data center, which are
+#    explicitly configured in cassandra-topology.properties.
+#
+# Ec2Snitch:
+#    Appropriate for EC2 deployments in a single Region. Loads Region
+#    and Availability Zone information from the EC2 API. The Region is
+#    treated as the datacenter, and the Availability Zone as the rack.
+#    Only private IPs are used, so this will not work across multiple
+#    Regions.
+#
+# Ec2MultiRegionSnitch:
+#    Uses public IPs as broadcast_address to allow cross-region
+#    connectivity.  (Thus, you should set seed addresses to the public
+#    IP as well.) You will need to open the storage_port or
+#    ssl_storage_port on the public IP firewall.  (For intra-Region
+#    traffic, Cassandra will switch to the private IP after
+#    establishing a connection.)
+#
+# RackInferringSnitch:
+#    Proximity is determined by rack and data center, which are
+#    assumed to correspond to the 3rd and 2nd octet of each node's IP
+#    address, respectively.  Unless this happens to match your
+#    deployment conventions, this is best used as an example of
+#    writing a custom Snitch class and is provided in that spirit.
+#
+# You can use a custom Snitch by setting this to the full class name
+# of the snitch, which will be assumed to be on your classpath.
+endpoint_snitch: SimpleSnitch
+
+# controls how often to perform the more expensive part of host score
+# calculation
+dynamic_snitch_update_interval_in_ms: 100 
+# controls how often to reset all host scores, allowing a bad host to
+# possibly recover
+dynamic_snitch_reset_interval_in_ms: 600000
+# if set greater than zero and read_repair_chance is < 1.0, this will allow
+# 'pinning' of replicas to hosts in order to increase cache capacity.
+# The badness threshold will control how much worse the pinned host has to be
+# before the dynamic snitch will prefer other replicas over it.  This is
+# expressed as a double which represents a percentage.  Thus, a value of
+# 0.2 means Cassandra would continue to prefer the static snitch values
+# until the pinned host was 20% worse than the fastest.
+dynamic_snitch_badness_threshold: 0.1
+
+# request_scheduler -- Set this to a class that implements
+# RequestScheduler, which will schedule incoming client requests
+# according to the specific policy. This is useful for multi-tenancy
+# with a single Cassandra cluster.
+# NOTE: This is specifically for requests from the client and does
+# not affect inter node communication.
+# org.apache.cassandra.scheduler.NoScheduler - No scheduling takes place
+# org.apache.cassandra.scheduler.RoundRobinScheduler - Round robin of
+# client requests to a node with a separate queue for each
+# request_scheduler_id. The scheduler is further customized by
+# request_scheduler_options as described below.
+request_scheduler: org.apache.cassandra.scheduler.NoScheduler
+
+# Scheduler Options vary based on the type of scheduler
+#
+# NoScheduler
+#   Has no options
+#
+# RoundRobin
+#   throttle_limit
+#     The throttle_limit is the number of in-flight
+#     requests per client.  Requests beyond 
+#     that limit are queued up until
+#     running requests can complete.
+#     The value of 80 here is twice the number of
+#     concurrent_reads + concurrent_writes.
+#   default_weight
+#     default_weight is optional and allows for
+#     overriding the default which is 1.
+#   weights
+#     Weights are optional and will default to 1 or the
+#     overridden default_weight. The weight translates into how
+#     many requests are handled during each turn of the
+#     RoundRobin, based on the scheduler id.
+#
+# request_scheduler_options:
+#    throttle_limit: 80
+#    default_weight: 5
+#    weights:
+#      Keyspace1: 1
+#      Keyspace2: 5
+
+# request_scheduler_id -- An identifier based on which to perform
+# the request scheduling. Currently the only valid option is keyspace.
+# request_scheduler_id: keyspace
+
+# Enable or disable inter-node encryption
+# JVM defaults for supported SSL socket protocols and cipher suites can
+# be replaced using custom encryption options. This is not recommended
+# unless you have policies in place that dictate certain settings, or
+# need to disable vulnerable ciphers or protocols in case the JVM cannot
+# be updated.
+# FIPS compliant settings can be configured at JVM level and should not
+# involve changing encryption settings here:
+# https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/FIPS.html
+# *NOTE* No custom encryption options are enabled at the moment
+# The available internode options are : all, none, dc, rack
+#
+# If set to dc cassandra will encrypt the traffic between the DCs
+# If set to rack cassandra will encrypt the traffic between the racks
+#
+# The passwords used in these options must match the passwords used when generating
+# the keystore and truststore.  For instructions on generating these files, see:
+# http://download.oracle.com/javase/6/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore
+#
+server_encryption_options:
+    internode_encryption: none
+    keystore: conf/.keystore
+    keystore_password: cassandra
+    truststore: conf/.truststore
+    truststore_password: cassandra
+    # More advanced defaults below:
+    # protocol: TLS
+    # algorithm: SunX509
+    # store_type: JKS
+    # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA]
+    # require_client_auth: false
+    # require_endpoint_verification: false
+
+# enable or disable client/server encryption.
+client_encryption_options:
+    enabled: false
+    # If enabled and optional is set to true encrypted and unencrypted connections are handled.
+    optional: false
+    keystore: conf/.keystore
+    keystore_password: cassandra
+    # require_client_auth: false
+    # Set trustore and truststore_password if require_client_auth is true
+    # truststore: conf/.truststore
+    # truststore_password: cassandra
+    # More advanced defaults below:
+    # protocol: TLS
+    # algorithm: SunX509
+    # store_type: JKS
+    # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA]
+
+# internode_compression controls whether traffic between nodes is
+# compressed.
+# Can be:
+#
+# all
+#   all traffic is compressed
+#
+# dc
+#   traffic between different datacenters is compressed
+#
+# none
+#   nothing is compressed.
+internode_compression: dc
+
+# Enable or disable tcp_nodelay for inter-dc communication.
+# Disabling it will result in larger (but fewer) network packets being sent,
+# reducing overhead from the TCP protocol itself, at the cost of increasing
+# latency if you block for cross-datacenter responses.
+inter_dc_tcp_nodelay: false
+
+# TTL for different trace types used during logging of the repair process.
+tracetype_query_ttl: 86400
+tracetype_repair_ttl: 604800
+
+# By default, Cassandra logs GC Pauses greater than 200 ms at INFO level
+# This threshold can be adjusted to minimize logging if necessary
+# gc_log_threshold_in_ms: 200
+
+# If unset, all GC Pauses greater than gc_log_threshold_in_ms will log at
+# INFO level
+# UDFs (user defined functions) are disabled by default.
+# As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code.
+enable_user_defined_functions: true
+
+# Enables scripted UDFs (JavaScript UDFs).
+# Java UDFs are always enabled, if enable_user_defined_functions is true.
+# Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider.
+# This option has no effect, if enable_user_defined_functions is false.
+enable_scripted_user_defined_functions: false
+
+# Enables materialized view creation on this node.
+# Materialized views are considered experimental and are not recommended for production use.
+enable_materialized_views: true
+
+# The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation.
+# Lowering this value on Windows can provide much tighter latency and better throughput, however
+# some virtualized environments may see a negative performance impact from changing this setting
+# below their system default. The sysinternals 'clockres' tool can confirm your system's default
+# setting.
+windows_timer_interval: 1
+
+
+# Enables encrypting data at-rest (on disk). Different key providers can be plugged in, but the default reads from
+# a JCE-style keystore. A single keystore can hold multiple keys, but the one referenced by
+# the "key_alias" is the only key that will be used for encrypt opertaions; previously used keys
+# can still (and should!) be in the keystore and will be used on decrypt operations
+# (to handle the case of key rotation).
+#
+# It is strongly recommended to download and install Java Cryptography Extension (JCE)
+# Unlimited Strength Jurisdiction Policy Files for your version of the JDK.
+# (current link: http://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html)
+#
+# Currently, only the following file types are supported for transparent data encryption, although
+# more are coming in future cassandra releases: commitlog, hints
+transparent_data_encryption_options:
+    enabled: false
+    chunk_length_kb: 64
+    cipher: AES/CBC/PKCS5Padding
+    key_alias: testing:1
+    # CBC IV length for AES needs to be 16 bytes (which is also the default size)
+    # iv_length: 16
+    key_provider: 
+      - class_name: org.apache.cassandra.security.JKSKeyProvider
+        parameters: 
+          - keystore: conf/.keystore
+            keystore_password: cassandra
+            store_type: JCEKS
+            key_password: cassandra
+
+
+#####################
+# SAFETY THRESHOLDS #
+#####################
+
+# When executing a scan, within or across a partition, we need to keep the
+# tombstones seen in memory so we can return them to the coordinator, which
+# will use them to make sure other replicas also know about the deleted rows.
+# With workloads that generate a lot of tombstones, this can cause performance
+# problems and even exaust the server heap.
+# (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets)
+# Adjust the thresholds here if you understand the dangers and want to
+# scan more tombstones anyway.  These thresholds may also be adjusted at runtime
+# using the StorageService mbean.
+tombstone_warn_threshold: 1000
+tombstone_failure_threshold: 100000
+
+# Log WARN on any multiple-partition batch size exceeding this value. 5kb per batch by default.
+# Caution should be taken on increasing the size of this threshold as it can lead to node instability.
+batch_size_warn_threshold_in_kb: 5
+
+# Fail any multiple-partition batch exceeding this value. 50kb (10x warn threshold) by default.
+batch_size_fail_threshold_in_kb: 50
+
+# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
+unlogged_batch_across_partitions_warn_threshold: 10
+
+# Log a warning when compacting partitions larger than this value
+compaction_large_partition_warning_threshold_mb: 100
+
+# GC Pauses greater than gc_warn_threshold_in_ms will be logged at WARN level
+# Adjust the threshold based on your application throughput requirement
+# By default, Cassandra logs GC Pauses greater than 200 ms at INFO level
+gc_warn_threshold_in_ms: 1000
+
+# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
+# early. Any value size larger than this threshold will result into marking an SSTable
+# as corrupted. This should be positive and less than 2048.
+# max_value_size_in_mb: 256
+
+# Back-pressure settings #
+# If enabled, the coordinator will apply the back-pressure strategy specified below to each mutation
+# sent to replicas, with the aim of reducing pressure on overloaded replicas.
+back_pressure_enabled: false
+# The back-pressure strategy applied.
+# The default implementation, RateBasedBackPressure, takes three arguments:
+# high ratio, factor, and flow type, and uses the ratio between incoming mutation responses and outgoing mutation requests.
+# If below high ratio, outgoing mutations are rate limited according to the incoming rate decreased by the given factor;
+# if above high ratio, the rate limiting is increased by the given factor;
+# such factor is usually best configured between 1 and 10, use larger values for a faster recovery
+# at the expense of potentially more dropped mutations;
+# the rate limiting is applied according to the flow type: if FAST, it's rate limited at the speed of the fastest replica,
+# if SLOW at the speed of the slowest one.
+# New strategies can be added. Implementors need to implement org.apache.cassandra.net.BackpressureStrategy and
+# provide a public constructor accepting a Map<String, Object>.
+back_pressure_strategy:
+    - class_name: org.apache.cassandra.net.RateBasedBackPressure
+      parameters:
+        - high_ratio: 0.90
+          factor: 5
+          flow: FAST
+
+# Coalescing Strategies #
+# Coalescing multiples messages turns out to significantly boost message processing throughput (think doubling or more).
+# On bare metal, the floor for packet processing throughput is high enough that many applications won't notice, but in
+# virtualized environments, the point at which an application can be bound by network packet processing can be
+# surprisingly low compared to the throughput of task processing that is possible inside a VM. It's not that bare metal
+# doesn't benefit from coalescing messages, it's that the number of packets a bare metal network interface can process
+# is sufficient for many applications such that no load starvation is experienced even without coalescing.
+# There are other benefits to coalescing network messages that are harder to isolate with a simple metric like messages
+# per second. By coalescing multiple tasks together, a network thread can process multiple messages for the cost of one
+# trip to read from a socket, and all the task submission work can be done at the same time reducing context switching
+# and increasing cache friendliness of network message processing.
+# See CASSANDRA-8692 for details.
+
+# Strategy to use for coalescing messages in OutboundTcpConnection.
+# Can be fixed, movingaverage, timehorizon, disabled (default).
+# You can also specify a subclass of CoalescingStrategies.CoalescingStrategy by name.
+# otc_coalescing_strategy: DISABLED
+
+# How many microseconds to wait for coalescing. For fixed strategy this is the amount of time after the first
+# message is received before it will be sent with any accompanying messages. For moving average this is the
+# maximum amount of time that will be waited as well as the interval at which messages must arrive on average
+# for coalescing to be enabled.
+# otc_coalescing_window_us: 200
+
+# Do not try to coalesce messages if we already got that many messages. This should be more than 2 and less than 128.
+# otc_coalescing_enough_coalesced_messages: 8
+
+# How many milliseconds to wait between two expiration runs on the backlog (queue) of the OutboundTcpConnection.
+# Expiration is done if messages are piling up in the backlog. Droppable messages are expired to free the memory
+# taken by expired messages. The interval should be between 0 and 1000, and in most installations the default value
+# will be appropriate. A smaller value could potentially expire messages slightly sooner at the expense of more CPU
+# time and queue contention while iterating the backlog of messages.
+# An interval of 0 disables any wait time, which is the behavior of former Cassandra versions.
+#
+# otc_backlog_expiration_interval_ms: 200
diff --git a/docker/conf/deposit.yml b/docker/conf/deposit.yml
new file mode 100644
index 0000000..7b0cabf
--- /dev/null
+++ b/docker/conf/deposit.yml
@@ -0,0 +1,17 @@
+scheduler:
+  cls: remote
+  args:
+    url: http://swh-scheduler-api:5008
+
+allowed_hosts:
+  - swh-deposit
+
+private:
+  secret_key: prod-in-docker
+  db:
+    host: swh-deposit-db
+    port: 5432
+    name: swh-deposit
+    user: postgres
+    password: testpassword
+  media_root: /tmp/swh-deposit/uploads
diff --git a/docker/conf/grafana/dashboards/task-processing.json b/docker/conf/grafana/dashboards/task-processing.json
new file mode 100644
index 0000000..a1cc4b1
--- /dev/null
+++ b/docker/conf/grafana/dashboards/task-processing.json
@@ -0,0 +1,373 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 2,
+  "id": 18,
+  "iteration": 1551112370226,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 4,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": true,
+        "hideEmpty": true,
+        "hideZero": true,
+        "max": true,
+        "min": true,
+        "rightSide": false,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [
+        {
+          "alias": "/uncaught/",
+          "color": "#bf1b00"
+        }
+      ],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(swh_task_called_count{worker=~\"$worker\"}[$interval])) by (task)",
+          "format": "time_series",
+          "hide": false,
+          "intervalFactor": 1,
+          "legendFormat": "{{task}}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(swh_task_failure_count{worker=~\"$worker\"}[$interval])) by (task)",
+          "format": "time_series",
+          "hide": false,
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{task}} uncaught exceptions",
+          "refId": "C"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Task counts",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": null,
+          "format": "short",
+          "label": "tasks per second",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "cards": {
+        "cardPadding": null,
+        "cardRound": null
+      },
+      "color": {
+        "cardColor": "#c15c17",
+        "colorScale": "sqrt",
+        "colorScheme": "interpolateViridis",
+        "exponent": 0.5,
+        "mode": "spectrum"
+      },
+      "dataFormat": "tsbuckets",
+      "datasource": "Prometheus",
+      "description": "Each square's color represents the number of tasks that completed within that duration range.",
+      "gridPos": {
+        "h": 14,
+        "w": 24,
+        "x": 0,
+        "y": 9
+      },
+      "heatmap": {},
+      "hideTimeOverride": false,
+      "highlightCards": true,
+      "id": 2,
+      "legend": {
+        "show": true
+      },
+      "links": [],
+      "repeat": "task",
+      "repeatDirection": "v",
+      "scopedVars": {
+        "task": {
+          "selected": true,
+          "text": "swh.loader.git.tasks.UpdateGitRepository",
+          "value": "swh.loader.git.tasks.UpdateGitRepository"
+        }
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(swh_task_duration_seconds_bucket{task=~\"$task\",worker=~\"$worker\"}[$interval])) by (le)",
+          "format": "heatmap",
+          "instant": false,
+          "interval": "$interval",
+          "intervalFactor": 1,
+          "legendFormat": "{{le}}",
+          "refId": "A"
+        }
+      ],
+      "title": "$task durations",
+      "tooltip": {
+        "show": true,
+        "showHistogram": true
+      },
+      "tooltipDecimals": null,
+      "transparent": false,
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "xBucketNumber": null,
+      "xBucketSize": null,
+      "yAxis": {
+        "decimals": 0,
+        "format": "s",
+        "logBase": 1,
+        "max": null,
+        "min": null,
+        "show": true,
+        "splitFactor": null
+      },
+      "yBucketBound": "upper",
+      "yBucketNumber": null,
+      "yBucketSize": null
+    }
+  ],
+  "refresh": false,
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "auto": true,
+        "auto_count": 100,
+        "auto_min": "2m",
+        "current": {
+          "text": "auto",
+          "value": "$__auto_interval_interval"
+        },
+        "hide": 0,
+        "label": "Interval",
+        "name": "interval",
+        "options": [
+          {
+            "selected": true,
+            "text": "auto",
+            "value": "$__auto_interval_interval"
+          },
+          {
+            "selected": false,
+            "text": "2m",
+            "value": "2m"
+          },
+          {
+            "selected": false,
+            "text": "5m",
+            "value": "5m"
+          },
+          {
+            "selected": false,
+            "text": "10m",
+            "value": "10m"
+          },
+          {
+            "selected": false,
+            "text": "30m",
+            "value": "30m"
+          },
+          {
+            "selected": false,
+            "text": "1h",
+            "value": "1h"
+          },
+          {
+            "selected": false,
+            "text": "6h",
+            "value": "6h"
+          },
+          {
+            "selected": false,
+            "text": "12h",
+            "value": "12h"
+          },
+          {
+            "selected": false,
+            "text": "1d",
+            "value": "1d"
+          },
+          {
+            "selected": false,
+            "text": "7d",
+            "value": "7d"
+          },
+          {
+            "selected": false,
+            "text": "14d",
+            "value": "14d"
+          },
+          {
+            "selected": false,
+            "text": "30d",
+            "value": "30d"
+          }
+        ],
+        "query": "2m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+        "refresh": 2,
+        "skipUrlSync": false,
+        "type": "interval"
+      },
+      {
+        "allValue": null,
+        "current": {
+          "text": "swh.loader.git.tasks.UpdateGitRepository",
+          "value": "swh.loader.git.tasks.UpdateGitRepository"
+        },
+        "datasource": "Prometheus",
+        "definition": "label_values(swh_task_called_count, task)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Task name",
+        "multi": false,
+        "name": "task",
+        "options": [],
+        "query": "label_values(swh_task_called_count, task)",
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": null,
+        "current": {
+          "text": "All",
+          "value": [
+            "$__all"
+          ]
+        },
+        "datasource": "Prometheus",
+        "definition": "label_values(swh_task_called_count{task=~\"$task\"}, worker)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Worker",
+        "multi": true,
+        "name": "worker",
+        "options": [],
+        "query": "label_values(swh_task_called_count{task=~\"$task\"}, worker)",
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "Worker task processing",
+  "uid": "b_xh3f9ik",
+  "version": 8
+}
diff --git a/docker/conf/grafana/provisioning/dashboards/all.yaml b/docker/conf/grafana/provisioning/dashboards/all.yaml
new file mode 100644
index 0000000..aa79647
--- /dev/null
+++ b/docker/conf/grafana/provisioning/dashboards/all.yaml
@@ -0,0 +1,6 @@
+- name: 'default'       # name of this dashboard configuration (not dashboard itself)
+  org_id: 1             # id of the org to hold the dashboard
+  folder: ''            # name of the folder to put the dashboard (http://docs.grafana.org/v5.0/reference/dashboard_folders/)
+  type: 'file'          # type of dashboard description (json files)
+  options:
+    folder: '/var/lib/grafana/dashboards'       # where dashboards are
diff --git a/docker/conf/grafana/provisioning/datasources/prometheus.yaml b/docker/conf/grafana/provisioning/datasources/prometheus.yaml
new file mode 100644
index 0000000..289aaf1
--- /dev/null
+++ b/docker/conf/grafana/provisioning/datasources/prometheus.yaml
@@ -0,0 +1,11 @@
+# config file version
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    url: http://localhost:5080/prometheus
+    access: direct
+    isDefault: true
+    version: 1
+    editable: false
diff --git a/docker/conf/indexer.yml b/docker/conf/indexer.yml
new file mode 100644
index 0000000..dd2133b
--- /dev/null
+++ b/docker/conf/indexer.yml
@@ -0,0 +1,31 @@
+storage:
+  cls: remote
+  args:
+    url: http://swh-storage:5002/
+objstorage:
+  cls: remote
+  args:
+    url: http://swh-objstorage:5003/
+indexer_storage:
+  cls: remote
+  args:
+    url: http://swh-idx-storage:5007/
+scheduler:
+  cls: remote
+  args:
+    url: http://swh-scheduler-api:5008/
+celery:
+  task_broker: amqp://guest:guest@amqp//
+  task_modules:
+    - swh.indexer.tasks
+  task_queues:
+    - swh.indexer.tasks.ContentFossologyLicense
+    - swh.indexer.tasks.ContentLanguage
+    - swh.indexer.tasks.ContentMimetype
+    - swh.indexer.tasks.ContentRangeFossologyLicense
+    - swh.indexer.tasks.ContentRangeMimetype
+    - swh.indexer.tasks.Ctags
+    - swh.indexer.tasks.OriginHead
+    - swh.indexer.tasks.OriginMetadata
+    - swh.indexer.tasks.RecomputeChecksums
+    - swh.indexer.tasks.RevisionMetadata
diff --git a/docker/conf/indexer_journal_client.yml b/docker/conf/indexer_journal_client.yml
new file mode 100644
index 0000000..91877c9
--- /dev/null
+++ b/docker/conf/indexer_journal_client.yml
@@ -0,0 +1,11 @@
+journal:
+  brokers:
+    - kafka
+  group_id: swh.indexer.journal_client
+
+max_messages: 50
+
+scheduler:
+  cls: remote
+  args:
+    url: http://swh-scheduler-api:5008/
diff --git a/docker/conf/indexer_storage.yml b/docker/conf/indexer_storage.yml
new file mode 100644
index 0000000..a5eb85c
--- /dev/null
+++ b/docker/conf/indexer_storage.yml
@@ -0,0 +1,4 @@
+indexer_storage:
+  cls: local
+  args:
+    db: postgresql:///?service=swh-indexers
diff --git a/docker/conf/journal_backfiller.yml b/docker/conf/journal_backfiller.yml
new file mode 100644
index 0000000..268b752
--- /dev/null
+++ b/docker/conf/journal_backfiller.yml
@@ -0,0 +1,9 @@
+brokers:
+  - kafka
+
+final_prefix: swh.journal.objects
+client_id: swh.journal.backfiller
+object_types:
+  - content
+
+storage_dbconn: postgresql:///?service=swh-storage
diff --git a/docker/conf/lister.yml b/docker/conf/lister.yml
new file mode 100644
index 0000000..0c326f1
--- /dev/null
+++ b/docker/conf/lister.yml
@@ -0,0 +1,60 @@
+storage:
+  cls: remote
+  args:
+    url: http://swh-storage:5002/
+
+scheduler:
+  cls: remote
+  args:
+    url: http://swh-scheduler-api:5008/
+
+lister:
+  cls: local
+  args:
+    db: postgresql://postgres@swh-listers-db/swh-listers
+
+celery:
+  task_broker: amqp://guest:guest@amqp//
+  task_modules:
+    - swh.lister.bitbucket.tasks
+    - swh.lister.cgit.tasks
+    - swh.lister.cran.tasks
+    - swh.lister.debian.tasks
+    - swh.lister.github.tasks
+    - swh.lister.gitlab.tasks
+    - swh.lister.gnu.tasks
+    - swh.lister.npm.tasks
+    - swh.lister.packagist.tasks
+    - swh.lister.phabricator.tasks
+    - swh.lister.pypi.tasks
+  task_queues:
+    - swh.lister.bitbucket.tasks.FullBitBucketRelister
+    - swh.lister.bitbucket.tasks.IncrementalBitBucketLister
+    - swh.lister.bitbucket.tasks.RangeBitBucketLister
+    - swh.lister.bitbucket.tasks.ping
+    - swh.lister.cgit.tasks.CGitListerTask
+    - swh.lister.cgit.tasks.ping
+    - swh.lister.cran.tasks.CRANListerTask
+    - swh.lister.cran.tasks.ping
+    - swh.lister.debian.tasks.DebianListerTask
+    - swh.lister.debian.tasks.ping
+    - swh.lister.github.tasks.FullGitHubRelister
+    - swh.lister.github.tasks.IncrementalGitHubLister
+    - swh.lister.github.tasks.RangeGitHubLister
+    - swh.lister.github.tasks.ping
+    - swh.lister.gitlab.tasks.FullGitLabRelister
+    - swh.lister.gitlab.tasks.IncrementalGitLabLister
+    - swh.lister.gitlab.tasks.RangeGitLabLister
+    - swh.lister.gitlab.tasks.ping
+    - swh.lister.gnu.tasks.GNUListerTask
+    - swh.lister.gnu.tasks.ping
+    - swh.lister.npm.tasks.NpmIncrementalListerTask
+    - swh.lister.npm.tasks.NpmListerTask
+    - swh.lister.npm.tasks.ping
+    - swh.lister.packagist.tasks.PackagistListerTask
+    - swh.lister.packagist.tasks.ping
+    - swh.lister.phabricator.tasks.FullPhabricatorLister
+    - swh.lister.phabricator.tasks.IncrementalPhabricatorLister
+    - swh.lister.phabricator.tasks.ping
+    - swh.lister.pypi.tasks.PyPIListerTask
+    - swh.lister.pypi.tasks.ping
diff --git a/docker/conf/loader.yml b/docker/conf/loader.yml
new file mode 100644
index 0000000..debcc93
--- /dev/null
+++ b/docker/conf/loader.yml
@@ -0,0 +1,50 @@
+storage:
+  cls: filter
+  args:
+    storage:
+      cls: buffer
+      args:
+        storage:
+          cls: remote
+          args:
+            url: http://swh-storage:5002/
+        min_batch_size:
+          content: 10000
+          content_bytes: 104857600
+          directory: 1000
+          revision: 1000
+
+scheduler:
+  cls: remote
+  args:
+    url: http://swh-scheduler-api:5008/
+
+celery:
+  task_broker: amqp://guest:guest@amqp//
+  task_modules:
+    - swh.loader.git.tasks
+    - swh.loader.mercurial.tasks
+    - swh.loader.svn.tasks
+    - swh.deposit.loader.tasks
+    - swh.loader.package.tasks
+
+  task_queues:
+    - swh.loader.dir.tasks.LoadDirRepository
+    - swh.loader.git.tasks.LoadDiskGitRepository
+    - swh.loader.git.tasks.UncompressAndLoadDiskGitRepository
+    - swh.loader.git.tasks.UpdateGitRepository
+    - swh.loader.mercurial.tasks.LoadArchiveMercurial
+    - swh.loader.mercurial.tasks.LoadMercurial
+    - swh.loader.package.tasks.LoadArchive
+    - swh.loader.package.tasks.LoadDebian
+    - swh.loader.package.tasks.LoadNpm
+    - swh.loader.package.tasks.LoadPyPI
+    - swh.loader.svn.tasks.DumpMountAndLoadSvnRepository
+    - swh.loader.svn.tasks.LoadSvnRepository
+    - swh.loader.svn.tasks.MountAndLoadSvnRepository
+    - swh.deposit.loader.tasks.LoadDepositArchiveTsk
+    - swh.deposit.loader.tasks.ChecksDepositTsk
+
+lister_db_url: postgresql://postgres@swh-listers-db/swh-listers
+
+url: 'http://swh-deposit:5006'
diff --git a/docker/conf/nginx.conf b/docker/conf/nginx.conf
new file mode 100644
index 0000000..a0774ad
--- /dev/null
+++ b/docker/conf/nginx.conf
@@ -0,0 +1,107 @@
+worker_processes  1;
+
+# Show startup logs on stderr; switch to debug to print, well, debug logs when
+# running nginx-debug
+error_log /dev/stderr info;
+
+events {
+  worker_connections 1024;
+}
+
+http {
+  include            mime.types;
+  default_type       application/octet-stream;
+  sendfile           on;
+  keepalive_timeout  65;
+
+  # Built-in Docker resolver. Needed to allow on-demand resolution of proxy
+  # upstreams.
+  resolver           127.0.0.11 valid=30s;
+
+  server {
+    listen             5080 default_server;
+
+    # Add a trailing slash to top level requests (e.g. http://localhost:5080/flower)
+
+    rewrite ^/([^/]+)$ /$1/ permanent;
+
+    # In this pile of proxies, all upstreams are set using a variable. This
+    # makes nginx DNS-resolve the name of the upstream when clients request
+    # them, rather than on start. This avoids an unstarted container preventing
+    # nginx from starting.
+    #
+    # Variables need to be set as early as possible, as they're statements from
+    # the rewrite module and `rewrite [...] break;` will prevent these
+    # statements from being executed.
+
+    location /flower/ {
+      set $upstream "http://flower:5555";
+
+      rewrite ^/flower/(.*)$ /$1 break;
+      proxy_pass $upstream;
+
+      proxy_set_header X-Real-IP $remote_addr;
+      proxy_set_header Host $host;
+      proxy_redirect off;
+      proxy_http_version 1.1;
+      proxy_set_header Upgrade $http_upgrade;
+      proxy_set_header Connection "upgrade";
+      }
+    location /rabbitmq/ {
+      set $upstream "http://amqp:15672";
+      
+      rewrite  ^  $request_uri;      
+      rewrite ^/rabbitmq(/.*)$ $1 break;
+      
+      proxy_pass $upstream$uri;
+    }
+    location /scheduler {
+      set $upstream "http://swh-scheduler-api:5008";
+
+      rewrite ^/scheduler/(.*)$ /$1 break;
+      proxy_pass $upstream;
+    }
+    location /storage {
+      set $upstream "http://swh-storage:5002";
+
+      rewrite ^/storage/(.*)$ /$1 break;
+      proxy_pass $upstream;
+    }
+    location /indexer-storage {
+      set $upstream "http://swh-idx-storage:5007";
+
+      rewrite ^/indexer-storage/(.*)$ /$1 break;
+
+      proxy_pass $upstream;
+    }
+    location /deposit {
+      set $upstream "http://swh-deposit:5006";
+
+      rewrite ^/deposit/(.*)$ /$1 break;
+      proxy_pass $upstream;
+
+      proxy_set_header X-Real-IP $remote_addr;
+      proxy_set_header Host $host;
+      proxy_redirect off;
+    }
+    location /objstorage {
+      set $upstream "http://swh-objstorage:5003";
+
+      rewrite ^/objstorage/(.*)$ /$1 break;
+      proxy_pass $upstream;
+    }
+    location /prometheus {
+      set $upstream "http://prometheus:9090";
+      proxy_pass $upstream;
+    }
+    location /grafana {
+      set $upstream "http://grafana:3000";
+      rewrite ^/grafana/(.*)$ /$1 break;
+      proxy_pass $upstream;
+    }
+    location / {
+      set $upstream "http://swh-web:5004";
+      proxy_pass $upstream;
+    }
+  }
+}
diff --git a/docker/conf/objstorage.yml b/docker/conf/objstorage.yml
new file mode 100644
index 0000000..dde0323
--- /dev/null
+++ b/docker/conf/objstorage.yml
@@ -0,0 +1,7 @@
+objstorage:
+  cls: pathslicing
+  args:
+    root: /srv/softwareheritage/objects
+    slicing: 0:5
+
+client_max_size: 1073741824
diff --git a/docker/conf/prometheus-jmx-exporter-cassandra.yml b/docker/conf/prometheus-jmx-exporter-cassandra.yml
new file mode 100644
index 0000000..7c256d9
--- /dev/null
+++ b/docker/conf/prometheus-jmx-exporter-cassandra.yml
@@ -0,0 +1,42 @@
+# see:
+# - http://cassandra.apache.org/doc/latest/operating/metrics.html
+# - https://blog.pythian.com/step-step-monitoring-cassandra-prometheus-grafana/
+
+startDelaySeconds: 0
+hostPort: cassandra-seed:7199
+username: 
+password: 
+#jmxUrl: service:jmx:rmi:///jndi/rmi://127.0.0.1:1234/jmxrmi
+ssl: false
+lowercaseOutputName: false
+lowercaseOutputLabelNames: false
+whitelistObjectNames: ["org.apache.cassandra.metrics:*"]
+blacklistObjectNames: []
+rules:
+- pattern: org.apache.cassandra.metrics<type=(Connection|Streaming), scope=(\S*), name=(\S*)><>(Count|Value)
+  name: cassandra_$1_$3
+  labels:
+    address: "$2"
+- pattern: org.apache.cassandra.metrics<type=(ColumnFamily), name=(RangeLatency)><>(Mean)
+  name: cassandra_$1_$2_$3
+- pattern: org.apache.cassandra.net<type=(FailureDetector)><>(DownEndpointCount)
+  name: cassandra_$1_$2
+- pattern: org.apache.cassandra.metrics<type=(Keyspace), keyspace=(\S*), name=(\S*)><>(Count|Mean|95thPercentile)
+  name: cassandra_$1_$3_$4
+  labels:
+    "$1": "$2"
+- pattern: org.apache.cassandra.metrics<type=(Table), keyspace=(\S*), scope=(\S*), name=(\S*)><>(Count|Mean|95thPercentile)
+  name: cassandra_$1_$4_$5
+  labels:
+    "keyspace": "$2"
+    "table": "$3"
+- pattern: org.apache.cassandra.metrics<type=(ClientRequest), scope=(\S*), name=(\S*)><>(Count|Mean|95thPercentile)
+  name: cassandra_$1_$3_$4
+  labels:
+    "type": "$2"
+- pattern: org.apache.cassandra.metrics<type=(\S*)(?:, ((?!scope)\S*)=(\S*))?(?:, scope=(\S*))?,
+    name=(\S*)><>(Count|Value)
+  name: cassandra_$1_$5
+  labels:
+    "$1": "$4"
+    "$2": "$3"
diff --git a/docker/conf/prometheus-jmx-exporter-logging.properties b/docker/conf/prometheus-jmx-exporter-logging.properties
new file mode 100644
index 0000000..0f4c31f
--- /dev/null
+++ b/docker/conf/prometheus-jmx-exporter-logging.properties
@@ -0,0 +1,6 @@
+handlers=java.util.logging.ConsoleHandler
+# uncomment this to get logs:
+#java.util.logging.ConsoleHandler.level=ALL
+#io.prometheus.jmx.level=ALL
+#io.prometheus.jmx.shaded.io.prometheus.jmx.level=ALL
+
diff --git a/docker/conf/prometheus-statsd-mapping.yml b/docker/conf/prometheus-statsd-mapping.yml
new file mode 100644
index 0000000..a994106
--- /dev/null
+++ b/docker/conf/prometheus-statsd-mapping.yml
@@ -0,0 +1,27 @@
+defaults:
+  timer_type: histogram
+  buckets:
+    - .005
+    - .01
+    - .025
+    - .05
+    - .1
+    - .25
+    - .5
+    - .75
+    - 1
+    - 2
+    - 5
+    - 10
+    - 15
+    - 30
+    - 45
+    - 60
+    - 120
+    - 300
+    - 600
+    - 900
+    - 1800
+    - 2700
+    - 3600
+    - 7200
diff --git a/docker/conf/prometheus.yml b/docker/conf/prometheus.yml
new file mode 100644
index 0000000..f342c98
--- /dev/null
+++ b/docker/conf/prometheus.yml
@@ -0,0 +1,22 @@
+# my global config
+global:
+  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
+  # scrape_timeout is set to the global default (10s).
+
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+    - targets:
+        - localhost:9090
+    metrics_path: /prometheus/metrics
+
+  - job_name: statsd-exporter
+    static_configs:
+    - targets:
+        - prometheus-statsd-exporter:9102
+
+  - job_name: jmx-exporter-cassandra
+    static_configs:
+    - targets:
+        - prometheus-jmx-exporter-cassandra:5556
diff --git a/docker/conf/scheduler.yml b/docker/conf/scheduler.yml
new file mode 100644
index 0000000..af1b3f5
--- /dev/null
+++ b/docker/conf/scheduler.yml
@@ -0,0 +1,8 @@
+scheduler:
+  cls: local
+  args:
+    db: postgresql:///?service=swh-scheduler
+celery:
+  task_broker: amqp://guest:guest@amqp//
+  broker_transport_options:
+    max_retries: 1
diff --git a/docker/conf/search.yml b/docker/conf/search.yml
new file mode 100644
index 0000000..883f6f2
--- /dev/null
+++ b/docker/conf/search.yml
@@ -0,0 +1,5 @@
+search:
+  cls: elasticsearch
+  args:
+    hosts:
+        - elasticsearch:9200
diff --git a/docker/conf/search_journal_client_objects.yml b/docker/conf/search_journal_client_objects.yml
new file mode 100644
index 0000000..4248935
--- /dev/null
+++ b/docker/conf/search_journal_client_objects.yml
@@ -0,0 +1,10 @@
+search:
+  cls: remote
+  args:
+    url: http://swh-search:5010/
+journal:
+  brokers:
+    - kafka
+  group_id: swh.search.journal_client.objects
+  prefix: swh.journal.objects
+
diff --git a/docker/conf/storage-replica.yml b/docker/conf/storage-replica.yml
new file mode 100644
index 0000000..2412282
--- /dev/null
+++ b/docker/conf/storage-replica.yml
@@ -0,0 +1,8 @@
+storage:
+  cls: local
+  args:
+    db: postgresql:///?service=swh-storage-replica
+    objstorage:
+      cls: remote
+      args:
+        url: http://swh-objstorage:5003/
diff --git a/docker/conf/storage.yml b/docker/conf/storage.yml
new file mode 100644
index 0000000..a27b3be
--- /dev/null
+++ b/docker/conf/storage.yml
@@ -0,0 +1,15 @@
+storage:
+  cls: local
+  args:
+    db: postgresql:///?service=swh-storage
+    objstorage:
+      cls: remote
+      args:
+        url: http://swh-objstorage:5003/
+    journal_writer:
+      cls: kafka
+      args:
+        brokers:
+          - kafka
+        prefix: swh.journal.objects
+        client_id: swh.storage.master
diff --git a/docker/conf/storage_cassandra.yml b/docker/conf/storage_cassandra.yml
new file mode 100644
index 0000000..a4d1ec4
--- /dev/null
+++ b/docker/conf/storage_cassandra.yml
@@ -0,0 +1,11 @@
+storage:
+  cls: cassandra
+  args:
+    hosts:
+    - cassandra-seed
+    keyspace: swh
+    objstorage:
+      cls: remote
+      args:
+        url: http://swh-objstorage:5003/
+
diff --git a/docker/conf/vault-api.yml b/docker/conf/vault-api.yml
new file mode 100644
index 0000000..b3ec6a3
--- /dev/null
+++ b/docker/conf/vault-api.yml
@@ -0,0 +1,17 @@
+storage:
+  cls: remote
+  args:
+    url: http://swh-storage:5002/
+scheduler:
+  cls: remote
+  args:
+    url: http://swh-scheduler-api:5008/
+vault:
+  cls: local
+  args:
+    db: postgresql:///?service=swh-vault
+cache:
+  cls: pathslicing
+  args:
+    root: /srv/softwareheritage/vault
+    slicing: 0:5
diff --git a/docker/conf/vault-worker.yml b/docker/conf/vault-worker.yml
new file mode 100644
index 0000000..8a195ac
--- /dev/null
+++ b/docker/conf/vault-worker.yml
@@ -0,0 +1,17 @@
+storage:
+  cls: remote
+  args:
+    url: http://swh-storage:5002/
+vault:
+  cls: remote
+  args:
+    url: http://swh-vault-api:5005/
+celery:
+  task_broker: amqp://guest:guest@amqp//
+  task_modules:
+    - swh.vault.cooking_tasks
+  task_queues:
+    - swh.vault.cooking_tasks.SWHBatchCookingTask
+    - swh.vault.cooking_tasks.SWHCookingTask
+
+max_bundle_size: 536870912
diff --git a/docker/conf/web-replica.yml b/docker/conf/web-replica.yml
new file mode 100644
index 0000000..06ba565
--- /dev/null
+++ b/docker/conf/web-replica.yml
@@ -0,0 +1,37 @@
+storage:
+  cls: remote
+  args:
+    url: http://swh-storage-replica:5002/
+    timeout: 1
+
+objstorage:
+  cls: remote
+  args:
+    url: http://swh-objstorage:5003/
+
+indexer_storage:
+  cls: remote
+  args:
+    url: http://swh-idx-storage:5007/
+
+scheduler:
+  cls: remote
+  args:
+    url: http://swh-scheduler-api:5008/
+
+vault:
+  cls: remote
+  args:
+    url: http://swh-vault-api:5005/
+
+deposit:
+  private_api_url: https://swh-deposit:5006/1/private/
+  private_api_user: swhworker
+  private_api_password: ''
+
+allowed_hosts:
+  - "*"
+
+debug: yes
+
+serve_assets: yes
diff --git a/docker/conf/web.yml b/docker/conf/web.yml
new file mode 100644
index 0000000..23eb3f8
--- /dev/null
+++ b/docker/conf/web.yml
@@ -0,0 +1,62 @@
+storage:
+  cls: remote
+  args:
+    url: http://swh-storage:5002/
+    timeout: 1
+
+objstorage:
+  cls: remote
+  args:
+    url: http://swh-objstorage:5003/
+
+indexer_storage:
+  cls: remote
+  args:
+    url: http://swh-idx-storage:5007/
+
+scheduler:
+  cls: remote
+  args:
+    url: http://swh-scheduler-api:5008/
+
+vault:
+  cls: remote
+  args:
+    url: http://swh-vault-api:5005/
+
+deposit:
+  private_api_url: https://swh-deposit:5006/1/private/
+  private_api_user: swhworker
+  private_api_password: ''
+
+allowed_hosts:
+  - "*"
+
+debug: yes
+
+serve_assets: yes
+
+development_db: /tmp/db.sqlite3
+
+throttling:
+  scopes:
+    swh_api:
+      limiter_rate:
+        default: 120/h
+      exempted_networks:
+        - 0.0.0.0/0
+    swh_api_origin_visit_latest:
+      limiter_rate:
+        default: 700/m
+      exempted_networks:
+        - 0.0.0.0/0
+    swh_vault_cooking:
+      limiter_rate:
+        default: 120/h
+      exempted_networks:
+        - 0.0.0.0/0
+    swh_save_origin:
+      limiter_rate:
+        default: 120/h
+      exempted_networks:
+        - 0.0.0.0/0
diff --git a/docker/docker-compose.cassandra.yml b/docker/docker-compose.cassandra.yml
new file mode 100644
index 0000000..ae843a4
--- /dev/null
+++ b/docker/docker-compose.cassandra.yml
@@ -0,0 +1,61 @@
+version: '2'
+
+services:
+  cassandra-seed:
+    # This container starts a Cassandra instance that must be used as the
+    # contact-point for clients. This container will then make the client
+    # discover other cassandra containers.
+    # This container must not be scaled up; scale up th 'cassandra'
+    # container instead.
+    image: cassandra
+    env_file:
+    - ./env/cassandra.env
+    entrypoint: /swh_entrypoint.sh
+    volumes:
+        - "./services/cassandra/swh_entrypoint.sh:/swh_entrypoint.sh:ro"
+        - "./conf/cassandra.yaml:/cassandra.yaml:ro"
+
+  cassandra:
+    # Additional Cassandra instance(s), which may be scaled up, but not
+    # down. They will automatically connect to 'cassandra-seed', and
+    # 'cassandra-seed' will tell clients to connect to these 'cassandra'
+    # containers to load-balance.
+    image: cassandra
+    entrypoint: /swh_entrypoint.sh
+    volumes:
+        - "./services/cassandra/swh_entrypoint.sh:/swh_entrypoint.sh:ro"
+        - "./conf/cassandra.yaml:/cassandra.yaml:ro"
+    env_file:
+    - ./env/cassandra.env
+
+  prometheus:
+    # just to add the dep on the cassandra-jmx-exporter-cassandra
+    depends_on:
+    - prometheus-statsd-exporter
+    - prometheus-jmx-exporter-cassandra
+
+  prometheus-jmx-exporter-cassandra:
+    image: sscaling/jmx-prometheus-exporter
+    environment:
+      JVM_OPTS: "-Djava.util.logging.config.file=/logging.properties"
+    volumes:
+      - "./conf/prometheus-jmx-exporter-cassandra.yml:/opt/jmx_exporter/config.yml:ro"
+      - "./conf/prometheus-jmx-exporter-logging.properties:/logging.properties:ro"
+    ports:
+    - "5556:5556"
+
+  swh-storage:
+    volumes:
+      # note: you need to be on the cassandra-backend2 branch
+      - "../swh-storage:/src/swh-storage"
+      - "./conf/storage_cassandra.yml:/storage.yml:ro"
+      - "./services/swh-storage/entrypoint.sh:/entrypoint.sh:ro"
+    depends_on:
+      - swh-storage-db
+      - cassandra-seed
+      - swh-objstorage
+      - kafka
+    environment:
+      CASSANDRA_SEED: cassandra-seed
+      STORAGE_BACKEND: cassandra
+      PYTHONUNBUFFERED: 1
diff --git a/docker/docker-compose.override.yml.example b/docker/docker-compose.override.yml.example
new file mode 100644
index 0000000..5221796
--- /dev/null
+++ b/docker/docker-compose.override.yml.example
@@ -0,0 +1,6 @@
+version: '2'
+
+services:
+  swh-objstorage:
+    volumes:
+      - "/home/ddouard/src/swh-environment/swh-objstorage:/src/swh-objstorage"
diff --git a/docker/docker-compose.search.yml b/docker/docker-compose.search.yml
new file mode 100644
index 0000000..091bd12
--- /dev/null
+++ b/docker/docker-compose.search.yml
@@ -0,0 +1,39 @@
+version: '2'
+
+services:
+  elasticsearch:
+    image: elasticsearch:7.3.2
+    env_file:
+      - ./env/elasticsearch.env
+    ports:
+      - 9200:9200
+    volumes:
+      - elasticsearch-data:/usr/share/elasticsearch/data
+
+  swh-search:
+    image: swh/stack
+    build: ./
+    entrypoint: /entrypoint.sh
+    ports:
+      - 5010:5010
+    depends_on:
+      - elasticsearch
+    environment:
+      SWH_CONFIG_FILENAME: /search.yml
+    volumes:
+      - "./conf/search.yml:/search.yml:ro"
+      - "./services/swh-search/entrypoint.sh:/entrypoint.sh:ro"
+
+  swh-search-journal-client-objects:
+    image: swh/stack
+    build: ./
+    entrypoint: /entrypoint.sh
+    depends_on:
+      - swh-search
+      - kafka
+    volumes:
+      - "./conf/search_journal_client_objects.yml:/etc/softwareheritage/search/journal_client_objects.yml:ro"
+      - "./services/swh-search-journal-client-objects/entrypoint.sh:/entrypoint.sh:ro"
+
+volumes:
+  elasticsearch-data:
diff --git a/docker/docker-compose.storage-replica.yml b/docker/docker-compose.storage-replica.yml
new file mode 100644
index 0000000..2824615
--- /dev/null
+++ b/docker/docker-compose.storage-replica.yml
@@ -0,0 +1,66 @@
+version: '2'
+
+services:
+  # override web app to use the replica
+  swh-web:
+    environment:
+      SWH_CONFIG_FILENAME: /web-replica.yml
+    volumes:
+      - "./conf/web-replica.yml:/web-replica.yml:ro"
+
+  # create a dedicated db for the replica
+  swh-storage-replica-db:
+    image: postgres:11
+    env_file:
+      - ./env/storage-db-replica.env
+    environment:
+      # unset PGHOST as db service crashes otherwise
+      PGHOST:
+
+  # and an RPC server
+  swh-storage-replica:
+    image: swh/stack
+    build: ./
+    depends_on:
+      - swh-storage-replica-db
+      - swh-objstorage
+    env_file:
+      - ./env/storage-db-replica.env
+    environment:
+      SWH_CONFIG_FILENAME: /storage-replica.yml
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/storage-replica.yml:/storage-replica.yml:ro"
+      - "./services/swh-storage/entrypoint.sh:/entrypoint.sh:ro"
+
+  # and the background process that keeps the replica in sync with the
+  # main graph
+  swh-storage-replica-replayer:
+    image: swh/stack
+    build: ./
+    depends_on:
+      - swh-storage-replica-db
+      - swh-objstorage
+    env_file:
+      - ./env/storage-db-replica.env
+    environment:
+      SWH_CONFIG_FILENAME: /storage-replica.yml
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/storage-replica.yml:/storage-replica.yml:ro"
+      - "./services/swh-storage-replayer/entrypoint.sh:/entrypoint.sh:ro"
+
+  swh-journal-backfiller:
+    image: swh/stack
+    build: ./
+    entrypoint: /entrypoint.sh
+    environment:
+      SWH_CONFIG_FILENAME: /journal_backfiller.yml
+    env_file:
+      - ./env/storage-db.env
+    depends_on:
+      - swh-storage-db
+      - kafka
+    volumes:
+      - "./conf/journal_backfiller.yml:/journal_backfiller.yml:ro"
+      - "./services/swh-journal-backfiller/entrypoint.sh:/entrypoint.sh:ro"
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
new file mode 100644
index 0000000..44da7d2
--- /dev/null
+++ b/docker/docker-compose.yml
@@ -0,0 +1,390 @@
+version: '2'
+
+services:
+
+  amqp:
+    image: rabbitmq:3.6-management
+    ports:
+      - 5072:5672
+
+#  flower:
+#    image: mher/flower
+#    command: --broker=amqp://guest:guest@amqp:5672// --url_prefix=flower
+#    ports:
+#      - 5055:5555
+#    depends_on:
+#      - amqp
+
+  zookeeper:
+    image: wurstmeister/zookeeper
+    restart: always
+
+  kafka:
+    image: wurstmeister/kafka
+    ports:
+      - "5092:9092"
+    env_file: ./env/kafka.env
+    depends_on:
+      - zookeeper
+
+  kafka-manager:
+    image: hlebalbau/kafka-manager:stable
+    ports:
+      - "5093:9000"
+    environment:
+      ZK_HOSTS: zookeeper:2181
+      APPLICATION_SECRET: random-secret
+    command: -Dpidfile.path=/dev/null
+
+  prometheus:
+    image: prom/prometheus
+    depends_on:
+    - prometheus-statsd-exporter
+    command:
+      # Needed for the reverse-proxy
+      - "--web.external-url=/prometheus"
+      - "--config.file=/etc/prometheus/prometheus.yml"
+    volumes:
+      - "./conf/prometheus.yml:/etc/prometheus/prometheus.yml:ro"
+    restart: unless-stopped
+
+  prometheus-statsd-exporter:
+    image: prom/statsd-exporter
+    command:
+      - "--statsd.mapping-config=/etc/prometheus/statsd-mapping.yml"
+    volumes:
+      - "./conf/prometheus-statsd-mapping.yml:/etc/prometheus/statsd-mapping.yml:ro"
+    restart: unless-stopped
+
+  grafana:
+    image: grafana/grafana
+    restart: unless-stopped
+    depends_on:
+    - prometheus
+    environment:
+      GF_SERVER_ROOT_URL: http://localhost:5080/grafana
+    volumes:
+      - "./conf/grafana/provisioning:/etc/grafana/provisioning:ro"
+      - "./conf/grafana/dashboards:/var/lib/grafana/dashboards"
+
+  nginx:
+    image: nginx
+    volumes:
+      - "./conf/nginx.conf:/etc/nginx/nginx.conf:ro"
+    ports:
+      - 5080:5080
+
+# Scheduler
+
+  swh-scheduler-db:
+    image: postgres:11
+    env_file:
+      - ./env/scheduler-db.env
+    environment:
+      # unset PGHOST as db service crashes otherwise
+      PGHOST:
+
+  swh-scheduler-api:
+    image: swh/stack
+    build: ./
+    env_file:
+      - ./env/scheduler-db.env
+      - ./env/scheduler.env
+    environment:
+      SWH_CONFIG_FILENAME: /scheduler.yml
+      SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml
+    entrypoint: /entrypoint.sh
+    depends_on:
+      - swh-scheduler-db
+    ports:
+      - 5008:5008
+    volumes:
+      - "./conf/scheduler.yml:/scheduler.yml:ro"
+      - "./services/swh-scheduler-api/entrypoint.sh:/entrypoint.sh:ro"
+
+  swh-scheduler-listener:
+    image: swh/stack
+    build: ./
+    env_file:
+      - ./env/scheduler-db.env
+      - ./env/scheduler.env
+    environment:
+      SWH_CONFIG_FILENAME: /scheduler.yml
+      SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml
+    entrypoint: /entrypoint.sh
+    command: start-listener
+    depends_on:
+      - swh-scheduler-api
+      - amqp
+    volumes:
+      - "./conf/scheduler.yml:/scheduler.yml:ro"
+      - "./services/swh-scheduler-worker/entrypoint.sh:/entrypoint.sh:ro"
+
+  swh-scheduler-runner:
+    image: swh/stack
+    build: ./
+    env_file:
+      - ./env/scheduler-db.env
+      - ./env/scheduler.env
+    environment:
+      SWH_CONFIG_FILENAME: /scheduler.yml
+      SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml
+    entrypoint: /entrypoint.sh
+    command: start-runner -p 10
+    depends_on:
+      - swh-scheduler-api
+      - amqp
+    volumes:
+      - "./conf/scheduler.yml:/scheduler.yml:ro"
+      - "./services/swh-scheduler-worker/entrypoint.sh:/entrypoint.sh:ro"
+
+# Graph storage
+
+  swh-storage-db:
+    image: postgres:11
+    env_file:
+      - ./env/storage-db.env
+    environment:
+      # unset PGHOST as db service crashes otherwise
+      PGHOST:
+
+  swh-storage:
+    image: swh/stack
+    build: ./
+    ports:
+      - 5002:5002
+    depends_on:
+      - swh-storage-db
+      - swh-objstorage
+      - kafka
+    env_file:
+      - ./env/storage-db.env
+    environment:
+      SWH_CONFIG_FILENAME: /storage.yml
+      STORAGE_BACKEND: postgresql
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/storage.yml:/storage.yml:ro"
+      - "./services/swh-storage/entrypoint.sh:/entrypoint.sh:ro"
+
+# Object storage
+
+  swh-objstorage:
+    build: ./
+    image: swh/stack
+    ports:
+      - 5003:5003
+    environment:
+      SWH_CONFIG_FILENAME: /objstorage.yml
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/objstorage.yml:/objstorage.yml:ro"
+      - "./services/swh-objstorage/entrypoint.sh:/entrypoint.sh:ro"
+
+# Indexer storage
+
+  swh-idx-storage-db:
+    image: postgres:11
+    env_file:
+      - ./env/indexers-db.env
+    environment:
+      # unset PGHOST as db service crashes otherwise
+      PGHOST:
+
+  swh-idx-storage:
+    image: swh/stack
+    build: ./
+    ports:
+      - 5007:5007
+    depends_on:
+      - swh-idx-storage-db
+    env_file:
+      - ./env/indexers-db.env
+    environment:
+      SWH_CONFIG_FILENAME: /indexer_storage.yml
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/indexer_storage.yml:/indexer_storage.yml:ro"
+      - "./services/swh-indexer-storage/entrypoint.sh:/entrypoint.sh:ro"
+
+# Web interface
+
+  swh-web:
+    build: ./
+    image: swh/stack
+    ports:
+      - 5004:5004
+    depends_on:
+      - swh-objstorage
+      - swh-storage
+      - swh-idx-storage
+    environment:
+      VERBOSITY: 3
+      DJANGO_SETTINGS_MODULE: swh.web.settings.development
+      SWH_CONFIG_FILENAME: /web.yml
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/web.yml:/web.yml:ro"
+      - "./services/swh-web/entrypoint.sh:/entrypoint.sh:ro"
+
+  swh-deposit-db:
+    image: postgres:11
+    env_file:
+      - ./env/deposit-db.env
+    environment:
+      # unset PGHOST as db service crashes otherwise
+      PGHOST:
+
+  swh-deposit:
+    image: swh/stack
+    build: ./
+    ports:
+      - 5006:5006
+    depends_on:
+      - swh-deposit-db
+      - swh-scheduler-api
+    environment:
+      VERBOSITY: 3
+      SWH_CONFIG_FILENAME: /deposit.yml
+      DJANGO_SETTINGS_MODULE: swh.deposit.settings.production
+    env_file:
+      - ./env/deposit-db.env
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/deposit.yml:/deposit.yml:ro"
+      - "./services/swh-deposit/entrypoint.sh:/entrypoint.sh:ro"
+
+  swh-vault-db:
+    image: postgres:11
+    env_file:
+      - ./env/vault-db.env
+    environment:
+      # unset PGHOST as db service crashes otherwise
+      PGHOST:
+
+  swh-vault-api:
+    image: swh/stack
+    build: ./
+    env_file:
+      - ./env/vault-db.env
+    environment:
+      SWH_CONFIG_FILENAME: /vault-api.yml
+    command: server
+    ports:
+      - 5005:5005
+    depends_on:
+      - swh-vault-db
+      - swh-objstorage
+      - swh-storage
+      - swh-scheduler-api
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/vault-api.yml:/vault-api.yml:ro"
+      - "./services/swh-vault/entrypoint.sh:/entrypoint.sh:ro"
+
+  swh-vault-worker:
+    image: swh/stack
+    build: ./
+    command: worker
+    environment:
+      SWH_CONFIG_FILENAME: /cooker.yml
+    depends_on:
+      - swh-vault-api
+      - swh-storage
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/vault-worker.yml:/cooker.yml:ro"
+      - "./services/swh-vault/entrypoint.sh:/entrypoint.sh:ro"
+
+
+# Lister Celery workers
+
+  swh-listers-db:
+    image: postgres:11
+    env_file:
+      - ./env/listers-db.env
+    environment:
+      # unset PGHOST as db service crashes otherwise
+      PGHOST:
+
+  swh-lister:
+    image: swh/stack
+    build: ./
+    env_file:
+      - ./env/listers-db.env
+      - ./env/listers.env
+    user: swh
+    environment:
+      STATSD_HOST: prometheus-statsd-exporter
+      STATSD_PORT: 9125
+      SWH_WORKER_INSTANCE: listers
+      SWH_CONFIG_FILENAME: /lister.yml
+    depends_on:
+      - swh-listers-db
+      - swh-scheduler-api
+      - swh-scheduler-runner
+      - swh-storage
+      - amqp
+    entrypoint: /entrypoint.sh
+    volumes:
+      - "./conf/lister.yml:/lister.yml:ro"
+      - "./services/swh-listers-worker/entrypoint.sh:/entrypoint.sh:ro"
+
+# Loader Celery workers
+
+  swh-loader:
+    image: swh/stack
+    build: ./
+    env_file:
+      - ./env/listers.env
+    user: swh
+    environment:
+      STATSD_HOST: prometheus-statsd-exporter
+      STATSD_PORT: 9125
+      SWH_WORKER_INSTANCE: loader
+      SWH_CONFIG_FILENAME: /loader.yml
+    entrypoint: /entrypoint.sh
+    depends_on:
+      - swh-storage
+      - swh-scheduler-api
+      - amqp
+    volumes:
+      - "./conf/loader.yml:/loader.yml:ro"
+      - "./services/swh-loaders-worker/entrypoint.sh:/entrypoint.sh:ro"
+
+# Indexer Celery workers
+
+  swh-indexer:
+    image: swh/stack
+    build: ./
+    user: swh
+    env_file:
+      - ./env/indexers-db.env
+      - ./env/indexers.env
+    environment:
+      STATSD_HOST: prometheus-statsd-exporter
+      STATSD_PORT: 9125
+    entrypoint: /entrypoint.sh
+    depends_on:
+      - swh-scheduler-runner
+      - swh-idx-storage
+      - swh-storage
+      - swh-objstorage
+      - amqp
+    volumes:
+      - "./conf/indexer.yml:/indexer.yml:ro"
+      - "./services/swh-indexer-worker/entrypoint.sh:/entrypoint.sh:ro"
+
+# Journal related
+
+  swh-indexer-journal-client:
+    image: swh/stack
+    build: ./
+    entrypoint: /entrypoint.sh
+    depends_on:
+      - kafka
+      - swh-storage
+      - swh-scheduler-api
+    volumes:
+      - "./conf/indexer_journal_client.yml:/etc/softwareheritage/indexer/journal_client.yml:ro"
+      - "./services/swh-indexer-journal-client/entrypoint.sh:/entrypoint.sh:ro"
diff --git a/docker/docs/Makefile b/docker/docs/Makefile
new file mode 100644
index 0000000..c30c50a
--- /dev/null
+++ b/docker/docs/Makefile
@@ -0,0 +1 @@
+include ../../swh-docs/Makefile.sphinx
diff --git a/docker/env/cassandra.env b/docker/env/cassandra.env
new file mode 100644
index 0000000..b04b233
--- /dev/null
+++ b/docker/env/cassandra.env
@@ -0,0 +1,5 @@
+MAX_HEAP_SIZE=1G
+HEAP_NEWSIZE=100M
+LOCAL_JMX=no
+JVM_EXTRA_OPTS=-Dcom.sun.management.jmxremote.authenticate=false
+
diff --git a/docker/env/deposit-db.env b/docker/env/deposit-db.env
new file mode 100644
index 0000000..0a4973d
--- /dev/null
+++ b/docker/env/deposit-db.env
@@ -0,0 +1,5 @@
+PGHOST=swh-deposit-db
+PGUSER=postgres
+POSTGRES_PASSWORD=testpassword
+POSTGRES_DB=swh-deposit
+
diff --git a/docker/env/elasticsearch.env b/docker/env/elasticsearch.env
new file mode 100644
index 0000000..3392cf9
--- /dev/null
+++ b/docker/env/elasticsearch.env
@@ -0,0 +1 @@
+discovery.type=single-node
diff --git a/docker/env/indexers-db.env b/docker/env/indexers-db.env
new file mode 100644
index 0000000..c748508
--- /dev/null
+++ b/docker/env/indexers-db.env
@@ -0,0 +1,4 @@
+PGHOST=swh-idx-storage-db
+PGUSER=postgres
+POSTGRES_PASSWORD=testpassword
+POSTGRES_DB=swh-indexers
\ No newline at end of file
diff --git a/docker/env/indexers.env b/docker/env/indexers.env
new file mode 100644
index 0000000..5db2f4b
--- /dev/null
+++ b/docker/env/indexers.env
@@ -0,0 +1,5 @@
+CONCURRENCY=4
+MAX_TASKS_PER_CHILD=10
+LOGLEVEL=DEBUG
+SWH_WORKER_INSTANCE=indexer
+SWH_CONFIG_FILENAME=/indexer.yml
diff --git a/docker/env/kafka.env b/docker/env/kafka.env
new file mode 100644
index 0000000..1bc0a97
--- /dev/null
+++ b/docker/env/kafka.env
@@ -0,0 +1,9 @@
+KAFKA_ADVERTISED_HOST_NAME=kafka
+KAFKA_ADVERTISED_PORT=9092
+KAFKA_PORT=9092
+KAFKA_LISTENERS=PLAINTEXT://:9092
+KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
+KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
+KAFKA_JMX_OPTS=-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Djava.rmi.server.hostname=kafka -Dcom.sun.management.jmxremote.rmi.port=1099
+JMX_PORT=1099
+LOG4J_LOGGER_KAFKA_AUTHORIZER_LOGGER=DEBUG, authorizerAppender
diff --git a/docker/env/listers-db.env b/docker/env/listers-db.env
new file mode 100644
index 0000000..5146abc
--- /dev/null
+++ b/docker/env/listers-db.env
@@ -0,0 +1,4 @@
+PGHOST=swh-listers-db
+PGUSER=postgres
+POSTGRES_PASSWORD=testpassword
+POSTGRES_DB=swh-listers
diff --git a/docker/env/listers.env b/docker/env/listers.env
new file mode 100644
index 0000000..0f02e61
--- /dev/null
+++ b/docker/env/listers.env
@@ -0,0 +1,3 @@
+CONCURRENCY=1
+MAX_TASKS_PER_CHILD=10
+LOGLEVEL=DEBUG
diff --git a/docker/env/scheduler-db.env b/docker/env/scheduler-db.env
new file mode 100644
index 0000000..8e1de99
--- /dev/null
+++ b/docker/env/scheduler-db.env
@@ -0,0 +1,4 @@
+PGHOST=swh-scheduler-db
+PGUSER=postgres
+POSTGRES_PASSWORD=testpassword
+POSTGRES_DB=swh-scheduler
diff --git a/docker/env/scheduler.env b/docker/env/scheduler.env
new file mode 100644
index 0000000..8650403
--- /dev/null
+++ b/docker/env/scheduler.env
@@ -0,0 +1,3 @@
+SWH_WORKER_INSTANCE=scheduler
+LOGLEVEL=INFO
+CELERY_BROKER_URL=amqp://amqp//
diff --git a/docker/env/storage-db-replica.env b/docker/env/storage-db-replica.env
new file mode 100644
index 0000000..c713f3a
--- /dev/null
+++ b/docker/env/storage-db-replica.env
@@ -0,0 +1,4 @@
+PGHOST=swh-storage-replica-db
+PGUSER=postgres
+POSTGRES_PASSWORD=testpassword
+POSTGRES_DB=swh-storage-replica
diff --git a/docker/env/storage-db.env b/docker/env/storage-db.env
new file mode 100644
index 0000000..65da09b
--- /dev/null
+++ b/docker/env/storage-db.env
@@ -0,0 +1,4 @@
+PGHOST=swh-storage-db
+PGUSER=postgres
+POSTGRES_PASSWORD=testpassword
+POSTGRES_DB=swh-storage
diff --git a/docker/env/storage.env b/docker/env/storage.env
new file mode 100644
index 0000000..f802fd4
--- /dev/null
+++ b/docker/env/storage.env
@@ -0,0 +1,5 @@
+POSTGRES_PASSWORD=testpassword
+POSTGRES_DB=swh-storage
+PGHOST=swh-storage-db
+PGUSER=postgres
+SWH_CONFIG_FILENAME=/storage.yml
\ No newline at end of file
diff --git a/docker/env/vault-db.env b/docker/env/vault-db.env
new file mode 100644
index 0000000..2adadcf
--- /dev/null
+++ b/docker/env/vault-db.env
@@ -0,0 +1,4 @@
+POSTGRES_DB=swh-vault
+POSTGRES_PASSWORD=testpassword
+PGUSER=postgres
+PGHOST=swh-vault-db
diff --git a/docker/env/vault.env b/docker/env/vault.env
new file mode 100644
index 0000000..2adadcf
--- /dev/null
+++ b/docker/env/vault.env
@@ -0,0 +1,4 @@
+POSTGRES_DB=swh-vault
+POSTGRES_PASSWORD=testpassword
+PGUSER=postgres
+PGHOST=swh-vault-db
diff --git a/docker/services/cassandra/swh_entrypoint.sh b/docker/services/cassandra/swh_entrypoint.sh
new file mode 100755
index 0000000..8927578
--- /dev/null
+++ b/docker/services/cassandra/swh_entrypoint.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# /cassandra.yaml is provided by docker-compose via a bind-mount, but
+# we need to copy it because the official entrypoint (docker-entrypoint.sh)
+# modifies it.
+cp /cassandra.yaml /etc/cassandra/cassandra.yaml
+exec docker-entrypoint.sh
diff --git a/docker/services/swh-deposit/entrypoint.sh b/docker/services/swh-deposit/entrypoint.sh
new file mode 100755
index 0000000..8b6994f
--- /dev/null
+++ b/docker/services/swh-deposit/entrypoint.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -ex
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+if [ "$1" = 'shell' ] ; then
+    exec bash -i
+else
+
+    wait_pgsql
+
+    echo "Migrating db"
+    django-admin migrate --settings=${DJANGO_SETTINGS_MODULE}
+
+    swh-deposit admin user exists test || \
+        swh-deposit admin user create \
+                    --username test \
+                    --password test \
+                    --provider-url https://softwareheritage.org \
+                    --domain softwareheritage.org
+
+    echo "starting swh-deposit server"
+    exec gunicorn --bind 0.0.0.0:5006 \
+                  --reload \
+                  --threads 2 \
+                  --workers 2 \
+                  --log-level DEBUG \
+                  --timeout 3600 \
+                  'django.core.wsgi:get_wsgi_application()'
+fi
diff --git a/docker/services/swh-indexer-journal-client/entrypoint.sh b/docker/services/swh-indexer-journal-client/entrypoint.sh
new file mode 100755
index 0000000..18d2811
--- /dev/null
+++ b/docker/services/swh-indexer-journal-client/entrypoint.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+case "$1" in
+    "shell")
+      exec bash -i
+      ;;
+    *)
+      echo "Starting swh-indexer-journal client"
+      exec wait-for-it kafka:9092 -s --timeout=0 -- swh indexer --config-file /etc/softwareheritage/indexer/journal_client.yml journal-client
+      ;;
+esac
+
diff --git a/docker/services/swh-indexer-storage/entrypoint.sh b/docker/services/swh-indexer-storage/entrypoint.sh
new file mode 100755
index 0000000..ec5d92c
--- /dev/null
+++ b/docker/services/swh-indexer-storage/entrypoint.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+case "$1" in
+    "shell")
+        exec bash -i
+        ;;
+    *)
+
+    wait_pgsql
+
+    echo Setup the database
+    PGPASSWORD=${POSTGRES_PASSWORD} swh db-init \
+          --db-name ${POSTGRES_DB} indexer
+
+    echo Starting the swh-indexer-storage API server
+    exec gunicorn --bind 0.0.0.0:5007 \
+         --reload \
+         --threads 2 \
+         --workers 2 \
+         --log-level DEBUG \
+         --timeout 3600 \
+         'swh.indexer.storage.api.server:make_app_from_configfile()'
+    ;;
+esac
diff --git a/docker/services/swh-indexer-worker/entrypoint.sh b/docker/services/swh-indexer-worker/entrypoint.sh
new file mode 100755
index 0000000..dea17ef
--- /dev/null
+++ b/docker/services/swh-indexer-worker/entrypoint.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+case "$1" in
+    "shell")
+        exec bash -i
+        ;;
+    *)
+        echo Waiting for RabbitMQ to start
+        wait-for-it amqp:5672 -s --timeout=0
+
+        wait_pgsql
+
+        echo Starting swh-indexer worker
+        exec python -m celery worker \
+             --app=swh.scheduler.celery_backend.config.app \
+             --pool=prefork --events \
+             --concurrency=${CONCURRENCY} \
+             --maxtasksperchild=${MAX_TASKS_PER_CHILD} \
+             -Ofair --loglevel=${LOGLEVEL} --without-gossip \
+             --without-mingle \
+			       --heartbeat-interval 10 \
+             --hostname "${SWH_WORKER_INSTANCE}@%h"
+    ;;
+esac
diff --git a/docker/services/swh-journal-backfiller/entrypoint.sh b/docker/services/swh-journal-backfiller/entrypoint.sh
new file mode 100755
index 0000000..d24bf1b
--- /dev/null
+++ b/docker/services/swh-journal-backfiller/entrypoint.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+case "$1" in
+    "shell")
+      exec bash -i
+      ;;
+    *)
+      wait_pgsql
+      echo "Starting swh-journal-backfiller"
+      exec wait-for-it kafka:9092 -s --timeout=0 -- swh-journal backfiller $@
+      ;;
+esac
diff --git a/docker/services/swh-listers-worker/entrypoint.sh b/docker/services/swh-listers-worker/entrypoint.sh
new file mode 100755
index 0000000..3ab19c6
--- /dev/null
+++ b/docker/services/swh-listers-worker/entrypoint.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+case "$1" in
+    "shell")
+        exec bash -i
+        ;;
+    *)
+        wait_pgsql
+
+        echo Setup ${POSTGRES_DB} database for ${SWH_WORKER_INSTANCE}
+        if psql -lqt | cut -d \| -f 1 | grep -qw ${POSTGRES_DB}; then
+            echo Database already exists, nothing to do
+        else
+            echo Creating database
+            createdb ${POSTGRES_DB}
+        fi
+
+        echo Initialize database
+        swh lister -C ${SWH_CONFIG_FILENAME} db-init
+
+        echo Register task types in scheduler database
+        swh scheduler -C ${SWH_CONFIG_FILENAME} task-type register
+
+        echo Waiting for RabbitMQ to start
+        wait-for-it amqp:5672 -s --timeout=0
+
+        echo Starting the swh-lister Celery worker for ${SWH_WORKER_INSTANCE}
+        exec python -m celery worker \
+                    --app=swh.scheduler.celery_backend.config.app \
+                    --pool=prefork --events \
+                    --concurrency=${CONCURRENCY} \
+                    --maxtasksperchild=${MAX_TASKS_PER_CHILD} \
+                    -Ofair --loglevel=${LOGLEVEL} --without-gossip \
+                    --without-mingle \
+                    --heartbeat-interval 10 \
+                    --hostname "${SWH_WORKER_INSTANCE}@%h"
+        ;;
+esac
diff --git a/docker/services/swh-loaders-worker/entrypoint.sh b/docker/services/swh-loaders-worker/entrypoint.sh
new file mode 100755
index 0000000..d230f49
--- /dev/null
+++ b/docker/services/swh-loaders-worker/entrypoint.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+case "$1" in
+    "shell")
+        exec bash -i
+        ;;
+    *)
+        echo Waiting for RabbitMQ to start
+        wait-for-it amqp:5672 -s --timeout=0
+
+        echo Register task types in scheduler database
+        swh scheduler -C ${SWH_CONFIG_FILENAME} task-type register
+
+        echo Starting the swh-loader Celery worker for ${SWH_WORKER_INSTANCE}
+        exec python -m celery worker \
+                    --app=swh.scheduler.celery_backend.config.app \
+                    --pool=prefork --events \
+                    --concurrency=${CONCURRENCY} \
+                    --maxtasksperchild=${MAX_TASKS_PER_CHILD} \
+                    -Ofair --loglevel=${LOGLEVEL} --without-gossip \
+                    --without-mingle \
+					          --heartbeat-interval 10 \
+                    --hostname "${SWH_WORKER_INSTANCE}@%h"
+        ;;
+esac
diff --git a/docker/services/swh-objstorage/entrypoint.sh b/docker/services/swh-objstorage/entrypoint.sh
new file mode 100755
index 0000000..30fcefc
--- /dev/null
+++ b/docker/services/swh-objstorage/entrypoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+echo Installed Python packages:
+pip list
+
+if [ "$1" = 'shell' ] ; then
+    exec bash -i
+else
+    echo Starting the swh-objstorage API server
+  exec gunicorn --bind 0.0.0.0:5003 \
+       --worker-class aiohttp.worker.GunicornWebWorker \
+       --log-level DEBUG \
+       --threads 4 \
+       --workers 2 \
+       --reload \
+       --timeout 3600 \
+       'swh.objstorage.api.server:make_app_from_configfile()'
+
+fi
diff --git a/docker/services/swh-scheduler-api/entrypoint.sh b/docker/services/swh-scheduler-api/entrypoint.sh
new file mode 100755
index 0000000..b0a3381
--- /dev/null
+++ b/docker/services/swh-scheduler-api/entrypoint.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+case "$1" in
+    "shell")
+        exec bash -i
+        ;;
+    "swh-scheduler")
+        exec $@
+        ;;
+    *)
+        wait_pgsql
+
+        echo Setup the swh-scheduler API database
+        PGPASSWORD=${POSTGRES_PASSWORD} swh db-init \
+            --db-name ${POSTGRES_DB} scheduler
+
+        echo Starting the swh-scheduler API server
+        exec gunicorn --bind 0.0.0.0:5008 \
+             --log-level DEBUG \
+             --threads 2 \
+             --workers 2 \
+             --reload \
+             --timeout 3600 \
+             'swh.scheduler.api.server:make_app_from_configfile()'
+
+esac
diff --git a/docker/services/swh-scheduler-worker/entrypoint.sh b/docker/services/swh-scheduler-worker/entrypoint.sh
new file mode 100755
index 0000000..1ed0b2b
--- /dev/null
+++ b/docker/services/swh-scheduler-worker/entrypoint.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+case "$1" in
+    "shell")
+        exec bash -i
+        ;;
+    *)
+        wait_pgsql
+
+        echo "Starting the swh-scheduler $1"
+        exec wait-for-it amqp:5672 -s --timeout=0 -- swh --log-level ${LOGLEVEL} scheduler -C /scheduler.yml $@
+        ;;
+esac
diff --git a/docker/services/swh-search-journal-client-objects/entrypoint.sh b/docker/services/swh-search-journal-client-objects/entrypoint.sh
new file mode 100755
index 0000000..aa4ecb9
--- /dev/null
+++ b/docker/services/swh-search-journal-client-objects/entrypoint.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+case "$1" in
+    "shell")
+      exec bash -i
+      ;;
+    *)
+      echo "Starting swh-indexer-journal client"
+      exec wait-for-it -s kafka:9092 -s --timeout=0 -- \
+          wait-for-it -s swh-search:5010 -s --timeout=0 -- \
+          swh --log-level DEBUG search --config-file /etc/softwareheritage/search/journal_client_objects.yml journal-client objects
+      ;;
+esac
+
diff --git a/docker/services/swh-search/entrypoint.sh b/docker/services/swh-search/entrypoint.sh
new file mode 100755
index 0000000..2c44fb7
--- /dev/null
+++ b/docker/services/swh-search/entrypoint.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+case "$1" in
+    "shell")
+      exec bash -i
+      ;;
+    *)
+      echo Starting the swh-search API server
+      wait-for-it elasticsearch:9200 -s --timeout=0
+      echo "Waiting for ElasticSearch cluster to be up"
+      cat << EOF | python3
+import elasticsearch
+es = elasticsearch.Elasticsearch(['elasticsearch:9200'])
+es.cluster.health(wait_for_status='yellow')
+EOF
+      echo "ElasticSearch cluster is up"
+      swh search -C $SWH_CONFIG_FILENAME initialize
+      exec gunicorn --bind 0.0.0.0:5010 \
+           --reload \
+           --threads 4 \
+           --workers 2 \
+           --log-level DEBUG \
+           --timeout 3600 \
+           'swh.search.api.server:make_app_from_configfile()'
+      ;;
+esac
diff --git a/docker/services/swh-storage-replayer/entrypoint.sh b/docker/services/swh-storage-replayer/entrypoint.sh
new file mode 100755
index 0000000..382d52c
--- /dev/null
+++ b/docker/services/swh-storage-replayer/entrypoint.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+case "$1" in
+    "shell")
+      exec bash -i
+      ;;
+    *)
+      wait_pgsql
+
+      echo Setup the database
+      PGPASSWORD=${POSTGRES_PASSWORD} swh db-init \
+          --db-name ${POSTGRES_DB} storage
+
+      echo Starting the swh-storage Kafka storage replayer
+      exec swh-journal replay \
+		   --broker kafka \
+		   --prefix swh.journal.objects \
+		   --consumer-id swh.storage.replica
+      ;;
+esac
diff --git a/docker/services/swh-storage/entrypoint.sh b/docker/services/swh-storage/entrypoint.sh
new file mode 100755
index 0000000..359880c
--- /dev/null
+++ b/docker/services/swh-storage/entrypoint.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+if [ "$STORAGE_BACKEND" = "postgresql" ]; then
+    source /srv/softwareheritage/utils/pgsql.sh
+    setup_pgsql
+
+elif [ "$STORAGE_BACKEND" = "cassandra" ]; then
+    echo Waiting for Cassandra to start
+    wait-for-it ${CASSANDRA_SEED}:9042 -s --timeout=0
+    echo Creating keyspace
+    cat << EOF | python3
+from swh.storage.cassandra import create_keyspace
+create_keyspace(['cassandra-seed'], 'swh')
+EOF
+
+fi
+
+case "$1" in
+    "shell")
+      exec bash -i
+      ;;
+    *)
+      if [ "$STORAGE_BACKEND" = "postgresql" ]; then
+          wait_pgsql
+
+          echo Setup the database
+          PGPASSWORD=${POSTGRES_PASSWORD} swh db-init \
+              --db-name ${POSTGRES_DB} storage
+      fi
+
+      echo Starting the swh-storage API server
+      exec gunicorn --bind 0.0.0.0:5002 \
+           --reload \
+           --threads 4 \
+           --workers 2 \
+           --log-level DEBUG \
+           --timeout 3600 \
+           'swh.storage.api.server:make_app_from_configfile()'
+      ;;
+esac
diff --git a/docker/services/swh-vault/entrypoint.sh b/docker/services/swh-vault/entrypoint.sh
new file mode 100755
index 0000000..2734bfc
--- /dev/null
+++ b/docker/services/swh-vault/entrypoint.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+source /srv/softwareheritage/utils/pgsql.sh
+setup_pgsql
+
+case "$1" in
+    "shell")
+        exec bash -i
+        ;;
+    "worker")
+        echo Starting the swh-vault Celery worker for
+        exec python -m celery worker \
+                    --app=swh.scheduler.celery_backend.config.app \
+                    --pool=prefork --events \
+                    --concurrency=${CONCURRENCY:-1} \
+                    --maxtasksperchild=${MAX_TASKS_PER_CHILD:-10} \
+                    -Ofair --loglevel=${LOGLEVEL:-INFO} --without-gossip \
+                    --without-mingle --without-heartbeat \
+                    --hostname "vault@%h"
+        ;;
+    "server")
+        # ensure the pathslicing root dir for the cache exists
+        mkdir -p /srv/softwareheritage/vault
+
+        wait_pgsql
+
+        echo Setup the swh-vault API database
+        PGPASSWORD=${POSTGRES_PASSWORD} swh db-init vault \
+                  --db-name ${POSTGRES_DB}
+
+        echo Starting the swh-vault API server
+        exec swh vault rpc-serve -C ${SWH_CONFIG_FILENAME}
+esac
diff --git a/docker/services/swh-web/entrypoint.sh b/docker/services/swh-web/entrypoint.sh
new file mode 100755
index 0000000..d33a904
--- /dev/null
+++ b/docker/services/swh-web/entrypoint.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -e
+
+create_admin_script="
+from django.contrib.auth import get_user_model;
+
+username = 'admin';
+password = 'admin';
+email = 'admin@swh-web.org';
+
+User = get_user_model();
+
+if not User.objects.filter(username = username).exists():
+    User.objects.create_superuser(username, email, password);
+"
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+case "$1" in
+    "shell")
+        exec bash -i
+        ;;
+     *)
+        echo "Migrating db using ${DJANGO_SETTINGS_MODULE}"
+        django-admin migrate --settings=${DJANGO_SETTINGS_MODULE}
+
+        echo "Creating admin user"
+        echo "$create_admin_script" | python3 -m swh.web.manage shell
+
+        echo "starting the swh-web server"
+        exec gunicorn --bind 0.0.0.0:5004 \
+             --threads 2 \
+             --workers 2 \
+             --timeout 3600 \
+             'django.core.wsgi:get_wsgi_application()'
+esac
diff --git a/docker/tests/run_tests.sh b/docker/tests/run_tests.sh
new file mode 100755
index 0000000..effa51c
--- /dev/null
+++ b/docker/tests/run_tests.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+# Main script to run high level tests on the Software Heritage stack
+
+# Use a temporary directory as working directory
+WORKDIR=/tmp/swh-docker-dev_tests
+# Create it if it does not exist
+mkdir $WORKDIR 2>/dev/null
+# Ensure it is empty before running the tests
+rm -rf $WORKDIR/*
+
+# We want the script to exit at the first encountered error
+set -e
+
+# Get test scripts directory
+TEST_SCRIPTS_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+
+# Set the docker-compose.yml file to use
+export COMPOSE_FILE=$TEST_SCRIPTS_DIR/../docker-compose.yml
+
+# Useful global variables
+SWH_WEB_API_BASEURL="http://localhost:5004/api/1"
+CURRENT_TEST_SCRIPT=""
+
+# Colored output related variables and functions (only if stdout is a terminal)
+if test -t 1; then
+  GREEN='\033[0;32m'
+  RED='\033[0;31m'
+  NC='\033[0m'
+else
+  DOCO_OPTIONS='--no-ansi'
+fi
+
+# Remove previously dumped service logs file if any
+rm -f $TEST_SCRIPTS_DIR/swh-docker-compose.logs
+
+function colored_output {
+  local msg="$2"
+  if [ "$CURRENT_TEST_SCRIPT" != "" ]; then
+    msg="[$CURRENT_TEST_SCRIPT] $msg"
+  fi
+  echo -e "${1}${msg}${NC}"
+}
+
+function status_message {
+  colored_output ${GREEN} "$1"
+}
+
+function error_message {
+  colored_output ${RED} "$1"
+}
+
+function dump_docker_logs {
+  error_message "Dumping logs for all services in file $TEST_SCRIPTS_DIR/swh-docker-compose.logs"
+  docker-compose logs > $TEST_SCRIPTS_DIR/swh-docker-compose.logs
+}
+
+# Exit handler that will get called when this script terminates
+function finish {
+  if [ $? -ne 0 ] && [ "$CURRENT_TEST_SCRIPT" != "" ]; then
+    local SCRIPT_NAME=$CURRENT_TEST_SCRIPT
+    CURRENT_TEST_SCRIPT=""
+    error_message "An error occurred when running test script ${SCRIPT_NAME}"
+    dump_docker_logs
+  fi
+  docker-compose $DOCO_OPTIONS down
+  rm -rf $WORKDIR
+}
+trap finish EXIT
+
+# Docker-compose events listener that will be executed in background
+# Parameters:
+#   $1: PID of parent process
+function listen_docker_events {
+  docker-compose $DOCO_OPTIONS events | while read event
+  do
+    service=$(echo $event | cut -d " " -f7 | sed 's/^name=swh-docker-dev_\(.*\)_1)/\1/')
+    event_type=$(echo $event | cut -d ' ' -f4)
+    # "docker-compose down" has been called, exiting this child process
+    if [ "$event_type" = "kill" ] ; then
+      exit
+    # a swh service crashed, sending signal to parent process to exit with error
+    elif [ "$event_type" = "die" ]; then
+      if [[ "$service" =~ ^swh.* ]]; then
+        exit_code=$(docker-compose ps | grep $service | awk '{print $4}')
+        if [ "$exit_code" != "0" ]; then
+          error_message "Service $service died unexpectedly, exiting"
+          dump_docker_logs
+          kill -s SIGUSR1 $1; exit
+        fi
+      fi
+    fi
+  done
+}
+trap "exit 1" SIGUSR1
+
+declare -A SERVICE_LOGS_NB_LINES_READ
+
+# Function to wait for a specific string to be outputted in a specific
+# docker-compose service logs.
+# When called multiple times on the same service, only the newly outputted
+# logs since the last call will be processed.
+# Parameters:
+#   $1: a timeout value in seconds to stop waiting and exit with error
+#   $2: docker-compose service name
+#   $3: the string to look for in the produced logs
+function wait_for_service_output {
+  local nb_lines_to_skip=0
+  if [[ -v "SERVICE_LOGS_NB_LINES_READ[$2]" ]]; then
+    let nb_lines_to_skip=${SERVICE_LOGS_NB_LINES_READ[$2]}+1
+  fi
+  SECONDS=0
+  local service_logs=$(docker-compose $DOCO_OPTIONS logs $2 | tail -n +$nb_lines_to_skip)
+  until echo -ne "$service_logs" | grep -m 1 "$3" >/dev/null ; do
+    sleep 1;
+    if (( $SECONDS > $1 )); then
+      error_message "Could not find pattern \"$3\" in $2 service logs after $1 seconds"
+      exit 1
+    fi
+    let nb_lines_to_skip+=$(echo -ne "$service_logs" | wc -l)
+    service_logs=$(docker-compose $DOCO_OPTIONS logs $2 | tail -n +$nb_lines_to_skip)
+  done
+  let nb_lines_to_skip+=$(echo -ne "$service_logs" | wc -l)
+  SERVICE_LOGS_NB_LINES_READ[$2]=$nb_lines_to_skip
+}
+
+# Function to make an HTTP request and gets its response.
+# It should be used the following way:
+#   response=$(http_request <method> <url>)
+# Parameters:
+#   $1: http method name (GET, POST, ...)
+#   $2: request url
+function http_request {
+  local response=$(curl -sS -X $1 $2)
+  echo $response
+}
+
+# Function to check that an HTTP request ends up with no errors.
+# If the HTTP response code is different from 200, an error will
+# be raised and the main script will terminate
+# Parameters:
+#   $1: http method name (GET, POST, ...)
+#   $2: request url
+function http_request_check {
+  curl -sSf -X $1 $2 > /dev/null
+}
+
+# Function to run the content of a script dedicated to test a specific
+# part of the Software Heritage stack.
+function run_test_script {
+  local SCRIPT_NAME=$(basename $1)
+  status_message "Executing test script $SCRIPT_NAME"
+  CURRENT_TEST_SCRIPT=$SCRIPT_NAME
+  source $1
+}
+
+# Move to work directory
+cd $WORKDIR
+
+# Start the docker-compose event handler as a background process
+status_message "Starting docker-compose events listener"
+listen_docker_events $$ &
+
+# Start the docker-compose environment including the full Software Heritage stack
+status_message "Starting swh docker-compose environment"
+docker-compose $DOCO_OPTIONS up -d
+
+# Ensure all swh services are up before running tests
+status_message "Waiting for swh services to be up"
+docker-compose $DOCO_OPTIONS exec -T swh-storage wait-for-it localhost:5002 -s --timeout=0
+docker-compose $DOCO_OPTIONS exec -T swh-objstorage wait-for-it localhost:5003 -s --timeout=0
+docker-compose $DOCO_OPTIONS exec -T swh-web wait-for-it localhost:5004 -s --timeout=0
+docker-compose $DOCO_OPTIONS exec -T swh-vault-api wait-for-it localhost:5005 -s --timeout=0
+docker-compose $DOCO_OPTIONS exec -T swh-deposit wait-for-it localhost:5006 -s --timeout=0
+docker-compose $DOCO_OPTIONS exec -T swh-idx-storage wait-for-it localhost:5007 -s --timeout=0
+docker-compose $DOCO_OPTIONS exec -T swh-scheduler-api wait-for-it localhost:5008 -s --timeout=0
+
+# Execute test scripts
+for test_script in $TEST_SCRIPTS_DIR/test_*; do
+  run_test_script ${test_script}
+  CURRENT_TEST_SCRIPT=""
+done
diff --git a/docker/tests/test_01_loader_git.sh b/docker/tests/test_01_loader_git.sh
new file mode 100644
index 0000000..e907d0f
--- /dev/null
+++ b/docker/tests/test_01_loader_git.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+shopt -s nullglob extglob
+
+TEST_GIT_REPO_NAME="swh-loader-core"
+TEST_GIT_REPO_URL="https://forge.softwareheritage.org/source/${TEST_GIT_REPO_NAME}.git"
+
+status_message "Scheduling the loading of the git repository located at ${TEST_GIT_REPO_URL}"
+
+docker-compose $DOCO_OPTIONS exec -T swh-scheduler-api swh scheduler task add load-git repo_url=$TEST_GIT_REPO_URL
+
+status_message "Waiting for the git loading task to complete"
+
+wait_for_service_output 300 swh-loader "swh.loader.git.tasks.UpdateGitRepository.*succeeded"
+
+status_message "The loading task has been successfully executed"
+
+status_message "Getting all git objects contained in the repository"
+git clone $TEST_GIT_REPO_URL
+cd $TEST_GIT_REPO_NAME
+cd "$(git rev-parse --git-path objects)"
+for p in pack/pack-*([0-9a-f]).idx ; do
+  git show-index < $p | cut -f 2 -d ' ' > $WORKDIR/git_objects
+done
+for o in [0-9a-f][0-9a-f]/*([0-9a-f]) ; do
+  echo ${o/\/} >> $WORKDIR/git_objects
+done
+
+declare -ga CONTENTS
+declare -ga DIRECTORIES
+declare -ga REVISIONS
+declare -ga RELEASES
+
+while IFS='' read -r object || [[ -n "$object" ]]; do
+  object_type=$(git cat-file -t $object)
+  if [ "$object_type" = "blob" ]; then
+    CONTENTS+=($object)
+  elif [ "$object_type" = "tree" ]; then
+    DIRECTORIES+=($object)
+  elif [ "$object_type" = "commit" ]; then
+    REVISIONS+=($object)
+  elif [ "$object_type" = "tag" ]; then
+    RELEASES+=($object)
+  fi
+done < $WORKDIR/git_objects
+
+status_message "Checking all git objects have been successfully loaded into the archive"
+
+status_message "Checking contents"
+for content in "${CONTENTS[@]}"; do
+  http_request_check GET ${SWH_WEB_API_BASEURL}/content/sha1_git:$content/
+done
+status_message "All contents have been successfully loaded into the archive"
+
+status_message "Checking directories"
+for directory in "${DIRECTORIES[@]}"; do
+  http_request_check GET ${SWH_WEB_API_BASEURL}/directory/$directory/
+done
+status_message "All directories have been successfully loaded into the archive"
+
+status_message "Checking revisions"
+for revision in "${REVISIONS[@]}"; do
+  http_request_check GET ${SWH_WEB_API_BASEURL}/revision/$revision/
+done
+status_message "All revisions have been successfully loaded into the archive"
+
+status_message "Checking releases"
+for release in "${RELEASES[@]}"; do
+  http_request_check GET ${SWH_WEB_API_BASEURL}/release/$release/
+done
+status_message "All releases have been successfully loaded into the archive"
diff --git a/docker/tests/test_02_vault.sh b/docker/tests/test_02_vault.sh
new file mode 100644
index 0000000..0f4ee25
--- /dev/null
+++ b/docker/tests/test_02_vault.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+directory=${DIRECTORIES[$RANDOM % ${#DIRECTORIES[@]}]}
+revision=${REVISIONS[$RANDOM % ${#REVISIONS[@]}]}
+
+status_message "Requesting the vault to cook a random directory stored into the archive"
+http_request_check POST ${SWH_WEB_API_BASEURL}/vault/directory/$directory/
+
+status_message "Waiting for the directory cooking task to complete"
+wait_for_service_output 300 swh-vault-worker "swh.vault.cooking_tasks.SWHCookingTask.*succeeded"
+status_message "The directory cooking task has been sucessfully executed"
+
+status_message "Checking that the cooked directory tarball can be downloaded"
+http_request_check GET ${SWH_WEB_API_BASEURL}/vault/directory/$directory/raw/
+status_message "The cooked directory tarball is available for download"
+
+status_message "Requesting the vault to cook a random revision stored into the archive"
+http_request_check POST ${SWH_WEB_API_BASEURL}/vault/revision/$revision/gitfast/
+
+status_message "Waiting for the revision cooking task to complete"
+wait_for_service_output 300 swh-vault-worker "swh.vault.cooking_tasks.SWHCookingTask.*succeeded"
+status_message "The revision cooking task has been sucessfully executed"
+
+status_message "Checking that the cooked revision tarball can be downloaded"
+http_request_check GET ${SWH_WEB_API_BASEURL}/vault/revision/$revision/gitfast/raw/
+status_message "The cooked revision tarball is available for download"
diff --git a/docker/utils/pgsql.sh b/docker/utils/pgsql.sh
new file mode 100644
index 0000000..6e30e83
--- /dev/null
+++ b/docker/utils/pgsql.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+setup_pgsql () {
+    echo "${PGHOST}:5432:postgres:${PGUSER}:${POSTGRES_PASSWORD}" > ~/.pgpass
+    echo "${PGHOST}:5432:${POSTGRES_DB}:${PGUSER}:${POSTGRES_PASSWORD}" >> ~/.pgpass
+    cat > ~/.pg_service.conf <<EOF
+[${POSTGRES_DB}]
+dbname=${POSTGRES_DB}
+host=${PGHOST}
+port=5432
+user=${PGUSER}
+EOF
+    chmod 0600 ~/.pgpass
+}
+
+wait_pgsql () {
+    echo Waiting for postgresql to start
+    wait-for-it ${PGHOST}:5432 -s --timeout=0
+    until psql postgresql:///?service=${POSTGRES_DB} -c "select 1" > /dev/null 2> /dev/null; do sleep 1; done
+}
\ No newline at end of file
diff --git a/docker/utils/pyutils.sh b/docker/utils/pyutils.sh
new file mode 100755
index 0000000..289eb40
--- /dev/null
+++ b/docker/utils/pyutils.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+setup_pip () {
+  echo Using pip from $(which pip)
+
+  if [[ -d /src ]] ; then
+    for srcrepo in /src/swh-* ; do
+      pip install $srcrepo
+    done
+  fi
+
+  echo Installed Python packages:
+  pip list
+}