diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -626,3 +626,36 @@ ``` (swh) ~/swh-environment$ swh-scheduler task respawn 1 ``` + + +## Starting a kafka-powered replica of the storage + +This repo comes with an optional `docker-compose.storage-replica.yml` +docker compose file that can be used to test the kafka-powered replication +mecanism for the main storage. + +This can be used like: + +``` +~/swh-environment/swh-docker-dev$ docker-compose -f docker-compose.yml -f docker-compose.storage-replica.yml up -d +[...] +``` + +Compared to the original compose file, this will: + +- overrides the swh-storage service to activate the kafka direct writer + on swh.journal.objects prefixed topics using thw swh.storage.master ID, +- overrides the swh-web service to make it use the replica instead of the + master storage, +- starts a db for the replica, +- starts a storage service based on this db, +- starts a replayer service that runs the process that listen to kafka to + keeps the replica in sync. + +When using it, you will have a setup in which the master storage is used by +workers and most other services, whereas the storage replica will be used to +by the web application and should be kept in sync with the master storage +by kafka. + + +Note that the object storage is not replicated here, only the graph storage. diff --git a/conf/storage.yml b/conf/storage-replica.yml copy from conf/storage.yml copy to conf/storage-replica.yml --- a/conf/storage.yml +++ b/conf/storage-replica.yml @@ -1,7 +1,7 @@ storage: cls: local args: - db: postgresql:///?service=swh-storage + db: postgresql:///?service=swh-storage-replica objstorage: cls: remote args: diff --git a/conf/storage.yml b/conf/storage.yml --- a/conf/storage.yml +++ b/conf/storage.yml @@ -6,3 +6,10 @@ cls: remote args: url: http://swh-objstorage:5003/ + journal_writer: + cls: kafka + args: + brokers: + - kafka + prefix: swh.journal.objects + client_id: swh.storage.master diff --git a/conf/web-replica.yml b/conf/web-replica.yml new file mode 100644 --- /dev/null +++ b/conf/web-replica.yml @@ -0,0 +1,41 @@ +storage: + cls: remote + args: + url: http://swh-storage-replica:5002/ + timeout: 1 + +objstorage: + cls: remote + args: + url: http://swh-objstorage:5003/ + +indexer_storage: + cls: remote + args: + url: http://swh-idx-storage:5007/ + +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008/ + +vault: + cls: remote + args: + url: http://swh-vault-api:5005/ + +deposit: + private_api_url: https://swh-deposit:5006/1/private/ + private_api_user: swhworker + private_api_password: '' + +allowed_hosts: + - "*" + +debug: yes + +grecaptcha: + activated: false + site_key: '' + +serve_assets: yes diff --git a/docker-compose.storage-replica.yml b/docker-compose.storage-replica.yml new file mode 100644 --- /dev/null +++ b/docker-compose.storage-replica.yml @@ -0,0 +1,52 @@ +version: '2' + +services: + + # override web app to use the replica + swh-web: + environment: + SWH_CONFIG_FILENAME: /web-replica.yml + volumes: + - "./conf/web-replica.yml:/web-replica.yml:ro" + depends_on: + - swh-storage-replica + + # create a dedicated db for the replica + swh-storage-replica-db: + image: postgres:11 + env_file: + - ./env/storage-db-replica.env + environment: + # unset PGHOST as db service crashes otherwise + PGHOST: + + # and an RPC server + swh-storage-replica: + build: ./ + image: swh/stack + depends_on: + - swh-storage-replica-db + - swh-objstorage + env_file: + - ./env/storage-db-replica.env + environment: + SWH_CONFIG_FILENAME: /storage-replica.yml + volumes: + - "./conf/storage-replica.yml:/storage-replica.yml:ro" + - "./services/swh-storage/entrypoint.sh:/entrypoint.sh:ro" + + # and the background process that keeps the replica in sync with the + # main graph + swh-storage-replica-replayer: + build: ./ + image: swh/stack + depends_on: + - swh-storage-replica-db + - swh-objstorage + env_file: + - ./env/storage-db-replica.env + environment: + SWH_CONFIG_FILENAME: /storage-replica.yml + volumes: + - "./conf/storage-replica.yml:/storage-replica.yml:ro" + - "./services/swh-storage-replayer/entrypoint.sh:/entrypoint.sh:ro" diff --git a/docker-compose.yml b/docker-compose.yml --- a/docker-compose.yml +++ b/docker-compose.yml @@ -149,6 +149,7 @@ depends_on: - swh-storage-db - swh-objstorage + - kafka env_file: - ./env/storage-db.env environment: diff --git a/env/storage-db-replica.env b/env/storage-db-replica.env new file mode 100644 --- /dev/null +++ b/env/storage-db-replica.env @@ -0,0 +1,4 @@ +PGHOST=swh-storage-replica-db +PGUSER=postgres +POSTGRES_PASSWORD=testpassword +POSTGRES_DB=swh-storage-replica diff --git a/services/swh-storage-replayer/entrypoint.sh b/services/swh-storage-replayer/entrypoint.sh new file mode 100755 --- /dev/null +++ b/services/swh-storage-replayer/entrypoint.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +if [[ -d /src ]] ; then + for srcrepo in /src/swh-* ; do + pushd $srcrepo + pip install -e . + popd + done +fi + +echo Installed Python packages: +pip list + +source /swh-utils/pgsql.sh + +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + *) + wait_pgsql + + echo Setup the database + PGPASSWORD=${POSTGRES_PASSWORD} swh-db-init storage \ + --db-name ${POSTGRES_DB} + + echo Starting the swh-storage Kafka storage replayer + exec swh-journal replay --broker kafka --prefix swh.journal.objects \ + --consumer-id swh.storage.replica + ;; +esac