diff --git a/mirror.yml b/mirror.yml index 2d56a62..58531d5 100644 --- a/mirror.yml +++ b/mirror.yml @@ -1,534 +1,534 @@ version: "3.8" services: memcache: # used by the web app image: memcached deploy: replicas: 1 networks: - - swhtest-mirror + - swh-mirror storage-db: # the main storage database image: postgres:13 deploy: # we want only one replica of this service in the whole cluster replicas: 1 # possible workaround to prevent dropped idle cnx (making pg pool fail to work after a while) endpoint_mode: dnsrr placement: max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.storage-db == true networks: - - swhtest-mirror + - swh-mirror command: ['-c', 'shared_buffers=4GB', '-c', 'effective_cache_size=4GB', '-c', 'random_page_cost=1.5', '-c', 'max_wal_size=4GB'] environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password POSTGRES_USER: swh POSTGRES_DB: # unset POSTGRES_DB: we're handling db creation ourselves in the backend volumes: - "storage-db:/var/lib/postgresql/data:rw,Z" secrets: - source: swh-mirror-storage-db-password target: postgres-password uid: '999' mode: 0400 web-db: # the database for the web application image: postgres:13 deploy: # we want only one replica of this service in the whole cluster replicas: 1 endpoint_mode: dnsrr placement: max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.web-db == true networks: - - swhtest-mirror + - swh-mirror command: ['-c', 'shared_buffers=4GB', '-c', 'effective_cache_size=4GB', '-c', 'random_page_cost=1.5', '-c', 'max_wal_size=4GB'] environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password POSTGRES_USER: swh POSTGRES_DB: swh-web volumes: - "web-db:/var/lib/postgresql/data:rw,Z" secrets: - source: swh-mirror-web-db-password target: postgres-password uid: '999' mode: 0400 web: # the web app; serves both the web navigation interface and the public web API image: softwareheritage/web:${SWH_IMAGE_TAG:-latest} networks: - - swhtest-mirror + - swh-mirror configs: - source: web target: /etc/softwareheritage/config.yml command: serve environment: PORT: "5004" PGHOST: web-db PGUSER: swh POSTGRES_DB: swh-web depends_on: - web-db - memcache secrets: - source: swh-mirror-web-db-password target: postgres-password mode: 0400 objstorage: # the swh-objstorage backend service; this example configuration uses a simple # filesystem-based pathslicing implementation of the swh-objstorage: see # https://docs.softwareheritage.org/devel/apidoc/swh.objstorage.backends.pathslicing.html image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: # needed to allow actual and dynamic load balancing endpoint_mode: dnsrr # a real life replicas value better be in the 16 to 64 range replicas: 1 placement: # note: if using a local volume, you need to pin the objstorage # instances on the node hosting the volume, eg. the manager, otherwise, # if using a remote/distributed objstorage backend (seaweedfs, cloud, # etc.) you want to remove this placement constraint constraints: - node.labels.org.softwareheritage.mirror.volumes.objstorage == true networks: - - swhtest-mirror + - swh-mirror volumes: - "objstorage:/srv/softwareheritage/objects:rw,Z" configs: - source: objstorage target: /etc/softwareheritage/config.yml env_file: - ./env/common-python.env environment: PORT: "5003" command: ["rpc-server", "objstorage"] storage: # the swh-storage backend service; using postgresql (storage-db) as backend image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: # needed to allow actual and dynammic load balancing endpoint_mode: dnsrr # a real life replicas value better be in the 16 to 64 range # however we recommend keeping 1 in this stack deploy file so that # an upgrade of the base image that comes with a database migration script # is upgraded in a consistent way replicas: 1 networks: - - swhtest-mirror + - swh-mirror configs: - source: storage target: /etc/softwareheritage/config.yml environment: PGHOST: storage-db PGUSER: swh POSTGRES_DB: swh-storage FLAVOR: mirror PORT: "5002" env_file: - ./env/common-python.env secrets: - source: swh-mirror-storage-db-password target: postgres-password mode: 0400 command: ["rpc-server", "storage"] depends_on: - storage-db nginx: image: nginx networks: - - swhtest-mirror + - swh-mirror configs: - source: nginx target: /etc/nginx/nginx.conf ports: - "5081:5081/tcp" deploy: mode: global prometheus: image: prom/prometheus networks: - - swhtest-mirror + - swh-mirror depends_on: - prometheus-statsd-exporter command: # Needed for the reverse-proxy - "--web.external-url=/prometheus" - "--config.file=/etc/prometheus/prometheus.yml" configs: - source: prometheus target: /etc/prometheus/prometheus.yml deploy: # we want only one replica of this service in the whole cluster replicas: 1 placement: max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.monitoring == true prometheus-statsd-exporter: image: prom/statsd-exporter networks: - - swhtest-mirror + - swh-mirror command: - "--statsd.mapping-config=/etc/prometheus/statsd-mapping.yml" configs: - source: prometheus-statsd-exporter target: /etc/prometheus/statsd-mapping.yml deploy: # we want only one replica of this service in the whole cluster replicas: 1 placement: max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.monitoring == true grafana: image: grafana/grafana networks: - - swhtest-mirror + - swh-mirror depends_on: - prometheus environment: GF_SERVER_ROOT_URL: http://localhost:5081/grafana configs: - source: grafana-provisioning-datasources-prometheus target: /etc/grafana/provisioning/datasources/prometheus.yaml - source: grafana-provisioning-dashboards-all target: /etc/grafana/provisioning/dashboards/all.yaml - source: grafana-dashboards-backend-stats target: /var/lib/grafana/dashboards/backend-stats.json - source: grafana-dashboards-content-replayer target: /var/lib/grafana/dashboards/content-replayer.json - source: grafana-dashboards-graph-replayer target: /var/lib/grafana/dashboards/graph-replayer.json deploy: # we want only one replica of this service in the whole cluster replicas: 1 placement: max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.monitoring == true ## replayer services redis: image: redis:6.2.6 deploy: # we want only one replica of this service in the whole cluster replicas: 1 placement: max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.redis == true networks: - - swhtest-mirror + - swh-mirror command: - redis-server - --save 60 1 - --loglevel warning volumes: - redis:/data graph-replayer: image: softwareheritage/replayer:${SWH_IMAGE_TAG:-latest} deploy: # do not start replayers by default once the remaining of the stack is # running as expected, bump this value; expected real-life values should # be something in the range [16, 64] (staging) or [16, 256] (production) # depending on your hardware capabilities; note that there is no need of # going above the number of partitions on the kafka cluster (so the 64 # and 254 upper limits depending on the execution environment). replicas: 0 restart_policy: condition: "none" networks: - - swhtest-mirror + - swh-mirror env_file: - ./env/common-python.env environment: STATSD_TAGS: 'role:graph-replayer,hostname:${HOSTNAME}' configs: - source: graph-replayer target: /etc/softwareheritage/config.yml command: - graph-replayer depends_on: - storage - redis content-replayer: image: softwareheritage/replayer:${SWH_IMAGE_TAG:-latest} deploy: # do not start replayers by default; see above replicas: 0 networks: - - swhtest-mirror + - swh-mirror env_file: - ./env/common-python.env environment: STATSD_TAGS: 'role:content-replayer,hostname:${HOSTNAME}' configs: - source: content-replayer target: /etc/softwareheritage/config.yml command: - content-replayer depends_on: - objstorage - redis ## secondary services amqp: image: rabbitmq:3.6-management networks: - - swhtest-mirror + - swh-mirror ports: - 5072:5672 ### vault services vault-db: # the database for the vault rpc server image: postgres:13 deploy: # we want only one replica of this service in the whole cluster replicas: 1 endpoint_mode: dnsrr placement: max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.web-db == true networks: - - swhtest-mirror + - swh-mirror command: ['-c', 'shared_buffers=4GB', '-c', 'effective_cache_size=4GB', '-c', 'random_page_cost=1.5', '-c', 'max_wal_size=4GB'] environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password POSTGRES_USER: swh POSTGRES_DB: swh-vault volumes: - "vault-db:/var/lib/postgresql/data:rw,Z" secrets: - source: swh-mirror-vault-db-password target: postgres-password uid: '999' mode: 0400 vault: image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 networks: - - swhtest-mirror + - swh-mirror env_file: - ./env/common-python.env configs: - source: vault target: /etc/softwareheritage/config.yml environment: PGHOST: vault-db PGUSER: swh POSTGRES_DB: swh-vault PORT: "5005" LOG_LEVEL: DEBUG command: ["rpc-server", "vault"] secrets: - source: swh-mirror-vault-db-password target: postgres-password uid: '999' mode: 0400 vault-worker: image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 networks: - - swhtest-mirror + - swh-mirror env_file: - ./env/common-python.env - ./env/celery-worker.env environment: SWH_WORKER_INSTANCE: vault LOG_LEVEL: DEBUG configs: - source: vault-worker target: /etc/softwareheritage/config.yml command: - celery-worker # vault do really need someone to talk to via SMTP mailhog: image: mailhog/mailhog networks: - - swhtest-mirror + - swh-mirror ### scheduler services scheduler-db: # the database for the vault rpc server image: postgres:13 deploy: # we want only one replica of this service in the whole cluster replicas: 1 endpoint_mode: dnsrr placement: max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.web-db == true networks: - - swhtest-mirror + - swh-mirror command: ['-c', 'shared_buffers=4GB', '-c', 'effective_cache_size=4GB', '-c', 'random_page_cost=1.5', '-c', 'max_wal_size=4GB'] environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password POSTGRES_USER: swh POSTGRES_DB: swh-scheduler volumes: - "scheduler-db:/var/lib/postgresql/data:rw,Z" secrets: - source: swh-mirror-scheduler-db-password target: postgres-password uid: '999' mode: 0400 scheduler: image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 networks: - - swhtest-mirror + - swh-mirror configs: - source: scheduler target: /etc/softwareheritage/config.yml env_file: - ./env/common-python.env environment: PGHOST: scheduler-db PGUSER: swh POSTGRES_DB: swh-scheduler PORT: "5008" command: ["rpc-server", "scheduler"] secrets: - source: swh-mirror-scheduler-db-password target: postgres-password uid: '999' mode: 0400 scheduler-listener: image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 networks: - - swhtest-mirror + - swh-mirror configs: - source: scheduler target: /etc/softwareheritage/config.yml env_file: - ./env/common-python.env environment: SWH_WORKER_INSTANCE: scheduler PGHOST: scheduler-db PGUSER: swh POSTGRES_DB: swh-scheduler command: ["scheduler", "start-listener"] secrets: - source: swh-mirror-scheduler-db-password target: postgres-password uid: '999' mode: 0400 scheduler-runner: image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 networks: - - swhtest-mirror + - swh-mirror configs: - source: scheduler target: /etc/softwareheritage/config.yml env_file: - ./env/common-python.env environment: SWH_WORKER_INSTANCE: scheduler PGHOST: scheduler-db PGUSER: swh POSTGRES_DB: swh-scheduler command: ["scheduler", "start-runner", "--period", "10"] secrets: - source: swh-mirror-scheduler-db-password target: postgres-password uid: '999' mode: 0400 volumes: objstorage: redis: scheduler-db: storage-db: vault-db: web-db: secrets: swh-mirror-storage-db-password: external: true swh-mirror-web-db-password: external: true swh-mirror-vault-db-password: external: true swh-mirror-scheduler-db-password: external: true configs: storage: file: conf/storage.yml objstorage: file: conf/objstorage.yml nginx: file: conf/nginx.conf scheduler: file: conf/scheduler.yml vault: file: conf/vault.yml vault-worker: file: conf/vault-worker.yml web: file: conf/web.yml content-replayer: file: conf/content-replayer.yml graph-replayer: file: conf/graph-replayer.yml prometheus: file: conf/prometheus.yml prometheus-statsd-exporter: file: conf/prometheus-statsd-mapping.yml grafana-provisioning-datasources-prometheus: file: conf/grafana/provisioning/datasources/prometheus.yaml grafana-provisioning-dashboards-all: file: conf/grafana/provisioning/dashboards/all.yaml grafana-dashboards-graph-replayer: file: conf/grafana/dashboards/graph-replayer.json grafana-dashboards-content-replayer: file: conf/grafana/dashboards/content-replayer.json grafana-dashboards-backend-stats: file: conf/grafana/dashboards/backend-stats.json networks: - swhtest-mirror: + swh-mirror: driver: overlay attachable: true diff --git a/tests/README.md b/tests/README.md index 26c05d0..b8ecf30 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,193 +1,193 @@ # mirror stack deployment tests These are a set of tests for the deployment of a full software heritage mirror stack. As of today, only docker swarm based deployment tests are available. ## docker swarm deployment tests This test is using [pytest-testinfra](https://github.com/pytest-dev/pytest-testinfra) to orchestrate the deployment and checks that are made against the replicated Software Heritage Archive. The idea of this test is: - a test dataset is built by loading a few origins in a dedicated swh instance (using the swh-environment/docker), - the gathered objects are pushed in a dedicated set of kafka topics on swh's staging kafka broker (swh.test.objects), - expected statistics for each origin are also computed and pushed in the swh.test.objects.stats topic; these statistics are simply the total number, for each origin, of each object type (content, directory, revision, snapshot, release) is reachable from that origin. Then, the test scenario is the following: 1. copy all docker config files and resolve template ones in a temp dir (especially conf/graph-replayer.yml.test and conf/content-replayer.yml.test, see the mirror_stack fixture in conftest.py), 2. create and deploy a docker stack from the mirror.yml compose file from the tmp dir; note that replayer services are not started at this point (their replication factor is set to 0 in mirror.yml), 3. wait for all the services to be up 4. scale the content replayer service to 1, and wait for the service to be up, 5. scale the content replayer service to 4, and wait for services to be up, 6. wait for the content replaying to be done (test replayer services are configured with stop_on_eof=true), 7. scale the content replayer to 0 8. repeat steps 4-7 for the graph-replayer 9. retrieve expected stats for each origin from a dedicated swh.test.objects.stats topic on kafka, 10. compute these stats from the replicated archive; note that this step also check content object hashes from the replicated objstorage, 11. compare computed stats with expected ones. 12. spawn vault (flat) cooking for each origin (latest snapshot's master) 13. wait for the tgz artifacts to be generated by vault-workers 14. download resulting artifacts and make a few checks on their content. Obviously, this test heavily depends on the content of ``swh.test.objects`` topics on kafka, thus some tooling is required to manage said test dataset. These tools are not part of this repo, but will be provided in the swh-environment git repo (these are using the development docker environment). ### Running the test The test is written using pytest-testinfra, thus relies on the pytest execution tool. Note that for this test run: - docker swarm must be enabled - it will use dedicated test kafka topics on the staging kafka broker hosted by software heritage (see the Journal TLS endpoint listed on https://docs.softwareheritage.org/sysadm/network-architecture/service-urls.html#public-urls), - it will require a few environment variables set before running the test, namely: - `SWH_MIRROR_TEST_KAFKA_USERNAME`: login used to access the kafka broker, - `SWH_MIRROR_TEST_KAFKA_PASSWORD`: password used to access the kafka broker, - `SWH_MIRROR_TEST_KAFKA_BROKER`: URL os the kafka broker (should be the one described above), - `SWH_MIRROR_TEST_OBJSTORAGE_URL`: the URL of the source object storage used for the content replication; it would typically include access credentials, e.g. `https://login:password@objstorage.softwareheritage.org/`, - `SWH_IMAGE_TAG`: the docker image tag to be tested. You can copy the template `env/tests.env.template` to `env/tests.env` to set them. - the `softwareheritage/base`, `softwareheritage/web`, `softwareheritage/replayer` and `softwareheritage/test` images must be built with the proper image tag (`$SWH_IMAGE_TAG`). See the `../images/build_images.sh` script to rebuild images if need be. Assuming you have a properly set up environment: ``` # check the docker swarm cluster is ok ~/swh-mirror$ docker node ls ID HOSTNAME STATUS AVAILABILITY MANAGER STATUS ENGINE VERSION w6uzfpxayyc8l9ksfud7dlq9p * libra Ready Active Leader 20.10.5+dfsg1 # check images ~/swh-mirror$ echo $SWH_IMAGE_TAG 20220805-185133 ~/swh-mirror$ docker image ls -f reference="softwareheritage/*:$SWH_IMAGE_TAG" REPOSITORY TAG IMAGE ID CREATED SIZE softwareheritage/replayer 20220805-185133 da2d12d57a65 5 days ago 223MB softwareheritage/test 20220805-185133 cb4449867d3a 5 days ago 682MB softwareheritage/web 20220805-185133 66c54d5c2611 5 days ago 364MB softwareheritage/base 20220805-185133 528010e1fc9c 5 days ago 682MB # check environment variables are set ~/swh-mirror$ env | grep SWH_MIRROR_TEST SWH_MIRROR_TEST_KAFKA_PASSWORD= SWH_MIRROR_TEST_KAFKA_BROKER=broker1.journal.staging.swh.network:9093 SWH_MIRROR_TEST_KAFKA_USERNAME=mirror-test-ro SWH_MIRROR_TEST_OBJSTORAGE_URL=https://:@objstorage.softwareheritage.org/ ``` you should be able to execute the test: ``` ~/swh-mirror$ pytest ============================== test session starts ============================== platform linux -- Python 3.9.2, pytest-6.2.5, py-1.9.0, pluggy-1.0.0 rootdir: /home/ddouard/swh/swh-docker plugins: django-4.5.2, dash-1.18.1, django-test-migrations-1.2.0, forked-1.4.0, redis-2.4.0, requests-mock-1.9.3, Faker-4.18.0, asyncio-0.18.1, xdist-2.1.0, hypothesis-6.4.3, testinfra-6.8.0, postgresql-3.1.3, flask-1.1.0, mock-3.7.0, swh.journal-1.0.1.dev10+gdb9d202, swh.core-2.13 asyncio: mode=legacy collected 1 item tests/test_graph_replayer.py . [100%] =============================== warnings summary ================================ ../../.virtualenvs/swh/lib/python3.9/site-packages/pytest_asyncio/plugin.py:191 /home/ddouard/.virtualenvs/swh/lib/python3.9/site-packages/pytest_asyncio/plugin.py:191: DeprecationWarning: The 'asyncio_mode' default value will change to 'strict' in future, please explicitly use 'asyncio_mode=strict' or 'asyncio_mode=auto' in pytest configuration file. config.issue_config_time_warning(LEGACY_MODE, stacklevel=2) -- Docs: https://docs.pytest.org/en/stable/warnings.html =================== 1 passed, 1 warning in 923.19s (0:15:23) ==================== ``` Note the test takes quite some time to execute, so be patient. Troubleshooting =============== ### Watch out for stale services If something goes wrong, you might want to check if you have any remaining Docker services setup: docker service ls If you want to shut them all down, you can use: docker service rm $(docker service ls --format '{{.Name}}') ### I want a shell! To run a shell in an image in the Swarm context, use the following: - docker run --network=swhtest_mirror0_swhtest-mirror -ti --env-file env/common-python.env --env STATSD_TAGS="role:content-replayer,hostname:${HOSTNAME}" -v /tmp/pytest-of-lunar/pytest-current/mirrorcurrent/conf/content-replayer.yml:/etc/softwareheritage/config.yml softwareheritage/replayer:20220915-163058 shell + docker run --network=swhtest_mirror0_swh-mirror -ti --env-file env/common-python.env --env STATSD_TAGS="role:content-replayer,hostname:${HOSTNAME}" -v /tmp/pytest-of-lunar/pytest-current/mirrorcurrent/conf/content-replayer.yml:/etc/softwareheritage/config.yml softwareheritage/replayer:20220915-163058 shell ### Some containers are never started If you notice that some container stay at 0 replicas in `docker service ls`, it probably means the rule for services, as described in `mirror.yml`, cannot be fulfilled by the current nodes part of the swarm. Most likely, you are missing the labels locating the volumes needed by the containers. You might want to run: docker node update $HOSTNAME \ --label-add org.softwareheritage.mirror.volumes.storage-db=true \ --label-add org.softwareheritage.mirror.volumes.web-db=true \ --label-add org.softwareheritage.mirror.volumes.objstorage=true \ --label-add org.softwareheritage.mirror.volumes.redis=true ### SWH services keep restarting If SWH services keep restarting, look at the service logs, but don’t forget to look at the logs for Docker service (using `journalctl -u docker.service` for example). If you see: error="task: non-zero exit (124)" It means that `wait-for-it` has reached its timeout. You should double check the network configuration, including the firewall. ### Failure while checking the Vault service If the test fail with the following exception: ~~~ > assert isinstance(tarfilecontent, bytes) E assert False E + where False = isinstance({'exception': 'NotFoundExc', 'reason': 'Cooked archive for swh:1:dir:c1695cab57e5bfe64ea4b0900c4575bf7240483d not found.', 'traceback': 'Traceback (most recent call last):\n File "/usr/lib/python3/dist-packages/rest_framework/views.py", line 492, in dispatch\n response = handler(request, *args, **kwargs)\n File "/usr/lib/python3/dist-packages/rest_framework/decorators.py", line 54, in handler\n return func(*args, **kwargs)\n File "/usr/lib/python3/dist-pac→ …/swh-mirror/tests/test_graph_replayer.py:423: AssertionError ~~~ It is most likely because of a stale database. Remove the vault volume using: docker volume rm swhtest_mirror0_vault-db -In general, the test has been designed to be run on empty volumes. \ No newline at end of file +In general, the test has been designed to be run on empty volumes.