diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /conf/graph-replayer.yml /conf/content-replayer.yml +/env/tests.env diff --git a/conf/web.yml b/conf/web.yml --- a/conf/web.yml +++ b/conf/web.yml @@ -1,7 +1,7 @@ storage: cls: remote url: http://storage:5002/ - timeout: 5 + timeout: 30 objstorage: cls: remote diff --git a/env/tests.env.template b/env/tests.env.template new file mode 100644 --- /dev/null +++ b/env/tests.env.template @@ -0,0 +1,5 @@ +SWH_IMAGE_TAG=20220921-113342 +SWH_MIRROR_TEST_KAFKA_USERNAME=mirror-test-ro +SWH_MIRROR_TEST_KAFKA_PASSWORD=SOME_INTERESTING_SECRET +SWH_MIRROR_TEST_KAFKA_BROKER=broker1.journal.staging.swh.network:9093 +SWH_MIRROR_TEST_OBJSTORAGE_URL=https://swh-prod:SOME_INTERESTING_SECRET@objstorage.softwareheritage.org/ diff --git a/images/Dockerfile b/images/Dockerfile --- a/images/Dockerfile +++ b/images/Dockerfile @@ -126,12 +126,3 @@ COPY replayer/entrypoint.sh / ENTRYPOINT ["/entrypoint.sh"] - - -### -# Test image -### -FROM swh-base as swh-test - -COPY restore_kafka.py / -#ENTRYPOINT ["python3", "-u", "/restore_kafka.py"] diff --git a/mirror.yml b/mirror.yml --- a/mirror.yml +++ b/mirror.yml @@ -6,6 +6,8 @@ image: memcached deploy: replicas: 1 + networks: + - swhtest-mirror storage-db: # the main storage database @@ -19,6 +21,8 @@ max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.storage-db == true + networks: + - swhtest-mirror command: ['-c', 'shared_buffers=4GB', '-c', 'effective_cache_size=4GB', '-c', 'random_page_cost=1.5', '-c', 'max_wal_size=4GB'] environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password @@ -44,6 +48,8 @@ max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.web-db == true + networks: + - swhtest-mirror command: ['-c', 'shared_buffers=4GB', '-c', 'effective_cache_size=4GB', '-c', 'random_page_cost=1.5', '-c', 'max_wal_size=4GB'] environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password @@ -60,6 +66,8 @@ web: # the web app; serves both the web navigation interface and the public web API image: softwareheritage/web:${SWH_IMAGE_TAG:-latest} + networks: + - swhtest-mirror configs: - source: web target: /etc/softwareheritage/config.yml @@ -94,6 +102,8 @@ # etc.) you want to remove this placement constraint constraints: - node.labels.org.softwareheritage.mirror.volumes.objstorage == true + networks: + - swhtest-mirror volumes: - "objstorage:/srv/softwareheritage/objects:rw,Z" configs: @@ -116,6 +126,8 @@ # an upgrade of the base image that comes with a database migration script # is upgraded in a consistent way replicas: 1 + networks: + - swhtest-mirror configs: - source: storage target: /etc/softwareheritage/config.yml @@ -137,6 +149,8 @@ nginx: image: nginx + networks: + - swhtest-mirror configs: - source: nginx target: /etc/nginx/nginx.conf @@ -147,6 +161,8 @@ prometheus: image: prom/prometheus + networks: + - swhtest-mirror depends_on: - prometheus-statsd-exporter command: @@ -166,6 +182,8 @@ prometheus-statsd-exporter: image: prom/statsd-exporter + networks: + - swhtest-mirror command: - "--statsd.mapping-config=/etc/prometheus/statsd-mapping.yml" configs: @@ -181,6 +199,8 @@ grafana: image: grafana/grafana + networks: + - swhtest-mirror depends_on: - prometheus environment: @@ -215,6 +235,8 @@ max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.redis == true + networks: + - swhtest-mirror command: - redis-server - --save 60 1 @@ -234,6 +256,8 @@ replicas: 0 restart_policy: condition: "none" + networks: + - swhtest-mirror env_file: - ./env/common-python.env environment: @@ -252,6 +276,8 @@ deploy: # do not start replayers by default; see above replicas: 0 + networks: + - swhtest-mirror env_file: - ./env/common-python.env environment: @@ -269,6 +295,8 @@ amqp: image: rabbitmq:3.6-management + networks: + - swhtest-mirror ports: - 5072:5672 @@ -285,6 +313,8 @@ max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.web-db == true + networks: + - swhtest-mirror command: ['-c', 'shared_buffers=4GB', '-c', 'effective_cache_size=4GB', '-c', 'random_page_cost=1.5', '-c', 'max_wal_size=4GB'] environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password @@ -302,6 +332,8 @@ image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 + networks: + - swhtest-mirror env_file: - ./env/common-python.env configs: @@ -324,6 +356,8 @@ image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 + networks: + - swhtest-mirror env_file: - ./env/common-python.env - ./env/celery-worker.env @@ -339,6 +373,8 @@ # vault do really need someone to talk to via SMTP mailhog: image: mailhog/mailhog + networks: + - swhtest-mirror ### scheduler services @@ -353,6 +389,8 @@ max_replicas_per_node: 1 constraints: - node.labels.org.softwareheritage.mirror.volumes.web-db == true + networks: + - swhtest-mirror command: ['-c', 'shared_buffers=4GB', '-c', 'effective_cache_size=4GB', '-c', 'random_page_cost=1.5', '-c', 'max_wal_size=4GB'] environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password @@ -370,6 +408,8 @@ image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 + networks: + - swhtest-mirror configs: - source: scheduler target: /etc/softwareheritage/config.yml @@ -391,6 +431,8 @@ image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 + networks: + - swhtest-mirror configs: - source: scheduler target: /etc/softwareheritage/config.yml @@ -412,6 +454,8 @@ image: softwareheritage/base:${SWH_IMAGE_TAG:-latest} deploy: replicas: 1 + networks: + - swhtest-mirror configs: - source: scheduler target: /etc/softwareheritage/config.yml @@ -482,3 +526,9 @@ file: conf/grafana/dashboards/content-replayer.json grafana-dashboards-backend-stats: file: conf/grafana/dashboards/backend-stats.json + + +networks: + swhtest-mirror: + driver: overlay + attachable: true diff --git a/pyproject.toml b/pyproject.toml --- a/pyproject.toml +++ b/pyproject.toml @@ -10,3 +10,9 @@ line_length = 88 force_sort_within_sections = true +[tool.pytest.ini_options] +env_files = 'env/tests.env' +log_cli = true +log_cli_level = "INFO" +log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,5 @@ +pytest +pytest-testinfra +pytest-dotenv +requests +msgpack diff --git a/tests/README.md b/tests/README.md --- a/tests/README.md +++ b/tests/README.md @@ -76,6 +76,7 @@ for the content replication; it would typically include access credentials, e.g. `https://login:password@objstorage.softwareheritage.org/`, - `SWH_IMAGE_TAG`: the docker image tag to be tested. + You can copy the template `env/tests.env.template` to `env/tests.env` to set them. - the `softwareheritage/base`, `softwareheritage/web`, `softwareheritage/replayer` and `softwareheritage/test` images must be built @@ -130,3 +131,63 @@ ``` Note the test takes quite some time to execute, so be patient. + + +Troubleshooting +=============== + +### Watch out for stale services + +If something goes wrong, you might want to check if you have any remaining Docker services setup: + + docker service ls + +If you want to shut them all down, you can use: + + docker service rm $(docker service ls --format '{{.Name}}') + +### I want a shell! + +To run a shell in an image in the Swarm context, use the following: + + docker run --network=swhtest_mirror0_swhtest-mirror -ti --env-file env/common-python.env --env STATSD_TAGS="role:content-replayer,hostname:${HOSTNAME}" -v /tmp/pytest-of-lunar/pytest-current/mirrorcurrent/conf/content-replayer.yml:/etc/softwareheritage/config.yml softwareheritage/replayer:20220915-163058 shell + +### Some containers are never started + +If you notice that some container stay at 0 replicas in `docker service ls`, it probably means the rule for services, as described in `mirror.yml`, cannot be fulfilled by the current nodes part of the swarm. + +Most likely, you are missing the labels locating the volumes needed by the containers. You might want to run: + + docker node update $HOSTNAME \ + --label-add org.softwareheritage.mirror.volumes.storage-db=true \ + --label-add org.softwareheritage.mirror.volumes.web-db=true \ + --label-add org.softwareheritage.mirror.volumes.objstorage=true \ + --label-add org.softwareheritage.mirror.volumes.redis=true + +### SWH services keep restarting + +If SWH services keep restarting, look at the service logs, but don’t forget to look at the logs for Docker service (using `journalctl -u docker.service` for example). + +If you see: + + error="task: non-zero exit (124)" + +It means that `wait-for-it` has reached its timeout. You should double check the network configuration, including the firewall. + +### Failure while checking the Vault service + +If the test fail with the following exception: + +~~~ +> assert isinstance(tarfilecontent, bytes) +E assert False +E + where False = isinstance({'exception': 'NotFoundExc', 'reason': 'Cooked archive for swh:1:dir:c1695cab57e5bfe64ea4b0900c4575bf7240483d not found.', 'traceback': 'Traceback (most recent call last):\n File "/usr/lib/python3/dist-packages/rest_framework/views.py", line 492, in dispatch\n response = handler(request, *args, **kwargs)\n File "/usr/lib/python3/dist-packages/rest_framework/decorators.py", line 54, in handler\n return func(*args, **kwargs)\n File "/usr/lib/python3/dist-pac→ + +…/swh-mirror/tests/test_graph_replayer.py:423: AssertionError +~~~ + +It is most likely because of a stale database. Remove the vault volume using: + + docker volume rm swhtest_mirror0_vault-db + +In general, the test has been designed to be run on empty volumes. \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging from os import chdir, environ from pathlib import Path from shutil import copy, copytree @@ -23,6 +24,8 @@ OBJSTORAGE_URL = environ["SWH_MIRROR_TEST_OBJSTORAGE_URL"] WFI_TIMEOUT = 60 +LOGGER = logging.getLogger(__name__) + def pytest_addoption(parser, pluginmanager): parser.addoption( @@ -59,7 +62,7 @@ # start the whole cluster stack_name = f"swhtest_{tmp_path.name}" - print("Create missing secrets") + LOGGER.info("Create missing secrets") existing_secrets = [ line.strip() for line in docker_host.check_output( @@ -69,11 +72,11 @@ for srv in ("storage", "web", "vault", "scheduler"): secret = f"swh-mirror-{srv}-db-password" if secret not in existing_secrets: - print("Creating secret {secret}") + LOGGER.info(f"Creating secret {secret}") docker_host.check_output( f"echo not-so-secret | docker secret create {secret} -" ) - print("Remove config objects (if any)") + LOGGER.info("Remove config objects (if any)") existing_configs = [ line.strip() for line in docker_host.check_output( @@ -84,17 +87,17 @@ if cfg.startswith(f"{stack_name}_"): docker_host.check_output(f"docker config rm {cfg}") - print(f"Deploy docker stack {stack_name}") + LOGGER.info(f"Deploy docker stack {stack_name}") docker_host.check_output(f"docker stack deploy -c mirror.yml {stack_name}") yield stack_name # breakpoint() if not request.config.getoption("keep_stack"): - print(f"Remove stack {stack_name}") + LOGGER.info(f"Remove stack {stack_name}") docker_host.check_output(f"docker stack rm {stack_name}") # wait for services to be down - print(f"Wait for all services of {stack_name} to be down") + LOGGER.info(f"Wait for all services of {stack_name} to be down") while docker_host.check_output( "docker service ls --format {{.Name}} " f"--filter label=com.docker.stack.namespace={stack_name}" @@ -103,8 +106,8 @@ # give a bit of time to docker to sync the state of service<->volumes # relations so the next step runs ok - time.sleep(1) - print(f"Remove volumes of stack {stack_name}") + time.sleep(20) + LOGGER.info(f"Remove volumes of stack {stack_name}") for volume in docker_host.check_output( "docker volume ls --format {{.Name}} " f"--filter label=com.docker.stack.namespace={stack_name}" @@ -113,6 +116,6 @@ try: docker_host.check_output(f"docker volume rm {volume}") except AssertionError: - print(f"Failed to remove volume {volume}") + LOGGER.error(f"Failed to remove volume {volume}") chdir(cwd) diff --git a/tests/test_graph_replayer.py b/tests/test_graph_replayer.py --- a/tests/test_graph_replayer.py +++ b/tests/test_graph_replayer.py @@ -14,7 +14,7 @@ import msgpack import requests -from .conftest import KAFKA_GROUPID, KAFKA_PASSWORD, KAFKA_USERNAME +from .conftest import KAFKA_GROUPID, KAFKA_PASSWORD, KAFKA_USERNAME, LOGGER SERVICES = { "{}_content-replayer": "0/0", @@ -37,8 +37,8 @@ "{}_scheduler-listener": "1/1", "{}_scheduler-runner": "1/1", } -ATTEMPTS = 200 -DELAY = 0.5 +ATTEMPTS = 600 +DELAY = 1 SCALE = 2 API_URL = "http://127.0.0.1:5081/api/1" @@ -55,16 +55,16 @@ def check_running_services(host, stack, services): - print("Waiting for service", services) + LOGGER.info("Waiting for services %s", services) mirror_services_ = {} for i in range(ATTEMPTS): mirror_services = running_services(host, stack) mirror_services = {k: v for k, v in mirror_services.items() if k in services} if mirror_services == services: - print("Got them all!") + LOGGER.info("Got them all!") break if mirror_services != mirror_services_: - print("Not yet there", mirror_services) + LOGGER.info("Not yet there %s", mirror_services) mirror_services_ = mirror_services time.sleep(0.5) return mirror_services == services @@ -98,8 +98,7 @@ try: data = get(content["data_url"]) except Exception as exc: - print("Failed loading", content["data_url"]) - print(exc) + LOGGER.error("Failed loading %s", content["data_url"], exc_info=exc) raise assert len(data) == content["length"] assert sha1(data).hexdigest() == content["checksums"]["sha1"] @@ -278,7 +277,7 @@ partitions = set() def on_assign(cons, parts): - print("assignment", parts) + LOGGER.info("assignment %s", parts) for p in parts: partitions.add(p.partition) @@ -303,9 +302,12 @@ # Proper message k = msgpack.unpackb(msg.key()) v = msgpack.unpackb(msg.value()) - print( - "%% %s [%d] at offset %d with key %s:\n" - % (msg.topic(), msg.partition(), msg.offset(), k) + LOGGER.info( + "%% %s [%d] at offset %d with key %s:\n", + msg.topic(), + msg.partition(), + msg.offset(), + k, ) assert k == v["origin"] stats[k] = v @@ -321,7 +323,7 @@ # run replayer services for service_type in ("content", "graph"): service = f"{mirror_stack}_{service_type}-replayer" - print(f"Scale {service} to 1") + LOGGER.info(f"Scale {service} to 1") host.check_output(f"docker service scale -d {service}=1") if not check_running_services(host, mirror_stack, {service: "1/1"}): breakpoint() @@ -330,7 +332,7 @@ ) assert len(logs) == 1 - print(f"Scale {service} to {SCALE}") + LOGGER.info(f"Scale {service} to {SCALE}") host.check_output(f"docker service scale -d {service}={SCALE}") check_running_services(host, mirror_stack, {service: f"{SCALE}/{SCALE}"}) logs = wait_for_log_entry( @@ -339,14 +341,14 @@ assert len(logs) == SCALE # wait for the replaying to be done (stop_on_oef is true) - print(f"Wait for {service} to be done") + LOGGER.info(f"Wait for {service} to be done") logs = wait_for_log_entry(host, service, "Done.", SCALE) # >= SCALE below because replayer services may have been restarted # (once done) before we scale them to 0 if not (len(logs) >= SCALE): breakpoint() assert len(logs) >= SCALE - print(f"Scale {service} to 0") + LOGGER.info(f"Scale {service} to 0") check_running_services(host, mirror_stack, {service: f"0/{SCALE}"}) # TODO: check there are no error reported in redis after the replayers are done @@ -355,7 +357,7 @@ if False: # check replicated archive is in good shape expected_stats = get_expected_stats() - print("Check replicated archive") + LOGGER.info("Check replicated archive") # seems the graph replayer is OK, let's check the archive can tell something expected_origins = sorted(expected_stats) assert len(origins) == len(expected_origins) @@ -365,16 +367,16 @@ timing_stats.clear() assert origin == expected["origin"] origin_stats, swhids = get_stats(origin) - print(origin_stats) - print(f"{len(timing_stats)} REQS took {sum(timing_stats)}s") + LOGGER.info("%s", origin_stats) + LOGGER.info(f"{len(timing_stats)} REQS took {sum(timing_stats)}s") assert origin_stats == expected - print(f"{origin} is OK") + LOGGER.info(f"{origin} is OK") # test the vault service cooks = [] # first start all the cookings for origin in origins: - print(f"Cook HEAD for {origin['url']}") + LOGGER.info(f"Cook HEAD for {origin['url']}") visit = get( f"{API_URL}/origin/{origin['url']}/visit/latest/?require_snapshot=true" ) @@ -400,7 +402,7 @@ else: breakpoint() - print(f"Directory is {swhid}") + LOGGER.info(f"Directory is {swhid}") cook = post(f"{API_URL}/vault/flat/{swhid}/") assert cook assert cook["status"] in ("new", "pending") @@ -421,9 +423,9 @@ assert all(fname.startswith(swhid) for fname in filelist) for path in filelist[1:]: tarinfo = tarfileobj.getmember(path) - expected = get( - f"{API_URL}/directory/{quote(path[10:])}" - ) # remove the 'swh:1:dir:' part + url = f"{API_URL}/directory/{quote(path[10:])}" + expected = get(url) # remove the 'swh:1:dir:' part + LOGGER.info(f"Retrieved from storage: {url} → {expected}") if expected["type"] == "dir": assert tarinfo.isdir() elif expected["type"] == "file":