diff --git a/docker/.gitignore b/docker/.gitignore new file mode 100644 index 0000000..6b8a876 --- /dev/null +++ b/docker/.gitignore @@ -0,0 +1,3 @@ +docker-compose.override.yml +docker-compose.storage-replica.override.yml +tests/swh-docker-compose.logs diff --git a/docker/CODE_OF_CONDUCT.md b/docker/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..0ad22b5 --- /dev/null +++ b/docker/CODE_OF_CONDUCT.md @@ -0,0 +1,78 @@ +# Software Heritage Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as Software +Heritage contributors and maintainers pledge to making participation in our +project and our community a harassment-free experience for everyone, regardless +of age, body size, disability, ethnicity, sex characteristics, gender identity +and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at `conduct@softwareheritage.org`. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an +incident. Further details of specific enforcement policies may be posted +separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/docker/CONTRIBUTORS b/docker/CONTRIBUTORS new file mode 100644 index 0000000..acd099c --- /dev/null +++ b/docker/CONTRIBUTORS @@ -0,0 +1,2 @@ +Archit Agrawal +Rob Guinness diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..af78d10 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,46 @@ +FROM python:3.7 + +RUN export DEBIAN_FRONTEND=noninteractive && \ + apt-get update && apt-get upgrade -y && \ + apt-get install -y \ + libapr1-dev \ + libaprutil1-dev \ + libpq-dev \ + libsvn-dev \ + libsystemd-dev \ + postgresql-client \ + wait-for-it \ + ngrep && \ + apt-get install -y --no-install-recommends \ + r-base-core \ + r-cran-jsonlite && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN useradd -md /srv/softwareheritage -s /bin/bash swh +USER swh + +RUN python3 -m venv /srv/softwareheritage/venv +ENV PATH="/srv/softwareheritage/venv/bin:${PATH}" +RUN pip install --upgrade pip setuptools wheel +RUN pip install 'gunicorn<20' +RUN pip install cassandra-driver + +RUN pip install \ + swh-core[db,http] \ + swh-deposit[server] \ + swh-indexer \ + swh-journal \ + swh-lister \ + swh-loader-core \ + swh-loader-git \ + swh-loader-mercurial \ + swh-loader-svn \ + swh-storage \ + swh-objstorage \ + swh-scheduler \ + swh-vault \ + swh-web + +COPY utils/*.sh /srv/softwareheritage/utils/ +RUN mkdir -p /srv/softwareheritage/objects diff --git a/docker/Makefile b/docker/Makefile new file mode 100644 index 0000000..527b92c --- /dev/null +++ b/docker/Makefile @@ -0,0 +1,3 @@ +.PHONY: check-staged +check-staged: + docker-compose config -q diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..90a9c8c --- /dev/null +++ b/docker/README.md @@ -0,0 +1,679 @@ +# swh-docker-dev + +This repo contains Dockerfiles to allow developers to run a small +Software Heritage instance on their development computer. + +The end goal is to smooth the contributors/developers workflow. Focus +on coding, not configuring! + +WARNING: Running a Software Heritage instance on your machine can consume + quite a bit of resources: if you play a bit too hard (e.g., if you + try to list all GitHub repositories with the corresponding lister), + you may fill your hard drive, and consume a lot of CPU, memory and + network bandwidth. + + +## Dependencies + +This uses docker with docker-compose, so ensure you have a working +docker environment and docker-compose is installed. + +We recommend using the latest version of docker, so please read +https://docs.docker.com/install/linux/docker-ce/debian/ for more details on how +to install docker on your machine. + +On a debian system, docker-compose can be installed from debian repositories. +On a stable (stretch) machine, it is recommended to install the version from +[backports](https://backports.debian.org/Instructions/): + +``` +~$ sudo apt install -t stretch-backports docker-compose +``` + +## Quick start + +First, clone this repository. + +If you already have followed the +[[https://docs.softwareheritage.org/devel/developer-setup.html|developer setup guide]], +then you should already have a copy of the swh-docker-env git repository. Use +it: + +``` +~$ cd swh-environment/swh-docker-dev +``` + +Otherwise, we suggest to create a `swh-environment` +directory in which this repo will be cloned so you can later on run some +component in docker containers with overrides code from local repositories (see +[[<#using-docker-setup-development-and-integration-testing>|below]]): + +``` +~$ mkdir swh-environment +~$ cd swh-environment +~/swh-environment$ git clone https://forge.softwareheritage.org/source/swh-docker-dev.git +~/swh-environment$ cd swh-docker-dev +``` + +Then, start containers: + +``` +~/swh-environment/swh-docker-dev$ docker-compose up -d +[...] +Creating swh-docker-dev_amqp_1 ... done +Creating swh-docker-dev_zookeeper_1 ... done +Creating swh-docker-dev_kafka_1 ... done +Creating swh-docker-dev_flower_1 ... done +Creating swh-docker-dev_swh-scheduler-db_1 ... done +[...] +``` + +This will build docker images and run them. +Check everything is running fine with: + +``` +~/swh-environment/swh-docker-dev$ docker-compose ps + Name Command State Ports +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +swh-docker-dev_amqp_1 docker-entrypoint.sh rabbi ... Up 15671/tcp, 0.0.0.0:5018->15672/tcp, 25672/tcp, 4369/tcp, 5671/tcp, 5672/tcp +swh-docker-dev_flower_1 flower --broker=amqp://gue ... Up 0.0.0.0:5555->5555/tcp +swh-docker-dev_kafka_1 start-kafka.sh Up 0.0.0.0:9092->9092/tcp +swh-docker-dev_swh-deposit-db_1 docker-entrypoint.sh postgres Up 5432/tcp +swh-docker-dev_swh-deposit_1 /entrypoint.sh Up 0.0.0.0:5006->5006/tcp +[...] +``` + +At the time of writing this guide, the startup of some containers may fail the +first time for dependency-related problems. If some containers failed to start, +just run the `docker-compose up -d` command again. + +If a container really refuses to start properly, you can check why using the +`docker-compose logs` command. For example: + +``` +~/swh-environment/swh-docker-dev$ docker-compose logs swh-lister +Attaching to swh-docker-dev_swh-lister_1 +[...] +swh-lister_1 | Processing /src/swh-scheduler +swh-lister_1 | Could not install packages due to an EnvironmentError: [('/src/swh-scheduler/.hypothesis/unicodedata/8.0.0/charmap.json.gz', '/tmp/pip-req-build-pm7nsax3/.hypothesis/unicodedata/8.0.0/charmap.json.gz', "[Errno 13] Permission denied: '/src/swh-scheduler/.hypothesis/unicodedata/8.0.0/charmap.json.gz'")] +swh-lister_1 | +``` + +Once all containers are running, you can use the web interface by opening +http://localhost:5080/ in your web browser. + +At this point, the archive is empty and needs to be filled with some content. +To do so, you can create tasks that will scrape a forge. For example, to inject +the code from the https://0xacab.org gitlab forge: + +``` +~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \ + swh scheduler task add list-gitlab-full \ + -p oneshot url=https://0xacab.org/api/v4 + +Created 1 tasks + +Task 1 + Next run: just now (2018-12-19 14:58:49+00:00) + Interval: 90 days, 0:00:00 + Type: list-gitlab-full + Policy: oneshot + Args: + Keyword args: + url=https://0xacab.org/api/v4 +``` + +This task will scrape the forge's project list and create subtasks to inject +each git repository found there. + +This will take a bit af time to complete. + +To increase the speed at which git repositories are imported, you can spawn more +`swh-loader-git` workers: + +``` +~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \ + celery status +listers@50ac2185c6c9: OK +loader@b164f9055637: OK +indexer@33bc6067a5b8: OK +vault@c9fef1bbfdc1: OK + +4 nodes online. +~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \ + celery control pool_grow 3 -d loader@b164f9055637 +-> loader@b164f9055637: OK + pool will grow +~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \ + celery inspect -d loader@b164f9055637 stats | grep prefetch_count + "prefetch_count": 4 +``` + +Now there are 4 workers ingesting git repositories. +You can also increase the number of `swh-loader-git` containers: + +``` +~/swh-environment/swh-docker-dev$ docker-compose up -d --scale swh-loader=4 +[...] +Creating swh-docker-dev_swh-loader_2 ... done +Creating swh-docker-dev_swh-loader_3 ... done +Creating swh-docker-dev_swh-loader_4 ... done +``` + +## Updating the docker image + +All containers started by `docker-compose` are bound to a docker image +named `swh/stack` including all the software components of Software Heritage. +When new versions of these components are released, the docker image will not +be automatically updated. In order to update all Software heritage components +to their latest version, the docker image needs to be explicitly rebuilt by +issuing the following command inside the `swh-docker-dev` directory: + +``` +~/swh-environment/swh-docker-dev$ docker build --no-cache -t swh/stack . +``` + +## Details + +This runs the following services on their respectively standard ports, +all of the following services are configured to communicate with each +other: + +- swh-storage-db: a `softwareheritage` instance db that stores the + Merkle DAG, + +- swh-objstorage: Content-addressable object storage, + +- swh-storage: Abstraction layer over the archive, allowing to access + all stored source code artifacts as well as their metadata, + +- swh-web: the swh's web interface over the storage, + +- swh-scheduler: the API service as well as 2 utilities, + the runner and the listener, + +- swh-lister: celery workers dedicated to running lister tasks, + +- swh-loaders: celery workers dedicated to importing/updating source code + content (VCS repos, source packages, etc.), + +- swh-journal: Persistent logger of changes to the archive, with + publish-subscribe support. + +That means, you can start doing the ingestion using those services using the +same setup described in the getting-started starting directly at +https://docs.softwareheritage.org/devel/getting-started.html#step-4-ingest-repositories + +### Exposed Ports + +Several services have their listening ports exposed on the host: + +- amqp: 5072 +- kafka: 5092 +- nginx: 5080 + +And for SWH services: + +- scheduler API: 5008 +- storage API: 5002 +- object storage API: 5003 +- indexer API: 5007 +- web app: 5004 +- deposit app: 5006 + +Beware that these ports are not the same as the ports used from within the +docker network. This means that the same command executed from the host or from +a docker container will not use the same urls to access services. For example, +to use the `celery` utility from the host, you may type: + +``` +~/swh-environment/swh-docker-dev$ CELERY_BROKER_URL=amqp://:5072// celery status +loader@61704103668c: OK +[...] +``` + +To run the same command from within a container: + +``` +~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api celery status +loader@61704103668c: OK +[...] +``` + +## Managing tasks + +One of the main components of the Software Heritage platform is the task system. +These are used to manage everything related to background process, like +discovering new git repositories to import, ingesting them, checking a known +repository is up to date, etc. + +The task system is based on Celery but uses a custom database-based scheduler. + +So when we refer to the term 'task', it may designate either a Celery task or a +SWH one (ie. the entity in the database). When we refer to simply a "task" in +the documentation, it designates the SWH task. + +When a SWH task is ready to be executed, a Celery task is created to handle the +actual SWH task's job. Note that not all Celery tasks are directly linked to a +SWH task (some SWH tasks are implemented using a Celery task that spawns Celery +subtasks). + +A (SWH) task can be `recurring` or `oneshot`. `oneshot` tasks are only executed +once, whereas `recurring` are regularly executed. The scheduling configuration +of these recurring tasks can be set via the fields `current_interval` and +`priority` (can be 'high', 'normal' or 'low') of the task database entity. + + +### Inserting a new lister task + +To list the content of a source code provider like github or a Debian +distribution, you may add a new task for this. + +This task will (generally) scrape a web page or use a public API to identify +the list of published software artefacts (git repos, debian source packages, +etc.) + +Then, for each repository, a new task will be created to ingest this repository +and keep it up to date. + +For example, to add a (one shot) task that will list git repos on the +0xacab.org gitlab instance, one can do (from this git repository): + +``` +~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \ + swh scheduler task add list-gitlab-full \ + -p oneshot url=https://0xacab.org/api/v4 + +Created 1 tasks + +Task 12 + Next run: just now (2018-12-19 14:58:49+00:00) + Interval: 90 days, 0:00:00 + Type: list-gitlab-full + Policy: oneshot + Args: + Keyword args: + url=https://0xacab.org/api/v4 +``` + +This will insert a new task in the scheduler. To list existing tasks for a +given task type: + +``` +~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \ + swh scheduler task list-pending list-gitlab-full + +Found 1 list-gitlab-full tasks + +Task 12 + Next run: 2 minutes ago (2018-12-19 14:58:49+00:00) + Interval: 90 days, 0:00:00 + Type: list-gitlab-full + Policy: oneshot + Args: + Keyword args: + url=https://0xacab.org/api/v4 +``` + +To list all existing task types: + +``` +~/swh-environment/swh-docker-dev$ docker-compose exec swh-scheduler-api \ + swh scheduler task-type list + +Known task types: +load-svn-from-archive: + Loading svn repositories from svn dump +load-svn: + Create dump of a remote svn repository, mount it and load it +load-deposit: + Loading deposit archive into swh through swh-loader-tar +check-deposit: + Pre-checking deposit step before loading into swh archive +cook-vault-bundle: + Cook a Vault bundle +load-hg: + Loading mercurial repository swh-loader-mercurial +load-hg-from-archive: + Loading archive mercurial repository swh-loader-mercurial +load-git: + Update an origin of type git +list-github-incremental: + Incrementally list GitHub +list-github-full: + Full update of GitHub repos list +list-debian-distribution: + List a Debian distribution +list-gitlab-incremental: + Incrementally list a Gitlab instance +list-gitlab-full: + Full update of a Gitlab instance's repos list +list-pypi: + Full pypi lister +load-pypi: + Load Pypi origin +index-mimetype: + Mimetype indexer task +index-mimetype-for-range: + Mimetype Range indexer task +index-fossology-license: + Fossology license indexer task +index-fossology-license-for-range: + Fossology license range indexer task +index-origin-head: + Origin Head indexer task +index-revision-metadata: + Revision Metadata indexer task +index-origin-metadata: + Origin Metadata indexer task + +``` + + +### Monitoring activity + +You can monitor the workers activity by connecting to the RabbitMQ console on +`http://localhost:5080/rabbitmq` or the grafana dashboard on +`http://localhost:5080/grafana`. + +If you cannot see any task being executed, check the logs of the +`swh-scheduler-runner` service (here is a failure example due to the +debian lister task not being properly registered on the +swh-scheduler-runner service): + +``` +~/swh-environment/swh-docker-dev$ docker-compose logs --tail=10 swh-scheduler-runner +Attaching to swh-docker-dev_swh-scheduler-runner_1 +swh-scheduler-runner_1 | "__main__", mod_spec) +swh-scheduler-runner_1 | File "/usr/local/lib/python3.7/runpy.py", line 85, in _run_code +swh-scheduler-runner_1 | exec(code, run_globals) +swh-scheduler-runner_1 | File "/usr/local/lib/python3.7/site-packages/swh/scheduler/celery_backend/runner.py", line 107, in +swh-scheduler-runner_1 | run_ready_tasks(main_backend, main_app) +swh-scheduler-runner_1 | File "/usr/local/lib/python3.7/site-packages/swh/scheduler/celery_backend/runner.py", line 81, in run_ready_tasks +swh-scheduler-runner_1 | task_types[task['type']]['backend_name'] +swh-scheduler-runner_1 | File "/usr/local/lib/python3.7/site-packages/celery/app/registry.py", line 21, in __missing__ +swh-scheduler-runner_1 | raise self.NotRegistered(key) +swh-scheduler-runner_1 | celery.exceptions.NotRegistered: 'swh.lister.debian.tasks.DebianListerTask' +``` + + +## Using docker setup development and integration testing + +If you hack the code of one or more archive components with a virtual +env based setup as described in the +[[https://docs.softwareheritage.org/devel/developer-setup.html|developer +setup guide]], you may want to test your modifications in a working +Software Heritage instance. The simplest way to achieve this is to use +this docker-based environment. + +If you haven't followed the +[[https://docs.softwareheritage.org/devel/developer-setup.html|developer setup guide]], +you must clone the the [swh-environment] repo in your `swh-environment` +directory: + +``` +~/swh-environment$ git clone https://forge.softwareheritage.org/source/swh-environment.git . +``` + +Note the `.` at the end of this command: we want the git repository to be +cloned directly in the `~/swh-environment` directory, not in a sub directory. +Also note that if you haven't done it yet and you want to hack the source code +of one or more Software Heritage packages, you really should read the +[[https://docs.softwareheritage.org/devel/developer-setup.html|developer setup guide]]. + +From there, we will checkout or update all the swh packages: + +``` +~/swh-environment$ ./bin/update +``` + +### Install a swh package from sources in a container + +It is possible to run a docker container with some swh packages installed from +sources instead of using the latest published packages from pypi. To do this +you must write a docker-compose override file (`docker-compose.override.yml`). +An example is given in the `docker-compose.override.yml.example` file: + +``` yaml +version: '2' + +services: + swh-objstorage: + volumes: + - "$HOME/swh-environment/swh-objstorage:/src/swh-objstorage" +``` + +The file named `docker-compose.override.yml` will automatically be loaded by +`docker-compose`. + +This example shows the simplest case of the `swh-objstorage` package: +you just have to mount it in the container in `/src` and the +entrypoint will ensure every swh-* package found in `/src/` is +installed (using `pip install -e` so you can easily hack your +code). If the application you play with has autoreload support, there +is no need to restart the impacted container.) + +Note: if the docker fails to start when using local sources for one or more swh +package, it's most probably due to permission problems on cache files. For +example, if you have executed tests locally (using pytest or tox), you have +cache files (__pycache__ etc.) that will prevent `pip install` from working +within the docker. + +The solution is to clean these files and directories before trying to spawn the +docker. + +``` +~/swh-environment$ find . -type d -name __pycache__ -exec rm -rf {} \; +~/swh-environment$ find . -type d -name .tox -exec rm -rf {} \; +~/swh-environment$ find . -type d -name .hypothesis -exec rm -rf {} \; +``` + +### Using locally installed swh tools with docker + +In all examples above, we have executed swh commands from within a running +container. Now we also have these swh commands locally available in our virtual +env, we can use them to interact with swh services running in docker +containers. + +For this, we just need to configure a few environment variables. First, ensure +your Software Heritage virtualenv is activated (here, using virtualenvwrapper): + +``` +~$ workon swh +(swh) ~/swh-environment$ export SWH_SCHEDULER_URL=http://127.0.0.1:5008/ +(swh) ~/swh-environment$ export CELERY_BROKER_URL=amqp://127.0.0.1:5072/ +``` + +Now we can use the `celery` command directly to control the celery system +running in the docker environment: + +``` +(swh) ~/swh-environment$ celery status +vault@c9fef1bbfdc1: OK +listers@ba66f18e7d02: OK +indexer@cb14c33cbbfb: OK +loader@61704103668c: OK + +4 nodes online. +(swh) ~/swh-environment$ celery control -d loader@61704103668c pool_grow 3 +``` + +And we can use the `swh-scheduler` command all the same: + +``` +(swh) ~/swh-environment$ swh scheduler task-type list +Known task types: +index-fossology-license: + Fossology license indexer task +index-mimetype: + Mimetype indexer task +[...] +``` + +### Make your life a bit easier + +When you use virtualenvwrapper, you can add postactivation commands: + +``` +(swh) ~/swh-environment$ cat >>$VIRTUAL_ENV/bin/postactivate <<'EOF' +# unfortunately, the interface cmd for the click autocompletion +# depends on the shell +# https://click.palletsprojects.com/en/7.x/bashcomplete/#activation + +shell=$(basename $SHELL) +case "$shell" in + "zsh") + autocomplete_cmd=source_zsh + ;; + *) + autocomplete_cmd=source + ;; +esac + +eval "$(_SWH_COMPLETE=$autocomplete_cmd swh)" +export SWH_SCHEDULER_URL=http://127.0.0.1:5008/ +export CELERY_BROKER_URL=amqp://127.0.0.1:5072/ +export COMPOSE_FILE=~/swh-environment/swh-docker-dev/docker-compose.yml:~/swh-environment/swh-docker-dev/docker-compose.override.yml +alias doco=docker-compose + +function swhclean { + find ~/swh-environment -type d -name __pycache__ -exec rm -rf {} \; + find ~/swh-environment -type d -name .tox -exec rm -rf {} \; + find ~/swh-environment -type d -name .hypothesis -exec rm -rf {} \; +} +EOF +``` + +This postactivate script does: + +- install a shell completion handler for the swh-scheduler command, +- preset a bunch of environment variables + + - `SWH_SCHEDULER_URL` so that you can just run `swh scheduler` against the + scheduler API instance running in docker, without having to specify the + endpoint URL, + + - `CELERY_BROKER` so you can execute the `celery` tool (without cli options) + against the rabbitmq server running in the docker environment, + + - `COMPOSE_FILE` so you can run `docker-compose` from everywhere, + +- create an alias `doco` for `docker-compose` because this is way too + long to type, + +- add a `swhclean` shell function to clean your source directories so that + there is no conflict with docker containers using local swh repositories (see + below). This will delete any `.tox`, `__pycache__` and `.hypothesis` + directory found in your swh-environment directory. + +So now you can easily: + +* Start the SWH platform: + +``` + (swh) ~/swh-environment$ doco up -d + [...] +``` + +* Check celery: + +``` + (swh) ~/swh-environment$ celery status + listers@50ac2185c6c9: OK + loader@b164f9055637: OK + indexer@33bc6067a5b8: OK +``` + +* List task-types: + +``` + (swh) ~/swh-environment$ swh scheduler task-type list + [...] +``` + +* Get more info on a task type: + +``` + (swh) ~/swh-environment$ swh scheduler task-type list -v -t load-hg + Known task types: + load-hg: swh.loader.mercurial.tasks.LoadMercurial + Loading mercurial repository swh-loader-mercurial + interval: 1 day, 0:00:00 [1 day, 0:00:00, 1 day, 0:00:00] + backoff_factor: 1.0 + max_queue_length: 1000 + num_retries: None + retry_delay: None +``` + +* Add a new task: + +``` + (swh) ~/swh-environment$ swh scheduler task add load-hg \ + origin_url=https://hg.logilab.org/master/cubicweb + Created 1 tasks + Task 1 + Next run: just now (2019-02-06 12:36:58+00:00) + Interval: 1 day, 0:00:00 + Type: load-hg + Policy: recurring + Args: + Keyword args: + origin_url: https://hg.logilab.org/master/cubicweb +``` + +* Respawn a task: + +``` + (swh) ~/swh-environment$ swh scheduler task respawn 1 +``` + + +## Starting a kafka-powered replica of the storage + +This repo comes with an optional `docker-compose.storage-replica.yml` +docker compose file that can be used to test the kafka-powered replication +mecanism for the main storage. + +This can be used like: + +``` +~/swh-environment/swh-docker-dev$ docker-compose -f docker-compose.yml -f docker-compose.storage-replica.yml up -d +[...] +``` + +Compared to the original compose file, this will: + +- overrides the swh-storage service to activate the kafka direct writer + on swh.journal.objects prefixed topics using thw swh.storage.master ID, +- overrides the swh-web service to make it use the replica instead of the + master storage, +- starts a db for the replica, +- starts a storage service based on this db, +- starts a replayer service that runs the process that listen to kafka to + keeps the replica in sync. + +When using it, you will have a setup in which the master storage is used by +workers and most other services, whereas the storage replica will be used to +by the web application and should be kept in sync with the master storage +by kafka. + + +Note that the object storage is not replicated here, only the graph storage. + +## Starting the backfiller + +Reading from the storage the objects from within range +[start-object, end-object] to the kafka topics. + +``` +(swh) $ docker-compose \ + -f docker-compose.yml \ + -f docker-compose.storage-replica.yml \ + -f docker-compose.storage-replica.override.yml \ + run \ + swh-journal-backfiller \ + snapshot \ + --start-object 000000 \ + --end-object 000001 \ + --dry-run +``` diff --git a/docker/conf/cassandra.yaml b/docker/conf/cassandra.yaml new file mode 100644 index 0000000..dc26a6c --- /dev/null +++ b/docker/conf/cassandra.yaml @@ -0,0 +1,1242 @@ +# Cassandra storage config YAML + +# NOTE: +# See http://wiki.apache.org/cassandra/StorageConfiguration for +# full explanations of configuration directives +# /NOTE + +# The name of the cluster. This is mainly used to prevent machines in +# one logical cluster from joining another. +cluster_name: 'Test Cluster' + +# This defines the number of tokens randomly assigned to this node on the ring +# The more tokens, relative to other nodes, the larger the proportion of data +# that this node will store. You probably want all nodes to have the same number +# of tokens assuming they have equal hardware capability. +# +# If you leave this unspecified, Cassandra will use the default of 1 token for legacy compatibility, +# and will use the initial_token as described below. +# +# Specifying initial_token will override this setting on the node's initial start, +# on subsequent starts, this setting will apply even if initial token is set. +# +# If you already have a cluster with 1 token per node, and wish to migrate to +# multiple tokens per node, see http://wiki.apache.org/cassandra/Operations +num_tokens: 256 + +# Triggers automatic allocation of num_tokens tokens for this node. The allocation +# algorithm attempts to choose tokens in a way that optimizes replicated load over +# the nodes in the datacenter for the replication strategy used by the specified +# keyspace. +# +# The load assigned to each node will be close to proportional to its number of +# vnodes. +# +# Only supported with the Murmur3Partitioner. +# allocate_tokens_for_keyspace: KEYSPACE + +# initial_token allows you to specify tokens manually. While you can use it with +# vnodes (num_tokens > 1, above) -- in which case you should provide a +# comma-separated list -- it's primarily used when adding nodes to legacy clusters +# that do not have vnodes enabled. +# initial_token: + +# See http://wiki.apache.org/cassandra/HintedHandoff +# May either be "true" or "false" to enable globally +hinted_handoff_enabled: true + +# When hinted_handoff_enabled is true, a black list of data centers that will not +# perform hinted handoff +# hinted_handoff_disabled_datacenters: +# - DC1 +# - DC2 + +# this defines the maximum amount of time a dead host will have hints +# generated. After it has been dead this long, new hints for it will not be +# created until it has been seen alive and gone down again. +max_hint_window_in_ms: 10800000 # 3 hours + +# Maximum throttle in KBs per second, per delivery thread. This will be +# reduced proportionally to the number of nodes in the cluster. (If there +# are two nodes in the cluster, each delivery thread will use the maximum +# rate; if there are three, each will throttle to half of the maximum, +# since we expect two nodes to be delivering hints simultaneously.) +hinted_handoff_throttle_in_kb: 1024 + +# Number of threads with which to deliver hints; +# Consider increasing this number when you have multi-dc deployments, since +# cross-dc handoff tends to be slower +max_hints_delivery_threads: 2 + +# Directory where Cassandra should store hints. +# If not set, the default directory is $CASSANDRA_HOME/data/hints. +# hints_directory: /var/lib/cassandra/hints +hints_directory: /var/lib/cassandra/hints + +# How often hints should be flushed from the internal buffers to disk. +# Will *not* trigger fsync. +hints_flush_period_in_ms: 10000 + +# Maximum size for a single hints file, in megabytes. +max_hints_file_size_in_mb: 128 + +# Compression to apply to the hint files. If omitted, hints files +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +#hints_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# Maximum throttle in KBs per second, total. This will be +# reduced proportionally to the number of nodes in the cluster. +batchlog_replay_throttle_in_kb: 1024 + +# Authentication backend, implementing IAuthenticator; used to identify users +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthenticator, +# PasswordAuthenticator}. +# +# - AllowAllAuthenticator performs no checks - set it to disable authentication. +# - PasswordAuthenticator relies on username/password pairs to authenticate +# users. It keeps usernames and hashed passwords in system_auth.roles table. +# Please increase system_auth keyspace replication factor if you use this authenticator. +# If using PasswordAuthenticator, CassandraRoleManager must also be used (see below) +authenticator: AllowAllAuthenticator + +# Authorization backend, implementing IAuthorizer; used to limit access/provide permissions +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthorizer, +# CassandraAuthorizer}. +# +# - AllowAllAuthorizer allows any action to any user - set it to disable authorization. +# - CassandraAuthorizer stores permissions in system_auth.role_permissions table. Please +# increase system_auth keyspace replication factor if you use this authorizer. +authorizer: AllowAllAuthorizer + +# Part of the Authentication & Authorization backend, implementing IRoleManager; used +# to maintain grants and memberships between roles. +# Out of the box, Cassandra provides org.apache.cassandra.auth.CassandraRoleManager, +# which stores role information in the system_auth keyspace. Most functions of the +# IRoleManager require an authenticated login, so unless the configured IAuthenticator +# actually implements authentication, most of this functionality will be unavailable. +# +# - CassandraRoleManager stores role data in the system_auth keyspace. Please +# increase system_auth keyspace replication factor if you use this role manager. +role_manager: CassandraRoleManager + +# Validity period for roles cache (fetching granted roles can be an expensive +# operation depending on the role manager, CassandraRoleManager is one example) +# Granted roles are cached for authenticated sessions in AuthenticatedUser and +# after the period specified here, become eligible for (async) reload. +# Defaults to 2000, set to 0 to disable caching entirely. +# Will be disabled automatically for AllowAllAuthenticator. +roles_validity_in_ms: 2000 + +# Refresh interval for roles cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If roles_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as roles_validity_in_ms. +# roles_update_interval_in_ms: 2000 + +# Validity period for permissions cache (fetching permissions can be an +# expensive operation depending on the authorizer, CassandraAuthorizer is +# one example). Defaults to 2000, set to 0 to disable. +# Will be disabled automatically for AllowAllAuthorizer. +permissions_validity_in_ms: 2000 + +# Refresh interval for permissions cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If permissions_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as permissions_validity_in_ms. +# permissions_update_interval_in_ms: 2000 + +# Validity period for credentials cache. This cache is tightly coupled to +# the provided PasswordAuthenticator implementation of IAuthenticator. If +# another IAuthenticator implementation is configured, this cache will not +# be automatically used and so the following settings will have no effect. +# Please note, credentials are cached in their encrypted form, so while +# activating this cache may reduce the number of queries made to the +# underlying table, it may not bring a significant reduction in the +# latency of individual authentication attempts. +# Defaults to 2000, set to 0 to disable credentials caching. +credentials_validity_in_ms: 2000 + +# Refresh interval for credentials cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If credentials_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as credentials_validity_in_ms. +# credentials_update_interval_in_ms: 2000 + +# The partitioner is responsible for distributing groups of rows (by +# partition key) across nodes in the cluster. You should leave this +# alone for new clusters. The partitioner can NOT be changed without +# reloading all data, so when upgrading you should set this to the +# same partitioner you were already using. +# +# Besides Murmur3Partitioner, partitioners included for backwards +# compatibility include RandomPartitioner, ByteOrderedPartitioner, and +# OrderPreservingPartitioner. +# +partitioner: org.apache.cassandra.dht.Murmur3Partitioner + +# Directories where Cassandra should store data on disk. Cassandra +# will spread data evenly across them, subject to the granularity of +# the configured compaction strategy. +# If not set, the default directory is $CASSANDRA_HOME/data/data. +data_file_directories: + - /var/lib/cassandra/data + +# commit log. when running on magnetic HDD, this should be a +# separate spindle than the data directories. +# If not set, the default directory is $CASSANDRA_HOME/data/commitlog. +commitlog_directory: /var/lib/cassandra/commitlog + +# Enable / disable CDC functionality on a per-node basis. This modifies the logic used +# for write path allocation rejection (standard: never reject. cdc: reject Mutation +# containing a CDC-enabled table if at space limit in cdc_raw_directory). +cdc_enabled: false + +# CommitLogSegments are moved to this directory on flush if cdc_enabled: true and the +# segment contains mutations for a CDC-enabled table. This should be placed on a +# separate spindle than the data directories. If not set, the default directory is +# $CASSANDRA_HOME/data/cdc_raw. +# cdc_raw_directory: /var/lib/cassandra/cdc_raw + +# Policy for data disk failures: +# +# die +# shut down gossip and client transports and kill the JVM for any fs errors or +# single-sstable errors, so the node can be replaced. +# +# stop_paranoid +# shut down gossip and client transports even for single-sstable errors, +# kill the JVM for errors during startup. +# +# stop +# shut down gossip and client transports, leaving the node effectively dead, but +# can still be inspected via JMX, kill the JVM for errors during startup. +# +# best_effort +# stop using the failed disk and respond to requests based on +# remaining available sstables. This means you WILL see obsolete +# data at CL.ONE! +# +# ignore +# ignore fatal errors and let requests fail, as in pre-1.2 Cassandra +disk_failure_policy: stop + +# Policy for commit disk failures: +# +# die +# shut down gossip and Thrift and kill the JVM, so the node can be replaced. +# +# stop +# shut down gossip and Thrift, leaving the node effectively dead, but +# can still be inspected via JMX. +# +# stop_commit +# shutdown the commit log, letting writes collect but +# continuing to service reads, as in pre-2.0.5 Cassandra +# +# ignore +# ignore fatal errors and let the batches fail +commit_failure_policy: stop + +# Maximum size of the native protocol prepared statement cache +# +# Valid values are either "auto" (omitting the value) or a value greater 0. +# +# Note that specifying a too large value will result in long running GCs and possbily +# out-of-memory errors. Keep the value at a small fraction of the heap. +# +# If you constantly see "prepared statements discarded in the last minute because +# cache limit reached" messages, the first step is to investigate the root cause +# of these messages and check whether prepared statements are used correctly - +# i.e. use bind markers for variable parts. +# +# Do only change the default value, if you really have more prepared statements than +# fit in the cache. In most cases it is not neccessary to change this value. +# Constantly re-preparing statements is a performance penalty. +# +# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater +prepared_statements_cache_size_mb: + +# Maximum size of the Thrift prepared statement cache +# +# If you do not use Thrift at all, it is safe to leave this value at "auto". +# +# See description of 'prepared_statements_cache_size_mb' above for more information. +# +# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater +thrift_prepared_statements_cache_size_mb: + +# Maximum size of the key cache in memory. +# +# Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the +# minimum, sometimes more. The key cache is fairly tiny for the amount of +# time it saves, so it's worthwhile to use it at large numbers. +# The row cache saves even more time, but must contain the entire row, +# so it is extremely space-intensive. It's best to only use the +# row cache if you have hot rows or static rows. +# +# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup. +# +# Default value is empty to make it "auto" (min(5% of Heap (in MB), 100MB)). Set to 0 to disable key cache. +key_cache_size_in_mb: 1024 + +# Duration in seconds after which Cassandra should +# save the key cache. Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 14400 or 4 hours. +key_cache_save_period: 14400 + +# Number of keys from the key cache to save +# Disabled by default, meaning all keys are going to be saved +# key_cache_keys_to_save: 100 + +# Row cache implementation class name. Available implementations: +# +# org.apache.cassandra.cache.OHCProvider +# Fully off-heap row cache implementation (default). +# +# org.apache.cassandra.cache.SerializingCacheProvider +# This is the row cache implementation availabile +# in previous releases of Cassandra. +# row_cache_class_name: org.apache.cassandra.cache.OHCProvider + +# Maximum size of the row cache in memory. +# Please note that OHC cache implementation requires some additional off-heap memory to manage +# the map structures and some in-flight memory during operations before/after cache entries can be +# accounted against the cache capacity. This overhead is usually small compared to the whole capacity. +# Do not specify more memory that the system can afford in the worst usual situation and leave some +# headroom for OS block level cache. Do never allow your system to swap. +# +# Default value is 0, to disable row caching. +row_cache_size_in_mb: 0 + +# Duration in seconds after which Cassandra should save the row cache. +# Caches are saved to saved_caches_directory as specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 0 to disable saving the row cache. +row_cache_save_period: 0 + +# Number of keys from the row cache to save. +# Specify 0 (which is the default), meaning all keys are going to be saved +# row_cache_keys_to_save: 100 + +# Maximum size of the counter cache in memory. +# +# Counter cache helps to reduce counter locks' contention for hot counter cells. +# In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before +# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration +# of the lock hold, helping with hot counter cell updates, but will not allow skipping +# the read entirely. Only the local (clock, count) tuple of a counter cell is kept +# in memory, not the whole counter, so it's relatively cheap. +# +# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup. +# +# Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache. +# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache. +counter_cache_size_in_mb: + +# Duration in seconds after which Cassandra should +# save the counter cache (keys only). Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Default is 7200 or 2 hours. +counter_cache_save_period: 7200 + +# Number of keys from the counter cache to save +# Disabled by default, meaning all keys are going to be saved +# counter_cache_keys_to_save: 100 + +# saved caches +# If not set, the default directory is $CASSANDRA_HOME/data/saved_caches. +saved_caches_directory: /var/lib/cassandra/saved_caches + + +# commitlog_sync may be either "periodic" or "batch." +# +# When in batch mode, Cassandra won't ack writes until the commit log +# has been fsynced to disk. It will wait +# commitlog_sync_batch_window_in_ms milliseconds between fsyncs. +# This window should be kept short because the writer threads will +# be unable to do extra work while waiting. (You may need to increase +# concurrent_writes for the same reason.) +# +# commitlog_sync: batch +# commitlog_sync_batch_window_in_ms: 2 +# +# the other option is "periodic" where writes may be acked immediately +# and the CommitLog is simply synced every commitlog_sync_period_in_ms +# milliseconds. +commitlog_sync: periodic +commitlog_sync_period_in_ms: 10000 + +# The size of the individual commitlog file segments. A commitlog +# segment may be archived, deleted, or recycled once all the data +# in it (potentially from each columnfamily in the system) has been +# flushed to sstables. +# +# The default size is 32, which is almost always fine, but if you are +# archiving commitlog segments (see commitlog_archiving.properties), +# then you probably want a finer granularity of archiving; 8 or 16 MB +# is reasonable. +# Max mutation size is also configurable via max_mutation_size_in_kb setting in +# cassandra.yaml. The default is half the size commitlog_segment_size_in_mb * 1024. +# This should be positive and less than 2048. +# +# NOTE: If max_mutation_size_in_kb is set explicitly then commitlog_segment_size_in_mb must +# be set to at least twice the size of max_mutation_size_in_kb / 1024 +# +commitlog_segment_size_in_mb: 512 +# This is much bigger than the default (32), but the segment size must be +# larger than the largest row we want to write. And we have rows as large +# as 300MB, so... + +# Compression to apply to the commit log. If omitted, the commit log +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +# commitlog_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# any class that implements the SeedProvider interface and has a +# constructor that takes a Map of parameters will do. +seed_provider: + # Addresses of hosts that are deemed contact points. + # Cassandra nodes use this list of hosts to find each other and learn + # the topology of the ring. You must change this if you are running + # multiple nodes! + - class_name: org.apache.cassandra.locator.SimpleSeedProvider + parameters: + # seeds is actually a comma-delimited list of addresses. + # Ex: ",," + - seeds: + +# For workloads with more data than can fit in memory, Cassandra's +# bottleneck will be reads that need to fetch data from +# disk. "concurrent_reads" should be set to (16 * number_of_drives) in +# order to allow the operations to enqueue low enough in the stack +# that the OS and drives can reorder them. Same applies to +# "concurrent_counter_writes", since counter writes read the current +# values before incrementing and writing them back. +# +# On the other hand, since writes are almost never IO bound, the ideal +# number of "concurrent_writes" is dependent on the number of cores in +# your system; (8 * number_of_cores) is a good rule of thumb. +concurrent_reads: 32 +concurrent_writes: 32 +concurrent_counter_writes: 32 + +# For materialized view writes, as there is a read involved, so this should +# be limited by the less of concurrent reads or concurrent writes. +concurrent_materialized_view_writes: 32 + +# Maximum memory to use for sstable chunk cache and buffer pooling. +# 32MB of this are reserved for pooling buffers, the rest is used as an +# cache that holds uncompressed sstable chunks. +# Defaults to the smaller of 1/4 of heap or 512MB. This pool is allocated off-heap, +# so is in addition to the memory allocated for heap. The cache also has on-heap +# overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size +# if the default 64k chunk size is used). +# Memory is only allocated when needed. +# file_cache_size_in_mb: 512 + +# Flag indicating whether to allocate on or off heap when the sstable buffer +# pool is exhausted, that is when it has exceeded the maximum memory +# file_cache_size_in_mb, beyond which it will not cache buffers but allocate on request. + +# buffer_pool_use_heap_if_exhausted: true + +# The strategy for optimizing disk read +# Possible values are: +# ssd (for solid state disks, the default) +# spinning (for spinning disks) +# disk_optimization_strategy: ssd + +# Total permitted memory to use for memtables. Cassandra will stop +# accepting writes when the limit is exceeded until a flush completes, +# and will trigger a flush based on memtable_cleanup_threshold +# If omitted, Cassandra will set both to 1/4 the size of the heap. +# memtable_heap_space_in_mb: 2048 +# memtable_offheap_space_in_mb: 2048 + +# memtable_cleanup_threshold is deprecated. The default calculation +# is the only reasonable choice. See the comments on memtable_flush_writers +# for more information. +# +# Ratio of occupied non-flushing memtable size to total permitted size +# that will trigger a flush of the largest memtable. Larger mct will +# mean larger flushes and hence less compaction, but also less concurrent +# flush activity which can make it difficult to keep your disks fed +# under heavy write load. +# +# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1) +# memtable_cleanup_threshold: 0.11 + +# Specify the way Cassandra allocates and manages memtable memory. +# Options are: +# +# heap_buffers +# on heap nio buffers +# +# offheap_buffers +# off heap (direct) nio buffers +# +# offheap_objects +# off heap objects +memtable_allocation_type: heap_buffers + +# Total space to use for commit logs on disk. +# +# If space gets above this value, Cassandra will flush every dirty CF +# in the oldest segment and remove it. So a small total commitlog space +# will tend to cause more flush activity on less-active columnfamilies. +# +# The default value is the smaller of 8192, and 1/4 of the total space +# of the commitlog volume. +# +# commitlog_total_space_in_mb: 8192 + +# This sets the number of memtable flush writer threads per disk +# as well as the total number of memtables that can be flushed concurrently. +# These are generally a combination of compute and IO bound. +# +# Memtable flushing is more CPU efficient than memtable ingest and a single thread +# can keep up with the ingest rate of a whole server on a single fast disk +# until it temporarily becomes IO bound under contention typically with compaction. +# At that point you need multiple flush threads. At some point in the future +# it may become CPU bound all the time. +# +# You can tell if flushing is falling behind using the MemtablePool.BlockedOnAllocation +# metric which should be 0, but will be non-zero if threads are blocked waiting on flushing +# to free memory. +# +# memtable_flush_writers defaults to two for a single data directory. +# This means that two memtables can be flushed concurrently to the single data directory. +# If you have multiple data directories the default is one memtable flushing at a time +# but the flush will use a thread per data directory so you will get two or more writers. +# +# Two is generally enough to flush on a fast disk [array] mounted as a single data directory. +# Adding more flush writers will result in smaller more frequent flushes that introduce more +# compaction overhead. +# +# There is a direct tradeoff between number of memtables that can be flushed concurrently +# and flush size and frequency. More is not better you just need enough flush writers +# to never stall waiting for flushing to free memory. +# +#memtable_flush_writers: 2 + +# Total space to use for change-data-capture logs on disk. +# +# If space gets above this value, Cassandra will throw WriteTimeoutException +# on Mutations including tables with CDC enabled. A CDCCompactor is responsible +# for parsing the raw CDC logs and deleting them when parsing is completed. +# +# The default value is the min of 4096 mb and 1/8th of the total space +# of the drive where cdc_raw_directory resides. +# cdc_total_space_in_mb: 4096 + +# When we hit our cdc_raw limit and the CDCCompactor is either running behind +# or experiencing backpressure, we check at the following interval to see if any +# new space for cdc-tracked tables has been made available. Default to 250ms +# cdc_free_space_check_interval_ms: 250 + +# A fixed memory pool size in MB for for SSTable index summaries. If left +# empty, this will default to 5% of the heap size. If the memory usage of +# all index summaries exceeds this limit, SSTables with low read rates will +# shrink their index summaries in order to meet this limit. However, this +# is a best-effort process. In extreme conditions Cassandra may need to use +# more than this amount of memory. +index_summary_capacity_in_mb: + +# How frequently index summaries should be resampled. This is done +# periodically to redistribute memory from the fixed-size pool to sstables +# proportional their recent read rates. Setting to -1 will disable this +# process, leaving existing index summaries at their current sampling level. +index_summary_resize_interval_in_minutes: 60 + +# Whether to, when doing sequential writing, fsync() at intervals in +# order to force the operating system to flush the dirty +# buffers. Enable this to avoid sudden dirty buffer flushing from +# impacting read latencies. Almost always a good idea on SSDs; not +# necessarily on platters. +trickle_fsync: false +trickle_fsync_interval_in_kb: 10240 + +# TCP port, for commands and data +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +storage_port: 7000 + +# SSL port, for encrypted communication. Unused unless enabled in +# encryption_options +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +ssl_storage_port: 7001 + +# Address or interface to bind to and tell other Cassandra nodes to connect to. +# You _must_ change this if you want multiple nodes to be able to communicate! +# +# Set listen_address OR listen_interface, not both. +# +# Leaving it blank leaves it up to InetAddress.getLocalHost(). This +# will always do the Right Thing _if_ the node is properly configured +# (hostname, name resolution, etc), and the Right Thing is to use the +# address associated with the hostname (it might not be). +# +# Setting listen_address to 0.0.0.0 is always wrong. +# +listen_address: + +# Set listen_address OR listen_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# listen_interface: eth0 + +# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address +# you can specify which should be chosen using listen_interface_prefer_ipv6. If false the first ipv4 +# address will be used. If true the first ipv6 address will be used. Defaults to false preferring +# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6. +# listen_interface_prefer_ipv6: false + +# Address to broadcast to other Cassandra nodes +# Leaving this blank will set it to the same value as listen_address +broadcast_address: + +# When using multiple physical network interfaces, set this +# to true to listen on broadcast_address in addition to +# the listen_address, allowing nodes to communicate in both +# interfaces. +# Ignore this property if the network configuration automatically +# routes between the public and private networks such as EC2. +# listen_on_broadcast_address: false + +# Internode authentication backend, implementing IInternodeAuthenticator; +# used to allow/disallow connections from peer nodes. +# internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator + +# Whether to start the native transport server. +# Please note that the address on which the native transport is bound is the +# same as the rpc_address. The port however is different and specified below. +start_native_transport: true +# port for the CQL native transport to listen for clients on +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +native_transport_port: 9042 +# Enabling native transport encryption in client_encryption_options allows you to either use +# encryption for the standard port or to use a dedicated, additional port along with the unencrypted +# standard native_transport_port. +# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption +# for native_transport_port. Setting native_transport_port_ssl to a different value +# from native_transport_port will use encryption for native_transport_port_ssl while +# keeping native_transport_port unencrypted. +# native_transport_port_ssl: 9142 +# The maximum threads for handling requests when the native transport is used. +# This is similar to rpc_max_threads though the default differs slightly (and +# there is no native_transport_min_threads, idle threads will always be stopped +# after 30 seconds). +# native_transport_max_threads: 128 +# +# The maximum size of allowed frame. Frame (requests) larger than this will +# be rejected as invalid. The default is 256MB. If you're changing this parameter, +# you may want to adjust max_value_size_in_mb accordingly. This should be positive and less than 2048. +# native_transport_max_frame_size_in_mb: 256 + +# The maximum number of concurrent client connections. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections: -1 + +# The maximum number of concurrent client connections per source ip. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections_per_ip: -1 + +# Whether to start the thrift rpc server. +start_rpc: false + +# The address or interface to bind the Thrift RPC service and native transport +# server to. +# +# Set rpc_address OR rpc_interface, not both. +# +# Leaving rpc_address blank has the same effect as on listen_address +# (i.e. it will be based on the configured hostname of the node). +# +# Note that unlike listen_address, you can specify 0.0.0.0, but you must also +# set broadcast_rpc_address to a value other than 0.0.0.0. +# +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +rpc_address: 0.0.0.0 + +# Set rpc_address OR rpc_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# rpc_interface: eth1 + +# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address +# you can specify which should be chosen using rpc_interface_prefer_ipv6. If false the first ipv4 +# address will be used. If true the first ipv6 address will be used. Defaults to false preferring +# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6. +# rpc_interface_prefer_ipv6: false + +# port for Thrift to listen for clients on +rpc_port: 9160 + +# RPC address to broadcast to drivers and other Cassandra nodes. This cannot +# be set to 0.0.0.0. If left blank, this will be set to the value of +# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must +# be set. +broadcast_rpc_address: + +# enable or disable keepalive on rpc/native connections +rpc_keepalive: true + +# Cassandra provides two out-of-the-box options for the RPC Server: +# +# sync +# One thread per thrift connection. For a very large number of clients, memory +# will be your limiting factor. On a 64 bit JVM, 180KB is the minimum stack size +# per thread, and that will correspond to your use of virtual memory (but physical memory +# may be limited depending on use of stack space). +# +# hsha +# Stands for "half synchronous, half asynchronous." All thrift clients are handled +# asynchronously using a small number of threads that does not vary with the amount +# of thrift clients (and thus scales well to many clients). The rpc requests are still +# synchronous (one thread per active request). If hsha is selected then it is essential +# that rpc_max_threads is changed from the default value of unlimited. +# +# The default is sync because on Windows hsha is about 30% slower. On Linux, +# sync/hsha performance is about the same, with hsha of course using less memory. +# +# Alternatively, can provide your own RPC server by providing the fully-qualified class name +# of an o.a.c.t.TServerFactory that can create an instance of it. +rpc_server_type: sync + +# Uncomment rpc_min|max_thread to set request pool size limits. +# +# Regardless of your choice of RPC server (see above), the number of maximum requests in the +# RPC thread pool dictates how many concurrent requests are possible (but if you are using the sync +# RPC server, it also dictates the number of clients that can be connected at all). +# +# The default is unlimited and thus provides no protection against clients overwhelming the server. You are +# encouraged to set a maximum that makes sense for you in production, but do keep in mind that +# rpc_max_threads represents the maximum number of client requests this server may execute concurrently. +# +# rpc_min_threads: 16 +# rpc_max_threads: 2048 + +# uncomment to set socket buffer sizes on rpc connections +# rpc_send_buff_size_in_bytes: +# rpc_recv_buff_size_in_bytes: + +# Uncomment to set socket buffer size for internode communication +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it it is defined by net.ipv4.tcp_wmem +# See also: +# /proc/sys/net/core/wmem_max +# /proc/sys/net/core/rmem_max +# /proc/sys/net/ipv4/tcp_wmem +# /proc/sys/net/ipv4/tcp_wmem +# and 'man tcp' +# internode_send_buff_size_in_bytes: + +# Uncomment to set socket buffer size for internode communication +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it it is defined by net.ipv4.tcp_wmem +# internode_recv_buff_size_in_bytes: + +# Frame size for thrift (maximum message length). +thrift_framed_transport_size_in_mb: 15 + +# Set to true to have Cassandra create a hard link to each sstable +# flushed or streamed locally in a backups/ subdirectory of the +# keyspace data. Removing these links is the operator's +# responsibility. +incremental_backups: false + +# Whether or not to take a snapshot before each compaction. Be +# careful using this option, since Cassandra won't clean up the +# snapshots for you. Mostly useful if you're paranoid when there +# is a data format change. +snapshot_before_compaction: false + +# Whether or not a snapshot is taken of the data before keyspace truncation +# or dropping of column families. The STRONGLY advised default of true +# should be used to provide data safety. If you set this flag to false, you will +# lose data on truncation or drop. +auto_snapshot: true + +# Granularity of the collation index of rows within a partition. +# Increase if your rows are large, or if you have a very large +# number of rows per partition. The competing goals are these: +# +# - a smaller granularity means more index entries are generated +# and looking up rows withing the partition by collation column +# is faster +# - but, Cassandra will keep the collation index in memory for hot +# rows (as part of the key cache), so a larger granularity means +# you can cache more hot rows +column_index_size_in_kb: 64 + +# Per sstable indexed key cache entries (the collation index in memory +# mentioned above) exceeding this size will not be held on heap. +# This means that only partition information is held on heap and the +# index entries are read from disk. +# +# Note that this size refers to the size of the +# serialized index information and not the size of the partition. +column_index_cache_size_in_kb: 2 + +# Number of simultaneous compactions to allow, NOT including +# validation "compactions" for anti-entropy repair. Simultaneous +# compactions can help preserve read performance in a mixed read/write +# workload, by mitigating the tendency of small sstables to accumulate +# during a single long running compactions. The default is usually +# fine and if you experience problems with compaction running too +# slowly or too fast, you should look at +# compaction_throughput_mb_per_sec first. +# +# concurrent_compactors defaults to the smaller of (number of disks, +# number of cores), with a minimum of 2 and a maximum of 8. +# +# If your data directories are backed by SSD, you should increase this +# to the number of cores. +#concurrent_compactors: 1 + +# Throttles compaction to the given total throughput across the entire +# system. The faster you insert data, the faster you need to compact in +# order to keep the sstable count down, but in general, setting this to +# 16 to 32 times the rate you are inserting data is more than sufficient. +# Setting this to 0 disables throttling. Note that this account for all types +# of compaction, including validation compaction. +compaction_throughput_mb_per_sec: 16 + +# When compacting, the replacement sstable(s) can be opened before they +# are completely written, and used in place of the prior sstables for +# any range that has been written. This helps to smoothly transfer reads +# between the sstables, reducing page cache churn and keeping hot rows hot +sstable_preemptive_open_interval_in_mb: 50 + +# Throttles all outbound streaming file transfers on this node to the +# given total throughput in Mbps. This is necessary because Cassandra does +# mostly sequential IO when streaming data during bootstrap or repair, which +# can lead to saturating the network connection and degrading rpc performance. +# When unset, the default is 200 Mbps or 25 MB/s. +# stream_throughput_outbound_megabits_per_sec: 200 + +# Throttles all streaming file transfer between the datacenters, +# this setting allows users to throttle inter dc stream throughput in addition +# to throttling all network stream traffic as configured with +# stream_throughput_outbound_megabits_per_sec +# When unset, the default is 200 Mbps or 25 MB/s +# inter_dc_stream_throughput_outbound_megabits_per_sec: 200 + +# How long the coordinator should wait for read operations to complete +read_request_timeout_in_ms: 50000 +# How long the coordinator should wait for seq or index scans to complete +range_request_timeout_in_ms: 100000 +# How long the coordinator should wait for writes to complete +write_request_timeout_in_ms: 20000 +# How long the coordinator should wait for counter writes to complete +counter_write_request_timeout_in_ms: 50000 +# How long a coordinator should continue to retry a CAS operation +# that contends with other proposals for the same row +cas_contention_timeout_in_ms: 10000 +# How long the coordinator should wait for truncates to complete +# (This can be much longer, because unless auto_snapshot is disabled +# we need to flush first so we can snapshot before removing the data.) +truncate_request_timeout_in_ms: 600000 +# The default timeout for other, miscellaneous operations +request_timeout_in_ms: 100000 + +# How long before a node logs slow queries. Select queries that take longer than +# this timeout to execute, will generate an aggregated log message, so that slow queries +# can be identified. Set this value to zero to disable slow query logging. +slow_query_log_timeout_in_ms: 500 + +# Enable operation timeout information exchange between nodes to accurately +# measure request timeouts. If disabled, replicas will assume that requests +# were forwarded to them instantly by the coordinator, which means that +# under overload conditions we will waste that much extra time processing +# already-timed-out requests. +# +# Warning: before enabling this property make sure to ntp is installed +# and the times are synchronized between the nodes. +cross_node_timeout: false + +# Set keep-alive period for streaming +# This node will send a keep-alive message periodically with this period. +# If the node does not receive a keep-alive message from the peer for +# 2 keep-alive cycles the stream session times out and fail +# Default value is 300s (5 minutes), which means stalled stream +# times out in 10 minutes by default +# streaming_keep_alive_period_in_secs: 300 + +# phi value that must be reached for a host to be marked down. +# most users should never need to adjust this. +# phi_convict_threshold: 8 + +# endpoint_snitch -- Set this to a class that implements +# IEndpointSnitch. The snitch has two functions: +# +# - it teaches Cassandra enough about your network topology to route +# requests efficiently +# - it allows Cassandra to spread replicas around your cluster to avoid +# correlated failures. It does this by grouping machines into +# "datacenters" and "racks." Cassandra will do its best not to have +# more than one replica on the same "rack" (which may not actually +# be a physical location) +# +# CASSANDRA WILL NOT ALLOW YOU TO SWITCH TO AN INCOMPATIBLE SNITCH +# ONCE DATA IS INSERTED INTO THE CLUSTER. This would cause data loss. +# This means that if you start with the default SimpleSnitch, which +# locates every node on "rack1" in "datacenter1", your only options +# if you need to add another datacenter are GossipingPropertyFileSnitch +# (and the older PFS). From there, if you want to migrate to an +# incompatible snitch like Ec2Snitch you can do it by adding new nodes +# under Ec2Snitch (which will locate them in a new "datacenter") and +# decommissioning the old ones. +# +# Out of the box, Cassandra provides: +# +# SimpleSnitch: +# Treats Strategy order as proximity. This can improve cache +# locality when disabling read repair. Only appropriate for +# single-datacenter deployments. +# +# GossipingPropertyFileSnitch +# This should be your go-to snitch for production use. The rack +# and datacenter for the local node are defined in +# cassandra-rackdc.properties and propagated to other nodes via +# gossip. If cassandra-topology.properties exists, it is used as a +# fallback, allowing migration from the PropertyFileSnitch. +# +# PropertyFileSnitch: +# Proximity is determined by rack and data center, which are +# explicitly configured in cassandra-topology.properties. +# +# Ec2Snitch: +# Appropriate for EC2 deployments in a single Region. Loads Region +# and Availability Zone information from the EC2 API. The Region is +# treated as the datacenter, and the Availability Zone as the rack. +# Only private IPs are used, so this will not work across multiple +# Regions. +# +# Ec2MultiRegionSnitch: +# Uses public IPs as broadcast_address to allow cross-region +# connectivity. (Thus, you should set seed addresses to the public +# IP as well.) You will need to open the storage_port or +# ssl_storage_port on the public IP firewall. (For intra-Region +# traffic, Cassandra will switch to the private IP after +# establishing a connection.) +# +# RackInferringSnitch: +# Proximity is determined by rack and data center, which are +# assumed to correspond to the 3rd and 2nd octet of each node's IP +# address, respectively. Unless this happens to match your +# deployment conventions, this is best used as an example of +# writing a custom Snitch class and is provided in that spirit. +# +# You can use a custom Snitch by setting this to the full class name +# of the snitch, which will be assumed to be on your classpath. +endpoint_snitch: SimpleSnitch + +# controls how often to perform the more expensive part of host score +# calculation +dynamic_snitch_update_interval_in_ms: 100 +# controls how often to reset all host scores, allowing a bad host to +# possibly recover +dynamic_snitch_reset_interval_in_ms: 600000 +# if set greater than zero and read_repair_chance is < 1.0, this will allow +# 'pinning' of replicas to hosts in order to increase cache capacity. +# The badness threshold will control how much worse the pinned host has to be +# before the dynamic snitch will prefer other replicas over it. This is +# expressed as a double which represents a percentage. Thus, a value of +# 0.2 means Cassandra would continue to prefer the static snitch values +# until the pinned host was 20% worse than the fastest. +dynamic_snitch_badness_threshold: 0.1 + +# request_scheduler -- Set this to a class that implements +# RequestScheduler, which will schedule incoming client requests +# according to the specific policy. This is useful for multi-tenancy +# with a single Cassandra cluster. +# NOTE: This is specifically for requests from the client and does +# not affect inter node communication. +# org.apache.cassandra.scheduler.NoScheduler - No scheduling takes place +# org.apache.cassandra.scheduler.RoundRobinScheduler - Round robin of +# client requests to a node with a separate queue for each +# request_scheduler_id. The scheduler is further customized by +# request_scheduler_options as described below. +request_scheduler: org.apache.cassandra.scheduler.NoScheduler + +# Scheduler Options vary based on the type of scheduler +# +# NoScheduler +# Has no options +# +# RoundRobin +# throttle_limit +# The throttle_limit is the number of in-flight +# requests per client. Requests beyond +# that limit are queued up until +# running requests can complete. +# The value of 80 here is twice the number of +# concurrent_reads + concurrent_writes. +# default_weight +# default_weight is optional and allows for +# overriding the default which is 1. +# weights +# Weights are optional and will default to 1 or the +# overridden default_weight. The weight translates into how +# many requests are handled during each turn of the +# RoundRobin, based on the scheduler id. +# +# request_scheduler_options: +# throttle_limit: 80 +# default_weight: 5 +# weights: +# Keyspace1: 1 +# Keyspace2: 5 + +# request_scheduler_id -- An identifier based on which to perform +# the request scheduling. Currently the only valid option is keyspace. +# request_scheduler_id: keyspace + +# Enable or disable inter-node encryption +# JVM defaults for supported SSL socket protocols and cipher suites can +# be replaced using custom encryption options. This is not recommended +# unless you have policies in place that dictate certain settings, or +# need to disable vulnerable ciphers or protocols in case the JVM cannot +# be updated. +# FIPS compliant settings can be configured at JVM level and should not +# involve changing encryption settings here: +# https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/FIPS.html +# *NOTE* No custom encryption options are enabled at the moment +# The available internode options are : all, none, dc, rack +# +# If set to dc cassandra will encrypt the traffic between the DCs +# If set to rack cassandra will encrypt the traffic between the racks +# +# The passwords used in these options must match the passwords used when generating +# the keystore and truststore. For instructions on generating these files, see: +# http://download.oracle.com/javase/6/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore +# +server_encryption_options: + internode_encryption: none + keystore: conf/.keystore + keystore_password: cassandra + truststore: conf/.truststore + truststore_password: cassandra + # More advanced defaults below: + # protocol: TLS + # algorithm: SunX509 + # store_type: JKS + # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA] + # require_client_auth: false + # require_endpoint_verification: false + +# enable or disable client/server encryption. +client_encryption_options: + enabled: false + # If enabled and optional is set to true encrypted and unencrypted connections are handled. + optional: false + keystore: conf/.keystore + keystore_password: cassandra + # require_client_auth: false + # Set trustore and truststore_password if require_client_auth is true + # truststore: conf/.truststore + # truststore_password: cassandra + # More advanced defaults below: + # protocol: TLS + # algorithm: SunX509 + # store_type: JKS + # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA] + +# internode_compression controls whether traffic between nodes is +# compressed. +# Can be: +# +# all +# all traffic is compressed +# +# dc +# traffic between different datacenters is compressed +# +# none +# nothing is compressed. +internode_compression: dc + +# Enable or disable tcp_nodelay for inter-dc communication. +# Disabling it will result in larger (but fewer) network packets being sent, +# reducing overhead from the TCP protocol itself, at the cost of increasing +# latency if you block for cross-datacenter responses. +inter_dc_tcp_nodelay: false + +# TTL for different trace types used during logging of the repair process. +tracetype_query_ttl: 86400 +tracetype_repair_ttl: 604800 + +# By default, Cassandra logs GC Pauses greater than 200 ms at INFO level +# This threshold can be adjusted to minimize logging if necessary +# gc_log_threshold_in_ms: 200 + +# If unset, all GC Pauses greater than gc_log_threshold_in_ms will log at +# INFO level +# UDFs (user defined functions) are disabled by default. +# As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code. +enable_user_defined_functions: true + +# Enables scripted UDFs (JavaScript UDFs). +# Java UDFs are always enabled, if enable_user_defined_functions is true. +# Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider. +# This option has no effect, if enable_user_defined_functions is false. +enable_scripted_user_defined_functions: false + +# Enables materialized view creation on this node. +# Materialized views are considered experimental and are not recommended for production use. +enable_materialized_views: true + +# The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation. +# Lowering this value on Windows can provide much tighter latency and better throughput, however +# some virtualized environments may see a negative performance impact from changing this setting +# below their system default. The sysinternals 'clockres' tool can confirm your system's default +# setting. +windows_timer_interval: 1 + + +# Enables encrypting data at-rest (on disk). Different key providers can be plugged in, but the default reads from +# a JCE-style keystore. A single keystore can hold multiple keys, but the one referenced by +# the "key_alias" is the only key that will be used for encrypt opertaions; previously used keys +# can still (and should!) be in the keystore and will be used on decrypt operations +# (to handle the case of key rotation). +# +# It is strongly recommended to download and install Java Cryptography Extension (JCE) +# Unlimited Strength Jurisdiction Policy Files for your version of the JDK. +# (current link: http://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html) +# +# Currently, only the following file types are supported for transparent data encryption, although +# more are coming in future cassandra releases: commitlog, hints +transparent_data_encryption_options: + enabled: false + chunk_length_kb: 64 + cipher: AES/CBC/PKCS5Padding + key_alias: testing:1 + # CBC IV length for AES needs to be 16 bytes (which is also the default size) + # iv_length: 16 + key_provider: + - class_name: org.apache.cassandra.security.JKSKeyProvider + parameters: + - keystore: conf/.keystore + keystore_password: cassandra + store_type: JCEKS + key_password: cassandra + + +##################### +# SAFETY THRESHOLDS # +##################### + +# When executing a scan, within or across a partition, we need to keep the +# tombstones seen in memory so we can return them to the coordinator, which +# will use them to make sure other replicas also know about the deleted rows. +# With workloads that generate a lot of tombstones, this can cause performance +# problems and even exaust the server heap. +# (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets) +# Adjust the thresholds here if you understand the dangers and want to +# scan more tombstones anyway. These thresholds may also be adjusted at runtime +# using the StorageService mbean. +tombstone_warn_threshold: 1000 +tombstone_failure_threshold: 100000 + +# Log WARN on any multiple-partition batch size exceeding this value. 5kb per batch by default. +# Caution should be taken on increasing the size of this threshold as it can lead to node instability. +batch_size_warn_threshold_in_kb: 5 + +# Fail any multiple-partition batch exceeding this value. 50kb (10x warn threshold) by default. +batch_size_fail_threshold_in_kb: 50 + +# Log WARN on any batches not of type LOGGED than span across more partitions than this limit +unlogged_batch_across_partitions_warn_threshold: 10 + +# Log a warning when compacting partitions larger than this value +compaction_large_partition_warning_threshold_mb: 100 + +# GC Pauses greater than gc_warn_threshold_in_ms will be logged at WARN level +# Adjust the threshold based on your application throughput requirement +# By default, Cassandra logs GC Pauses greater than 200 ms at INFO level +gc_warn_threshold_in_ms: 1000 + +# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption +# early. Any value size larger than this threshold will result into marking an SSTable +# as corrupted. This should be positive and less than 2048. +# max_value_size_in_mb: 256 + +# Back-pressure settings # +# If enabled, the coordinator will apply the back-pressure strategy specified below to each mutation +# sent to replicas, with the aim of reducing pressure on overloaded replicas. +back_pressure_enabled: false +# The back-pressure strategy applied. +# The default implementation, RateBasedBackPressure, takes three arguments: +# high ratio, factor, and flow type, and uses the ratio between incoming mutation responses and outgoing mutation requests. +# If below high ratio, outgoing mutations are rate limited according to the incoming rate decreased by the given factor; +# if above high ratio, the rate limiting is increased by the given factor; +# such factor is usually best configured between 1 and 10, use larger values for a faster recovery +# at the expense of potentially more dropped mutations; +# the rate limiting is applied according to the flow type: if FAST, it's rate limited at the speed of the fastest replica, +# if SLOW at the speed of the slowest one. +# New strategies can be added. Implementors need to implement org.apache.cassandra.net.BackpressureStrategy and +# provide a public constructor accepting a Map. +back_pressure_strategy: + - class_name: org.apache.cassandra.net.RateBasedBackPressure + parameters: + - high_ratio: 0.90 + factor: 5 + flow: FAST + +# Coalescing Strategies # +# Coalescing multiples messages turns out to significantly boost message processing throughput (think doubling or more). +# On bare metal, the floor for packet processing throughput is high enough that many applications won't notice, but in +# virtualized environments, the point at which an application can be bound by network packet processing can be +# surprisingly low compared to the throughput of task processing that is possible inside a VM. It's not that bare metal +# doesn't benefit from coalescing messages, it's that the number of packets a bare metal network interface can process +# is sufficient for many applications such that no load starvation is experienced even without coalescing. +# There are other benefits to coalescing network messages that are harder to isolate with a simple metric like messages +# per second. By coalescing multiple tasks together, a network thread can process multiple messages for the cost of one +# trip to read from a socket, and all the task submission work can be done at the same time reducing context switching +# and increasing cache friendliness of network message processing. +# See CASSANDRA-8692 for details. + +# Strategy to use for coalescing messages in OutboundTcpConnection. +# Can be fixed, movingaverage, timehorizon, disabled (default). +# You can also specify a subclass of CoalescingStrategies.CoalescingStrategy by name. +# otc_coalescing_strategy: DISABLED + +# How many microseconds to wait for coalescing. For fixed strategy this is the amount of time after the first +# message is received before it will be sent with any accompanying messages. For moving average this is the +# maximum amount of time that will be waited as well as the interval at which messages must arrive on average +# for coalescing to be enabled. +# otc_coalescing_window_us: 200 + +# Do not try to coalesce messages if we already got that many messages. This should be more than 2 and less than 128. +# otc_coalescing_enough_coalesced_messages: 8 + +# How many milliseconds to wait between two expiration runs on the backlog (queue) of the OutboundTcpConnection. +# Expiration is done if messages are piling up in the backlog. Droppable messages are expired to free the memory +# taken by expired messages. The interval should be between 0 and 1000, and in most installations the default value +# will be appropriate. A smaller value could potentially expire messages slightly sooner at the expense of more CPU +# time and queue contention while iterating the backlog of messages. +# An interval of 0 disables any wait time, which is the behavior of former Cassandra versions. +# +# otc_backlog_expiration_interval_ms: 200 diff --git a/docker/conf/deposit.yml b/docker/conf/deposit.yml new file mode 100644 index 0000000..7b0cabf --- /dev/null +++ b/docker/conf/deposit.yml @@ -0,0 +1,17 @@ +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008 + +allowed_hosts: + - swh-deposit + +private: + secret_key: prod-in-docker + db: + host: swh-deposit-db + port: 5432 + name: swh-deposit + user: postgres + password: testpassword + media_root: /tmp/swh-deposit/uploads diff --git a/docker/conf/grafana/dashboards/task-processing.json b/docker/conf/grafana/dashboards/task-processing.json new file mode 100644 index 0000000..a1cc4b1 --- /dev/null +++ b/docker/conf/grafana/dashboards/task-processing.json @@ -0,0 +1,373 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 2, + "id": 18, + "iteration": 1551112370226, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/uncaught/", + "color": "#bf1b00" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(swh_task_called_count{worker=~\"$worker\"}[$interval])) by (task)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{task}}", + "refId": "A" + }, + { + "expr": "sum(rate(swh_task_failure_count{worker=~\"$worker\"}[$interval])) by (task)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}} uncaught exceptions", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task counts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": "tasks per second", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#c15c17", + "colorScale": "sqrt", + "colorScheme": "interpolateViridis", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "Prometheus", + "description": "Each square's color represents the number of tasks that completed within that duration range.", + "gridPos": { + "h": 14, + "w": 24, + "x": 0, + "y": 9 + }, + "heatmap": {}, + "hideTimeOverride": false, + "highlightCards": true, + "id": 2, + "legend": { + "show": true + }, + "links": [], + "repeat": "task", + "repeatDirection": "v", + "scopedVars": { + "task": { + "selected": true, + "text": "swh.loader.git.tasks.UpdateGitRepository", + "value": "swh.loader.git.tasks.UpdateGitRepository" + } + }, + "targets": [ + { + "expr": "sum(increase(swh_task_duration_seconds_bucket{task=~\"$task\",worker=~\"$worker\"}[$interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "$task durations", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": null, + "transparent": false, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + } + ], + "refresh": false, + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "auto": true, + "auto_count": 100, + "auto_min": "2m", + "current": { + "text": "auto", + "value": "$__auto_interval_interval" + }, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "2m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "swh.loader.git.tasks.UpdateGitRepository", + "value": "swh.loader.git.tasks.UpdateGitRepository" + }, + "datasource": "Prometheus", + "definition": "label_values(swh_task_called_count, task)", + "hide": 0, + "includeAll": true, + "label": "Task name", + "multi": false, + "name": "task", + "options": [], + "query": "label_values(swh_task_called_count, task)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(swh_task_called_count{task=~\"$task\"}, worker)", + "hide": 0, + "includeAll": true, + "label": "Worker", + "multi": true, + "name": "worker", + "options": [], + "query": "label_values(swh_task_called_count{task=~\"$task\"}, worker)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Worker task processing", + "uid": "b_xh3f9ik", + "version": 8 +} diff --git a/docker/conf/grafana/provisioning/dashboards/all.yaml b/docker/conf/grafana/provisioning/dashboards/all.yaml new file mode 100644 index 0000000..aa79647 --- /dev/null +++ b/docker/conf/grafana/provisioning/dashboards/all.yaml @@ -0,0 +1,6 @@ +- name: 'default' # name of this dashboard configuration (not dashboard itself) + org_id: 1 # id of the org to hold the dashboard + folder: '' # name of the folder to put the dashboard (http://docs.grafana.org/v5.0/reference/dashboard_folders/) + type: 'file' # type of dashboard description (json files) + options: + folder: '/var/lib/grafana/dashboards' # where dashboards are diff --git a/docker/conf/grafana/provisioning/datasources/prometheus.yaml b/docker/conf/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 0000000..289aaf1 --- /dev/null +++ b/docker/conf/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,11 @@ +# config file version +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + url: http://localhost:5080/prometheus + access: direct + isDefault: true + version: 1 + editable: false diff --git a/docker/conf/indexer.yml b/docker/conf/indexer.yml new file mode 100644 index 0000000..dd2133b --- /dev/null +++ b/docker/conf/indexer.yml @@ -0,0 +1,31 @@ +storage: + cls: remote + args: + url: http://swh-storage:5002/ +objstorage: + cls: remote + args: + url: http://swh-objstorage:5003/ +indexer_storage: + cls: remote + args: + url: http://swh-idx-storage:5007/ +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008/ +celery: + task_broker: amqp://guest:guest@amqp// + task_modules: + - swh.indexer.tasks + task_queues: + - swh.indexer.tasks.ContentFossologyLicense + - swh.indexer.tasks.ContentLanguage + - swh.indexer.tasks.ContentMimetype + - swh.indexer.tasks.ContentRangeFossologyLicense + - swh.indexer.tasks.ContentRangeMimetype + - swh.indexer.tasks.Ctags + - swh.indexer.tasks.OriginHead + - swh.indexer.tasks.OriginMetadata + - swh.indexer.tasks.RecomputeChecksums + - swh.indexer.tasks.RevisionMetadata diff --git a/docker/conf/indexer_journal_client.yml b/docker/conf/indexer_journal_client.yml new file mode 100644 index 0000000..91877c9 --- /dev/null +++ b/docker/conf/indexer_journal_client.yml @@ -0,0 +1,11 @@ +journal: + brokers: + - kafka + group_id: swh.indexer.journal_client + +max_messages: 50 + +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008/ diff --git a/docker/conf/indexer_storage.yml b/docker/conf/indexer_storage.yml new file mode 100644 index 0000000..a5eb85c --- /dev/null +++ b/docker/conf/indexer_storage.yml @@ -0,0 +1,4 @@ +indexer_storage: + cls: local + args: + db: postgresql:///?service=swh-indexers diff --git a/docker/conf/journal_backfiller.yml b/docker/conf/journal_backfiller.yml new file mode 100644 index 0000000..268b752 --- /dev/null +++ b/docker/conf/journal_backfiller.yml @@ -0,0 +1,9 @@ +brokers: + - kafka + +final_prefix: swh.journal.objects +client_id: swh.journal.backfiller +object_types: + - content + +storage_dbconn: postgresql:///?service=swh-storage diff --git a/docker/conf/lister.yml b/docker/conf/lister.yml new file mode 100644 index 0000000..0c326f1 --- /dev/null +++ b/docker/conf/lister.yml @@ -0,0 +1,60 @@ +storage: + cls: remote + args: + url: http://swh-storage:5002/ + +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008/ + +lister: + cls: local + args: + db: postgresql://postgres@swh-listers-db/swh-listers + +celery: + task_broker: amqp://guest:guest@amqp// + task_modules: + - swh.lister.bitbucket.tasks + - swh.lister.cgit.tasks + - swh.lister.cran.tasks + - swh.lister.debian.tasks + - swh.lister.github.tasks + - swh.lister.gitlab.tasks + - swh.lister.gnu.tasks + - swh.lister.npm.tasks + - swh.lister.packagist.tasks + - swh.lister.phabricator.tasks + - swh.lister.pypi.tasks + task_queues: + - swh.lister.bitbucket.tasks.FullBitBucketRelister + - swh.lister.bitbucket.tasks.IncrementalBitBucketLister + - swh.lister.bitbucket.tasks.RangeBitBucketLister + - swh.lister.bitbucket.tasks.ping + - swh.lister.cgit.tasks.CGitListerTask + - swh.lister.cgit.tasks.ping + - swh.lister.cran.tasks.CRANListerTask + - swh.lister.cran.tasks.ping + - swh.lister.debian.tasks.DebianListerTask + - swh.lister.debian.tasks.ping + - swh.lister.github.tasks.FullGitHubRelister + - swh.lister.github.tasks.IncrementalGitHubLister + - swh.lister.github.tasks.RangeGitHubLister + - swh.lister.github.tasks.ping + - swh.lister.gitlab.tasks.FullGitLabRelister + - swh.lister.gitlab.tasks.IncrementalGitLabLister + - swh.lister.gitlab.tasks.RangeGitLabLister + - swh.lister.gitlab.tasks.ping + - swh.lister.gnu.tasks.GNUListerTask + - swh.lister.gnu.tasks.ping + - swh.lister.npm.tasks.NpmIncrementalListerTask + - swh.lister.npm.tasks.NpmListerTask + - swh.lister.npm.tasks.ping + - swh.lister.packagist.tasks.PackagistListerTask + - swh.lister.packagist.tasks.ping + - swh.lister.phabricator.tasks.FullPhabricatorLister + - swh.lister.phabricator.tasks.IncrementalPhabricatorLister + - swh.lister.phabricator.tasks.ping + - swh.lister.pypi.tasks.PyPIListerTask + - swh.lister.pypi.tasks.ping diff --git a/docker/conf/loader.yml b/docker/conf/loader.yml new file mode 100644 index 0000000..debcc93 --- /dev/null +++ b/docker/conf/loader.yml @@ -0,0 +1,50 @@ +storage: + cls: filter + args: + storage: + cls: buffer + args: + storage: + cls: remote + args: + url: http://swh-storage:5002/ + min_batch_size: + content: 10000 + content_bytes: 104857600 + directory: 1000 + revision: 1000 + +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008/ + +celery: + task_broker: amqp://guest:guest@amqp// + task_modules: + - swh.loader.git.tasks + - swh.loader.mercurial.tasks + - swh.loader.svn.tasks + - swh.deposit.loader.tasks + - swh.loader.package.tasks + + task_queues: + - swh.loader.dir.tasks.LoadDirRepository + - swh.loader.git.tasks.LoadDiskGitRepository + - swh.loader.git.tasks.UncompressAndLoadDiskGitRepository + - swh.loader.git.tasks.UpdateGitRepository + - swh.loader.mercurial.tasks.LoadArchiveMercurial + - swh.loader.mercurial.tasks.LoadMercurial + - swh.loader.package.tasks.LoadArchive + - swh.loader.package.tasks.LoadDebian + - swh.loader.package.tasks.LoadNpm + - swh.loader.package.tasks.LoadPyPI + - swh.loader.svn.tasks.DumpMountAndLoadSvnRepository + - swh.loader.svn.tasks.LoadSvnRepository + - swh.loader.svn.tasks.MountAndLoadSvnRepository + - swh.deposit.loader.tasks.LoadDepositArchiveTsk + - swh.deposit.loader.tasks.ChecksDepositTsk + +lister_db_url: postgresql://postgres@swh-listers-db/swh-listers + +url: 'http://swh-deposit:5006' diff --git a/docker/conf/nginx.conf b/docker/conf/nginx.conf new file mode 100644 index 0000000..a0774ad --- /dev/null +++ b/docker/conf/nginx.conf @@ -0,0 +1,107 @@ +worker_processes 1; + +# Show startup logs on stderr; switch to debug to print, well, debug logs when +# running nginx-debug +error_log /dev/stderr info; + +events { + worker_connections 1024; +} + +http { + include mime.types; + default_type application/octet-stream; + sendfile on; + keepalive_timeout 65; + + # Built-in Docker resolver. Needed to allow on-demand resolution of proxy + # upstreams. + resolver 127.0.0.11 valid=30s; + + server { + listen 5080 default_server; + + # Add a trailing slash to top level requests (e.g. http://localhost:5080/flower) + + rewrite ^/([^/]+)$ /$1/ permanent; + + # In this pile of proxies, all upstreams are set using a variable. This + # makes nginx DNS-resolve the name of the upstream when clients request + # them, rather than on start. This avoids an unstarted container preventing + # nginx from starting. + # + # Variables need to be set as early as possible, as they're statements from + # the rewrite module and `rewrite [...] break;` will prevent these + # statements from being executed. + + location /flower/ { + set $upstream "http://flower:5555"; + + rewrite ^/flower/(.*)$ /$1 break; + proxy_pass $upstream; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header Host $host; + proxy_redirect off; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } + location /rabbitmq/ { + set $upstream "http://amqp:15672"; + + rewrite ^ $request_uri; + rewrite ^/rabbitmq(/.*)$ $1 break; + + proxy_pass $upstream$uri; + } + location /scheduler { + set $upstream "http://swh-scheduler-api:5008"; + + rewrite ^/scheduler/(.*)$ /$1 break; + proxy_pass $upstream; + } + location /storage { + set $upstream "http://swh-storage:5002"; + + rewrite ^/storage/(.*)$ /$1 break; + proxy_pass $upstream; + } + location /indexer-storage { + set $upstream "http://swh-idx-storage:5007"; + + rewrite ^/indexer-storage/(.*)$ /$1 break; + + proxy_pass $upstream; + } + location /deposit { + set $upstream "http://swh-deposit:5006"; + + rewrite ^/deposit/(.*)$ /$1 break; + proxy_pass $upstream; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header Host $host; + proxy_redirect off; + } + location /objstorage { + set $upstream "http://swh-objstorage:5003"; + + rewrite ^/objstorage/(.*)$ /$1 break; + proxy_pass $upstream; + } + location /prometheus { + set $upstream "http://prometheus:9090"; + proxy_pass $upstream; + } + location /grafana { + set $upstream "http://grafana:3000"; + rewrite ^/grafana/(.*)$ /$1 break; + proxy_pass $upstream; + } + location / { + set $upstream "http://swh-web:5004"; + proxy_pass $upstream; + } + } +} diff --git a/docker/conf/objstorage.yml b/docker/conf/objstorage.yml new file mode 100644 index 0000000..dde0323 --- /dev/null +++ b/docker/conf/objstorage.yml @@ -0,0 +1,7 @@ +objstorage: + cls: pathslicing + args: + root: /srv/softwareheritage/objects + slicing: 0:5 + +client_max_size: 1073741824 diff --git a/docker/conf/prometheus-jmx-exporter-cassandra.yml b/docker/conf/prometheus-jmx-exporter-cassandra.yml new file mode 100644 index 0000000..7c256d9 --- /dev/null +++ b/docker/conf/prometheus-jmx-exporter-cassandra.yml @@ -0,0 +1,42 @@ +# see: +# - http://cassandra.apache.org/doc/latest/operating/metrics.html +# - https://blog.pythian.com/step-step-monitoring-cassandra-prometheus-grafana/ + +startDelaySeconds: 0 +hostPort: cassandra-seed:7199 +username: +password: +#jmxUrl: service:jmx:rmi:///jndi/rmi://127.0.0.1:1234/jmxrmi +ssl: false +lowercaseOutputName: false +lowercaseOutputLabelNames: false +whitelistObjectNames: ["org.apache.cassandra.metrics:*"] +blacklistObjectNames: [] +rules: +- pattern: org.apache.cassandra.metrics<>(Count|Value) + name: cassandra_$1_$3 + labels: + address: "$2" +- pattern: org.apache.cassandra.metrics<>(Mean) + name: cassandra_$1_$2_$3 +- pattern: org.apache.cassandra.net<>(DownEndpointCount) + name: cassandra_$1_$2 +- pattern: org.apache.cassandra.metrics<>(Count|Mean|95thPercentile) + name: cassandra_$1_$3_$4 + labels: + "$1": "$2" +- pattern: org.apache.cassandra.metrics<>(Count|Mean|95thPercentile) + name: cassandra_$1_$4_$5 + labels: + "keyspace": "$2" + "table": "$3" +- pattern: org.apache.cassandra.metrics<>(Count|Mean|95thPercentile) + name: cassandra_$1_$3_$4 + labels: + "type": "$2" +- pattern: org.apache.cassandra.metrics<>(Count|Value) + name: cassandra_$1_$5 + labels: + "$1": "$4" + "$2": "$3" diff --git a/docker/conf/prometheus-jmx-exporter-logging.properties b/docker/conf/prometheus-jmx-exporter-logging.properties new file mode 100644 index 0000000..0f4c31f --- /dev/null +++ b/docker/conf/prometheus-jmx-exporter-logging.properties @@ -0,0 +1,6 @@ +handlers=java.util.logging.ConsoleHandler +# uncomment this to get logs: +#java.util.logging.ConsoleHandler.level=ALL +#io.prometheus.jmx.level=ALL +#io.prometheus.jmx.shaded.io.prometheus.jmx.level=ALL + diff --git a/docker/conf/prometheus-statsd-mapping.yml b/docker/conf/prometheus-statsd-mapping.yml new file mode 100644 index 0000000..a994106 --- /dev/null +++ b/docker/conf/prometheus-statsd-mapping.yml @@ -0,0 +1,27 @@ +defaults: + timer_type: histogram + buckets: + - .005 + - .01 + - .025 + - .05 + - .1 + - .25 + - .5 + - .75 + - 1 + - 2 + - 5 + - 10 + - 15 + - 30 + - 45 + - 60 + - 120 + - 300 + - 600 + - 900 + - 1800 + - 2700 + - 3600 + - 7200 diff --git a/docker/conf/prometheus.yml b/docker/conf/prometheus.yml new file mode 100644 index 0000000..f342c98 --- /dev/null +++ b/docker/conf/prometheus.yml @@ -0,0 +1,22 @@ +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + metrics_path: /prometheus/metrics + + - job_name: statsd-exporter + static_configs: + - targets: + - prometheus-statsd-exporter:9102 + + - job_name: jmx-exporter-cassandra + static_configs: + - targets: + - prometheus-jmx-exporter-cassandra:5556 diff --git a/docker/conf/scheduler.yml b/docker/conf/scheduler.yml new file mode 100644 index 0000000..af1b3f5 --- /dev/null +++ b/docker/conf/scheduler.yml @@ -0,0 +1,8 @@ +scheduler: + cls: local + args: + db: postgresql:///?service=swh-scheduler +celery: + task_broker: amqp://guest:guest@amqp// + broker_transport_options: + max_retries: 1 diff --git a/docker/conf/search.yml b/docker/conf/search.yml new file mode 100644 index 0000000..883f6f2 --- /dev/null +++ b/docker/conf/search.yml @@ -0,0 +1,5 @@ +search: + cls: elasticsearch + args: + hosts: + - elasticsearch:9200 diff --git a/docker/conf/search_journal_client_objects.yml b/docker/conf/search_journal_client_objects.yml new file mode 100644 index 0000000..4248935 --- /dev/null +++ b/docker/conf/search_journal_client_objects.yml @@ -0,0 +1,10 @@ +search: + cls: remote + args: + url: http://swh-search:5010/ +journal: + brokers: + - kafka + group_id: swh.search.journal_client.objects + prefix: swh.journal.objects + diff --git a/docker/conf/storage-replica.yml b/docker/conf/storage-replica.yml new file mode 100644 index 0000000..2412282 --- /dev/null +++ b/docker/conf/storage-replica.yml @@ -0,0 +1,8 @@ +storage: + cls: local + args: + db: postgresql:///?service=swh-storage-replica + objstorage: + cls: remote + args: + url: http://swh-objstorage:5003/ diff --git a/docker/conf/storage.yml b/docker/conf/storage.yml new file mode 100644 index 0000000..a27b3be --- /dev/null +++ b/docker/conf/storage.yml @@ -0,0 +1,15 @@ +storage: + cls: local + args: + db: postgresql:///?service=swh-storage + objstorage: + cls: remote + args: + url: http://swh-objstorage:5003/ + journal_writer: + cls: kafka + args: + brokers: + - kafka + prefix: swh.journal.objects + client_id: swh.storage.master diff --git a/docker/conf/storage_cassandra.yml b/docker/conf/storage_cassandra.yml new file mode 100644 index 0000000..a4d1ec4 --- /dev/null +++ b/docker/conf/storage_cassandra.yml @@ -0,0 +1,11 @@ +storage: + cls: cassandra + args: + hosts: + - cassandra-seed + keyspace: swh + objstorage: + cls: remote + args: + url: http://swh-objstorage:5003/ + diff --git a/docker/conf/vault-api.yml b/docker/conf/vault-api.yml new file mode 100644 index 0000000..b3ec6a3 --- /dev/null +++ b/docker/conf/vault-api.yml @@ -0,0 +1,17 @@ +storage: + cls: remote + args: + url: http://swh-storage:5002/ +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008/ +vault: + cls: local + args: + db: postgresql:///?service=swh-vault +cache: + cls: pathslicing + args: + root: /srv/softwareheritage/vault + slicing: 0:5 diff --git a/docker/conf/vault-worker.yml b/docker/conf/vault-worker.yml new file mode 100644 index 0000000..8a195ac --- /dev/null +++ b/docker/conf/vault-worker.yml @@ -0,0 +1,17 @@ +storage: + cls: remote + args: + url: http://swh-storage:5002/ +vault: + cls: remote + args: + url: http://swh-vault-api:5005/ +celery: + task_broker: amqp://guest:guest@amqp// + task_modules: + - swh.vault.cooking_tasks + task_queues: + - swh.vault.cooking_tasks.SWHBatchCookingTask + - swh.vault.cooking_tasks.SWHCookingTask + +max_bundle_size: 536870912 diff --git a/docker/conf/web-replica.yml b/docker/conf/web-replica.yml new file mode 100644 index 0000000..06ba565 --- /dev/null +++ b/docker/conf/web-replica.yml @@ -0,0 +1,37 @@ +storage: + cls: remote + args: + url: http://swh-storage-replica:5002/ + timeout: 1 + +objstorage: + cls: remote + args: + url: http://swh-objstorage:5003/ + +indexer_storage: + cls: remote + args: + url: http://swh-idx-storage:5007/ + +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008/ + +vault: + cls: remote + args: + url: http://swh-vault-api:5005/ + +deposit: + private_api_url: https://swh-deposit:5006/1/private/ + private_api_user: swhworker + private_api_password: '' + +allowed_hosts: + - "*" + +debug: yes + +serve_assets: yes diff --git a/docker/conf/web.yml b/docker/conf/web.yml new file mode 100644 index 0000000..23eb3f8 --- /dev/null +++ b/docker/conf/web.yml @@ -0,0 +1,62 @@ +storage: + cls: remote + args: + url: http://swh-storage:5002/ + timeout: 1 + +objstorage: + cls: remote + args: + url: http://swh-objstorage:5003/ + +indexer_storage: + cls: remote + args: + url: http://swh-idx-storage:5007/ + +scheduler: + cls: remote + args: + url: http://swh-scheduler-api:5008/ + +vault: + cls: remote + args: + url: http://swh-vault-api:5005/ + +deposit: + private_api_url: https://swh-deposit:5006/1/private/ + private_api_user: swhworker + private_api_password: '' + +allowed_hosts: + - "*" + +debug: yes + +serve_assets: yes + +development_db: /tmp/db.sqlite3 + +throttling: + scopes: + swh_api: + limiter_rate: + default: 120/h + exempted_networks: + - 0.0.0.0/0 + swh_api_origin_visit_latest: + limiter_rate: + default: 700/m + exempted_networks: + - 0.0.0.0/0 + swh_vault_cooking: + limiter_rate: + default: 120/h + exempted_networks: + - 0.0.0.0/0 + swh_save_origin: + limiter_rate: + default: 120/h + exempted_networks: + - 0.0.0.0/0 diff --git a/docker/docker-compose.cassandra.yml b/docker/docker-compose.cassandra.yml new file mode 100644 index 0000000..ae843a4 --- /dev/null +++ b/docker/docker-compose.cassandra.yml @@ -0,0 +1,61 @@ +version: '2' + +services: + cassandra-seed: + # This container starts a Cassandra instance that must be used as the + # contact-point for clients. This container will then make the client + # discover other cassandra containers. + # This container must not be scaled up; scale up th 'cassandra' + # container instead. + image: cassandra + env_file: + - ./env/cassandra.env + entrypoint: /swh_entrypoint.sh + volumes: + - "./services/cassandra/swh_entrypoint.sh:/swh_entrypoint.sh:ro" + - "./conf/cassandra.yaml:/cassandra.yaml:ro" + + cassandra: + # Additional Cassandra instance(s), which may be scaled up, but not + # down. They will automatically connect to 'cassandra-seed', and + # 'cassandra-seed' will tell clients to connect to these 'cassandra' + # containers to load-balance. + image: cassandra + entrypoint: /swh_entrypoint.sh + volumes: + - "./services/cassandra/swh_entrypoint.sh:/swh_entrypoint.sh:ro" + - "./conf/cassandra.yaml:/cassandra.yaml:ro" + env_file: + - ./env/cassandra.env + + prometheus: + # just to add the dep on the cassandra-jmx-exporter-cassandra + depends_on: + - prometheus-statsd-exporter + - prometheus-jmx-exporter-cassandra + + prometheus-jmx-exporter-cassandra: + image: sscaling/jmx-prometheus-exporter + environment: + JVM_OPTS: "-Djava.util.logging.config.file=/logging.properties" + volumes: + - "./conf/prometheus-jmx-exporter-cassandra.yml:/opt/jmx_exporter/config.yml:ro" + - "./conf/prometheus-jmx-exporter-logging.properties:/logging.properties:ro" + ports: + - "5556:5556" + + swh-storage: + volumes: + # note: you need to be on the cassandra-backend2 branch + - "../swh-storage:/src/swh-storage" + - "./conf/storage_cassandra.yml:/storage.yml:ro" + - "./services/swh-storage/entrypoint.sh:/entrypoint.sh:ro" + depends_on: + - swh-storage-db + - cassandra-seed + - swh-objstorage + - kafka + environment: + CASSANDRA_SEED: cassandra-seed + STORAGE_BACKEND: cassandra + PYTHONUNBUFFERED: 1 diff --git a/docker/docker-compose.override.yml.example b/docker/docker-compose.override.yml.example new file mode 100644 index 0000000..5221796 --- /dev/null +++ b/docker/docker-compose.override.yml.example @@ -0,0 +1,6 @@ +version: '2' + +services: + swh-objstorage: + volumes: + - "/home/ddouard/src/swh-environment/swh-objstorage:/src/swh-objstorage" diff --git a/docker/docker-compose.search.yml b/docker/docker-compose.search.yml new file mode 100644 index 0000000..091bd12 --- /dev/null +++ b/docker/docker-compose.search.yml @@ -0,0 +1,39 @@ +version: '2' + +services: + elasticsearch: + image: elasticsearch:7.3.2 + env_file: + - ./env/elasticsearch.env + ports: + - 9200:9200 + volumes: + - elasticsearch-data:/usr/share/elasticsearch/data + + swh-search: + image: swh/stack + build: ./ + entrypoint: /entrypoint.sh + ports: + - 5010:5010 + depends_on: + - elasticsearch + environment: + SWH_CONFIG_FILENAME: /search.yml + volumes: + - "./conf/search.yml:/search.yml:ro" + - "./services/swh-search/entrypoint.sh:/entrypoint.sh:ro" + + swh-search-journal-client-objects: + image: swh/stack + build: ./ + entrypoint: /entrypoint.sh + depends_on: + - swh-search + - kafka + volumes: + - "./conf/search_journal_client_objects.yml:/etc/softwareheritage/search/journal_client_objects.yml:ro" + - "./services/swh-search-journal-client-objects/entrypoint.sh:/entrypoint.sh:ro" + +volumes: + elasticsearch-data: diff --git a/docker/docker-compose.storage-replica.yml b/docker/docker-compose.storage-replica.yml new file mode 100644 index 0000000..2824615 --- /dev/null +++ b/docker/docker-compose.storage-replica.yml @@ -0,0 +1,66 @@ +version: '2' + +services: + # override web app to use the replica + swh-web: + environment: + SWH_CONFIG_FILENAME: /web-replica.yml + volumes: + - "./conf/web-replica.yml:/web-replica.yml:ro" + + # create a dedicated db for the replica + swh-storage-replica-db: + image: postgres:11 + env_file: + - ./env/storage-db-replica.env + environment: + # unset PGHOST as db service crashes otherwise + PGHOST: + + # and an RPC server + swh-storage-replica: + image: swh/stack + build: ./ + depends_on: + - swh-storage-replica-db + - swh-objstorage + env_file: + - ./env/storage-db-replica.env + environment: + SWH_CONFIG_FILENAME: /storage-replica.yml + entrypoint: /entrypoint.sh + volumes: + - "./conf/storage-replica.yml:/storage-replica.yml:ro" + - "./services/swh-storage/entrypoint.sh:/entrypoint.sh:ro" + + # and the background process that keeps the replica in sync with the + # main graph + swh-storage-replica-replayer: + image: swh/stack + build: ./ + depends_on: + - swh-storage-replica-db + - swh-objstorage + env_file: + - ./env/storage-db-replica.env + environment: + SWH_CONFIG_FILENAME: /storage-replica.yml + entrypoint: /entrypoint.sh + volumes: + - "./conf/storage-replica.yml:/storage-replica.yml:ro" + - "./services/swh-storage-replayer/entrypoint.sh:/entrypoint.sh:ro" + + swh-journal-backfiller: + image: swh/stack + build: ./ + entrypoint: /entrypoint.sh + environment: + SWH_CONFIG_FILENAME: /journal_backfiller.yml + env_file: + - ./env/storage-db.env + depends_on: + - swh-storage-db + - kafka + volumes: + - "./conf/journal_backfiller.yml:/journal_backfiller.yml:ro" + - "./services/swh-journal-backfiller/entrypoint.sh:/entrypoint.sh:ro" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..44da7d2 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,390 @@ +version: '2' + +services: + + amqp: + image: rabbitmq:3.6-management + ports: + - 5072:5672 + +# flower: +# image: mher/flower +# command: --broker=amqp://guest:guest@amqp:5672// --url_prefix=flower +# ports: +# - 5055:5555 +# depends_on: +# - amqp + + zookeeper: + image: wurstmeister/zookeeper + restart: always + + kafka: + image: wurstmeister/kafka + ports: + - "5092:9092" + env_file: ./env/kafka.env + depends_on: + - zookeeper + + kafka-manager: + image: hlebalbau/kafka-manager:stable + ports: + - "5093:9000" + environment: + ZK_HOSTS: zookeeper:2181 + APPLICATION_SECRET: random-secret + command: -Dpidfile.path=/dev/null + + prometheus: + image: prom/prometheus + depends_on: + - prometheus-statsd-exporter + command: + # Needed for the reverse-proxy + - "--web.external-url=/prometheus" + - "--config.file=/etc/prometheus/prometheus.yml" + volumes: + - "./conf/prometheus.yml:/etc/prometheus/prometheus.yml:ro" + restart: unless-stopped + + prometheus-statsd-exporter: + image: prom/statsd-exporter + command: + - "--statsd.mapping-config=/etc/prometheus/statsd-mapping.yml" + volumes: + - "./conf/prometheus-statsd-mapping.yml:/etc/prometheus/statsd-mapping.yml:ro" + restart: unless-stopped + + grafana: + image: grafana/grafana + restart: unless-stopped + depends_on: + - prometheus + environment: + GF_SERVER_ROOT_URL: http://localhost:5080/grafana + volumes: + - "./conf/grafana/provisioning:/etc/grafana/provisioning:ro" + - "./conf/grafana/dashboards:/var/lib/grafana/dashboards" + + nginx: + image: nginx + volumes: + - "./conf/nginx.conf:/etc/nginx/nginx.conf:ro" + ports: + - 5080:5080 + +# Scheduler + + swh-scheduler-db: + image: postgres:11 + env_file: + - ./env/scheduler-db.env + environment: + # unset PGHOST as db service crashes otherwise + PGHOST: + + swh-scheduler-api: + image: swh/stack + build: ./ + env_file: + - ./env/scheduler-db.env + - ./env/scheduler.env + environment: + SWH_CONFIG_FILENAME: /scheduler.yml + SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml + entrypoint: /entrypoint.sh + depends_on: + - swh-scheduler-db + ports: + - 5008:5008 + volumes: + - "./conf/scheduler.yml:/scheduler.yml:ro" + - "./services/swh-scheduler-api/entrypoint.sh:/entrypoint.sh:ro" + + swh-scheduler-listener: + image: swh/stack + build: ./ + env_file: + - ./env/scheduler-db.env + - ./env/scheduler.env + environment: + SWH_CONFIG_FILENAME: /scheduler.yml + SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml + entrypoint: /entrypoint.sh + command: start-listener + depends_on: + - swh-scheduler-api + - amqp + volumes: + - "./conf/scheduler.yml:/scheduler.yml:ro" + - "./services/swh-scheduler-worker/entrypoint.sh:/entrypoint.sh:ro" + + swh-scheduler-runner: + image: swh/stack + build: ./ + env_file: + - ./env/scheduler-db.env + - ./env/scheduler.env + environment: + SWH_CONFIG_FILENAME: /scheduler.yml + SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml + entrypoint: /entrypoint.sh + command: start-runner -p 10 + depends_on: + - swh-scheduler-api + - amqp + volumes: + - "./conf/scheduler.yml:/scheduler.yml:ro" + - "./services/swh-scheduler-worker/entrypoint.sh:/entrypoint.sh:ro" + +# Graph storage + + swh-storage-db: + image: postgres:11 + env_file: + - ./env/storage-db.env + environment: + # unset PGHOST as db service crashes otherwise + PGHOST: + + swh-storage: + image: swh/stack + build: ./ + ports: + - 5002:5002 + depends_on: + - swh-storage-db + - swh-objstorage + - kafka + env_file: + - ./env/storage-db.env + environment: + SWH_CONFIG_FILENAME: /storage.yml + STORAGE_BACKEND: postgresql + entrypoint: /entrypoint.sh + volumes: + - "./conf/storage.yml:/storage.yml:ro" + - "./services/swh-storage/entrypoint.sh:/entrypoint.sh:ro" + +# Object storage + + swh-objstorage: + build: ./ + image: swh/stack + ports: + - 5003:5003 + environment: + SWH_CONFIG_FILENAME: /objstorage.yml + entrypoint: /entrypoint.sh + volumes: + - "./conf/objstorage.yml:/objstorage.yml:ro" + - "./services/swh-objstorage/entrypoint.sh:/entrypoint.sh:ro" + +# Indexer storage + + swh-idx-storage-db: + image: postgres:11 + env_file: + - ./env/indexers-db.env + environment: + # unset PGHOST as db service crashes otherwise + PGHOST: + + swh-idx-storage: + image: swh/stack + build: ./ + ports: + - 5007:5007 + depends_on: + - swh-idx-storage-db + env_file: + - ./env/indexers-db.env + environment: + SWH_CONFIG_FILENAME: /indexer_storage.yml + entrypoint: /entrypoint.sh + volumes: + - "./conf/indexer_storage.yml:/indexer_storage.yml:ro" + - "./services/swh-indexer-storage/entrypoint.sh:/entrypoint.sh:ro" + +# Web interface + + swh-web: + build: ./ + image: swh/stack + ports: + - 5004:5004 + depends_on: + - swh-objstorage + - swh-storage + - swh-idx-storage + environment: + VERBOSITY: 3 + DJANGO_SETTINGS_MODULE: swh.web.settings.development + SWH_CONFIG_FILENAME: /web.yml + entrypoint: /entrypoint.sh + volumes: + - "./conf/web.yml:/web.yml:ro" + - "./services/swh-web/entrypoint.sh:/entrypoint.sh:ro" + + swh-deposit-db: + image: postgres:11 + env_file: + - ./env/deposit-db.env + environment: + # unset PGHOST as db service crashes otherwise + PGHOST: + + swh-deposit: + image: swh/stack + build: ./ + ports: + - 5006:5006 + depends_on: + - swh-deposit-db + - swh-scheduler-api + environment: + VERBOSITY: 3 + SWH_CONFIG_FILENAME: /deposit.yml + DJANGO_SETTINGS_MODULE: swh.deposit.settings.production + env_file: + - ./env/deposit-db.env + entrypoint: /entrypoint.sh + volumes: + - "./conf/deposit.yml:/deposit.yml:ro" + - "./services/swh-deposit/entrypoint.sh:/entrypoint.sh:ro" + + swh-vault-db: + image: postgres:11 + env_file: + - ./env/vault-db.env + environment: + # unset PGHOST as db service crashes otherwise + PGHOST: + + swh-vault-api: + image: swh/stack + build: ./ + env_file: + - ./env/vault-db.env + environment: + SWH_CONFIG_FILENAME: /vault-api.yml + command: server + ports: + - 5005:5005 + depends_on: + - swh-vault-db + - swh-objstorage + - swh-storage + - swh-scheduler-api + entrypoint: /entrypoint.sh + volumes: + - "./conf/vault-api.yml:/vault-api.yml:ro" + - "./services/swh-vault/entrypoint.sh:/entrypoint.sh:ro" + + swh-vault-worker: + image: swh/stack + build: ./ + command: worker + environment: + SWH_CONFIG_FILENAME: /cooker.yml + depends_on: + - swh-vault-api + - swh-storage + entrypoint: /entrypoint.sh + volumes: + - "./conf/vault-worker.yml:/cooker.yml:ro" + - "./services/swh-vault/entrypoint.sh:/entrypoint.sh:ro" + + +# Lister Celery workers + + swh-listers-db: + image: postgres:11 + env_file: + - ./env/listers-db.env + environment: + # unset PGHOST as db service crashes otherwise + PGHOST: + + swh-lister: + image: swh/stack + build: ./ + env_file: + - ./env/listers-db.env + - ./env/listers.env + user: swh + environment: + STATSD_HOST: prometheus-statsd-exporter + STATSD_PORT: 9125 + SWH_WORKER_INSTANCE: listers + SWH_CONFIG_FILENAME: /lister.yml + depends_on: + - swh-listers-db + - swh-scheduler-api + - swh-scheduler-runner + - swh-storage + - amqp + entrypoint: /entrypoint.sh + volumes: + - "./conf/lister.yml:/lister.yml:ro" + - "./services/swh-listers-worker/entrypoint.sh:/entrypoint.sh:ro" + +# Loader Celery workers + + swh-loader: + image: swh/stack + build: ./ + env_file: + - ./env/listers.env + user: swh + environment: + STATSD_HOST: prometheus-statsd-exporter + STATSD_PORT: 9125 + SWH_WORKER_INSTANCE: loader + SWH_CONFIG_FILENAME: /loader.yml + entrypoint: /entrypoint.sh + depends_on: + - swh-storage + - swh-scheduler-api + - amqp + volumes: + - "./conf/loader.yml:/loader.yml:ro" + - "./services/swh-loaders-worker/entrypoint.sh:/entrypoint.sh:ro" + +# Indexer Celery workers + + swh-indexer: + image: swh/stack + build: ./ + user: swh + env_file: + - ./env/indexers-db.env + - ./env/indexers.env + environment: + STATSD_HOST: prometheus-statsd-exporter + STATSD_PORT: 9125 + entrypoint: /entrypoint.sh + depends_on: + - swh-scheduler-runner + - swh-idx-storage + - swh-storage + - swh-objstorage + - amqp + volumes: + - "./conf/indexer.yml:/indexer.yml:ro" + - "./services/swh-indexer-worker/entrypoint.sh:/entrypoint.sh:ro" + +# Journal related + + swh-indexer-journal-client: + image: swh/stack + build: ./ + entrypoint: /entrypoint.sh + depends_on: + - kafka + - swh-storage + - swh-scheduler-api + volumes: + - "./conf/indexer_journal_client.yml:/etc/softwareheritage/indexer/journal_client.yml:ro" + - "./services/swh-indexer-journal-client/entrypoint.sh:/entrypoint.sh:ro" diff --git a/docker/docs/Makefile b/docker/docs/Makefile new file mode 100644 index 0000000..c30c50a --- /dev/null +++ b/docker/docs/Makefile @@ -0,0 +1 @@ +include ../../swh-docs/Makefile.sphinx diff --git a/docker/env/cassandra.env b/docker/env/cassandra.env new file mode 100644 index 0000000..b04b233 --- /dev/null +++ b/docker/env/cassandra.env @@ -0,0 +1,5 @@ +MAX_HEAP_SIZE=1G +HEAP_NEWSIZE=100M +LOCAL_JMX=no +JVM_EXTRA_OPTS=-Dcom.sun.management.jmxremote.authenticate=false + diff --git a/docker/env/deposit-db.env b/docker/env/deposit-db.env new file mode 100644 index 0000000..0a4973d --- /dev/null +++ b/docker/env/deposit-db.env @@ -0,0 +1,5 @@ +PGHOST=swh-deposit-db +PGUSER=postgres +POSTGRES_PASSWORD=testpassword +POSTGRES_DB=swh-deposit + diff --git a/docker/env/elasticsearch.env b/docker/env/elasticsearch.env new file mode 100644 index 0000000..3392cf9 --- /dev/null +++ b/docker/env/elasticsearch.env @@ -0,0 +1 @@ +discovery.type=single-node diff --git a/docker/env/indexers-db.env b/docker/env/indexers-db.env new file mode 100644 index 0000000..c748508 --- /dev/null +++ b/docker/env/indexers-db.env @@ -0,0 +1,4 @@ +PGHOST=swh-idx-storage-db +PGUSER=postgres +POSTGRES_PASSWORD=testpassword +POSTGRES_DB=swh-indexers \ No newline at end of file diff --git a/docker/env/indexers.env b/docker/env/indexers.env new file mode 100644 index 0000000..5db2f4b --- /dev/null +++ b/docker/env/indexers.env @@ -0,0 +1,5 @@ +CONCURRENCY=4 +MAX_TASKS_PER_CHILD=10 +LOGLEVEL=DEBUG +SWH_WORKER_INSTANCE=indexer +SWH_CONFIG_FILENAME=/indexer.yml diff --git a/docker/env/kafka.env b/docker/env/kafka.env new file mode 100644 index 0000000..1bc0a97 --- /dev/null +++ b/docker/env/kafka.env @@ -0,0 +1,9 @@ +KAFKA_ADVERTISED_HOST_NAME=kafka +KAFKA_ADVERTISED_PORT=9092 +KAFKA_PORT=9092 +KAFKA_LISTENERS=PLAINTEXT://:9092 +KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 +KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 +KAFKA_JMX_OPTS=-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Djava.rmi.server.hostname=kafka -Dcom.sun.management.jmxremote.rmi.port=1099 +JMX_PORT=1099 +LOG4J_LOGGER_KAFKA_AUTHORIZER_LOGGER=DEBUG, authorizerAppender diff --git a/docker/env/listers-db.env b/docker/env/listers-db.env new file mode 100644 index 0000000..5146abc --- /dev/null +++ b/docker/env/listers-db.env @@ -0,0 +1,4 @@ +PGHOST=swh-listers-db +PGUSER=postgres +POSTGRES_PASSWORD=testpassword +POSTGRES_DB=swh-listers diff --git a/docker/env/listers.env b/docker/env/listers.env new file mode 100644 index 0000000..0f02e61 --- /dev/null +++ b/docker/env/listers.env @@ -0,0 +1,3 @@ +CONCURRENCY=1 +MAX_TASKS_PER_CHILD=10 +LOGLEVEL=DEBUG diff --git a/docker/env/scheduler-db.env b/docker/env/scheduler-db.env new file mode 100644 index 0000000..8e1de99 --- /dev/null +++ b/docker/env/scheduler-db.env @@ -0,0 +1,4 @@ +PGHOST=swh-scheduler-db +PGUSER=postgres +POSTGRES_PASSWORD=testpassword +POSTGRES_DB=swh-scheduler diff --git a/docker/env/scheduler.env b/docker/env/scheduler.env new file mode 100644 index 0000000..8650403 --- /dev/null +++ b/docker/env/scheduler.env @@ -0,0 +1,3 @@ +SWH_WORKER_INSTANCE=scheduler +LOGLEVEL=INFO +CELERY_BROKER_URL=amqp://amqp// diff --git a/docker/env/storage-db-replica.env b/docker/env/storage-db-replica.env new file mode 100644 index 0000000..c713f3a --- /dev/null +++ b/docker/env/storage-db-replica.env @@ -0,0 +1,4 @@ +PGHOST=swh-storage-replica-db +PGUSER=postgres +POSTGRES_PASSWORD=testpassword +POSTGRES_DB=swh-storage-replica diff --git a/docker/env/storage-db.env b/docker/env/storage-db.env new file mode 100644 index 0000000..65da09b --- /dev/null +++ b/docker/env/storage-db.env @@ -0,0 +1,4 @@ +PGHOST=swh-storage-db +PGUSER=postgres +POSTGRES_PASSWORD=testpassword +POSTGRES_DB=swh-storage diff --git a/docker/env/storage.env b/docker/env/storage.env new file mode 100644 index 0000000..f802fd4 --- /dev/null +++ b/docker/env/storage.env @@ -0,0 +1,5 @@ +POSTGRES_PASSWORD=testpassword +POSTGRES_DB=swh-storage +PGHOST=swh-storage-db +PGUSER=postgres +SWH_CONFIG_FILENAME=/storage.yml \ No newline at end of file diff --git a/docker/env/vault-db.env b/docker/env/vault-db.env new file mode 100644 index 0000000..2adadcf --- /dev/null +++ b/docker/env/vault-db.env @@ -0,0 +1,4 @@ +POSTGRES_DB=swh-vault +POSTGRES_PASSWORD=testpassword +PGUSER=postgres +PGHOST=swh-vault-db diff --git a/docker/env/vault.env b/docker/env/vault.env new file mode 100644 index 0000000..2adadcf --- /dev/null +++ b/docker/env/vault.env @@ -0,0 +1,4 @@ +POSTGRES_DB=swh-vault +POSTGRES_PASSWORD=testpassword +PGUSER=postgres +PGHOST=swh-vault-db diff --git a/docker/services/cassandra/swh_entrypoint.sh b/docker/services/cassandra/swh_entrypoint.sh new file mode 100755 index 0000000..8927578 --- /dev/null +++ b/docker/services/cassandra/swh_entrypoint.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# /cassandra.yaml is provided by docker-compose via a bind-mount, but +# we need to copy it because the official entrypoint (docker-entrypoint.sh) +# modifies it. +cp /cassandra.yaml /etc/cassandra/cassandra.yaml +exec docker-entrypoint.sh diff --git a/docker/services/swh-deposit/entrypoint.sh b/docker/services/swh-deposit/entrypoint.sh new file mode 100755 index 0000000..8b6994f --- /dev/null +++ b/docker/services/swh-deposit/entrypoint.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -ex + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +if [ "$1" = 'shell' ] ; then + exec bash -i +else + + wait_pgsql + + echo "Migrating db" + django-admin migrate --settings=${DJANGO_SETTINGS_MODULE} + + swh-deposit admin user exists test || \ + swh-deposit admin user create \ + --username test \ + --password test \ + --provider-url https://softwareheritage.org \ + --domain softwareheritage.org + + echo "starting swh-deposit server" + exec gunicorn --bind 0.0.0.0:5006 \ + --reload \ + --threads 2 \ + --workers 2 \ + --log-level DEBUG \ + --timeout 3600 \ + 'django.core.wsgi:get_wsgi_application()' +fi diff --git a/docker/services/swh-indexer-journal-client/entrypoint.sh b/docker/services/swh-indexer-journal-client/entrypoint.sh new file mode 100755 index 0000000..18d2811 --- /dev/null +++ b/docker/services/swh-indexer-journal-client/entrypoint.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +case "$1" in + "shell") + exec bash -i + ;; + *) + echo "Starting swh-indexer-journal client" + exec wait-for-it kafka:9092 -s --timeout=0 -- swh indexer --config-file /etc/softwareheritage/indexer/journal_client.yml journal-client + ;; +esac + diff --git a/docker/services/swh-indexer-storage/entrypoint.sh b/docker/services/swh-indexer-storage/entrypoint.sh new file mode 100755 index 0000000..ec5d92c --- /dev/null +++ b/docker/services/swh-indexer-storage/entrypoint.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + *) + + wait_pgsql + + echo Setup the database + PGPASSWORD=${POSTGRES_PASSWORD} swh db-init \ + --db-name ${POSTGRES_DB} indexer + + echo Starting the swh-indexer-storage API server + exec gunicorn --bind 0.0.0.0:5007 \ + --reload \ + --threads 2 \ + --workers 2 \ + --log-level DEBUG \ + --timeout 3600 \ + 'swh.indexer.storage.api.server:make_app_from_configfile()' + ;; +esac diff --git a/docker/services/swh-indexer-worker/entrypoint.sh b/docker/services/swh-indexer-worker/entrypoint.sh new file mode 100755 index 0000000..dea17ef --- /dev/null +++ b/docker/services/swh-indexer-worker/entrypoint.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + *) + echo Waiting for RabbitMQ to start + wait-for-it amqp:5672 -s --timeout=0 + + wait_pgsql + + echo Starting swh-indexer worker + exec python -m celery worker \ + --app=swh.scheduler.celery_backend.config.app \ + --pool=prefork --events \ + --concurrency=${CONCURRENCY} \ + --maxtasksperchild=${MAX_TASKS_PER_CHILD} \ + -Ofair --loglevel=${LOGLEVEL} --without-gossip \ + --without-mingle \ + --heartbeat-interval 10 \ + --hostname "${SWH_WORKER_INSTANCE}@%h" + ;; +esac diff --git a/docker/services/swh-journal-backfiller/entrypoint.sh b/docker/services/swh-journal-backfiller/entrypoint.sh new file mode 100755 index 0000000..d24bf1b --- /dev/null +++ b/docker/services/swh-journal-backfiller/entrypoint.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + *) + wait_pgsql + echo "Starting swh-journal-backfiller" + exec wait-for-it kafka:9092 -s --timeout=0 -- swh-journal backfiller $@ + ;; +esac diff --git a/docker/services/swh-listers-worker/entrypoint.sh b/docker/services/swh-listers-worker/entrypoint.sh new file mode 100755 index 0000000..3ab19c6 --- /dev/null +++ b/docker/services/swh-listers-worker/entrypoint.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + *) + wait_pgsql + + echo Setup ${POSTGRES_DB} database for ${SWH_WORKER_INSTANCE} + if psql -lqt | cut -d \| -f 1 | grep -qw ${POSTGRES_DB}; then + echo Database already exists, nothing to do + else + echo Creating database + createdb ${POSTGRES_DB} + fi + + echo Initialize database + swh lister -C ${SWH_CONFIG_FILENAME} db-init + + echo Register task types in scheduler database + swh scheduler -C ${SWH_CONFIG_FILENAME} task-type register + + echo Waiting for RabbitMQ to start + wait-for-it amqp:5672 -s --timeout=0 + + echo Starting the swh-lister Celery worker for ${SWH_WORKER_INSTANCE} + exec python -m celery worker \ + --app=swh.scheduler.celery_backend.config.app \ + --pool=prefork --events \ + --concurrency=${CONCURRENCY} \ + --maxtasksperchild=${MAX_TASKS_PER_CHILD} \ + -Ofair --loglevel=${LOGLEVEL} --without-gossip \ + --without-mingle \ + --heartbeat-interval 10 \ + --hostname "${SWH_WORKER_INSTANCE}@%h" + ;; +esac diff --git a/docker/services/swh-loaders-worker/entrypoint.sh b/docker/services/swh-loaders-worker/entrypoint.sh new file mode 100755 index 0000000..d230f49 --- /dev/null +++ b/docker/services/swh-loaders-worker/entrypoint.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +case "$1" in + "shell") + exec bash -i + ;; + *) + echo Waiting for RabbitMQ to start + wait-for-it amqp:5672 -s --timeout=0 + + echo Register task types in scheduler database + swh scheduler -C ${SWH_CONFIG_FILENAME} task-type register + + echo Starting the swh-loader Celery worker for ${SWH_WORKER_INSTANCE} + exec python -m celery worker \ + --app=swh.scheduler.celery_backend.config.app \ + --pool=prefork --events \ + --concurrency=${CONCURRENCY} \ + --maxtasksperchild=${MAX_TASKS_PER_CHILD} \ + -Ofair --loglevel=${LOGLEVEL} --without-gossip \ + --without-mingle \ + --heartbeat-interval 10 \ + --hostname "${SWH_WORKER_INSTANCE}@%h" + ;; +esac diff --git a/docker/services/swh-objstorage/entrypoint.sh b/docker/services/swh-objstorage/entrypoint.sh new file mode 100755 index 0000000..30fcefc --- /dev/null +++ b/docker/services/swh-objstorage/entrypoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +echo Installed Python packages: +pip list + +if [ "$1" = 'shell' ] ; then + exec bash -i +else + echo Starting the swh-objstorage API server + exec gunicorn --bind 0.0.0.0:5003 \ + --worker-class aiohttp.worker.GunicornWebWorker \ + --log-level DEBUG \ + --threads 4 \ + --workers 2 \ + --reload \ + --timeout 3600 \ + 'swh.objstorage.api.server:make_app_from_configfile()' + +fi diff --git a/docker/services/swh-scheduler-api/entrypoint.sh b/docker/services/swh-scheduler-api/entrypoint.sh new file mode 100755 index 0000000..b0a3381 --- /dev/null +++ b/docker/services/swh-scheduler-api/entrypoint.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + "swh-scheduler") + exec $@ + ;; + *) + wait_pgsql + + echo Setup the swh-scheduler API database + PGPASSWORD=${POSTGRES_PASSWORD} swh db-init \ + --db-name ${POSTGRES_DB} scheduler + + echo Starting the swh-scheduler API server + exec gunicorn --bind 0.0.0.0:5008 \ + --log-level DEBUG \ + --threads 2 \ + --workers 2 \ + --reload \ + --timeout 3600 \ + 'swh.scheduler.api.server:make_app_from_configfile()' + +esac diff --git a/docker/services/swh-scheduler-worker/entrypoint.sh b/docker/services/swh-scheduler-worker/entrypoint.sh new file mode 100755 index 0000000..1ed0b2b --- /dev/null +++ b/docker/services/swh-scheduler-worker/entrypoint.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + *) + wait_pgsql + + echo "Starting the swh-scheduler $1" + exec wait-for-it amqp:5672 -s --timeout=0 -- swh --log-level ${LOGLEVEL} scheduler -C /scheduler.yml $@ + ;; +esac diff --git a/docker/services/swh-search-journal-client-objects/entrypoint.sh b/docker/services/swh-search-journal-client-objects/entrypoint.sh new file mode 100755 index 0000000..aa4ecb9 --- /dev/null +++ b/docker/services/swh-search-journal-client-objects/entrypoint.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +case "$1" in + "shell") + exec bash -i + ;; + *) + echo "Starting swh-indexer-journal client" + exec wait-for-it -s kafka:9092 -s --timeout=0 -- \ + wait-for-it -s swh-search:5010 -s --timeout=0 -- \ + swh --log-level DEBUG search --config-file /etc/softwareheritage/search/journal_client_objects.yml journal-client objects + ;; +esac + diff --git a/docker/services/swh-search/entrypoint.sh b/docker/services/swh-search/entrypoint.sh new file mode 100755 index 0000000..2c44fb7 --- /dev/null +++ b/docker/services/swh-search/entrypoint.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +case "$1" in + "shell") + exec bash -i + ;; + *) + echo Starting the swh-search API server + wait-for-it elasticsearch:9200 -s --timeout=0 + echo "Waiting for ElasticSearch cluster to be up" + cat << EOF | python3 +import elasticsearch +es = elasticsearch.Elasticsearch(['elasticsearch:9200']) +es.cluster.health(wait_for_status='yellow') +EOF + echo "ElasticSearch cluster is up" + swh search -C $SWH_CONFIG_FILENAME initialize + exec gunicorn --bind 0.0.0.0:5010 \ + --reload \ + --threads 4 \ + --workers 2 \ + --log-level DEBUG \ + --timeout 3600 \ + 'swh.search.api.server:make_app_from_configfile()' + ;; +esac diff --git a/docker/services/swh-storage-replayer/entrypoint.sh b/docker/services/swh-storage-replayer/entrypoint.sh new file mode 100755 index 0000000..382d52c --- /dev/null +++ b/docker/services/swh-storage-replayer/entrypoint.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + *) + wait_pgsql + + echo Setup the database + PGPASSWORD=${POSTGRES_PASSWORD} swh db-init \ + --db-name ${POSTGRES_DB} storage + + echo Starting the swh-storage Kafka storage replayer + exec swh-journal replay \ + --broker kafka \ + --prefix swh.journal.objects \ + --consumer-id swh.storage.replica + ;; +esac diff --git a/docker/services/swh-storage/entrypoint.sh b/docker/services/swh-storage/entrypoint.sh new file mode 100755 index 0000000..359880c --- /dev/null +++ b/docker/services/swh-storage/entrypoint.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +if [ "$STORAGE_BACKEND" = "postgresql" ]; then + source /srv/softwareheritage/utils/pgsql.sh + setup_pgsql + +elif [ "$STORAGE_BACKEND" = "cassandra" ]; then + echo Waiting for Cassandra to start + wait-for-it ${CASSANDRA_SEED}:9042 -s --timeout=0 + echo Creating keyspace + cat << EOF | python3 +from swh.storage.cassandra import create_keyspace +create_keyspace(['cassandra-seed'], 'swh') +EOF + +fi + +case "$1" in + "shell") + exec bash -i + ;; + *) + if [ "$STORAGE_BACKEND" = "postgresql" ]; then + wait_pgsql + + echo Setup the database + PGPASSWORD=${POSTGRES_PASSWORD} swh db-init \ + --db-name ${POSTGRES_DB} storage + fi + + echo Starting the swh-storage API server + exec gunicorn --bind 0.0.0.0:5002 \ + --reload \ + --threads 4 \ + --workers 2 \ + --log-level DEBUG \ + --timeout 3600 \ + 'swh.storage.api.server:make_app_from_configfile()' + ;; +esac diff --git a/docker/services/swh-vault/entrypoint.sh b/docker/services/swh-vault/entrypoint.sh new file mode 100755 index 0000000..2734bfc --- /dev/null +++ b/docker/services/swh-vault/entrypoint.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +source /srv/softwareheritage/utils/pgsql.sh +setup_pgsql + +case "$1" in + "shell") + exec bash -i + ;; + "worker") + echo Starting the swh-vault Celery worker for + exec python -m celery worker \ + --app=swh.scheduler.celery_backend.config.app \ + --pool=prefork --events \ + --concurrency=${CONCURRENCY:-1} \ + --maxtasksperchild=${MAX_TASKS_PER_CHILD:-10} \ + -Ofair --loglevel=${LOGLEVEL:-INFO} --without-gossip \ + --without-mingle --without-heartbeat \ + --hostname "vault@%h" + ;; + "server") + # ensure the pathslicing root dir for the cache exists + mkdir -p /srv/softwareheritage/vault + + wait_pgsql + + echo Setup the swh-vault API database + PGPASSWORD=${POSTGRES_PASSWORD} swh db-init vault \ + --db-name ${POSTGRES_DB} + + echo Starting the swh-vault API server + exec swh vault rpc-serve -C ${SWH_CONFIG_FILENAME} +esac diff --git a/docker/services/swh-web/entrypoint.sh b/docker/services/swh-web/entrypoint.sh new file mode 100755 index 0000000..d33a904 --- /dev/null +++ b/docker/services/swh-web/entrypoint.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -e + +create_admin_script=" +from django.contrib.auth import get_user_model; + +username = 'admin'; +password = 'admin'; +email = 'admin@swh-web.org'; + +User = get_user_model(); + +if not User.objects.filter(username = username).exists(): + User.objects.create_superuser(username, email, password); +" + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +case "$1" in + "shell") + exec bash -i + ;; + *) + echo "Migrating db using ${DJANGO_SETTINGS_MODULE}" + django-admin migrate --settings=${DJANGO_SETTINGS_MODULE} + + echo "Creating admin user" + echo "$create_admin_script" | python3 -m swh.web.manage shell + + echo "starting the swh-web server" + exec gunicorn --bind 0.0.0.0:5004 \ + --threads 2 \ + --workers 2 \ + --timeout 3600 \ + 'django.core.wsgi:get_wsgi_application()' +esac diff --git a/docker/tests/run_tests.sh b/docker/tests/run_tests.sh new file mode 100755 index 0000000..effa51c --- /dev/null +++ b/docker/tests/run_tests.sh @@ -0,0 +1,182 @@ +#!/bin/bash + +# Main script to run high level tests on the Software Heritage stack + +# Use a temporary directory as working directory +WORKDIR=/tmp/swh-docker-dev_tests +# Create it if it does not exist +mkdir $WORKDIR 2>/dev/null +# Ensure it is empty before running the tests +rm -rf $WORKDIR/* + +# We want the script to exit at the first encountered error +set -e + +# Get test scripts directory +TEST_SCRIPTS_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) + +# Set the docker-compose.yml file to use +export COMPOSE_FILE=$TEST_SCRIPTS_DIR/../docker-compose.yml + +# Useful global variables +SWH_WEB_API_BASEURL="http://localhost:5004/api/1" +CURRENT_TEST_SCRIPT="" + +# Colored output related variables and functions (only if stdout is a terminal) +if test -t 1; then + GREEN='\033[0;32m' + RED='\033[0;31m' + NC='\033[0m' +else + DOCO_OPTIONS='--no-ansi' +fi + +# Remove previously dumped service logs file if any +rm -f $TEST_SCRIPTS_DIR/swh-docker-compose.logs + +function colored_output { + local msg="$2" + if [ "$CURRENT_TEST_SCRIPT" != "" ]; then + msg="[$CURRENT_TEST_SCRIPT] $msg" + fi + echo -e "${1}${msg}${NC}" +} + +function status_message { + colored_output ${GREEN} "$1" +} + +function error_message { + colored_output ${RED} "$1" +} + +function dump_docker_logs { + error_message "Dumping logs for all services in file $TEST_SCRIPTS_DIR/swh-docker-compose.logs" + docker-compose logs > $TEST_SCRIPTS_DIR/swh-docker-compose.logs +} + +# Exit handler that will get called when this script terminates +function finish { + if [ $? -ne 0 ] && [ "$CURRENT_TEST_SCRIPT" != "" ]; then + local SCRIPT_NAME=$CURRENT_TEST_SCRIPT + CURRENT_TEST_SCRIPT="" + error_message "An error occurred when running test script ${SCRIPT_NAME}" + dump_docker_logs + fi + docker-compose $DOCO_OPTIONS down + rm -rf $WORKDIR +} +trap finish EXIT + +# Docker-compose events listener that will be executed in background +# Parameters: +# $1: PID of parent process +function listen_docker_events { + docker-compose $DOCO_OPTIONS events | while read event + do + service=$(echo $event | cut -d " " -f7 | sed 's/^name=swh-docker-dev_\(.*\)_1)/\1/') + event_type=$(echo $event | cut -d ' ' -f4) + # "docker-compose down" has been called, exiting this child process + if [ "$event_type" = "kill" ] ; then + exit + # a swh service crashed, sending signal to parent process to exit with error + elif [ "$event_type" = "die" ]; then + if [[ "$service" =~ ^swh.* ]]; then + exit_code=$(docker-compose ps | grep $service | awk '{print $4}') + if [ "$exit_code" != "0" ]; then + error_message "Service $service died unexpectedly, exiting" + dump_docker_logs + kill -s SIGUSR1 $1; exit + fi + fi + fi + done +} +trap "exit 1" SIGUSR1 + +declare -A SERVICE_LOGS_NB_LINES_READ + +# Function to wait for a specific string to be outputted in a specific +# docker-compose service logs. +# When called multiple times on the same service, only the newly outputted +# logs since the last call will be processed. +# Parameters: +# $1: a timeout value in seconds to stop waiting and exit with error +# $2: docker-compose service name +# $3: the string to look for in the produced logs +function wait_for_service_output { + local nb_lines_to_skip=0 + if [[ -v "SERVICE_LOGS_NB_LINES_READ[$2]" ]]; then + let nb_lines_to_skip=${SERVICE_LOGS_NB_LINES_READ[$2]}+1 + fi + SECONDS=0 + local service_logs=$(docker-compose $DOCO_OPTIONS logs $2 | tail -n +$nb_lines_to_skip) + until echo -ne "$service_logs" | grep -m 1 "$3" >/dev/null ; do + sleep 1; + if (( $SECONDS > $1 )); then + error_message "Could not find pattern \"$3\" in $2 service logs after $1 seconds" + exit 1 + fi + let nb_lines_to_skip+=$(echo -ne "$service_logs" | wc -l) + service_logs=$(docker-compose $DOCO_OPTIONS logs $2 | tail -n +$nb_lines_to_skip) + done + let nb_lines_to_skip+=$(echo -ne "$service_logs" | wc -l) + SERVICE_LOGS_NB_LINES_READ[$2]=$nb_lines_to_skip +} + +# Function to make an HTTP request and gets its response. +# It should be used the following way: +# response=$(http_request ) +# Parameters: +# $1: http method name (GET, POST, ...) +# $2: request url +function http_request { + local response=$(curl -sS -X $1 $2) + echo $response +} + +# Function to check that an HTTP request ends up with no errors. +# If the HTTP response code is different from 200, an error will +# be raised and the main script will terminate +# Parameters: +# $1: http method name (GET, POST, ...) +# $2: request url +function http_request_check { + curl -sSf -X $1 $2 > /dev/null +} + +# Function to run the content of a script dedicated to test a specific +# part of the Software Heritage stack. +function run_test_script { + local SCRIPT_NAME=$(basename $1) + status_message "Executing test script $SCRIPT_NAME" + CURRENT_TEST_SCRIPT=$SCRIPT_NAME + source $1 +} + +# Move to work directory +cd $WORKDIR + +# Start the docker-compose event handler as a background process +status_message "Starting docker-compose events listener" +listen_docker_events $$ & + +# Start the docker-compose environment including the full Software Heritage stack +status_message "Starting swh docker-compose environment" +docker-compose $DOCO_OPTIONS up -d + +# Ensure all swh services are up before running tests +status_message "Waiting for swh services to be up" +docker-compose $DOCO_OPTIONS exec -T swh-storage wait-for-it localhost:5002 -s --timeout=0 +docker-compose $DOCO_OPTIONS exec -T swh-objstorage wait-for-it localhost:5003 -s --timeout=0 +docker-compose $DOCO_OPTIONS exec -T swh-web wait-for-it localhost:5004 -s --timeout=0 +docker-compose $DOCO_OPTIONS exec -T swh-vault-api wait-for-it localhost:5005 -s --timeout=0 +docker-compose $DOCO_OPTIONS exec -T swh-deposit wait-for-it localhost:5006 -s --timeout=0 +docker-compose $DOCO_OPTIONS exec -T swh-idx-storage wait-for-it localhost:5007 -s --timeout=0 +docker-compose $DOCO_OPTIONS exec -T swh-scheduler-api wait-for-it localhost:5008 -s --timeout=0 + +# Execute test scripts +for test_script in $TEST_SCRIPTS_DIR/test_*; do + run_test_script ${test_script} + CURRENT_TEST_SCRIPT="" +done diff --git a/docker/tests/test_01_loader_git.sh b/docker/tests/test_01_loader_git.sh new file mode 100644 index 0000000..e907d0f --- /dev/null +++ b/docker/tests/test_01_loader_git.sh @@ -0,0 +1,70 @@ +#!/bin/bash +shopt -s nullglob extglob + +TEST_GIT_REPO_NAME="swh-loader-core" +TEST_GIT_REPO_URL="https://forge.softwareheritage.org/source/${TEST_GIT_REPO_NAME}.git" + +status_message "Scheduling the loading of the git repository located at ${TEST_GIT_REPO_URL}" + +docker-compose $DOCO_OPTIONS exec -T swh-scheduler-api swh scheduler task add load-git repo_url=$TEST_GIT_REPO_URL + +status_message "Waiting for the git loading task to complete" + +wait_for_service_output 300 swh-loader "swh.loader.git.tasks.UpdateGitRepository.*succeeded" + +status_message "The loading task has been successfully executed" + +status_message "Getting all git objects contained in the repository" +git clone $TEST_GIT_REPO_URL +cd $TEST_GIT_REPO_NAME +cd "$(git rev-parse --git-path objects)" +for p in pack/pack-*([0-9a-f]).idx ; do + git show-index < $p | cut -f 2 -d ' ' > $WORKDIR/git_objects +done +for o in [0-9a-f][0-9a-f]/*([0-9a-f]) ; do + echo ${o/\/} >> $WORKDIR/git_objects +done + +declare -ga CONTENTS +declare -ga DIRECTORIES +declare -ga REVISIONS +declare -ga RELEASES + +while IFS='' read -r object || [[ -n "$object" ]]; do + object_type=$(git cat-file -t $object) + if [ "$object_type" = "blob" ]; then + CONTENTS+=($object) + elif [ "$object_type" = "tree" ]; then + DIRECTORIES+=($object) + elif [ "$object_type" = "commit" ]; then + REVISIONS+=($object) + elif [ "$object_type" = "tag" ]; then + RELEASES+=($object) + fi +done < $WORKDIR/git_objects + +status_message "Checking all git objects have been successfully loaded into the archive" + +status_message "Checking contents" +for content in "${CONTENTS[@]}"; do + http_request_check GET ${SWH_WEB_API_BASEURL}/content/sha1_git:$content/ +done +status_message "All contents have been successfully loaded into the archive" + +status_message "Checking directories" +for directory in "${DIRECTORIES[@]}"; do + http_request_check GET ${SWH_WEB_API_BASEURL}/directory/$directory/ +done +status_message "All directories have been successfully loaded into the archive" + +status_message "Checking revisions" +for revision in "${REVISIONS[@]}"; do + http_request_check GET ${SWH_WEB_API_BASEURL}/revision/$revision/ +done +status_message "All revisions have been successfully loaded into the archive" + +status_message "Checking releases" +for release in "${RELEASES[@]}"; do + http_request_check GET ${SWH_WEB_API_BASEURL}/release/$release/ +done +status_message "All releases have been successfully loaded into the archive" diff --git a/docker/tests/test_02_vault.sh b/docker/tests/test_02_vault.sh new file mode 100644 index 0000000..0f4ee25 --- /dev/null +++ b/docker/tests/test_02_vault.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +directory=${DIRECTORIES[$RANDOM % ${#DIRECTORIES[@]}]} +revision=${REVISIONS[$RANDOM % ${#REVISIONS[@]}]} + +status_message "Requesting the vault to cook a random directory stored into the archive" +http_request_check POST ${SWH_WEB_API_BASEURL}/vault/directory/$directory/ + +status_message "Waiting for the directory cooking task to complete" +wait_for_service_output 300 swh-vault-worker "swh.vault.cooking_tasks.SWHCookingTask.*succeeded" +status_message "The directory cooking task has been sucessfully executed" + +status_message "Checking that the cooked directory tarball can be downloaded" +http_request_check GET ${SWH_WEB_API_BASEURL}/vault/directory/$directory/raw/ +status_message "The cooked directory tarball is available for download" + +status_message "Requesting the vault to cook a random revision stored into the archive" +http_request_check POST ${SWH_WEB_API_BASEURL}/vault/revision/$revision/gitfast/ + +status_message "Waiting for the revision cooking task to complete" +wait_for_service_output 300 swh-vault-worker "swh.vault.cooking_tasks.SWHCookingTask.*succeeded" +status_message "The revision cooking task has been sucessfully executed" + +status_message "Checking that the cooked revision tarball can be downloaded" +http_request_check GET ${SWH_WEB_API_BASEURL}/vault/revision/$revision/gitfast/raw/ +status_message "The cooked revision tarball is available for download" diff --git a/docker/utils/pgsql.sh b/docker/utils/pgsql.sh new file mode 100644 index 0000000..6e30e83 --- /dev/null +++ b/docker/utils/pgsql.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +setup_pgsql () { + echo "${PGHOST}:5432:postgres:${PGUSER}:${POSTGRES_PASSWORD}" > ~/.pgpass + echo "${PGHOST}:5432:${POSTGRES_DB}:${PGUSER}:${POSTGRES_PASSWORD}" >> ~/.pgpass + cat > ~/.pg_service.conf < /dev/null 2> /dev/null; do sleep 1; done +} \ No newline at end of file diff --git a/docker/utils/pyutils.sh b/docker/utils/pyutils.sh new file mode 100755 index 0000000..289eb40 --- /dev/null +++ b/docker/utils/pyutils.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +setup_pip () { + echo Using pip from $(which pip) + + if [[ -d /src ]] ; then + for srcrepo in /src/swh-* ; do + pip install $srcrepo + done + fi + + echo Installed Python packages: + pip list +}