diff --git a/sysadmin/grid5000/cassandra/03-deploy_nodes.sh b/sysadmin/grid5000/cassandra/03-deploy_nodes.sh index a947f9f..be57a20 100755 --- a/sysadmin/grid5000/cassandra/03-deploy_nodes.sh +++ b/sysadmin/grid5000/cassandra/03-deploy_nodes.sh @@ -1,33 +1,35 @@ #!/usr/bin/env bash -set -eu +set -eux SCRIPT_DIR="$(pwd $(dirname @0))" source "${SCRIPT_DIR}/environment.cfg" echo "########### Nodes:" uniq "${OAR_FILE_NODES}" echo "########### Installing os on nodes" INSTALLED_OS_STAMP="${OAR_JOB_ID}.os.stamp" if [ ! -e "${SCRIPT_DIR}/${INSTALLED_OS_STAMP}" ]; then ${SCRIPT_DIR}/_install_os.sh touch "${SCRIPT_DIR}/${INSTALLED_OS_STAMP}" fi uniq "${OAR_NODE_FILE}" > ${SCRIPT_DIR}/nodes.lst -echo "${CASSANDRA_HOSTS}" | sed 's/ /,/' > ${SCRIPT_DIR}/cassandra_seeds.lst +NODE_COUNT="$(wc -l ${SCRIPT_DIR}/nodes.lst | cut -f1 -d' ')" -time parallel --halt now,fail=1 rsync -avP . "${SSH_USER}"@{}:install < ${SCRIPT_DIR}/nodes.lst +echo "${CASSANDRA_HOSTS}" | sed 's/ /,/g' > ${SCRIPT_DIR}/cassandra_seeds.lst -time parallel --halt now,fail=1 -u ssh ${SSH_OPTIONS} "${SSH_USER}"@{} install/_provision_node.sh < ${SCRIPT_DIR}/nodes.lst +time parallel -j${NODE_COUNT} rsync -avP . "${SSH_USER}"@{}:install < ${SCRIPT_DIR}/nodes.lst + +time parallel -j${NODE_COUNT} -u ssh ${SSH_OPTIONS} "${SSH_USER}"@{} install/_provision_node.sh < ${SCRIPT_DIR}/nodes.lst echo "########### Cassandra installation done" touch ${SCRIPT_DIR}/nodes.installed # The script must not exit to avoid the oar job to be killed echo "########### Sleeping" sleep infinity diff --git a/sysadmin/grid5000/cassandra/Vagrantfile b/sysadmin/grid5000/cassandra/Vagrantfile index 4beb88e..9be1e32 100644 --- a/sysadmin/grid5000/cassandra/Vagrantfile +++ b/sysadmin/grid5000/cassandra/Vagrantfile @@ -1,82 +1,88 @@ # -*- mode: ruby -*- # vi: set ft=ruby : vms = { "cassandra1" => { :ip => "10.168.180.11", :memory => 2048, :cpus => 2, :type => 'cassandra', }, "cassandra2" => { :ip => "10.168.180.12", :memory => 2048, :cpus => 2, :type => 'cassandra', }, "cassandra3" => { :ip => "10.168.180.13", :memory => 2048, :cpus => 2, :type => 'cassandra', }, "swh-storage1" => { :ip => "10.168.180.14", :memory => 1024, :cpus => 2, :type => 'swh-storage', }, + "monitoring1" => { + :ip => "10.168.180.15", + :memory => 1024, + :cpus => 2, + :type => 'monitoring', + }, } # Images/remote configuration $global_debian10_box = "debian10-20210517-1348" $global_debian10_box_url = "https://annex.softwareheritage.org/public/isos/libvirt/debian/swh-debian-10.9-amd64-20210517-1348.qcow2" vms.each { | vm_name, vm_props | Vagrant.configure("2") do |global_config| unless Vagrant.has_plugin?("libvirt") $stderr.puts <<-MSG vagrant-libvirt plugin is required for this. To install: `$ sudo apt install vagrant-libvirt MSG exit 1 end global_config.vm.define vm_name do |config| config.vm.box = $global_debian10_box config.vm.box_url = $global_debian10_box_url config.vm.box_check_update = false config.vm.hostname = vm_name config.vm.network :private_network, ip: vm_props[:ip], netmask: "255.255.0.0" config.vm.synced_folder ".", "/vagrant", type: 'nfs', nfs_version: 4 config.vm.provision :ansible do |ansible| ansible.verbose = true ansible.become = true ansible.playbook = "ansible/playbook.yml" ansible.inventory_path = "ansible/hosts.yml" ansible.raw_arguments = [ "-v", "--connection=paramiko", "--private-key=/home/.../.vagrant/machines/.../private_key", "--extra-vars=@.credentials", "--vault-password-file=.vault_password" ] end config.vm.provider :libvirt do |provider| provider.memory = vm_props[:memory] provider.cpus = vm_props[:cpus] provider.driver = 'kvm' if vm_props[:type] == "cassandra" provider.storage :file, :size => '1G' provider.storage :file, :size => '1G' provider.storage :file, :size => '1G' end end end end } diff --git a/sysadmin/grid5000/cassandra/_install_os.sh b/sysadmin/grid5000/cassandra/_install_os.sh index 0dc03a2..5e05fd4 100755 --- a/sysadmin/grid5000/cassandra/_install_os.sh +++ b/sysadmin/grid5000/cassandra/_install_os.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash set -eux # Install the OS -kadeploy3 -e debian10-x64-base -f "${OAR_FILE_NODES}" -k ~/.ssh/id_rsa.pub +kadeploy3 -e debian10-x64-big -f "${OAR_FILE_NODES}" -k ~/.ssh/id_rsa.pub diff --git a/sysadmin/grid5000/cassandra/_provision_node.sh b/sysadmin/grid5000/cassandra/_provision_node.sh index a1234fe..0770351 100755 --- a/sysadmin/grid5000/cassandra/_provision_node.sh +++ b/sysadmin/grid5000/cassandra/_provision_node.sh @@ -1,11 +1,12 @@ #!/usr/bin/env bash set -eux apt update apt install -y ansible cd /root/install/ansible CASSANDRA_SEEDS="$(cat ../cassandra_seeds.lst)" +NODES="$(cat ../nodes.lst | tr '\n' ',')" -ansible-playbook -i hosts.yml -e @/root/install/.credentials --vault-password-file=/root/install/.vault_password -l "$(hostname)" playbook.yml --extra-vars "cassandra_seed_ips=${CASSANDRA_SEEDS}" +ansible-playbook -i hosts.yml -e @/root/install/.credentials --vault-password-file=/root/install/.vault_password -l "$(hostname)" playbook.yml --extra-vars "cassandra_seed_ips=${CASSANDRA_SEEDS}" --extra-vars "nodes=${NODES}" diff --git a/sysadmin/grid5000/cassandra/ansible/_install_prometheus_exporter.yml b/sysadmin/grid5000/cassandra/ansible/_install_prometheus_exporter.yml new file mode 100644 index 0000000..6aa96e3 --- /dev/null +++ b/sysadmin/grid5000/cassandra/ansible/_install_prometheus_exporter.yml @@ -0,0 +1,5 @@ +--- +- name: Install node_exporter package + apt: + name: + - prometheus-node-exporter diff --git a/sysadmin/grid5000/cassandra/ansible/cassandra.yml b/sysadmin/grid5000/cassandra/ansible/cassandra.yml index b2c828f..9e5da53 100644 --- a/sysadmin/grid5000/cassandra/ansible/cassandra.yml +++ b/sysadmin/grid5000/cassandra/ansible/cassandra.yml @@ -1,59 +1,71 @@ --- # - name: "Get public ipv4 address" # set_fact: # cassandra_seed_ips: "{{ansible_facts[item]['ipv4']['address']}}" # with_items: # - "{{cassandra_listen_interface }}" - name: Install cassandra signing key apt_key: url: https://downloads.apache.org/cassandra/KEYS state: present - name: Install cassandra apt repository apt_repository: repo: deb http://downloads.apache.org/cassandra/debian 40x main state: present filename: cassandra.sources - name: Install cassandra packages apt: update_cache: true # force an apt update before name: ## TODO: check other version than jdk11 - - openjdk-11-jdk - cassandra - dstat - - smartmontools - facter + - openjdk-11-jdk + - smartmontools - tcpdump +- name: install prometheus node exporter + include: _install_prometheus_exporter.yml + - name: Create datadirs file: state: directory path: "{{ item }}" owner: "cassandra" group: "cassandra" mode: "0755" + recurse: true with_items: - "{{ cassandra_data_dir_base }}" - "{{ cassandra_data_dir_system }}" - "{{ cassandra_data_dir }}" - "{{ cassandra_commitlogs_dir }}" +- name: Download prometheus jmx exporter + get_url: + url: https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.15.0/jmx_prometheus_javaagent-0.15.0.jar + dest: /opt/jmx_prometheus_javaagent-0.15.0.jar + +- name: Download prometheus exporter configuration + get_url: + url: https://raw.githubusercontent.com/prometheus/jmx_exporter/master/example_configs/cassandra.yml + dest: /opt/jmx_exporter.yml + - name: Configure cassandra template: src: "templates/{{item}}" dest: "{{cassandra_config_dir}}/{{item}}" - with_items: [cassandra.yaml, jvm.options] + with_items: [cassandra.yaml, jvm11-server.options] register: cassandra_configuration_files - name: Restart cassandra service service: name: cassandra state: restarted when: cassandra_configuration_files.changed - - # TODO test different read ahead diff --git a/sysadmin/grid5000/cassandra/ansible/files/g5k-setup-docker b/sysadmin/grid5000/cassandra/ansible/files/g5k-setup-docker new file mode 100644 index 0000000..d44a8e7 --- /dev/null +++ b/sysadmin/grid5000/cassandra/ansible/files/g5k-setup-docker @@ -0,0 +1,47 @@ +#! /usr/bin/env ruby + +require 'optparse' + +def main + options = {} + OptionParser.new do |opts| + opts.banner = "Usage: g5k-setup-docker [-it]\nInstall Docker locally and make Docker commands available without user needing to log out" + opts.on('-i', '--install', 'Just install Docker locally (user need to log out)') do |i| + options[:install] = i + end + opts.on('-t', '--tmp', 'Bind /tmp/docker to /var/lib/docker to provide more storage space for pulled images (WARNING : A reboot will erase it)') do |t| + options[:tmp] = t + end + end.parse! + + ret = true + ret &= sh('sudo-g5k').first if Process.euid != 0 + if options[:tmp] + ret &= sh('sudo mkdir /tmp/docker').first + ret &= sh('sudo mkdir /var/lib/docker').first + ret &= sh('sudo mount --bind /tmp/docker /var/lib/docker').first + end + cpu_arch = `uname -m`.strip + # Docker is not officially supported on ppc64 + if cpu_arch == 'ppc64le' + ret &= sh('echo "deb http://packages.grid5000.fr/deb/docker-ce/buster /" | sudo tee /etc/apt/sources.list.d/docker-ce.list').first + ret &= sh('sudo apt-get update && sudo apt-get install -y docker-ce').first + else + ret &= sh('curl -sSL https://get.docker.com/ | sh').first + end + ret &= sh('sudo mkdir -p /etc/docker') + ret &= sh('echo "{ \"registry-mirrors\": [\"http://docker-cache.grid5000.fr\"] }" | sudo tee /etc/docker/daemon.json') + ret &= sh('sudo systemctl restart docker') + ret &= sh('sudo chmod o+rw /var/run/docker.sock').first if !options[:install] + ret +end + +def sh(cmd) + puts(cmd) + output = `#{cmd}`.chomp + status = ($?.exitstatus == 0) + [status, output] +end + +# MAIN PROGRAM +exit(main) diff --git a/sysadmin/grid5000/cassandra/ansible/hosts.yml b/sysadmin/grid5000/cassandra/ansible/hosts.yml index 7845051..068eddb 100644 --- a/sysadmin/grid5000/cassandra/ansible/hosts.yml +++ b/sysadmin/grid5000/cassandra/ansible/hosts.yml @@ -1,126 +1,152 @@ # Global configuration swh-storage: hosts: parasilo-[20:28].rennes.grid5000.fr: paranoia-[1:8].rennes.grid5000.fr: + # paravance-[].rennes.grid5000.fr: # local vagrant hosts swh-storage1: vars: ansible_connection: local journal: brokers: - broker1.journal.softwareheritage.org:9093 - broker2.journal.softwareheritage.org:9093 - broker3.journal.softwareheritage.org:9093 - broker4.journal.softwareheritage.org:9093 consumer: user: swh-vse group: swh-vse-grid5000-1 replayer_count: content: 0 skipped_content: 0 directory: 50 origin: 0 origin_visit: 0 origin_visit_status: 0 release: 0 revision: 0 snapshot: 0 cassandra: hosts: dahu-[1:32].grenoble.grid5000.fr: parasilo-[1:19].rennes.grid5000.fr: # local vagrant hosts cassandra[1:9]: vars: ansible_connection: local cassandra_config_dir: /etc/cassandra cassandra_data_dir_base: /srv/cassandra cassandra_data_dir_system: "{{cassandra_data_dir_base}}/system" cassandra_data_dir: "{{ cassandra_data_dir_base }}/data" cassandra_commitlogs_dir: "{{ cassandra_data_dir_base }}/commitlogs" # Per cluster specificities dahu_cluster_hosts: hosts: dahu[1:32].grenoble.grid5000.fr vars: cassandra_listen_interface: enp24s0f0 parasilo_cluster_hosts: hosts: parasilo-[1:28].rennes.grid5000.fr: - paranoia-[1:8].rennes.grid5000.fr: vars: cassandra_listen_interface: eno1 zfs_pools: commitlogs: disks: - sdf datasets: commitlogs: /srv/cassandra/commitlogs data: disks: - sdb - sdc - sdd - sde datasets: data: /srv/cassandra/data +paravance_cluster_hosts: + hosts: + paravance-[1:78].rennes.grid5000.fr: + vars: + cassandra_listen_interface: eno1 + zfs_pools: + - data: + disks: + - sdb + datasets: + data: /srv/cassandra + +monitoring: + hosts: + monitoring1: + paravance-[1:78].rennes.grid5000.fr: + vars: + ansible_connection: local + install_docker_install_script: false + # Vagrant configuration vagrant_nodes: hosts: cassandra1: ansible_host: 10.168.180.11 ansible_user: vagrant ansible_ssh_private_key_file: .vagrant/machines/cassandra1/libvirt/private_key cassandra2: ansible_host: 10.168.180.12 ansible_user: vagrant ansible_ssh_private_key_file: .vagrant/machines/cassandra2/libvirt/private_key cassandra3: ansible_host: 10.168.180.13 ansible_user: vagrant ansible_ssh_private_key_file: .vagrant/machines/cassandra3/libvirt/private_key swh-storage1: ansible_host: 10.168.180.14 ansible_user: vagrant ansible_ssh_private_key_file: .vagrant/machines/swh-storage/libvirt/private_key + monitoring1: + ansible_host: 10.168.180.15 + ansible_user: vagrant + ansible_ssh_private_key_file: .vagrant/machines/monitoring/libvirt/private_key vars: ansible_connection: ssh + install_docker_install_script: true journal: brokers: # staging - broker0.journal.staging.swh.network:9093 consumer: user: swh-vse group: swh-vse-grid5000-1 replayer_count: content: 0 skipped_content: 0 directory: 5 origin: 0 origin_visit: 0 origin_visit_status: 0 release: 0 revision: 0 snapshot: 0 cassandra_listen_interface: eth1 # passed through --extra-vars on grid5000 cassandra_seed_ips: 10.168.180.11,10.168.180.12,10.168.180.13 + nodes: 10.168.180.11,10.168.180.12,10.168.180.13,10.168.180.14,10.168.180.15 zfs_pools: commitlogs: disks: - vdb datasets: commitlogs: /srv/cassandra/commitlogs data: disks: - vdc - vdd datasets: data: /srv/cassandra/data diff --git a/sysadmin/grid5000/cassandra/ansible/monitoring.yml b/sysadmin/grid5000/cassandra/ansible/monitoring.yml new file mode 100644 index 0000000..2539b95 --- /dev/null +++ b/sysadmin/grid5000/cassandra/ansible/monitoring.yml @@ -0,0 +1,45 @@ +--- +- name: Create grid5000 tools directories + file: + state: directory + path: "{{ item }}" + owner: root + group: root + mode: "0755" + with_items: + - /grid5000 + - /grid5000/code + - /grid5000/code/bin + when: install_docker_install_script + +- name: Install docker installation script + copy: + src: "files/g5k-setup-docker" + dest: "/grid5000/code/bin/g5k-setup-docker" + owner: root + group: root + mode: "0755" + when: install_docker_install_script + +- name: Install docker + command: + cmd: "/grid5000/code/bin/g5k-setup-docker" + +- name: Create prometheus data directory + file: + state: directory + path: /tmp/prometheus + owner: nobody + group: nogroup + +- name: install prometheus node exporter + include: _install_prometheus_exporter.yml + +- name: Create prometheus configuration + template: + src: "templates/prometheus/prometheus.yml" + dest: "/etc/prometheus.yml" + +- name: Start prometheus + command: + cmd: "docker run -d -p 9090:9090 -v /etc/prometheus.yml:/etc/prometheus/prometheus.yml -v /tmp/prometheus:/prometheus --name prometheus prom/prometheus" diff --git a/sysadmin/grid5000/cassandra/ansible/playbook.yml b/sysadmin/grid5000/cassandra/ansible/playbook.yml index b8e8a54..a295930 100644 --- a/sysadmin/grid5000/cassandra/ansible/playbook.yml +++ b/sysadmin/grid5000/cassandra/ansible/playbook.yml @@ -1,11 +1,16 @@ --- - name: Install cassandra hosts: cassandra tasks: - include: zfs.yml - include: cassandra.yml - name: Install SWH Storage hosts: swh-storage tasks: - include: swh-storage.yml + +- name: Install Monitoring + hosts: monitoring + tasks: + - include: monitoring.yml diff --git a/sysadmin/grid5000/cassandra/ansible/swh-storage.yml b/sysadmin/grid5000/cassandra/ansible/swh-storage.yml index e270fc1..dd84201 100644 --- a/sysadmin/grid5000/cassandra/ansible/swh-storage.yml +++ b/sysadmin/grid5000/cassandra/ansible/swh-storage.yml @@ -1,120 +1,125 @@ --- - name: Add Backports repository apt_repository: repo: deb http://deb.debian.org/debian/ buster-backports main contrib non-free filename: backports.sources - name: swhstorage group group: name: swhstorage - name: swhstorage user user: name: swhstorage group: swhstorage + home: /var/lib/swhstorage # *big images mount homes via nfs so the user creation failed - name: Add SWH repository apt_repository: repo: deb [trusted=yes] https://debian.softwareheritage.org/ buster-swh main state: present filename: cassandra.sources - name: Install packages apt: name: - daemonize - dstat - facter + - prometheus-statsd-exporter - python3 - python3-gunicorn - tcpdump - name: Install packages from backports apt: name: - python3-typing-extensions - gunicorn3 default_release: buster-backports - name: Install swh storage packages apt: name: - python3-swh.storage - python3-swh.journal +- name: install prometheus node exporter + include: _install_prometheus_exporter.yml + - name: Create directories file: state: directory path: "{{ item }}" owner: root group: root mode: "0755" with_items: - /etc/gunicorn - /etc/gunicorn/instances - /run/gunicorn - /run/gunicorn/swh-storage - /etc/softwareheritage - /etc/softwareheritage/storage - /etc/softwareheritage/replayer - name: Create swh-storage directories file: state: directory path: "{{ item }}" owner: swhstorage group: swhstorage mode: "0755" with_items: - /run/gunicorn/swh-storage/ - /run/replayer - name: Configure gunicorn - default service template: src: "templates/gunicorn/gunicorn.service" dest: "/etc/systemd/system/gunicorn.service" - name: Configure gunicorn - log configuration template: src: "templates/gunicorn/logconfig.ini" dest: "/etc/gunicorn/logconfig.ini" - name: swh-storage gunicorn instance configuration template: src: "templates/gunicorn/gunicorn-instance.cfg" dest: "/etc/gunicorn/instances/swh-storage.cfg" - name: swh-storage configuration directories template: src: "templates/swhstorage/storage.yml" dest: "/etc/softwareheritage/storage/storage.yml" - name: swh-storage service configuration template: src: "templates/gunicorn/gunicorn-instance-service.cfg" dest: "/etc/systemd/system/gunicorn-swh-storage.service" # TODO variabilize - name: swh-storage service service: name: gunicorn-swh-storage enabled: true state: started - name: swh-storage init cassandra script template: src: templates/swhstorage/init-cassandra.sh dest: /usr/local/bin/swh-storage-init-cassandra.sh mode: 0755 - name: Configure replayer services include: _configure_replayer_services.yml obj={{ item }} loop: - content - skipped_content - directory - origin - origin_visit - origin_visit_status - release - revision - snapshot diff --git a/sysadmin/grid5000/cassandra/ansible/templates/gunicorn/gunicorn-instance-service.cfg b/sysadmin/grid5000/cassandra/ansible/templates/gunicorn/gunicorn-instance-service.cfg index 63ae9cf..2183a53 100644 --- a/sysadmin/grid5000/cassandra/ansible/templates/gunicorn/gunicorn-instance-service.cfg +++ b/sysadmin/grid5000/cassandra/ansible/templates/gunicorn/gunicorn-instance-service.cfg @@ -1,25 +1,26 @@ [Unit] Description=Gunicorn instance swh-storage ConditionPathExists=/etc/gunicorn/instances/swh-storage.cfg PartOf=gunicorn.service ReloadPropagatedFrom=gunicorn.service Before=gunicorn.service [Service] User=swhstorage Group=swhstorage PIDFile=/run/swh-storage.pid RuntimeDirectory=gunicorn/swh-storage WorkingDirectory=/run/gunicorn/swh-storage Environment=SWH_CONFIG_FILENAME=/etc/softwareheritage/storage/storage.yml Environment=SWH_LOG_TARGET=journal Environment=SWH_MAIN_PACKAGE=swh.storage +Environment=STATSD_PORT=9125 ExecStart=/usr/bin/gunicorn3 -p /run/gunicorn/swh-storage/pidfile -c /etc/gunicorn/instances/swh-storage.cfg swh.storage.api.server:make_app_from_configfile() ExecStop=/bin/kill -TERM $MAINPID ExecReload=/bin/kill -HUP $MAINPID Restart=always RestartSec=10 [Install] WantedBy=multi-user.target diff --git a/sysadmin/grid5000/cassandra/ansible/templates/jvm.options b/sysadmin/grid5000/cassandra/ansible/templates/jvm11-server.options similarity index 98% rename from sysadmin/grid5000/cassandra/ansible/templates/jvm.options rename to sysadmin/grid5000/cassandra/ansible/templates/jvm11-server.options index 7e78467..3680e3a 100644 --- a/sysadmin/grid5000/cassandra/ansible/templates/jvm.options +++ b/sysadmin/grid5000/cassandra/ansible/templates/jvm11-server.options @@ -1,103 +1,105 @@ ########################################################################### # jvm11-server.options # # # # See jvm-server.options. This file is specific for Java 11 and newer. # ########################################################################### ################# # GC SETTINGS # ################# ### CMS Settings -XX:+UseConcMarkSweepGC -XX:+CMSParallelRemarkEnabled -XX:SurvivorRatio=8 -XX:MaxTenuringThreshold=1 -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -XX:CMSWaitDuration=10000 -XX:+CMSParallelInitialMarkEnabled -XX:+CMSEdenChunksRecordAlways ## some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541 -XX:+CMSClassUnloadingEnabled ### G1 Settings ## Use the Hotspot garbage-first collector. #-XX:+UseG1GC #-XX:+ParallelRefProcEnabled # ## Have the JVM do less remembered set work during STW, instead ## preferring concurrent GC. Reduces p99.9 latency. #-XX:G1RSetUpdatingPauseTimePercent=5 # ## Main G1GC tunable: lowering the pause target will lower throughput and vise versa. ## 200ms is the JVM default and lowest viable setting ## 1000ms increases throughput. Keep it smaller than the timeouts in cassandra.yaml. #-XX:MaxGCPauseMillis=500 ## Optional G1 Settings # Save CPU time on large (>= 16GB) heaps by delaying region scanning # until the heap is 70% full. The default in Hotspot 8u40 is 40%. #-XX:InitiatingHeapOccupancyPercent=70 # For systems with > 8 cores, the default ParallelGCThreads is 5/8 the number of logical cores. # Otherwise equal to the number of cores when 8 or less. # Machines with > 10 cores should try setting these to <= full cores. #-XX:ParallelGCThreads=16 # By default, ConcGCThreads is 1/4 of ParallelGCThreads. # Setting both to the same value can reduce STW durations. #-XX:ConcGCThreads=16 ### JPMS -Djdk.attach.allowAttachSelf=true --add-exports java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/jdk.internal.ref=ALL-UNNAMED --add-exports java.base/sun.nio.ch=ALL-UNNAMED --add-exports java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED --add-exports java.rmi/sun.rmi.registry=ALL-UNNAMED --add-exports java.rmi/sun.rmi.server=ALL-UNNAMED --add-exports java.sql/java.sql=ALL-UNNAMED --add-opens java.base/java.lang.module=ALL-UNNAMED --add-opens java.base/jdk.internal.loader=ALL-UNNAMED --add-opens java.base/jdk.internal.ref=ALL-UNNAMED --add-opens java.base/jdk.internal.reflect=ALL-UNNAMED --add-opens java.base/jdk.internal.math=ALL-UNNAMED --add-opens java.base/jdk.internal.module=ALL-UNNAMED --add-opens java.base/jdk.internal.util.jar=ALL-UNNAMED --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED ### GC logging options -- uncomment to enable # Java 11 (and newer) GC logging options: # See description of https://bugs.openjdk.java.net/browse/JDK-8046148 for details about the syntax # The following is the equivalent to -XX:+PrintGCDetails -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=10M #-Xlog:gc=info,heap*=trace,age*=debug,safepoint=info,promotion*=trace:file=/var/log/cassandra/gc.log:time,uptime,pid,tid,level:filecount=10,filesize=10485760 # Notes for Java 8 migration: # # -XX:+PrintGCDetails maps to -Xlog:gc*:... - i.e. add a '*' after "gc" # -XX:+PrintGCDateStamps maps to decorator 'time' # # -XX:+PrintHeapAtGC maps to 'heap' with level 'trace' # -XX:+PrintTenuringDistribution maps to 'age' with level 'debug' # -XX:+PrintGCApplicationStoppedTime maps to 'safepoint' with level 'info' # -XX:+PrintPromotionFailure maps to 'promotion' with level 'trace' # -XX:PrintFLSStatistics=1 maps to 'freelist' with level 'trace' ### Netty Options # On Java >= 9 Netty requires the io.netty.tryReflectionSetAccessible system property to be set to true to enable # creation of direct buffers using Unsafe. Without it, this falls back to ByteBuffer.allocateDirect which has # inferior performance and risks exceeding MaxDirectMemory -Dio.netty.tryReflectionSetAccessible=true +-javaagent:/opt/jmx_prometheus_javaagent-0.15.0.jar=7070:/opt/jmx_exporter.yml + # The newline in the end of file is intentional diff --git a/sysadmin/grid5000/cassandra/ansible/templates/prometheus/prometheus.yml b/sysadmin/grid5000/cassandra/ansible/templates/prometheus/prometheus.yml new file mode 100644 index 0000000..d63724b --- /dev/null +++ b/sysadmin/grid5000/cassandra/ansible/templates/prometheus/prometheus.yml @@ -0,0 +1,54 @@ +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'cassandra exporter' + + static_configs: + - targets: [ +{% for cassandra in cassandra_seed_ips.split(',') %} + '{{cassandra}}:7070', +{% endfor %} + ] + + - job_name: 'Node exporter' + static_configs: + - targets: [ +{% for node in nodes.split(',') %} + '{{node}}:9100', +{% endfor %} + ] + + - job_name: 'Statsd exporter' + static_configs: + - targets: [ +{% for node in nodes.split(',') %} + '{{node}}:9102', +{% endfor %} + ] diff --git a/sysadmin/grid5000/cassandra/ansible/templates/swhstorage/replayer@.service b/sysadmin/grid5000/cassandra/ansible/templates/swhstorage/replayer@.service index 05989c9..03e77d9 100644 --- a/sysadmin/grid5000/cassandra/ansible/templates/swhstorage/replayer@.service +++ b/sysadmin/grid5000/cassandra/ansible/templates/swhstorage/replayer@.service @@ -1,20 +1,21 @@ [Unit] Description=swh storage {{ item }} replayer ConditionPathExists=/etc/softwareheritage/replayer/replayer-{{ item }}.yml PartOf=replayer-{{item}}.target [Service] Type=forking User=swhstorage Group=swhstorage PIDFile=/run/replayer/replayer-{{item}}-%i.pid Environment=SWH_CONFIG_FILENAME=/etc/softwareheritage/replayer/replayer-{{ item }}.yml Environment=SWH_LOG_TARGET=journal +Environment=STATSD_PORT=9125 ExecStart=daemonize -p /run/replayer/replayer-{{item}}-%i.pid /usr/bin/swh storage replay ExecStop=/bin/kill -TERM $MAINPID ExecReload=/bin/kill -HUP $MAINPID # Restart=Always # RestartSec=10 [Install] WantedBy=multi-user.target diff --git a/sysadmin/grid5000/cassandra/ansible/zfs.yml b/sysadmin/grid5000/cassandra/ansible/zfs.yml index a03a77b..735bc7f 100644 --- a/sysadmin/grid5000/cassandra/ansible/zfs.yml +++ b/sysadmin/grid5000/cassandra/ansible/zfs.yml @@ -1,70 +1,70 @@ --- - name: Install contrib and non-free repositories apt_repository: repo: deb http://deb.debian.org/debian/ buster-backports main contrib non-free filename: backports.sources - name: Install zfs packages apt: update_cache: true # force an apt update before name: - linux-image-amd64 - linux-headers-amd64 - libnvpair1linux - libuutil1linux - libzfs2linux - libzpool2linux - zfs-dkms - zfs-zed - zfsutils-linux ignore_errors: True - name: Ensure zfs initialized shell: /usr/sbin/modprobe zfs - name: Finalize zfs packages installation apt: update_cache: true # force an apt update before name: - linux-image-amd64 - linux-headers-amd64 - libnvpair1linux - libuutil1linux - libzfs2linux - libzpool2linux - zfs-dkms - zfs-zed - zfsutils-linux - name: Install zfs packages after modprobe apt: update_cache: true # force an apt update before name: - zfsutils-linux - zfs-zed - name: test if zfs pools are configured command: zfs list -o name register: pool_list - name: zfs pool import shell: "zpool import {{ item.key }}" loop: "{{ lookup('dict', zfs_pools) }}" when: item.key not in pool_list.stdout_lines ignore_errors: True - name: test if zfs pools are configured command: zfs list -o name register: pool_list - name: create zfs pools shell: "zpool create -f {{ item.key }} {{ item.value.disks | join(' ') }}" - loop: "{{ lookup('dict', zfs_pools) }}" + loop: "{{ lookup('dict', zfs_pools, wantlist=True) }}" when: item.key not in pool_list.stdout_lines - name: call dataset creation include: _zfs_create_dataset.yml obj={{ outside_item }} - loop: "{{ lookup('dict', zfs_pools) }}" + loop: "{{ lookup('dict', zfs_pools, wantlist=True) }}" loop_control: loop_var: outside_item when: outside_item.key not in pool_list.stdout_lines diff --git a/sysadmin/grid5000/cassandra/environment.cfg b/sysadmin/grid5000/cassandra/environment.cfg index ad60d85..60087e7 100644 --- a/sysadmin/grid5000/cassandra/environment.cfg +++ b/sysadmin/grid5000/cassandra/environment.cfg @@ -1,16 +1,16 @@ export G5K_SITE='rennes.grid5000.fr' export CASSANDRA_HOSTS="parasilo-2 parasilo-3 parasilo-4 parasilo-5" export CASSANDRA_DISKS_COUNT=5 -export STORAGE_HOSTS="parasilo-20" +export STORAGE_HOSTS="paravance-1" export JOURNAL_CLIENT_HOSTS="" -export MONITORING_HOSTS="" +export MONITORING_HOSTS="paravance-51" # export CASSANDRA_NB_DISK_PER_NODE=5 export DISK_RESERVATION_DURATION=$((1 * 24)) # 1day during tests export NODE_RESERVATION_DURATION=00:50 # in hours export SSH_USER=root # export SSH_OPTIONS="-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" export SSH_OPTIONS=""