diff --git a/conf/grafana/dashboards/content-replayer.json b/conf/grafana/dashboards/content-replayer.json new file mode 100644 index 0000000..712a0d8 --- /dev/null +++ b/conf/grafana/dashboards/content-replayer.json @@ -0,0 +1,312 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 4, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(swh_content_replayer_bytes[5m])", + "legendFormat": "Volume", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Volume Processed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(swh_content_replayer_operations_total[5m])", + "legendFormat": "Blobs", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of blobs processed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(swh_content_replayer_duration_seconds_sum[5m])", + "legendFormat": "{{request}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Content Replayer", + "uid": "XUY3DVaZz", + "version": 6 +} diff --git a/conf/grafana/dashboards/graph-replayer.json b/conf/grafana/dashboards/graph-replayer.json new file mode 100644 index 0000000..88d6288 --- /dev/null +++ b/conf/grafana/dashboards/graph-replayer.json @@ -0,0 +1,138 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(swh_graph_replayer_operations_total[5m])", + "legendFormat": "{{object_type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Graph Replayer Object processed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Graph Replayer", + "uid": "nJ_uDV-Zk", + "version": 1 +} diff --git a/conf/grafana/dashboards/task-processing.json b/conf/grafana/dashboards/task-processing.json new file mode 100644 index 0000000..a1cc4b1 --- /dev/null +++ b/conf/grafana/dashboards/task-processing.json @@ -0,0 +1,373 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 2, + "id": 18, + "iteration": 1551112370226, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/uncaught/", + "color": "#bf1b00" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(swh_task_called_count{worker=~\"$worker\"}[$interval])) by (task)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{task}}", + "refId": "A" + }, + { + "expr": "sum(rate(swh_task_failure_count{worker=~\"$worker\"}[$interval])) by (task)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}} uncaught exceptions", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task counts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": "tasks per second", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#c15c17", + "colorScale": "sqrt", + "colorScheme": "interpolateViridis", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "Prometheus", + "description": "Each square's color represents the number of tasks that completed within that duration range.", + "gridPos": { + "h": 14, + "w": 24, + "x": 0, + "y": 9 + }, + "heatmap": {}, + "hideTimeOverride": false, + "highlightCards": true, + "id": 2, + "legend": { + "show": true + }, + "links": [], + "repeat": "task", + "repeatDirection": "v", + "scopedVars": { + "task": { + "selected": true, + "text": "swh.loader.git.tasks.UpdateGitRepository", + "value": "swh.loader.git.tasks.UpdateGitRepository" + } + }, + "targets": [ + { + "expr": "sum(increase(swh_task_duration_seconds_bucket{task=~\"$task\",worker=~\"$worker\"}[$interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "$task durations", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": null, + "transparent": false, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + } + ], + "refresh": false, + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "auto": true, + "auto_count": 100, + "auto_min": "2m", + "current": { + "text": "auto", + "value": "$__auto_interval_interval" + }, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "2m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "swh.loader.git.tasks.UpdateGitRepository", + "value": "swh.loader.git.tasks.UpdateGitRepository" + }, + "datasource": "Prometheus", + "definition": "label_values(swh_task_called_count, task)", + "hide": 0, + "includeAll": true, + "label": "Task name", + "multi": false, + "name": "task", + "options": [], + "query": "label_values(swh_task_called_count, task)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(swh_task_called_count{task=~\"$task\"}, worker)", + "hide": 0, + "includeAll": true, + "label": "Worker", + "multi": true, + "name": "worker", + "options": [], + "query": "label_values(swh_task_called_count{task=~\"$task\"}, worker)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Worker task processing", + "uid": "b_xh3f9ik", + "version": 8 +} diff --git a/conf/grafana/provisioning/dashboards/all.yaml b/conf/grafana/provisioning/dashboards/all.yaml new file mode 100644 index 0000000..aa79647 --- /dev/null +++ b/conf/grafana/provisioning/dashboards/all.yaml @@ -0,0 +1,6 @@ +- name: 'default' # name of this dashboard configuration (not dashboard itself) + org_id: 1 # id of the org to hold the dashboard + folder: '' # name of the folder to put the dashboard (http://docs.grafana.org/v5.0/reference/dashboard_folders/) + type: 'file' # type of dashboard description (json files) + options: + folder: '/var/lib/grafana/dashboards' # where dashboards are diff --git a/conf/grafana/provisioning/datasources/prometheus.yaml b/conf/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 0000000..289aaf1 --- /dev/null +++ b/conf/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,11 @@ +# config file version +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + url: http://localhost:5080/prometheus + access: direct + isDefault: true + version: 1 + editable: false diff --git a/conf/nginx.conf b/conf/nginx.conf index 846f15a..e40515a 100644 --- a/conf/nginx.conf +++ b/conf/nginx.conf @@ -1,36 +1,55 @@ worker_processes 4; # Show startup logs on stderr; switch to debug to print, well, debug logs when # running nginx-debug error_log /dev/stderr info; events { worker_connections 1024; } http { include mime.types; default_type application/octet-stream; sendfile on; keepalive_timeout 65; # Built-in Docker resolver. Needed to allow on-demand resolution of proxy # upstreams. # resolver 127.0.0.11 valid=30s; upstream appserver { # fail_timeout=0 means we always retry an upstream even if it failed # to return a good HTTP response server "web:5004" fail_timeout=0; - } + } + + upstream prometheus { + server "prometheus:9090" fail_timeout=0; + } + + upstream grafana { + server "grafana:3000" fail_timeout=0; + } server { listen 80 default_server; + location /prometheus { + set $upstream "http://prometheus"; + proxy_pass $upstream; + } + + location /grafana { + set $upstream "http://grafana"; + rewrite ^/grafana/(.*)$ /$1 break; + proxy_pass $upstream; + } + location / { set $upstream "http://appserver"; proxy_pass $upstream; } } } diff --git a/conf/prometheus-statsd-mapping.yml b/conf/prometheus-statsd-mapping.yml new file mode 100644 index 0000000..a994106 --- /dev/null +++ b/conf/prometheus-statsd-mapping.yml @@ -0,0 +1,27 @@ +defaults: + timer_type: histogram + buckets: + - .005 + - .01 + - .025 + - .05 + - .1 + - .25 + - .5 + - .75 + - 1 + - 2 + - 5 + - 10 + - 15 + - 30 + - 45 + - 60 + - 120 + - 300 + - 600 + - 900 + - 1800 + - 2700 + - 3600 + - 7200 diff --git a/conf/prometheus.yml b/conf/prometheus.yml new file mode 100644 index 0000000..f342c98 --- /dev/null +++ b/conf/prometheus.yml @@ -0,0 +1,22 @@ +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + metrics_path: /prometheus/metrics + + - job_name: statsd-exporter + static_configs: + - targets: + - prometheus-statsd-exporter:9102 + + - job_name: jmx-exporter-cassandra + static_configs: + - targets: + - prometheus-jmx-exporter-cassandra:5556 diff --git a/docker-compose.yml b/docker-compose.yml index 91b26b7..025aa31 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,99 +1,131 @@ version: "3.7" services: memcache: image: memcached deploy: replicas: 1 db-storage: image: postgres:11 deploy: replicas: 1 placement: constraints: - node.role == manager environment: POSTGRES_PASSWORD_FILE: /run/secrets/postgres-password POSTGRES_USER: swh volumes: - "storage:/var/lib/postgresql/data" secrets: - source: postgres-password uid: '999' mode: 0400 web: image: softwareheritage/web:latest configs: - source: web target: /etc/softwareheritage/config.yml command: - serve environment: PORT: "5004" depends_on: - memcache objstorage: image: softwareheritage/base:latest deploy: replicas: 1 placement: constraints: - node.role == manager volumes: - "objstorage:/srv/softwareheritage/objects" configs: - source: objstorage target: /etc/softwareheritage/config.yml environment: PORT: "5003" + STATSD_HOST: prometheus-statsd-exporter + STATSD_PORT: 9125 command: - objstorage storage: image: softwareheritage/base:latest configs: - source: storage target: /etc/softwareheritage/config.yml environment: PGHOST: db-storage PORT: "5002" + STATSD_HOST: prometheus-statsd-exporter + STATSD_PORT: 9125 command: - storage depends_on: - db-storage secrets: - source: postgres-password mode: 0400 nginx: image: nginx configs: - source: nginx target: /etc/nginx/nginx.conf ports: - "5080:80" deploy: placement: constraints: - node.role == manager + prometheus: + image: prom/prometheus + depends_on: + - prometheus-statsd-exporter + command: + # Needed for the reverse-proxy + - "--web.external-url=/prometheus" + - "--config.file=/etc/prometheus/prometheus.yml" + volumes: + - "./conf/prometheus.yml:/etc/prometheus/prometheus.yml:ro" + + prometheus-statsd-exporter: + image: prom/statsd-exporter + command: + - "--statsd.mapping-config=/etc/prometheus/statsd-mapping.yml" + volumes: + - "./conf/prometheus-statsd-mapping.yml:/etc/prometheus/statsd-mapping.yml:ro" + + grafana: + image: grafana/grafana + depends_on: + - prometheus + environment: + GF_SERVER_ROOT_URL: http://localhost:5080/grafana + volumes: + - "./conf/grafana/provisioning:/etc/grafana/provisioning:ro" + - "./conf/grafana/dashboards:/var/lib/grafana/dashboards" + volumes: objstorage: storage: secrets: postgres-password: external: true configs: web: file: conf/web.yml storage: file: conf/storage.yml objstorage: file: conf/objstorage.yml nginx: file: conf/nginx.conf