diff --git a/files/prometheus/sql/update-prometheus-sql-exporter-config b/files/prometheus/sql/update-prometheus-sql-exporter-config new file mode 100755 index 0000000..fe89b59 --- /dev/null +++ b/files/prometheus/sql/update-prometheus-sql-exporter-config @@ -0,0 +1,161 @@ +#!/usr/bin/python3 +# +# Originally part of Credativ's Elephant Shed +# https://github.com/credativ/elephant-shed +# prometheus/sql_exporter/update-prometheus-sql-exporter-config +# Licensed under the GPLv3. +# +# Adapted to run on Python 3.5 + +import sys +import yaml +import subprocess +import traceback +from pkg_resources import parse_version + +""" +Returns a list of clusters (dict) containing the following attributes: + - name + - version + - port + - status + - owner + - databases (dict) + +Calls get_databases for each cluster. +""" +def get_clusters(min_version=None, max_version=None): + clusters = list() + + proc = subprocess.Popen(["pg_lsclusters"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, universal_newlines=True) + (out, err) = proc.communicate() + + out = out.rstrip("\n") + out_lines = out.splitlines() + + for i in range(1,len(out_lines)): + cluster = dict() + version, name, port, status, owner = out_lines[i].split()[:5] + cluster["version"] = version + cluster["name"] = name + cluster["port"] = port + cluster["status"] = status + cluster["owner"] = owner + cluster["databases"] = get_databases(cluster) + clusters.append(cluster) + + return clusters + +""" +Returns a list of databases for the given cluster. +""" +def get_databases(cluster): + databases = list() + command = "sudo -u %s psql -p %s -tXA -c \"SELECT datname FROM pg_database WHERE NOT datname ~ '^template(0|1)$';\"" + command = command % (cluster["owner"], cluster["port"]) + proc = subprocess.Popen([command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, universal_newlines=True) + (out, err) = proc.communicate() + out_lines = out.splitlines() + for line in out_lines: + databases.append(line.split("|")[0]) + + return databases + +""" +Returns the a config element (dict). +""" +def read_config_template(): + config_filename = "/etc/prometheus-sql-exporter.yml.in" + with open(config_filename, "r") as f: + yaml_conf = yaml.load(f) + + return yaml_conf + +""" +Writes the given config to file. +""" +def write_config(conf): + config_filename = "/etc/prometheus-sql-exporter.yml" + with open(config_filename, "w") as f: + yaml.dump(conf, f) + + return True + +""" +Takes a given config and appends connection strings to it. +""" +def append_conn_strings_to_template(conf): + clusters = get_clusters() + + for job in conf["jobs"]: + min_version = parse_version("0") + max_version = parse_version("99999999999") + used_clusters = list() + conn_strings = list() + + if "min_version" in job: + min_version = parse_version(job["min_version"]) + + if "max_version" in job: + max_version = parse_version(job["max_version"]) + + for cluster in clusters: + cluster_version = parse_version(cluster["version"]) + if cluster_version >= min_version and cluster_version <= max_version: + used_clusters.append(cluster) + + for cluster in used_clusters: + if job["name"].startswith("cluster"): + conn_strings.extend(build_conn_strings(cluster, False)) + if job["name"].startswith("database"): + conn_strings.extend(build_conn_strings(cluster, True)) + + job["connections"] = conn_strings + + return conf + +""" +Returns a list of connection strings for all clusters/databases. + +If per_db is True a list of all databases is returned. Otherwise a list of all +clusters (using template1) is returned. +""" +def build_conn_strings(cluster, per_db=True): + conn_strings = list() + + if per_db: + for db in cluster["databases"]: + conn_string = 'postgres://%s@:%s/%s?sslmode=disable' + conn_string = conn_string % (cluster["owner"], cluster["port"], db) + conn_strings.append(conn_string) + else: + conn_string = 'postgres://%s@:%s/postgres?sslmode=disable' + conn_string = conn_string % (cluster["owner"], cluster["port"]) + conn_strings.append(conn_string) + + return conn_strings + +if __name__ == "__main__": + clusters = get_clusters() + + try: + yaml_conf = read_config_template() + except: + sys.stderr.write("Could not read config template.\n") + print('-'*60) + traceback.print_exc(file=sys.stdout) + print('-'*60) + exit(1) + + try: + yaml_conf = append_conn_strings_to_template(yaml_conf) + except: + sys.stderr.write("Could not write new config.\n") + print('-'*60) + traceback.print_exc(file=sys.stdout) + print('-'*60) + exit(1) + + write_config(yaml_conf) + + exit(0) diff --git a/manifests/prometheus/sql.pp b/manifests/prometheus/sql.pp new file mode 100644 index 0000000..337b2ee --- /dev/null +++ b/manifests/prometheus/sql.pp @@ -0,0 +1,77 @@ +# Deployment of prometheus SQL exporter + +class profile::prometheus::sql { + package {'prometheus-sql-exporter': + ensure => latest, + } + + service {'prometheus-sql-exporter': + ensure => 'running', + enable => true, + require => [ + Package['prometheus-sql-exporter'], + File['/etc/defaults/prometheus-sql-exporter'], + Exec['/usr/bin/update-prometheus-sql-exporter-config'], + ] + } + + + file {'/usr/bin/update-prometheus-sql-exporter-config': + ensure => present, + owner => 'root', + group => 'root', + mode => '0755', + source => 'puppet:///modules/profile/prometheus/sql/update-prometheus-sql-exporter-config', + } + + file {'/etc/prometheus/prometheus-sql-exporter.yml.in': + ensure => present, + owner => 'root', + group => 'root', + mode => '0644', + content => template('profile/prometheus/sql/prometheus-sql-exporter.yml.in.erb'), + notify => Exec['/usr/bin/update-prometheus-sql-exporter-config'], + } + + $update_deps = ['python3-pkg-resources', 'python3-yaml'] + ensure_packages( + $update_deps, { + ensure => present + }, + ) + + exec {'/usr/bin/update-prometheus-sql-exporter-config': + refreshonly => true, + creates => '/etc/prometheus/prometheus-sql-exporter.yml', + require => [ + Package[$update_deps], + File['/usr/bin/update-prometheus-sql-exporter-config'], + ], + } + + $listen_network = lookup('prometheus::sql::listen_network', Optional[String], 'first', undef) + $listen_address = lookup('prometheus::sql::listen_address', Optional[String], 'first', undef) + $actual_listen_address = pick($listen_address, ip_for_network($listen_network)) + $listen_port = lookup('prometheus::sql::listen_port') + $target = "${actual_listen_address}:${listen_port}" + + $defaults_config = { + web => { + listen_address => $target, + }, + } + + file {'/etc/defaults/prometheus-sql-exporter': + ensure => present, + owner => 'root', + group => 'root', + mode => '0644', + content => template('profile/prometheus/sql/prometheus-sql-exporter.defaults.erb'), + require => Package['prometheus-sql-exporter'], + notify => Service['prometheus-sql-exporter'], + } + + profile::prometheus::export_scrape_config {'sql': + target => $target, + } +} diff --git a/templates/prometheus/sql/prometheus-sql-exporter.defaults.erb b/templates/prometheus/sql/prometheus-sql-exporter.defaults.erb new file mode 100644 index 0000000..c5cddbe --- /dev/null +++ b/templates/prometheus/sql/prometheus-sql-exporter.defaults.erb @@ -0,0 +1,12 @@ + +CONFIG=/etc/prometheus-sql-exporter.yml +PGHOST=/var/run/postgresql +PGAPPNAME=prometheus-sql-exporter +GOGC=40 +GOMAXPROCS=4 +LOGLEVEL=WARN +ARGS="\ +<%- scope.call_function('flatten_to_argument_list', [@defaults_config]).each do |argument| -%> + <%= argument %> \ +<%- end -%> +" diff --git a/templates/prometheus/sql/prometheus-sql-exporter.yml.in.erb b/templates/prometheus/sql/prometheus-sql-exporter.yml.in.erb new file mode 100644 index 0000000..f348b5c --- /dev/null +++ b/templates/prometheus/sql/prometheus-sql-exporter.yml.in.erb @@ -0,0 +1,512 @@ +--- +jobs: +# These checks are done for every cluster +- name: "cluster" + interval: '0' + connections: + # We have to omit the hostname, we use PGHOST as environment variable. + # See /etc/default/prometheus-sql-exporter + - 'postgres://postgres@/postgres?sslmode=disable' + queries: + - name: "running_queries" + help: "Number of running queries" + labels: + - "datname" + - "usename" + values: + - "count" + query: | + SELECT datname::text, usename::text, COUNT(*)::float AS count + FROM pg_stat_activity + WHERE NOT datname ~ '^template(0|1)$' + GROUP BY datname, usename; + - name: "pg_stat_activity" + help: "Running Backends by Database and state" + labels: + - "datname" + - "state" + values: + - "count" + - "max_tx_duration" + query: | + SELECT + pg_database.datname::text, + tmp.state::text, + COALESCE(count,0) as count, + COALESCE(max_tx_duration,0) as max_tx_duration + FROM + ( + VALUES ('active'), + ('idle'), + ('idle in transaction'), + ('idle in transaction (aborted)'), + ('fastpath function call'), + ('disabled') + ) AS tmp(state) CROSS JOIN pg_database + LEFT JOIN + ( + SELECT + datname, + state, + count(*) AS count, + MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration + FROM pg_stat_activity + GROUP BY datname,state) AS tmp2 + ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname + WHERE NOT pg_database.datname ~ '^template(0|1)$' + - name: "settings" + help: "PostgreSQL settings" + labels: + - "settings" + values: + - "max_connections" + - "autovacuum_freeze_max_age" + - "superuser_reserved_connections" + - "max_wal_senders" + - "max_prepared_transactions" + query: | + SELECT + current_setting('max_connections')::float AS max_connections, + current_setting('autovacuum_freeze_max_age')::float AS autovacuum_freeze_max_age, + current_setting('superuser_reserved_connections')::float AS superuser_reserved_connections, + current_setting('max_wal_senders')::float AS max_wal_senders, + current_setting('max_prepared_transactions')::float AS max_prepared_transactions; + - name: "pg_locks_sum" + help: "number of current locks" + labels: + - "mode" + values: + - "count" + query: | + SELECT t.mode, count(*) + FROM + (VALUES + ('AccessShareLock'), + ('RowShareLock'), + ('RowExclusiveLock'), + ('ShareUpdateExclusiveLock'), + ('ShareLock'), + ('ShareRowExclusiveLock'), + ('ExclusiveLock'), + ('AccessExclusiveLock') + ) t(mode) + FULL JOIN pg_locks l ON t.mode = l.mode GROUP BY 1; + - name: "pg_stat_database" + help: "Database statistics" + type: "counter" + labels: + - "datname" + values: + - "numbackends:count" + - "xact_commit" + - "xact_rollback" + - "blks_read" + - "blks_hit" + - "tup_returned" + - "tup_fetched" + - "tup_inserted" + - "tup_updated" + - "tup_deleted" + - "conflicts" + - "temp_files" + - "temp_bytes" + - "deadlocks" + - "blk_read_time" + - "blk_write_time" + - "freeze_age" + - "dbsize" + query: | + SELECT + s.datname::text, + numbackends::float, + xact_commit::float, + xact_rollback::float, + blks_read::float, + blks_hit::float, + tup_returned::float, + tup_fetched::float, + tup_inserted::float, + tup_updated::float, + tup_deleted::float, + conflicts::float, + temp_files::float, + temp_bytes::float, + deadlocks::float, + blk_read_time, + blk_write_time, + age(d.datfrozenxid) AS freeze_age, + pg_database_size(s.datname)::float AS dbsize + FROM pg_stat_database s + LEFT JOIN pg_database d ON d.datname = s.datname + WHERE NOT s.datname ~ '^template(0|1)$' + - name: "pg_stat_archiver" + help: "pg_stat_archiver" + values: + - "archived_count" + - "last_archived_time" + - "failed_count" + - "last_failed_time" + - "stats_reset" + query: | + SELECT + archived_count, + EXTRACT(EPOCH FROM last_archived_time), + failed_count, + EXTRACT(EPOCH FROM last_failed_time), + EXTRACT(EPOCH FROM stats_reset) + FROM pg_stat_archiver + - name: "checkpoints" + help: "Requested and timed Checkpoints" + values: + - "timed" + - "requested" + query: | + SELECT + pg_stat_get_bgwriter_timed_checkpoints() timed, + pg_stat_get_bgwriter_requested_checkpoints() requested; + - name: "txid" + help: "current txid" + values: + - "txid_current" + query: | + SELECT + txid_current(); + - name: "pg_stat_statements" + help: "pg_stat_statements" + labels: + - "usename" + - "datname" + - "queryid" + - "query" + values: + - "calls" + - "total_time" + - "rows" + - "shared_blks_hit" + - "shared_blks_read" + - "shared_blks_dirtied" + - "shared_blks_writte" + - "local_blks_hit" + - "local_blks_read" + - "local_blks_dirtied" + - "local_blks_writte" + - "temp_blks_read" + - "temp_blks_written" + query: | + WITH w_pg_stat_statements AS ( + SELECT * FROM pg_stat_statements + ) + (SELECT + usename::text + , datname::text + , queryid::text + , substr(regexp_replace(query, E'[\\n\\r]+', ' ', 'g' ),1,1024) AS query + , calls + , total_time + , rows + , shared_blks_hit + , shared_blks_read + , shared_blks_dirtied + , shared_blks_written + , local_blks_hit + , local_blks_read + , local_blks_dirtied + , local_blks_written + , temp_blks_read + , temp_blks_written + FROM w_pg_stat_statements pss + JOIN pg_database pd + ON pss.dbid = pd.oid + JOIN pg_user pu + ON pss.userid = pu.usesysid + ORDER BY pss.total_time DESC + LIMIT 25) + UNION + SELECT + usename::text + , datname::text + , queryid::text + , substr(regexp_replace(query, E'[\\n\\r]+', ' ', 'g' ),1,1024) AS query + , calls + , total_time + , rows + , shared_blks_hit + , shared_blks_read + , shared_blks_dirtied + , shared_blks_written + , local_blks_hit + , local_blks_read + , local_blks_dirtied + , local_blks_written + , temp_blks_read + , temp_blks_written + FROM w_pg_stat_statements pss2 + JOIN pg_database pd2 + ON pss2.dbid = pd2.oid + JOIN pg_user pu2 + ON pss2.userid = pu2.usesysid + ORDER BY calls DESC + LIMIT 25; + - name: "prepared_transactions" + help: "Prepared Transactions" + labels: + - "datname" + values: + - "count" + query: | + SELECT + datname::text, + COUNT(transaction) AS count + FROM pg_database d + LEFT JOIN pg_prepared_xacts x ON d.datname = x.database + WHERE NOT d.datname ~ '^template(0|1)$' + GROUP BY datname + +- name: "cluster_9" + interval: '0' + min_version: "9.0" + max_version: "9.6" + connections: + # We have to omit the hostname, we use PGHOST as environment variable. + # See /etc/default/prometheus-sql-exporter + queries: + - name: "pg_stat_replication" + help: "pg_stat_replication" + labels: + - "application_name" + - "usename" + - "client_addr" + - "client_port" + values: + - "pid" + - "current_xlog_location_bytes" + - "sent_location_bytes" + - "flush_location_bytes" + - "replay_location_bytes" + - "send_lag_bytes" + - "flush_lag_bytes" + - "replay_lag_bytes" + query: | + SELECT + COALESCE(pid, 0) AS pid, + COALESCE(application_name, ' ')::text AS application_name, + COALESCE(usename, ' ')::text AS usename, + COALESCE(client_addr::text, 'local')::text AS client_addr, + COALESCE(client_port::text, ' ') AS client_port, + COALESCE(pg_xlog_location_diff(pg_current_xlog_location(), '0/0'), 0) AS current_xlog_location_bytes, + COALESCE(pg_xlog_location_diff(sent_location, '0/0'), 0) AS sent_location_bytes, + COALESCE(pg_xlog_location_diff(flush_location, '0/0'), 0) AS flush_location_bytes, + COALESCE(pg_xlog_location_diff(replay_location, '0/0'), 0) AS replay_location_bytes, + COALESCE(pg_xlog_location_diff(pg_current_xlog_location(), sent_location), 0) AS send_lag_bytes, + COALESCE(pg_xlog_location_diff(pg_current_xlog_location(), flush_location), 0) AS flush_lag_bytes, + COALESCE(pg_xlog_location_diff(pg_current_xlog_location(), replay_location), 0) AS replay_lag_bytes + FROM pg_stat_replication LEFT JOIN (VALUES(0)) filler(i) ON TRUE + - name: "waldistance" + help: "amount of wal written since creation" + values: + - "location" + query: | + SELECT + pg_xlog_location_diff(pg_current_xlog_location(),'0/0000') "location"; + - name: "LastCheckpointDistance" + help: "Distance to the last checkpoint" + values: + - "distance" + query: | + SELECT + pg_xlog_location_diff( pg_current_xlog_location(), checkpoint_location) "distance" FROM pg_control_checkpoint(); + - name: "archive_ready" + help: "archive_ready" + values: + - "archive_ready" + query: | + SELECT COUNT(1) AS archive_ready FROM pg_ls_dir('./pg_xlog/archive_status') WHERE pg_ls_dir ~ '^[0-9a-fA-F]{24}\.ready$'; + + +- name: "cluster_10" + interval: '0' + min_version: "10" + max_version: "999999" + connections: + # We have to omit the hostname, we use PGHOST as environment variable. + # See /etc/default/prometheus-sql-exporter + queries: + - name: "pg_stat_replication" + help: "pg_stat_replication" + labels: + - "application_name" + - "usename" + - "client_addr" + - "client_port" + values: + - "pid" + - "current_xlog_lsn_bytes" + - "sent_lsn_bytes" + - "flush_lsn_bytes" + - "replay_lsn_bytes" + - "send_lag_bytes" + - "flush_lag_bytes" + - "replay_lag_bytes" + query: | + SELECT + COALESCE(pid, 0) AS pid, + COALESCE(application_name, ' ')::text AS application_name, + COALESCE(usename, ' ')::text AS usename, + COALESCE(client_addr::text, 'local')::text AS client_addr, + COALESCE(client_port::text, ' ') AS client_port, + COALESCE(pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0'), 0) AS current_xlog_lsn_bytes, + COALESCE(pg_wal_lsn_diff(sent_lsn, '0/0'), 0) AS sent_location_bytes, + COALESCE(pg_wal_lsn_diff(flush_lsn, '0/0'), 0) AS flush_location_bytes, + COALESCE(pg_wal_lsn_diff(replay_lsn, '0/0'), 0) AS replay_location_bytes, + COALESCE(pg_wal_lsn_diff(pg_current_wal_lsn(), sent_lsn), 0) AS send_lag_bytes, + COALESCE(pg_wal_lsn_diff(pg_current_wal_lsn(), flush_lsn), 0) AS flush_lag_bytes, + COALESCE(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn), 0) AS replay_lag_bytes + FROM pg_stat_replication FULL JOIN (VALUES(0)) filler(i) ON TRUE + - name: "waldistance" + help: "amount of wal written since creation" + values: + - "location" + query: | + SELECT + pg_wal_lsn_diff(pg_current_wal_lsn(),'0/0000') "location"; + - name: "LastCheckpointDistance" + help: "Distance to the last checkpoint" + values: + - "distance" + query: | + SELECT + pg_wal_lsn_diff( pg_current_wal_lsn(), checkpoint_lsn) "distance" FROM pg_control_checkpoint(); + - name: "archive_ready" + help: "archive_ready" + values: + - "archive_ready" + query: | + SELECT COUNT(1) AS archive_ready FROM pg_ls_dir('./pg_wal/archive_status') WHERE pg_ls_dir ~ '^[0-9a-fA-F]{24}\.ready$'; + +# This checks are done for every database +- name: "database" + interval: '24h' + connections: + # We have to omit the hostname, we use PGHOST as environment variable. + # See /etc/default/prometheus-sql-exporter + - 'postgres://postgres@/postgres?sslmode=disable' + queries: + - name: "CREATE EXTENSION pg_stat_statements" + help: "Creates the needed extension pg_stat_statements and does not produce errors." + query: | + DO $$ + BEGIN + EXECUTE 'CREATE EXTENSION pg_stat_statements'; + EXCEPTION WHEN OTHERS THEN + RETURN; + END; + $$; +- name: "database" + interval: '0' + connections: + # We have to omit the hostname, we use PGHOST as environment variable. + # See /etc/default/prometheus-sql-exporter + - 'postgres://postgres@/postgres?sslmode=disable' + queries: + - name: "pg_stat_user_tables" + help: "Table stats" + labels: + - "datname" + - "schemaname" + - "relname" + values: + - "seq_scan" + - "seq_tup_read" + - "idx_scan" + - "idx_tup_fetch" + - "n_tup_ins" + - "n_tup_upd" + - "n_tup_del" + - "n_tup_hot_upd" + - "n_live_tup" + - "n_dead_tup" + - "vacuum_count" + - "autovacuum_count" + - "analyze_count" + - "autoanalyze_count" + query: | + SELECT + current_database()::text AS datname + , coalesce(schemaname::text, 'null') AS schemaname + , coalesce(relname::text, 'null') AS relname + , coalesce(seq_scan,0)::float AS seq_scan + , coalesce(seq_tup_read,0)::float AS seq_tup_read + , coalesce(idx_scan,0)::float AS idx_scan + , coalesce(idx_tup_fetch,0)::float AS idx_tup_fetch + , coalesce(n_tup_ins,0)::float AS n_tup_ins + , coalesce(n_tup_upd,0)::float AS n_tup_upd + , coalesce(n_tup_del,0)::float AS n_tup_del + , coalesce(n_tup_hot_upd,0)::float AS n_tup_hot_upd + , coalesce(n_live_tup,0)::float AS n_live_tup + , coalesce(n_dead_tup,0)::float AS n_dead_tup + , coalesce(vacuum_count,0)::float AS vacuum_count + , coalesce(autovacuum_count,0)::float AS autovacuum_count + , coalesce(analyze_count,0)::float AS analyze_count + , coalesce(autoanalyze_count,0)::float AS autoanalyze_count + FROM pg_stat_user_tables FULL JOIN (VALUES(0)) filler(i) ON TRUE; + - name: "pg_statio_user_tables" + help: "IO Stats" + labels: + - "datname" + - "schemaname" + - "relname" + values: + - "heap_blks_read" + - "heap_blks_hit" + - "idx_blks_read" + - "idx_blks_hit" + query: | + SELECT + current_database()::text AS datname + , COALESCE(schemaname::text, 'null') AS schemaname + , COALESCE(relname::text, 'null') AS relname + , COALESCE(heap_blks_read::float,0) AS heap_blks_read + , COALESCE(heap_blks_hit::float,0) AS heap_blks_hit + , COALESCE(idx_blks_read::float,0) AS idx_blks_read + , COALESCE(idx_blks_hit::float,0) AS idx_blks_hit + FROM pg_statio_user_tables FULL JOIN (VALUES(0)) filler(i) ON TRUE; + - name: "BufferAccess" + help: "Buffers Accessed" + labels: + - "datname" + - "schemaname" + - "relname" + values: + - "reads" + query: | + SELECT + current_database()::text AS datname, + COALESCE(schemaname::text, 'null') AS schemaname, + COALESCE(relname::text, 'null') AS relname, + sum( coalesce( heap_blks_read,0) + + coalesce(heap_blks_hit,0) + + coalesce(idx_blks_hit,0) + + coalesce(idx_blks_read,0) + + coalesce(toast_blks_hit,0) + + coalesce(toast_blks_read,0) + + coalesce(tidx_blks_hit,0) + + coalesce(tidx_blks_read,0))*8196::bigint as reads + FROM pg_statio_user_tables FULL JOIN (VALUES(0)) filler(i) ON TRUE + GROUP BY 1,2,3; + - name: "Maintenancecounters" + help: "Counters for maintenance jobs on user tables" + labels: + - "datname" + values: + - "vacuum_count" + - "autovacuum_count" + - "analyze_count" + - "autoanalyze_count" + query: | + SELECT + current_database()::text AS datname, + sum( vacuum_count) vacuum_count, + sum( autovacuum_count) autovacuum_count, + sum( analyze_count) analyze_count, + sum( autoanalyze_count) autoanalyze_count + FROM pg_stat_all_tables;