diff --git a/site-modules/profile/files/prometheus/sql/config/swh-scheduler.yml b/site-modules/profile/files/prometheus/sql/config/swh-scheduler.yml index 76964295..4305ed80 100644 --- a/site-modules/profile/files/prometheus/sql/config/swh-scheduler.yml +++ b/site-modules/profile/files/prometheus/sql/config/swh-scheduler.yml @@ -1,218 +1,220 @@ - name: swh_scheduler_delay scope: database database: ^(swh|softwareheritage)-scheduler$ interval: '1h' help: "Software Heritage Scheduler task delay spread. Positive delay for tasks whose execution is late" query: | with task_count_by_bucket as ( -- get the count of tasks by delay bucket. Tasks are grouped by their -- characteristics (type, status, policy, priority, current interval), -- then by delay buckets that are 1 hour wide between -24 and +24 hours, -- and 1 day wide outside of this range. -- A positive delay means the task execution is late wrt scheduling. select "type", status, "policy", priority, current_interval, ( -- select the bucket widths case when delay between - 24 * 3600 and 24 * 3600 then (ceil(delay / 3600)::bigint) * 3600 else (ceil(delay / (24 * 3600))::bigint) * 24 * 3600 end ) as delay_bucket, count(*) from task join lateral ( -- this is where the "positive = late" convention is set select extract(epoch from (now() - next_run)) as delay ) as d on true group by "type", status, "policy", priority, current_interval, delay_bucket order by "type", status, "policy", priority, current_interval, delay_bucket ), delay_bounds as ( -- get the minimum and maximum delay bucket for each task group. This will -- let us generate all the buckets, even the empty ones in the next CTE. select "type", status, "policy", priority, current_interval, min(delay_bucket) as min, max(delay_bucket) as max from task_count_by_bucket group by "type", status, "policy", priority, current_interval ), task_buckets as ( -- Generate all time buckets for all categories. select "type", status, "policy", priority, current_interval, delay_bucket from delay_bounds join lateral ( -- 1 hour buckets select generate_series(- 23, 23) * 3600 as delay_bucket union -- 1 day buckets. The "- 1" is used to make sure we generate an empty -- bucket as lowest delay bucket, so prometheus quantile calculations -- stay accurate select generate_series(min / (24 * 3600) - 1, max / (24 * 3600)) * 24 * 3600 as delay_bucket ) as buckets on true ), task_count_for_all_buckets as ( -- This join merges the non-empty buckets (task_count_by_bucket) with -- the full list of buckets (task_buckets). -- The join clause can't use the "using (x, y, z)" syntax, as it uses -- equality and priority and current_interval can be null. This also -- forces us to label all the fields in the select. Ugh. select task_buckets."type", task_buckets.status, task_buckets."policy", task_buckets.priority, task_buckets.current_interval, task_buckets.delay_bucket, coalesce(count, 0) as count -- make sure empty buckets have a 0 count instead of null from task_buckets left join task_count_by_bucket on task_count_by_bucket."type" = task_buckets."type" and task_count_by_bucket.status = task_buckets.status and task_count_by_bucket. "policy" = task_buckets."policy" and task_count_by_bucket.priority is not distinct from task_buckets.priority and task_count_by_bucket.current_interval is not distinct from task_buckets.current_interval and task_count_by_bucket.delay_bucket = task_buckets.delay_bucket ), cumulative_buckets as ( -- Prometheus wants cumulative histograms: for each bucket, the value -- needs to be the total of all measurements below the given value (this -- allows downsampling by just throwing away some buckets). We use the -- "sum over partition" window function to compute this. -- Prometheus also expects a "+Inf" bucket for the total count. We -- generate it with a null le value so we can sort it after the rest of -- the buckets. -- cumulative data select "type", status, "policy", priority, current_interval, delay_bucket as le, sum(count) over ( partition by "type", status, "policy", priority, current_interval order by delay_bucket ) from task_count_for_all_buckets union all -- +Inf data select "type", status, "policy", priority, current_interval, null as le, sum(count) from task_count_for_all_buckets group by "type", status, "policy", priority, current_interval -- sorting of all buckets order by "type", status, "policy", priority, current_interval, le asc NULLS last -- make sure +Inf ends up last ) -- The final query, which at this point just has to make sure that all -- labels are text (or the SQL exporter croaks) select -- we retrieve the backend name here as that's what we have e.g. on the celery side (select backend_name from task_type where cumulative_buckets."type" = task_type."type") as task, status::text as status, policy::text as policy, coalesce(priority::text, '') as priority, coalesce(current_interval::text, '') as current_interval, coalesce(le::text, '+Inf') as le, sum from cumulative_buckets labels: - task - status - policy - priority - current_interval - le values: - sum -- name: swh_scheduler +- name: swh_scheduler_origins scope: database database: ^(softwareheritage|swh)-scheduler$ interval: '15m' help: "Software Heritage Scheduler Metrics" query: | select l.name, l.instance_name, sm.visit_type, - extract(epoch from sm.last_update) as last_update, - sm.origins_known, sm.origins_enabled, sm.origins_never_visited, - sm.origins_with_pending_changes + extract(epoch from sm.last_update) as last_update, + sm.origins_known as known, + sm.origins_enabled as enabled, + sm.origins_never_visited as never_visited, + sm.origins_with_pending_changes as with_pending_changes from scheduler_metrics sm inner join listers l on sm.lister_id=l.id order by l.name, l.instance_name labels: - name - instance_name - visit_type values: - last_update - - origins_known - - origins_enabled - - origins_never_visited - - origins_with_pending_changes + - known + - enabled + - never_visited + - with_pending_changes diff --git a/site-modules/profile/files/prometheus/update-prometheus-config b/site-modules/profile/files/prometheus/update-prometheus-config index 87fe42da..f79dc94a 100755 --- a/site-modules/profile/files/prometheus/update-prometheus-config +++ b/site-modules/profile/files/prometheus/update-prometheus-config @@ -1,133 +1,179 @@ #!/usr/bin/env python3 # # This generates a static configuration for Prometheus # -# Copyright © 2020 The Software Heritage Developers. +# Copyright © 2020-2021 The Software Heritage Developers. # This file is released under the Apache-2.0 License. # -from collections import defaultdict import copy -from dataclasses import asdict, dataclass, fields import datetime import os import stat import sys +from collections import defaultdict +from dataclasses import asdict, dataclass, fields, is_dataclass from typing import Any, Dict, Iterable, List, Optional, Tuple import yaml +from typing_extensions import Literal + + +@dataclass(frozen=True) +class RelabelConfig: + source_labels: Optional[Tuple[str]] + separator: Optional[str] + target_label: Optional[str] + regex: Optional[str] + modulus: Optional[int] + replacement: Optional[str] + action: Literal[ + "replace", "keep", "drop", "hashmod", "labelmap", "labeldrop", "labelkeep" + ] + + @classmethod + def from_dict(cls, dict): + init_vars = {field.name: dict.get(field.name) for field in fields(cls)} + + if init_vars.get("source_labels"): + init_vars["source_labels"] = tuple(init_vars["source_labels"]) + + return cls(**init_vars) @dataclass(frozen=True) class JobGroup: """Job parameters from which to group prometheus jobs""" job_name: str scrape_interval: Optional[int] scrape_timeout: Optional[int] metrics_path: Optional[str] scheme: Optional[str] - params: Optional[Tuple] + params: Optional[Tuple[Tuple[str, Tuple[str]], ...]] + metric_relabel_configs: Optional[Tuple[RelabelConfig]] @classmethod def from_dict(cls, dict): init_vars = {field.name: dict.get(field.name) for field in fields(cls)} - if init_vars.get('metrics_path') == '/metrics': - init_vars['metrics_path'] = None + if init_vars.get("metrics_path") == "/metrics": + init_vars["metrics_path"] = None + + if init_vars.get("scheme") == "http": + init_vars["scheme"] = None - if init_vars.get('scheme') == 'http': - init_vars['scheme'] = None + if init_vars.get("metric_relabel_configs"): + init_vars["metric_relabel_configs"] = tuple( + RelabelConfig.from_dict(args) + for args in init_vars.get("metric_relabel_configs") + ) return cls(**init_vars) def load_yaml_from_dir(dirname: str) -> Iterable[Dict[str, Any]]: """Load all yaml files from a given directory""" for filename in os.listdir(dirname): if not filename.endswith((".yml", ".yaml")): continue path = os.path.join(dirname, filename) with open(path, "r") as f: yield from yaml.safe_load(f) -def dict_factory(data): - d = dict(data) - - if d.get("params") is not None: - d["params"] = {k:list(v) for k,v in d["params"]} +def convert_to_dict(v: Any, field_name: Optional[str] = None) -> Any: + if field_name == "params": + return {kk: list(vv) for kk, vv in v} + elif is_dataclass(v): + return { + field.name: convert_to_dict(getattr(v, field.name), field.name) + for field in fields(v) + if getattr(v, field.name) is not None + } + elif isinstance(v, (list, tuple)): + return [convert_to_dict(vv) for vv in v] + else: + return v - return d def generate_scrape_configs(configs: Dict[JobGroup, List[Dict[str, Any]]]): """Generate a scrape_configs entry from a dict""" + seen_jobs = set() for params, targets in configs.items(): - yield { - **{ - param: value - for param, value in asdict(params, dict_factory=dict_factory).items() - if value is not None - }, + ret: Dict[str, Any] = { + **convert_to_dict(params), "static_configs": targets, } + ctr = 0 + orig_job_name = ret["job_name"] + while ret["job_name"] in seen_jobs: + ctr += 1 + ret["job_name"] = f"{orig_job_name}-{ctr}" + for target in ret["static_configs"]: + target.setdefault("labels", {})["job"] = orig_job_name + + seen_jobs.add(ret["job_name"]) + yield ret + def merge_prometheus_config( base_config: Dict[str, Any], scrape_configs: Iterable[Dict[str, Any]] ) -> Dict[str, Any]: """Merge the main prometheus config with scrape configs""" config = copy.deepcopy(base_config) config.setdefault("scrape_configs", []).extend(scrape_configs) return config def replace_file(old_file, new_file): """Replace old_file with new_file, ensuring permissions are the same""" try: info = os.stat(old_file) os.chown(new_file, info.st_uid, info.st_gid) os.chmod(new_file, stat.S_IMODE(info.st_mode)) except FileNotFoundError: pass os.rename(new_file, old_file) if __name__ == "__main__": base_conffile = sys.argv[1] exported_dir = sys.argv[2] output = sys.argv[3] config_groups: Dict[JobGroup, List[Dict[str, Any]]] = defaultdict(list) for conf in load_yaml_from_dir(exported_dir): - if 'job' in conf: - conf['job_name'] = conf.pop('job') - if 'params' in conf: - params = conf.pop('params') - if params is not None: - # Hack to allow the dict serialization (used in the config_groups dict key later) - conf['params'] = tuple((k,tuple(v)) for k,v in params.items()) + if "job" in conf: + conf["job_name"] = conf.pop("job") + if "params" in conf: + params = conf.pop("params") + if params is not None: + # Hack to allow the dict serialization (used in the config_groups dict key later) + conf["params"] = tuple((k, tuple(v)) for k, v in params.items()) group = JobGroup.from_dict(conf) for key in asdict(group): conf.pop(key, None) config_groups[group].append(conf) with open(base_conffile, "r") as f: base_config = yaml.safe_load(f) full_config = merge_prometheus_config( - base_config, generate_scrape_configs(config_groups), + base_config, + generate_scrape_configs(config_groups), ) now = datetime.datetime.now(tz=datetime.timezone.utc).isoformat() with open(output + ".tmp", "w") as f: print(f"# This file was generated by {sys.argv[0]} on {now}.", file=f) - print(f"# Changes will be lost", file=f) - print(f"", file=f) - yaml.dump(full_config, f, default_flow_style=False) + print("# Changes will be lost", file=f) + print("", file=f) + yaml.safe_dump(full_config, f, default_flow_style=False) replace_file(output, output + ".tmp") diff --git a/site-modules/profile/manifests/prometheus/export_scrape_config.pp b/site-modules/profile/manifests/prometheus/export_scrape_config.pp index 51a3ca26..f592714d 100644 --- a/site-modules/profile/manifests/prometheus/export_scrape_config.pp +++ b/site-modules/profile/manifests/prometheus/export_scrape_config.pp @@ -1,23 +1,25 @@ # Export a scrape config to the configured prometheus server define profile::prometheus::export_scrape_config ( String $target, String $job = $name, Optional[String] $prometheus_server = undef, Hash[String, String] $labels = {}, Optional[Enum['http', 'https']] $scheme = undef, Optional[String] $metrics_path = undef, Optional[Hash[String, Array[String]]] $params = undef, + Optional[Array[Hash[String, Variant[String, Array[String]]]]] $metric_relabel_configs = undef, ) { $static_labels = lookup('prometheus::static_labels', Hash) @@profile::prometheus::scrape_config {"${facts['swh_hostname']['short']}_${name}": - prometheus_server => pick($prometheus_server, lookup('prometheus::server::certname')), - target => $target, - job => $job, - labels => $static_labels + $labels, - scheme => $scheme, - metrics_path => $metrics_path, - params => $params, + prometheus_server => pick($prometheus_server, lookup('prometheus::server::certname')), + target => $target, + job => $job, + labels => $static_labels + $labels, + scheme => $scheme, + metrics_path => $metrics_path, + params => $params, + metric_relabel_configs => $metric_relabel_configs, } } diff --git a/site-modules/profile/manifests/prometheus/scrape_config.pp b/site-modules/profile/manifests/prometheus/scrape_config.pp index f1a5cac0..f59d7586 100644 --- a/site-modules/profile/manifests/prometheus/scrape_config.pp +++ b/site-modules/profile/manifests/prometheus/scrape_config.pp @@ -1,31 +1,33 @@ # Scrape configuration for a prometheus exporter define profile::prometheus::scrape_config ( String $prometheus_server, String $target, String $job, Hash[String, String] $labels = {}, Optional[Enum['http', 'https']] $scheme = undef, Optional[String] $metrics_path = undef, Optional[Hash[String, Array[String]]] $params = undef, + Optional[Array[Hash[String, Variant[String, Array[String]]]]] $metric_relabel_configs = undef, ){ $directory = $profile::prometheus::server::scrape_configs_dir file {"${directory}/${name}.yaml": ensure => 'present', owner => 'root', group => 'root', mode => '0644', content => inline_yaml( [ { - job_name => $job, - targets => [$target], - labels => $labels, - scheme => $scheme, - metrics_path => $metrics_path, - params => $params, + job_name => $job, + targets => [$target], + labels => $labels, + scheme => $scheme, + metrics_path => $metrics_path, + params => $params, + metric_relabel_configs => $metric_relabel_configs }, ] ), notify => Exec['update-prometheus-config'], } } diff --git a/site-modules/profile/manifests/prometheus/sql.pp b/site-modules/profile/manifests/prometheus/sql.pp index fc427245..5b8fe8d9 100644 --- a/site-modules/profile/manifests/prometheus/sql.pp +++ b/site-modules/profile/manifests/prometheus/sql.pp @@ -1,106 +1,113 @@ # Deployment of prometheus SQL exporter class profile::prometheus::sql { include profile::prometheus::base $exporter_name = 'sql' $package_name = "prometheus-${exporter_name}-exporter" $service_name = $package_name $defaults_file = "/etc/default/${package_name}" $config_snippet_dir = "/etc/${package_name}" $config_file = "/var/run/postgresql/${package_name}.yml" $config_updater = "/usr/bin/update-${package_name}-config" package {$package_name: ensure => installed, } service {$service_name: ensure => 'running', enable => true, require => [ Package[$package_name], ] } ::systemd::dropin_file {"${service_name}/restart.conf": ensure => present, unit => "${service_name}.service", filename => 'restart.conf', content => "[Service]\nRestart=always\nRestartSec=5\n", } ::systemd::dropin_file {"${service_name}/update_config.conf": ensure => present, unit => "${service_name}.service", filename => 'update_config.conf', content => template('profile/prometheus/sql/systemd/update_config.conf.erb'), notify => Service[$service_name], } $update_deps = ['postgresql-client-common', 'libyaml-perl'] ensure_packages( $update_deps, { ensure => present }, ) file {$config_updater: ensure => present, owner => 'root', group => 'root', mode => '0755', source => 'puppet:///modules/profile/prometheus/sql/update-prometheus-sql-exporter-config', require => Package[$update_deps], notify => Service[$service_name], } file {$config_snippet_dir: ensure => directory, owner => 'root', group => 'root', mode => '0644', notify => Service[$service_name], } $config_snippets = lookup('prometheus::sql::config_snippets', Array[String], 'unique') each($config_snippets) |$snippet| { file {"${config_snippet_dir}/${snippet}.yml": ensure => present, owner => 'root', group => 'root', mode => '0644', source => "puppet:///modules/profile/prometheus/sql/config/${snippet}.yml", notify => Service[$service_name], } } $listen_network = lookup('prometheus::sql::listen_network', Optional[String], 'first', undef) $listen_ip = lookup('prometheus::sql::listen_address', Optional[String], 'first', undef) $actual_listen_ip = pick($listen_ip, ip_for_network($listen_network)) $listen_port = lookup('prometheus::sql::listen_port') $listen_address = "${actual_listen_ip}:${listen_port}" file {$defaults_file: ensure => present, owner => 'root', group => 'root', mode => '0644', content => template('profile/prometheus/sql/prometheus-sql-exporter.defaults.erb'), require => Package[$package_name], notify => Service[$service_name], } profile::prometheus::export_scrape_config {'sql': target => $listen_address, + metric_relabel_configs => [{ + source_labels => ['__name__', 'col'], + regex => 'sql_swh_scheduler_origins;(.*)', + action => 'replace', + target_label => '__name__', + replacement => 'swh_scheduler_origins_${1}', + }], } profile::cron::d {'restart-sql-exporter': target => 'prometheus', user => 'root', command => "chronic systemctl restart ${service_name}", minute => 'fqdn_rand', hour => 'fqdn_rand/4', } }