diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -14,5 +14,9 @@ [mypy-requests_mock.*] ignore_missing_imports = True +[mypy-prometheus_client.*] +ignore_missing_imports = True + + # [mypy-add_your_lib_here.*] # ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ click psycopg2 requests +prometheus-client +typing diff --git a/swh/icinga_plugins/base_check.py b/swh/icinga_plugins/base_check.py --- a/swh/icinga_plugins/base_check.py +++ b/swh/icinga_plugins/base_check.py @@ -1,17 +1,32 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import atexit +from typing import Any, Dict, List + +from prometheus_client import CollectorRegistry, Gauge, Summary, write_to_textfile class BaseCheck: - def __init__(self, obj): + PROMETHEUS_METRICS_BASENAME = "swh_e2e_" + + def __init__(self, obj: Dict[str, str], application: str): self.warning_threshold = obj.get( "warning_threshold", self.DEFAULT_WARNING_THRESHOLD ) self.critical_threshold = obj.get( "critical_threshold", self.DEFAULT_CRITICAL_THRESHOLD ) + self.prometheus_enabled = obj.get("prometheus_enabled") + self.prometheus_exporter_directory = obj.get("prometheus_exporter_directory") + self.environment = obj.get("environment") + self.application = application + + self.registry = CollectorRegistry() + self.prometheus_metrics: Dict[str, Any] = {} + + atexit.register(self.save_prometheus_metrics) def get_status(self, value): if self.critical_threshold and value >= self.critical_threshold: @@ -25,3 +40,81 @@ print(f"{self.TYPE} {status_type} - {status_string}") for (metric_name, metric_value) in sorted(metrics.items()): print(f"| '{metric_name}' = {metric_value:.2f}s") + + def observe_prometheus_metric( + self, name: str, value: float, labels: List[str] = [] + ): + metric = self.prometheus_metrics.get(self.PROMETHEUS_METRICS_BASENAME + name) + + if metric is None: + raise ValueError(f"No metric {name} found") + + metric.labels(*self._get_label_values(labels)).observe(value) + + def collect_prometheus_metric( + self, name: str, value: float, labels: List[str] = [] + ): + g = self.prometheus_metrics.get(self.PROMETHEUS_METRICS_BASENAME + name) + + if g is None: + raise ValueError(f"No metric {name} found") + + g.labels(*self._get_label_values(labels)).set(value) + + def _get_label_values(self, labels: List[str]) -> List[str]: + label_list = [] + + if self.environment: + label_list.append(self.environment) + + if self.application is None: + raise ValueError("Application name must be specified") + label_list.append(self.application) + + return label_list + labels + + def _get_label_names(self, values: List[str] = []) -> List[str]: + full_list = [] + + if self.environment: + full_list.append(self.environment) + full_list.append("application") + + full_list += values + + return full_list + + def register_prometheus_summary( + self, name: str, unit: str, labels: List[str] = [] + ) -> Summary: + full_name = self.PROMETHEUS_METRICS_BASENAME + name + + self.prometheus_metrics[full_name] = Summary( + full_name, + "", + registry=self.registry, + unit=unit, + labelnames=self._get_label_names(labels), + ) + + def register_prometheus_gauge( + self, name: str, unit: str, labels: List[str] = [] + ) -> Gauge: + full_name = self.PROMETHEUS_METRICS_BASENAME + name + + self.prometheus_metrics[full_name] = Gauge( + full_name, + "", + registry=self.registry, + unit=unit, + labelnames=self._get_label_names(labels), + ) + + def save_prometheus_metrics(self) -> None: + if self.prometheus_enabled: + if self.application is None: + raise ValueError("Application name must be specified") + filename = ( + self.prometheus_exporter_directory + "/" + self.application + ".prom" + ) + write_to_textfile(filename, self.registry) diff --git a/swh/icinga_plugins/cli.py b/swh/icinga_plugins/cli.py --- a/swh/icinga_plugins/cli.py +++ b/swh/icinga_plugins/cli.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,8 +16,22 @@ @swh_cli_group.group(name="icinga_plugins", context_settings=CONTEXT_SETTINGS) @click.option("-w", "--warning", type=int, help="Warning threshold.") @click.option("-c", "--critical", type=int, help="Critical threshold.") +@click.option("--prometheus-exporter/--no-prometheus-exporter", default=False) +@click.option( + "--prometheus-exporter-directory", + type=str, + default="/var/lib/prometheus/node-exporter", +) +@click.option("--environment", type=str, help="The tested environment") @click.pass_context -def icinga_cli_group(ctx, warning, critical): +def icinga_cli_group( + ctx, + warning, + critical, + prometheus_exporter: bool, + prometheus_exporter_directory: str, + environment: str, +): """Main command for Icinga plugins """ ctx.ensure_object(dict) @@ -26,6 +40,10 @@ if critical: ctx.obj["critical_threshold"] = int(critical) + ctx.obj["prometheus_enabled"] = prometheus_exporter + ctx.obj["prometheus_exporter_directory"] = prometheus_exporter_directory + ctx.obj["environment"] = environment + @icinga_cli_group.group(name="check-vault") @click.option( diff --git a/swh/icinga_plugins/deposit.py b/swh/icinga_plugins/deposit.py --- a/swh/icinga_plugins/deposit.py +++ b/swh/icinga_plugins/deposit.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -19,7 +19,7 @@ DEFAULT_CRITICAL_THRESHOLD = 3600 def __init__(self, obj): - super().__init__(obj) + super().__init__(obj, application="deposit") self._poll_interval = obj["poll_interval"] self._archive_path = obj["archive"] self._metadata_path = obj["metadata"] @@ -33,6 +33,9 @@ } ) + self.register_prometheus_gauge("duration", "seconds", ["step", "status"]) + self.register_prometheus_gauge("status", "") + def upload_deposit(self): slug = "check-deposit-%s" % datetime.datetime.now().isoformat() result = self._client.deposit_create( @@ -80,6 +83,17 @@ f"started)", **metrics, ) + + self.collect_prometheus_metric( + "duration", + metrics["total_time"], + [result["deposit_status"], "timeout"], + ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["", "timeout"] + ) + self.collect_prometheus_metric("status", 2) + sys.exit(2) time.sleep(self._poll_interval) @@ -108,7 +122,17 @@ f'Deposit was rejected: {result["deposit_status_detail"]}', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["validation_time"], ["validation", "rejected"] + ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["validation", "rejected"] + ) + self.collect_prometheus_metric("status", 2) return 2 + self.collect_prometheus_metric( + "duration", metrics["validation_time"], ["validation", "ok"] + ) # Wait for loading result = self.wait_while_status( @@ -118,6 +142,9 @@ metrics["load_time"] = ( metrics["total_time"] - metrics["upload_time"] - metrics["validation_time"] ) + self.collect_prometheus_metric( + "duration", metrics["load_time"], ["loading", result["deposit_status"]] + ) # Check loading succeeded if result["deposit_status"] == "failed": @@ -126,6 +153,10 @@ f'Deposit loading failed: {result["deposit_status_detail"]}', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "failed"] + ) + self.collect_prometheus_metric("status", 2) return 2 # Check for unexpected status @@ -136,6 +167,10 @@ f'({result["deposit_status_detail"]})', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", result["deposit_status"]] + ) + self.collect_prometheus_metric("status", 2) return 2 # Everything went fine, check total time wasn't too large and @@ -148,6 +183,7 @@ ) if status_code != 0: # Stop if any problem in the initial scenario + self.collect_prometheus_metric("status", 2) return status_code # Initial deposit is now completed, now we can update the deposit with metadata @@ -169,6 +205,10 @@ f'Deposit Metadata update failed: {result["error"]} ', **metrics_update, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "metadata_error"] + ) + self.collect_prometheus_metric("status", 2) return 2 (status_code, status) = self.get_status(metrics_update["total_time"]) @@ -178,4 +218,9 @@ "and succeeded.", **metrics_update, ) + + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "done"] + ) + self.collect_prometheus_metric("status", status_code) return status_code diff --git a/swh/icinga_plugins/save_code_now.py b/swh/icinga_plugins/save_code_now.py --- a/swh/icinga_plugins/save_code_now.py +++ b/swh/icinga_plugins/save_code_now.py @@ -21,12 +21,15 @@ DEFAULT_CRITICAL_THRESHOLD = 120 def __init__(self, obj: Dict, origin: str, visit_type: str) -> None: - super().__init__(obj) + super().__init__(obj, application="scn") self.api_url = obj["swh_web_url"].rstrip("/") self.poll_interval = obj["poll_interval"] self.origin = origin self.visit_type = visit_type + self.register_prometheus_gauge("duration", "seconds", ["status"]) + self.register_prometheus_gauge("status", "") + @staticmethod def api_url_scn(root_api_url: str, origin: str, visit_type: str) -> str: """Compute the save code now api url for a given origin""" @@ -85,6 +88,8 @@ f'and has status: {result["save_task_status"]}.', total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["timeout"]) + self.collect_prometheus_metric("status", 2) return 2 if result[status_key] == "succeeded": @@ -94,6 +99,8 @@ f"{REPORT_MSG} {origin_info} took {total_time:.2f}s and succeeded.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["succeeded"]) + self.collect_prometheus_metric("status", status_code) return status_code elif result[status_key] == "failed": self.print_result( @@ -101,6 +108,8 @@ f"{REPORT_MSG} {origin_info} took {total_time:.2f}s and failed.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["failed"]) + self.collect_prometheus_metric("status", 2) return 2 else: self.print_result( @@ -110,4 +119,6 @@ f"{result['save_request_status']} ; {result[status_key]}.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["failed"]) + self.collect_prometheus_metric("status", 2) return 2 diff --git a/swh/icinga_plugins/tests/test_deposit.py b/swh/icinga_plugins/tests/test_deposit.py --- a/swh/icinga_plugins/tests/test_deposit.py +++ b/swh/icinga_plugins/tests/test_deposit.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -204,6 +204,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -270,6 +273,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -326,6 +332,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -373,6 +382,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "--warning", "15", "check-deposit", @@ -420,6 +432,9 @@ [ "--critical", "50", + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -469,6 +484,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -508,6 +526,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -552,6 +573,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", diff --git a/swh/icinga_plugins/tests/test_save_code_now.py b/swh/icinga_plugins/tests/test_save_code_now.py --- a/swh/icinga_plugins/tests/test_save_code_now.py +++ b/swh/icinga_plugins/tests/test_save_code_now.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -76,6 +76,8 @@ # fmt: off result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", "/tmp", "check-savecodenow", "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, @@ -206,7 +208,8 @@ # fmt: off result = invoke( [ - "check-savecodenow", "--swh-web-url", root_api_url, + "check-savecodenow", + "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, ], diff --git a/swh/icinga_plugins/vault.py b/swh/icinga_plugins/vault.py --- a/swh/icinga_plugins/vault.py +++ b/swh/icinga_plugins/vault.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -22,11 +22,14 @@ DEFAULT_CRITICAL_THRESHOLD = 3600 def __init__(self, obj): - super().__init__(obj) + super().__init__(obj, application="vault") self._swh_storage = get_storage("remote", url=obj["swh_storage_url"]) self._swh_web_url = obj["swh_web_url"] self._poll_interval = obj["poll_interval"] + self.register_prometheus_gauge("status", "") + self.register_prometheus_gauge("duration", "seconds", ["step", "status"]) + def _url_for_dir(self, dir_id): return self._swh_web_url + f"/api/1/vault/directory/{dir_id.hex()}/" @@ -71,30 +74,50 @@ f'{result["progress_message"]}', total_time=total_time, ) + + self.collect_prometheus_metric("status", 2) + self.collect_prometheus_metric( + "duration", total_time, ["cooking", "timeout"], + ) + return 2 - if result["status"] == "done": - (status_code, status) = self.get_status(total_time) + exit_code = 0 + status = result["status"] + prometheus_status = status + + if status == "done": + (exit_code, state) = self.get_status(total_time) self.print_result( - status, + state, f"cooking directory {dir_id.hex()} took {total_time:.2f}s " f"and succeeded.", total_time=total_time, ) - return status_code - elif result["status"] == "failed": + + elif status == "failed": self.print_result( "CRITICAL", f"cooking directory {dir_id.hex()} took {total_time:.2f}s " f'and failed with: {result["progress_message"]}', total_time=total_time, ) - return 2 + + exit_code = 2 + else: self.print_result( "CRITICAL", f"cooking directory {dir_id.hex()} took {total_time:.2f}s " - f'and resulted in unknown status: {result["status"]}', + f"and resulted in unknown status: {status}", total_time=total_time, ) - return 2 + + prometheus_status = "unknown" + exit_code = 2 + + self.collect_prometheus_metric("status", exit_code) + self.collect_prometheus_metric( + "duration", total_time, ["end", prometheus_status], + ) + return exit_code