diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -14,5 +14,9 @@ [mypy-requests_mock.*] ignore_missing_imports = True +[mypy-prometheus_client.*] +ignore_missing_imports = True + + # [mypy-add_your_lib_here.*] # ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ click psycopg2 requests +prometheus-client +typing diff --git a/swh/icinga_plugins/base_check.py b/swh/icinga_plugins/base_check.py --- a/swh/icinga_plugins/base_check.py +++ b/swh/icinga_plugins/base_check.py @@ -2,21 +2,35 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import atexit +from typing import Any, Dict, List -from typing import Dict +from prometheus_client import CollectorRegistry, Gauge, Summary, write_to_textfile class BaseCheck: DEFAULT_WARNING_THRESHOLD = 60 DEFAULT_CRITICAL_THRESHOLD = 120 + PROMETHEUS_METRICS_BASENAME = "swh_e2e_" - def __init__(self, obj: Dict[str, str]) -> None: + def __init__(self, obj: Dict[str, str], application: str): self.warning_threshold = float( obj.get("warning_threshold", self.DEFAULT_WARNING_THRESHOLD) ) self.critical_threshold = float( obj.get("critical_threshold", self.DEFAULT_CRITICAL_THRESHOLD) ) + self.prometheus_enabled = obj.get("prometheus_enabled") + self.prometheus_exporter_directory = obj.get("prometheus_exporter_directory") + self.environment = obj.get("environment") + self.application = application + + # A new registry is created to not export the default process metrics + self.registry = CollectorRegistry() + + self.prometheus_metrics: Dict[str, Any] = {} + + atexit.register(self.save_prometheus_metrics) def get_status(self, value): if self.critical_threshold and value >= self.critical_threshold: @@ -30,3 +44,76 @@ print(f"{self.TYPE} {status_type} - {status_string}") for (metric_name, metric_value) in sorted(metrics.items()): print(f"| '{metric_name}' = {metric_value:.2f}s") + + def collect_prometheus_metric( + self, name: str, value: float, labels: List[str] = [] + ): + g = self.prometheus_metrics.get(self.PROMETHEUS_METRICS_BASENAME + name) + + if g is None: + raise ValueError(f"No metric {name} found") + + g.labels(*self._get_label_values(labels)).set(value) + + def _get_label_values(self, labels: List[str]) -> List[str]: + label_list = [] + + if self.environment: + label_list.append(self.environment) + + if self.application is None: + raise ValueError("Application name must be specified") + label_list.append(self.application) + + return label_list + labels + + def _get_label_names(self, values: List[str] = []) -> List[str]: + full_list = [] + + if self.environment: + full_list.append(self.environment) + full_list.append("application") + + full_list += values + + return full_list + + def register_prometheus_summary( + self, name: str, unit: str, labels: List[str] = [] + ) -> None: + full_name = self.PROMETHEUS_METRICS_BASENAME + name + + self.prometheus_metrics[full_name] = Summary( + full_name, + "", + registry=self.registry, + unit=unit, + labelnames=self._get_label_names(labels), + ) + + def register_prometheus_gauge( + self, name: str, unit: str, labels: List[str] = [] + ) -> None: + full_name = self.PROMETHEUS_METRICS_BASENAME + name + + self.prometheus_metrics[full_name] = Gauge( + name=full_name, + documentation="", + registry=self.registry, + unit=unit, + labelnames=self._get_label_names(labels), + ) + + def save_prometheus_metrics(self) -> None: + """Dump on disk the .prom file containing the + metrics collected during the check execution. + + It's a callback method triggered by the atexit + declared in the constructor.""" + if self.prometheus_enabled: + assert self.prometheus_exporter_directory is not None + + filename = ( + self.prometheus_exporter_directory + "/" + self.application + ".prom" + ) + write_to_textfile(filename, self.registry) diff --git a/swh/icinga_plugins/cli.py b/swh/icinga_plugins/cli.py --- a/swh/icinga_plugins/cli.py +++ b/swh/icinga_plugins/cli.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,8 +16,22 @@ @swh_cli_group.group(name="icinga_plugins", context_settings=CONTEXT_SETTINGS) @click.option("-w", "--warning", type=int, help="Warning threshold.") @click.option("-c", "--critical", type=int, help="Critical threshold.") +@click.option("--prometheus-exporter/--no-prometheus-exporter", default=False) +@click.option( + "--prometheus-exporter-directory", + type=str, + default="/var/lib/prometheus/node-exporter", +) +@click.option("--environment", type=str, help="The tested environment") @click.pass_context -def icinga_cli_group(ctx, warning, critical): +def icinga_cli_group( + ctx, + warning, + critical, + prometheus_exporter: bool, + prometheus_exporter_directory: str, + environment: str, +): """Main command for Icinga plugins""" ctx.ensure_object(dict) if warning: @@ -25,6 +39,10 @@ if critical: ctx.obj["critical_threshold"] = int(critical) + ctx.obj["prometheus_enabled"] = prometheus_exporter + ctx.obj["prometheus_exporter_directory"] = prometheus_exporter_directory + ctx.obj["environment"] = environment + @icinga_cli_group.group(name="check-vault") @click.option( diff --git a/swh/icinga_plugins/deposit.py b/swh/icinga_plugins/deposit.py --- a/swh/icinga_plugins/deposit.py +++ b/swh/icinga_plugins/deposit.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -21,7 +21,7 @@ DEFAULT_CRITICAL_THRESHOLD = 3600 def __init__(self, obj): - super().__init__(obj) + super().__init__(obj, application="deposit") self.api_url = obj["swh_web_url"].rstrip("/") self._poll_interval = obj["poll_interval"] self._archive_path = obj["archive"] @@ -37,6 +37,9 @@ } ) + self.register_prometheus_gauge("duration", "seconds", ["step", "status"]) + self.register_prometheus_gauge("status", "") + def upload_deposit(self): slug = ( "check-deposit-%s" @@ -85,6 +88,17 @@ f"started)", **metrics, ) + + self.collect_prometheus_metric( + "duration", + metrics["total_time"], + [result["deposit_status"], "timeout"], + ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["", "timeout"] + ) + self.collect_prometheus_metric("status", 2) + sys.exit(2) time.sleep(self._poll_interval) @@ -116,7 +130,17 @@ f'Deposit was rejected: {result["deposit_status_detail"]}', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["validation_time"], ["validation", "rejected"] + ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["validation", "rejected"] + ) + self.collect_prometheus_metric("status", 2) return 2 + self.collect_prometheus_metric( + "duration", metrics["validation_time"], ["validation", "ok"] + ) # Wait for loading result = self.wait_while_status( @@ -126,6 +150,9 @@ metrics["load_time"] = ( metrics["total_time"] - metrics["upload_time"] - metrics["validation_time"] ) + self.collect_prometheus_metric( + "duration", metrics["load_time"], ["loading", result["deposit_status"]] + ) # Check loading succeeded if result["deposit_status"] == "failed": @@ -134,6 +161,10 @@ f'Deposit loading failed: {result["deposit_status_detail"]}', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "failed"] + ) + self.collect_prometheus_metric("status", 2) return 2 # Check for unexpected status @@ -144,6 +175,10 @@ f'({result["deposit_status_detail"]})', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", result["deposit_status"]] + ) + self.collect_prometheus_metric("status", 2) return 2 # Get the SWHID @@ -229,6 +264,7 @@ ) if status_code != 0: # Stop if any problem in the initial scenario + self.collect_prometheus_metric("status", status_code) return status_code # Initial deposit is now completed, now we can update the deposit with metadata @@ -250,6 +286,10 @@ f'Deposit Metadata update failed: {result["error"]} ', **metrics_update, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "metadata_error"] + ) + self.collect_prometheus_metric("status", 2) return 2 (status_code, status) = self.get_status(metrics_update["total_time"]) @@ -259,4 +299,9 @@ "and succeeded.", **metrics_update, ) + + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "done"] + ) + self.collect_prometheus_metric("status", status_code) return status_code diff --git a/swh/icinga_plugins/save_code_now.py b/swh/icinga_plugins/save_code_now.py --- a/swh/icinga_plugins/save_code_now.py +++ b/swh/icinga_plugins/save_code_now.py @@ -21,12 +21,15 @@ DEFAULT_CRITICAL_THRESHOLD = 120 def __init__(self, obj: Dict, origin: str, visit_type: str) -> None: - super().__init__(obj) + super().__init__(obj, application="scn") self.api_url = obj["swh_web_url"].rstrip("/") self.poll_interval = obj["poll_interval"] self.origin = origin self.visit_type = visit_type + self.register_prometheus_gauge("duration", "seconds", ["status"]) + self.register_prometheus_gauge("status", "") + @staticmethod def api_url_scn(root_api_url: str, origin: str, visit_type: str) -> str: """Compute the save code now api url for a given origin""" @@ -85,6 +88,8 @@ f'and has status: {result["save_task_status"]}.', total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["timeout"]) + self.collect_prometheus_metric("status", 2) return 2 if result[status_key] == "succeeded": @@ -94,6 +99,8 @@ f"{REPORT_MSG} {origin_info} took {total_time:.2f}s and succeeded.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["succeeded"]) + self.collect_prometheus_metric("status", status_code) return status_code elif result[status_key] == "failed": self.print_result( @@ -101,6 +108,8 @@ f"{REPORT_MSG} {origin_info} took {total_time:.2f}s and failed.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["failed"]) + self.collect_prometheus_metric("status", 2) return 2 else: self.print_result( @@ -110,4 +119,6 @@ f"{result['save_request_status']} ; {result[status_key]}.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["failed"]) + self.collect_prometheus_metric("status", 2) return 2 diff --git a/swh/icinga_plugins/tests/test_base_check.py b/swh/icinga_plugins/tests/test_base_check.py new file mode 100644 --- /dev/null +++ b/swh/icinga_plugins/tests/test_base_check.py @@ -0,0 +1,57 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.icinga_plugins.base_check import BaseCheck + + +def test_inexistent_metric(): + base_check = BaseCheck({}, "test") + + with pytest.raises(ValueError, match="No metric unknown found"): + base_check.collect_prometheus_metric("unknown", 10, []) + + +def test_environment(): + base_check = BaseCheck({"environment": "pytest"}, "test") + + with pytest.raises(ValueError, match="No metric unknown found"): + base_check.collect_prometheus_metric("unknown", 10, []) + + +def test_application_not_defined(): + base_check = BaseCheck({"environment": "pytest"}, "test") + base_check.register_prometheus_gauge("gauge", "seconds") + base_check.application = None + + with pytest.raises(ValueError, match="Application name must be specified"): + base_check.collect_prometheus_metric("gauge", 10, []) + + +def test_save_without_directory(tmpdir): + config = { + "prometheus_enabled": True, + } + + base_check = BaseCheck(config, "test") + + with pytest.raises(AssertionError): + base_check.save_prometheus_metrics() + + +def test_save(tmpdir): + application = "my_application" + config = { + "prometheus_enabled": True, + "prometheus_exporter_directory": tmpdir.strpath, + } + + base_check = BaseCheck(config, application) + base_check.register_prometheus_gauge("gauge", "count") + base_check.collect_prometheus_metric("gauge", 10) + base_check.save_prometheus_metrics() + + assert f"{tmpdir.strpath}/{application}.prom" in tmpdir.listdir() diff --git a/swh/icinga_plugins/tests/test_deposit.py b/swh/icinga_plugins/tests/test_deposit.py --- a/swh/icinga_plugins/tests/test_deposit.py +++ b/swh/icinga_plugins/tests/test_deposit.py @@ -247,6 +247,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -344,6 +347,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -427,6 +433,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -498,6 +507,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "--warning", "15", "check-deposit", @@ -573,6 +585,9 @@ [ "--critical", "50", + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -622,6 +637,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -873,6 +891,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -921,6 +942,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", @@ -970,6 +994,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", diff --git a/swh/icinga_plugins/tests/test_save_code_now.py b/swh/icinga_plugins/tests/test_save_code_now.py --- a/swh/icinga_plugins/tests/test_save_code_now.py +++ b/swh/icinga_plugins/tests/test_save_code_now.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -74,6 +74,8 @@ # fmt: off result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", "/tmp", "check-savecodenow", "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, @@ -204,7 +206,8 @@ # fmt: off result = invoke( [ - "check-savecodenow", "--swh-web-url", root_api_url, + "check-savecodenow", + "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, ], diff --git a/swh/icinga_plugins/tests/test_vault.py b/swh/icinga_plugins/tests/test_vault.py --- a/swh/icinga_plugins/tests/test_vault.py +++ b/swh/icinga_plugins/tests/test_vault.py @@ -158,6 +158,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", @@ -190,6 +193,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", @@ -225,6 +231,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", @@ -263,6 +272,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", @@ -291,6 +303,9 @@ result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", diff --git a/swh/icinga_plugins/vault.py b/swh/icinga_plugins/vault.py --- a/swh/icinga_plugins/vault.py +++ b/swh/icinga_plugins/vault.py @@ -1,10 +1,11 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tarfile import time +from typing import List import requests @@ -23,11 +24,14 @@ DEFAULT_CRITICAL_THRESHOLD = 3600 def __init__(self, obj): - super().__init__(obj) + super().__init__(obj, application="vault") self._swh_storage = get_storage("remote", url=obj["swh_storage_url"]) self._swh_web_url = obj["swh_web_url"] self._poll_interval = obj["poll_interval"] + self.register_prometheus_gauge("status", "") + self.register_prometheus_gauge("duration", "seconds", ["step", "status"]) + def _url_for_dir(self, dir_id): return self._swh_web_url + f"/api/1/vault/directory/{dir_id.hex()}/" @@ -44,6 +48,14 @@ if response.status_code == 404: return dir_id + def _collect_prometheus_metrics( + self, status: int, duration: float, labels: List[str] + ) -> None: + self.collect_prometheus_metric("status", status) + self.collect_prometheus_metric( + "duration", duration, labels, + ) + def main(self): try: dir_id = self._pick_uncached_directory() @@ -72,6 +84,9 @@ f'{result["progress_message"]}', total_time=total_time, ) + + self._collect_prometheus_metrics(2, total_time, ["cooking", "timeout"]) + return 2 if result["status"] == "failed": @@ -81,6 +96,9 @@ f'and failed with: {result["progress_message"]}', total_time=total_time, ) + + self._collect_prometheus_metrics(2, total_time, ["cooking", "failed"]) + return 2 elif result["status"] != "done": self.print_result( @@ -89,6 +107,8 @@ f'and resulted in unknown status: {result["status"]}', total_time=total_time, ) + + self._collect_prometheus_metrics(2, total_time, ["cooking", "unknown"]) return 2 (status_code, status) = self.get_status(total_time) @@ -100,6 +120,7 @@ f"and succeeded, but API response did not contain a fetch_url.", total_time=total_time, ) + self._collect_prometheus_metrics(2, total_time, ["fetch", "no_url"]) return 2 with requests.get(result["fetch_url"], stream=True) as fetch_response: @@ -113,6 +134,7 @@ f"{fetch_response.status_code}.", total_time=total_time, ) + self._collect_prometheus_metrics(2, total_time, ["fetch", "error"]) return 2 content_type = fetch_response.headers.get("Content-Type") @@ -122,6 +144,9 @@ f"Unexpected Content-Type when downloading bundle: {content_type}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["download", "unexpected_content_type"] + ) return 2 try: @@ -140,6 +165,9 @@ f"Unexpected member in tarball: {tarinfo.name}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["check", "archive_content"] + ) return 2 except tarfile.ReadError as e: self.print_result( @@ -147,6 +175,9 @@ f"ReadError while reading tarball: {e}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["check", "archive_content"] + ) return 2 except tarfile.StreamError as e: if e.args[0] == "seeking backwards is not allowed": @@ -156,6 +187,9 @@ f"StreamError while reading tarball (empty file?): {e}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["check", "archive_content"] + ) return 2 self.print_result( @@ -163,6 +197,9 @@ f"StreamError while reading tarball: {e}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["check", "archive_content"] + ) return 2 self.print_result( @@ -171,4 +208,6 @@ f"and succeeded.", total_time=total_time, ) + + self._collect_prometheus_metrics(status_code, total_time, ["end", ""]) return status_code