Changeset View
Changeset View
Standalone View
Standalone View
swh/icinga_plugins/vault.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import tarfile | import tarfile | ||||
import time | import time | ||||
from typing import List | |||||
import requests | import requests | ||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
from .base_check import BaseCheck | from .base_check import BaseCheck | ||||
class NoDirectory(Exception): | class NoDirectory(Exception): | ||||
pass | pass | ||||
class VaultCheck(BaseCheck): | class VaultCheck(BaseCheck): | ||||
TYPE = "VAULT" | TYPE = "VAULT" | ||||
DEFAULT_WARNING_THRESHOLD = 0 | DEFAULT_WARNING_THRESHOLD = 0 | ||||
DEFAULT_CRITICAL_THRESHOLD = 3600 | DEFAULT_CRITICAL_THRESHOLD = 3600 | ||||
def __init__(self, obj): | def __init__(self, obj): | ||||
super().__init__(obj) | super().__init__(obj, application="vault") | ||||
self._swh_storage = get_storage("remote", url=obj["swh_storage_url"]) | self._swh_storage = get_storage("remote", url=obj["swh_storage_url"]) | ||||
self._swh_web_url = obj["swh_web_url"] | self._swh_web_url = obj["swh_web_url"] | ||||
self._poll_interval = obj["poll_interval"] | self._poll_interval = obj["poll_interval"] | ||||
self.register_prometheus_gauge("status", "") | |||||
self.register_prometheus_gauge("duration", "seconds", ["step", "status"]) | |||||
def _url_for_dir(self, dir_id): | def _url_for_dir(self, dir_id): | ||||
return self._swh_web_url + f"/api/1/vault/directory/{dir_id.hex()}/" | return self._swh_web_url + f"/api/1/vault/directory/{dir_id.hex()}/" | ||||
def _pick_directory(self): | def _pick_directory(self): | ||||
dir_ = self._swh_storage.directory_get_random() | dir_ = self._swh_storage.directory_get_random() | ||||
if dir_ is None: | if dir_ is None: | ||||
raise NoDirectory() | raise NoDirectory() | ||||
return dir_ | return dir_ | ||||
def _pick_uncached_directory(self): | def _pick_uncached_directory(self): | ||||
while True: | while True: | ||||
dir_id = self._pick_directory() | dir_id = self._pick_directory() | ||||
response = requests.get(self._url_for_dir(dir_id)) | response = requests.get(self._url_for_dir(dir_id)) | ||||
if response.status_code == 404: | if response.status_code == 404: | ||||
return dir_id | return dir_id | ||||
def _collect_prometheus_metrics( | |||||
self, status: int, duration: float, labels: List[str] | |||||
) -> None: | |||||
self.collect_prometheus_metric("status", status) | |||||
self.collect_prometheus_metric( | |||||
"duration", duration, labels, | |||||
) | |||||
def main(self): | def main(self): | ||||
try: | try: | ||||
dir_id = self._pick_uncached_directory() | dir_id = self._pick_uncached_directory() | ||||
except NoDirectory: | except NoDirectory: | ||||
self.print_result("CRITICAL", "No directory exists in the archive.") | self.print_result("CRITICAL", "No directory exists in the archive.") | ||||
return 2 | return 2 | ||||
start_time = time.time() | start_time = time.time() | ||||
Show All 12 Lines | def main(self): | ||||
if total_time > self.critical_threshold: | if total_time > self.critical_threshold: | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"cooking directory {dir_id.hex()} took more than " | f"cooking directory {dir_id.hex()} took more than " | ||||
f"{total_time:.2f}s and has status: " | f"{total_time:.2f}s and has status: " | ||||
f'{result["progress_message"]}', | f'{result["progress_message"]}', | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics(2, total_time, ["cooking", "timeout"]) | |||||
return 2 | return 2 | ||||
if result["status"] == "failed": | if result["status"] == "failed": | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | ||||
f'and failed with: {result["progress_message"]}', | f'and failed with: {result["progress_message"]}', | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics(2, total_time, ["cooking", "failed"]) | |||||
return 2 | return 2 | ||||
elif result["status"] != "done": | elif result["status"] != "done": | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | ||||
f'and resulted in unknown status: {result["status"]}', | f'and resulted in unknown status: {result["status"]}', | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics(2, total_time, ["cooking", "unknown"]) | |||||
return 2 | return 2 | ||||
(status_code, status) = self.get_status(total_time) | (status_code, status) = self.get_status(total_time) | ||||
if "fetch_url" not in result: | if "fetch_url" not in result: | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | ||||
f"and succeeded, but API response did not contain a fetch_url.", | f"and succeeded, but API response did not contain a fetch_url.", | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics(2, total_time, ["fetch", "no_url"]) | |||||
return 2 | return 2 | ||||
with requests.get(result["fetch_url"], stream=True) as fetch_response: | with requests.get(result["fetch_url"], stream=True) as fetch_response: | ||||
try: | try: | ||||
fetch_response.raise_for_status() | fetch_response.raise_for_status() | ||||
except requests.HTTPError: | except requests.HTTPError: | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | ||||
f"and succeeded, but fetch failed with status code " | f"and succeeded, but fetch failed with status code " | ||||
f"{fetch_response.status_code}.", | f"{fetch_response.status_code}.", | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics(2, total_time, ["fetch", "error"]) | |||||
return 2 | return 2 | ||||
content_type = fetch_response.headers.get("Content-Type") | content_type = fetch_response.headers.get("Content-Type") | ||||
if content_type != "application/gzip": | if content_type != "application/gzip": | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"Unexpected Content-Type when downloading bundle: {content_type}", | f"Unexpected Content-Type when downloading bundle: {content_type}", | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics( | |||||
2, total_time, ["download", "unexpected_content_type"] | |||||
) | |||||
return 2 | return 2 | ||||
try: | try: | ||||
with tarfile.open(fileobj=fetch_response.raw, mode="r|gz") as tf: | with tarfile.open(fileobj=fetch_response.raw, mode="r|gz") as tf: | ||||
# Note that we are streaming the tarfile from the network, | # Note that we are streaming the tarfile from the network, | ||||
# so we are allowed at most one pass on the tf object; | # so we are allowed at most one pass on the tf object; | ||||
# and the sooner we close it the better. | # and the sooner we close it the better. | ||||
# Fortunately, checking only the first member is good enough: | # Fortunately, checking only the first member is good enough: | ||||
tarinfo = tf.next() | tarinfo = tf.next() | ||||
swhid = f"swh:1:dir:{dir_id.hex()}" | swhid = f"swh:1:dir:{dir_id.hex()}" | ||||
if tarinfo.name != swhid and not tarinfo.name.startswith( | if tarinfo.name != swhid and not tarinfo.name.startswith( | ||||
f"{swhid}/" | f"{swhid}/" | ||||
): | ): | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"Unexpected member in tarball: {tarinfo.name}", | f"Unexpected member in tarball: {tarinfo.name}", | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics( | |||||
2, total_time, ["check", "archive_content"] | |||||
) | |||||
return 2 | return 2 | ||||
except tarfile.ReadError as e: | except tarfile.ReadError as e: | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"ReadError while reading tarball: {e}", | f"ReadError while reading tarball: {e}", | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics( | |||||
2, total_time, ["check", "archive_content"] | |||||
) | |||||
return 2 | return 2 | ||||
except tarfile.StreamError as e: | except tarfile.StreamError as e: | ||||
if e.args[0] == "seeking backwards is not allowed": | if e.args[0] == "seeking backwards is not allowed": | ||||
# Probably https://bugs.python.org/issue46922 | # Probably https://bugs.python.org/issue46922 | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"StreamError while reading tarball (empty file?): {e}", | f"StreamError while reading tarball (empty file?): {e}", | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics( | |||||
2, total_time, ["check", "archive_content"] | |||||
) | |||||
return 2 | return 2 | ||||
self.print_result( | self.print_result( | ||||
"CRITICAL", | "CRITICAL", | ||||
f"StreamError while reading tarball: {e}", | f"StreamError while reading tarball: {e}", | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics( | |||||
2, total_time, ["check", "archive_content"] | |||||
) | |||||
return 2 | return 2 | ||||
self.print_result( | self.print_result( | ||||
status, | status, | ||||
f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | f"cooking directory {dir_id.hex()} took {total_time:.2f}s " | ||||
f"and succeeded.", | f"and succeeded.", | ||||
total_time=total_time, | total_time=total_time, | ||||
) | ) | ||||
self._collect_prometheus_metrics(status_code, total_time, ["end", ""]) | |||||
return status_code | return status_code |