diff --git a/swh/icinga_plugins/base_check.py b/swh/icinga_plugins/base_check.py new file mode 100644 index 0000000..3752818 --- /dev/null +++ b/swh/icinga_plugins/base_check.py @@ -0,0 +1,20 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class BaseCheck: + def __init__(self, obj): + self.warning_threshold = obj.get( + '_warning_threshold', self.DEFAULT_WARNING_THRESHOLD) + self.critical_threshold = obj.get( + '_critical_threshold', self.DEFAULT_CRITICAL_THRESHOLD) + + def get_status(self, value): + if self.critical_threshold and value >= self.critical_threshold: + return (2, 'CRITICAL') + elif self.warning_threshold and value >= self.warning_threshold: + return (1, 'WARNING') + else: + return (0, 'OK') diff --git a/swh/icinga_plugins/cli.py b/swh/icinga_plugins/cli.py index dafa4e9..6213bf0 100644 --- a/swh/icinga_plugins/cli.py +++ b/swh/icinga_plugins/cli.py @@ -1,43 +1,49 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import sys import click from swh.core.cli import CONTEXT_SETTINGS from .vault import VaultCheck @click.group(name='icinga_plugins', context_settings=CONTEXT_SETTINGS) @click.option('--swh-storage-url', type=str, help='URL to an swh-storage HTTP API') @click.option('--swh-web-url', type=str, help='URL to an swh-web instance') +@click.option('-w', '--warning', type=int, + help='Warning threshold.') +@click.option('-c', '--critical', type=int, + help='Critical threshold.') @click.pass_context -def cli(ctx, swh_storage_url, swh_web_url): +def cli(ctx, swh_storage_url, swh_web_url, warning, critical): """Main command for Icinga plugins """ ctx.ensure_object(dict) ctx.obj['swh_storage_url'] = swh_storage_url ctx.obj['swh_web_url'] = swh_web_url + ctx.obj['warning_threshold'] = warning + ctx.obj['critical_threshold'] = critical @cli.group(name='check-vault') @click.option('--poll-interval', type=int, default=10, help='Interval (in seconds) between two polls to the API, ' 'to check for cooking status.') @click.pass_context def check_vault(ctx, poll_interval): ctx.obj['poll_interval'] = poll_interval @check_vault.command(name='directory') @click.pass_context def check_vault_directory(ctx): """Picks a random directory, requests its cooking via swh-web, and waits for completion.""" sys.exit(VaultCheck(ctx.obj).main()) diff --git a/swh/icinga_plugins/tests/test_vault.py b/swh/icinga_plugins/tests/test_vault.py index 1ebf3d5..e2eab6b 100644 --- a/swh/icinga_plugins/tests/test_vault.py +++ b/swh/icinga_plugins/tests/test_vault.py @@ -1,229 +1,301 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import enum import json import re +import time from click.testing import CliRunner from swh.icinga_plugins.cli import cli dir_id = 'ab'*20 response_pending = { "obj_id": dir_id, "obj_type": "directory", "progress_message": "foo", "status": "pending" } response_done = { "fetch_url": f"/api/1/vault/directory/{dir_id}/raw/", "id": 9, "obj_id": dir_id, "obj_type": "directory", "status": "done" } response_failed = { "obj_id": dir_id, "obj_type": "directory", "progress_message": "foobar", "status": "failed" } class FakeStorage: def __init__(self, foo, **kwargs): pass def directory_get_random(self): return bytes.fromhex(dir_id) def invoke(args, catch_exceptions=False): runner = CliRunner() result = runner.invoke(cli, args) if not catch_exceptions and result.exception: print(result.output) raise result.exception return result def test_vault_immediate_success(requests_mock, mocker): class Step(enum.Enum): NOTHING_DONE = 0 CHECKED_UNCOOKED = 1 REQUESTED_COOKING = 2 step = Step.NOTHING_DONE def post_callback(request, context): nonlocal step if step == Step.CHECKED_UNCOOKED: step = Step.REQUESTED_COOKING return json.dumps(response_pending) else: assert False, step def get_callback(request, context): context.json = True nonlocal step if step == Step.NOTHING_DONE: context.status_code = 404 step = Step.CHECKED_UNCOOKED elif step == Step.CHECKED_UNCOOKED: assert False elif step == Step.REQUESTED_COOKING: return json.dumps(response_done) else: assert False, step requests_mock.get( f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', text=get_callback) requests_mock.post( f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', text=post_callback) get_storage_mock = mocker.patch('swh.icinga_plugins.vault.get_storage') get_storage_mock.side_effect = FakeStorage sleep_mock = mocker.patch('time.sleep') result = invoke([ '--swh-web-url', 'mock://swh-web.example.org', '--swh-storage-url', 'foo://example.org', 'check-vault', 'directory', ]) assert re.match( rf'VAULT OK - cooking directory {dir_id} took ' r'[0-9]\.[0-9]{2}s and succeeded.\n' r"| 'total time' = [0-9]\.[0-9]{2}s", result.output) + assert result.exit_code == 0, result.output sleep_mock.assert_called_once_with(10) def test_vault_delayed_success(requests_mock, mocker): class Step(enum.Enum): NOTHING_DONE = 0 CHECKED_UNCOOKED = 1 REQUESTED_COOKING = 2 PENDING = 3 step = Step.NOTHING_DONE def post_callback(request, context): nonlocal step if step == Step.CHECKED_UNCOOKED: step = Step.REQUESTED_COOKING return json.dumps(response_pending) else: assert False, step def get_callback(request, context): context.json = True nonlocal step if step == Step.NOTHING_DONE: context.status_code = 404 step = Step.CHECKED_UNCOOKED elif step == Step.CHECKED_UNCOOKED: assert False elif step == Step.REQUESTED_COOKING: step = Step.PENDING return json.dumps(response_pending) elif step == Step.PENDING: return json.dumps(response_done) else: assert False, step requests_mock.get( f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', text=get_callback) requests_mock.post( f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', text=post_callback) get_storage_mock = mocker.patch('swh.icinga_plugins.vault.get_storage') get_storage_mock.side_effect = FakeStorage sleep_mock = mocker.patch('time.sleep') result = invoke([ '--swh-web-url', 'mock://swh-web.example.org', '--swh-storage-url', 'foo://example.org', 'check-vault', 'directory', ]) assert re.match( rf'VAULT OK - cooking directory {dir_id} took ' r'[0-9]\.[0-9]{2}s and succeeded.\n' r"| 'total time' = [0-9]\.[0-9]{2}s", result.output) + assert result.exit_code == 0, result.output assert sleep_mock.call_count == 2 def test_vault_failure(requests_mock, mocker): class Step(enum.Enum): NOTHING_DONE = 0 CHECKED_UNCOOKED = 1 REQUESTED_COOKING = 2 step = Step.NOTHING_DONE def post_callback(request, context): nonlocal step if step == Step.CHECKED_UNCOOKED: step = Step.REQUESTED_COOKING return json.dumps(response_pending) else: assert False, step def get_callback(request, context): context.json = True nonlocal step if step == Step.NOTHING_DONE: context.status_code = 404 step = Step.CHECKED_UNCOOKED elif step == Step.CHECKED_UNCOOKED: assert False elif step == Step.REQUESTED_COOKING: return json.dumps(response_failed) else: assert False, step requests_mock.get( f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', text=get_callback) requests_mock.post( f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', text=post_callback) get_storage_mock = mocker.patch('swh.icinga_plugins.vault.get_storage') get_storage_mock.side_effect = FakeStorage sleep_mock = mocker.patch('time.sleep') result = invoke([ '--swh-web-url', 'mock://swh-web.example.org', '--swh-storage-url', 'foo://example.org', 'check-vault', 'directory', ], catch_exceptions=True) assert re.match( rf'VAULT CRITICAL - cooking directory {dir_id} took ' r'[0-9]\.[0-9]{2}s and failed with: foobar\n' r"| 'total time' = [0-9]\.[0-9]{2}s", result.output) + assert result.exit_code == 2, result.output sleep_mock.assert_called_once_with(10) + + +def test_vault_timeout(requests_mock, mocker): + + class Step(enum.Enum): + NOTHING_DONE = 0 + CHECKED_UNCOOKED = 1 + REQUESTED_COOKING = 2 + PENDING = 3 + + step = Step.NOTHING_DONE + + def post_callback(request, context): + nonlocal step + if step == Step.CHECKED_UNCOOKED: + step = Step.REQUESTED_COOKING + return json.dumps(response_pending) + else: + assert False, step + + def get_callback(request, context): + context.json = True + nonlocal step, time_offset + if step == Step.NOTHING_DONE: + context.status_code = 404 + step = Step.CHECKED_UNCOOKED + elif step == Step.CHECKED_UNCOOKED: + assert False + elif step == Step.REQUESTED_COOKING: + step = Step.PENDING + return json.dumps(response_pending) + elif step == Step.PENDING: + time_offset += 4000 # jump forward in time more than 1h + return json.dumps(response_pending) + else: + assert False, step + + requests_mock.get( + f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', + text=get_callback) + requests_mock.post( + f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', + text=post_callback) + + get_storage_mock = mocker.patch('swh.icinga_plugins.vault.get_storage') + get_storage_mock.side_effect = FakeStorage + + sleep_mock = mocker.patch('time.sleep') + + real_time = time.time + time_offset = 0 + mocker.patch( + 'time.time', side_effect=lambda: real_time() + time_offset) + + result = invoke([ + '--swh-web-url', 'mock://swh-web.example.org', + '--swh-storage-url', 'foo://example.org', + 'check-vault', 'directory', + ], catch_exceptions=True) + + assert re.match( + rf'VAULT CRITICAL - cooking directory {dir_id} took more than ' + r'[0-9]+\.[0-9]{2}s and has status: foo\n' + r"| 'total time' = [0-9]\.[0-9]{2}s", + result.output) + assert result.exit_code == 2, result.output + + assert sleep_mock.call_count == 2 diff --git a/swh/icinga_plugins/vault.py b/swh/icinga_plugins/vault.py index d85c766..26125b4 100644 --- a/swh/icinga_plugins/vault.py +++ b/swh/icinga_plugins/vault.py @@ -1,75 +1,89 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time import requests from swh.storage import get_storage +from .base_check import BaseCheck + class NoDirectory(Exception): pass -class VaultCheck: +class VaultCheck(BaseCheck): + DEFAULT_WARNING_THRESHOLD = 0 + DEFAULT_CRITICAL_THRESHOLD = 3600 + def __init__(self, obj): + super().__init__(obj) self._swh_storage = get_storage('remote', url=obj['swh_storage_url']) self._swh_web_url = obj['swh_web_url'] self._poll_interval = obj['poll_interval'] def _url_for_dir(self, dir_id): return self._swh_web_url + f'/api/1/vault/directory/{dir_id.hex()}/' def _pick_directory(self): dir_ = self._swh_storage.directory_get_random() if dir_ is None: raise NoDirectory() return dir_ def _pick_uncached_directory(self): while True: dir_id = self._pick_directory() response = requests.get(self._url_for_dir(dir_id)) if response.status_code == 404: return dir_id def main(self): try: dir_id = self._pick_uncached_directory() except NoDirectory: print('VAULT CRITICAL - No directory exists in the archive') return 2 start_time = time.time() + total_time = 0 response = requests.post(self._url_for_dir(dir_id)) assert response.status_code == 200, (response, response.text) result = response.json() while result['status'] in ('new', 'pending'): time.sleep(self._poll_interval) response = requests.get(self._url_for_dir(dir_id)) assert response.status_code == 200, (response, response.text) result = response.json() - end_time = time.time() - total_time = end_time - start_time + total_time = time.time() - start_time + + if total_time > self.critical_threshold: + print(f'VAULT CRITICAL - cooking directory {dir_id.hex()} ' + f'took more than {total_time:.2f}s and has status: ' + f'{result["progress_message"]}') + print(f"| 'total time' = {total_time:.2f}s") + return 2 if result['status'] == 'done': - print(f'VAULT OK - cooking directory {dir_id.hex()} ' + (status_code, status) = self.get_status(total_time) + print(f'VAULT {status} - cooking directory {dir_id.hex()} ' f'took {total_time:.2f}s and succeeded.') print(f"| 'total time' = {total_time:.2f}s") - return 0 + return status_code elif result['status'] == 'failed': print(f'VAULT CRITICAL - cooking directory {dir_id.hex()} ' f'took {total_time:.2f}s and failed with: ' f'{result["progress_message"]}') print(f"| 'total time' = {total_time:.2f}s") - return 3 + return 2 else: print(f'VAULT CRITICAL - cooking directory {dir_id.hex()} ' f'took {total_time:.2f}s and resulted in unknown: ' f'status: {result["status"]}') print(f"| 'total time' = {total_time:.2f}s") - return 3 + return 2