diff --git a/swh/icinga_plugins/base_check.py b/swh/icinga_plugins/base_check.py new file mode 100644 --- /dev/null +++ b/swh/icinga_plugins/base_check.py @@ -0,0 +1,20 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class BaseCheck: + def __init__(self, obj): + self.warning_threshold = obj.get( + '_warning_threshold', self.DEFAULT_WARNING_THRESHOLD) + self.critical_threshold = obj.get( + '_critical_threshold', self.DEFAULT_CRITICAL_THRESHOLD) + + def get_status(self, value): + if self.critical_threshold and value >= self.critical_threshold: + return (2, 'CRITICAL') + elif self.warning_threshold and value >= self.warning_threshold: + return (1, 'WARNING') + else: + return (0, 'OK') diff --git a/swh/icinga_plugins/cli.py b/swh/icinga_plugins/cli.py --- a/swh/icinga_plugins/cli.py +++ b/swh/icinga_plugins/cli.py @@ -17,13 +17,19 @@ help='URL to an swh-storage HTTP API') @click.option('--swh-web-url', type=str, help='URL to an swh-web instance') +@click.option('-w', '--warning', type=int, + help='Warning threshold.') +@click.option('-c', '--critical', type=int, + help='Critical threshold.') @click.pass_context -def cli(ctx, swh_storage_url, swh_web_url): +def cli(ctx, swh_storage_url, swh_web_url, warning, critical): """Main command for Icinga plugins """ ctx.ensure_object(dict) ctx.obj['swh_storage_url'] = swh_storage_url ctx.obj['swh_web_url'] = swh_web_url + ctx.obj['warning_threshold'] = warning + ctx.obj['critical_threshold'] = critical @cli.group(name='check-vault') diff --git a/swh/icinga_plugins/tests/test_vault.py b/swh/icinga_plugins/tests/test_vault.py --- a/swh/icinga_plugins/tests/test_vault.py +++ b/swh/icinga_plugins/tests/test_vault.py @@ -6,6 +6,7 @@ import enum import json import re +import time from click.testing import CliRunner @@ -107,6 +108,7 @@ r'[0-9]\.[0-9]{2}s and succeeded.\n' r"| 'total time' = [0-9]\.[0-9]{2}s", result.output) + assert result.exit_code == 0, result.output sleep_mock.assert_called_once_with(10) @@ -168,6 +170,7 @@ r'[0-9]\.[0-9]{2}s and succeeded.\n' r"| 'total time' = [0-9]\.[0-9]{2}s", result.output) + assert result.exit_code == 0, result.output assert sleep_mock.call_count == 2 @@ -225,5 +228,74 @@ r'[0-9]\.[0-9]{2}s and failed with: foobar\n' r"| 'total time' = [0-9]\.[0-9]{2}s", result.output) + assert result.exit_code == 2, result.output sleep_mock.assert_called_once_with(10) + + +def test_vault_timeout(requests_mock, mocker): + + class Step(enum.Enum): + NOTHING_DONE = 0 + CHECKED_UNCOOKED = 1 + REQUESTED_COOKING = 2 + PENDING = 3 + + step = Step.NOTHING_DONE + + def post_callback(request, context): + nonlocal step + if step == Step.CHECKED_UNCOOKED: + step = Step.REQUESTED_COOKING + return json.dumps(response_pending) + else: + assert False, step + + def get_callback(request, context): + context.json = True + nonlocal step, time_offset + if step == Step.NOTHING_DONE: + context.status_code = 404 + step = Step.CHECKED_UNCOOKED + elif step == Step.CHECKED_UNCOOKED: + assert False + elif step == Step.REQUESTED_COOKING: + step = Step.PENDING + return json.dumps(response_pending) + elif step == Step.PENDING: + time_offset += 4000 # jump forward in time more than 1h + return json.dumps(response_pending) + else: + assert False, step + + requests_mock.get( + f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', + text=get_callback) + requests_mock.post( + f'mock://swh-web.example.org/api/1/vault/directory/{dir_id}/', + text=post_callback) + + get_storage_mock = mocker.patch('swh.icinga_plugins.vault.get_storage') + get_storage_mock.side_effect = FakeStorage + + sleep_mock = mocker.patch('time.sleep') + + real_time = time.time + time_offset = 0 + mocker.patch( + 'time.time', side_effect=lambda: real_time() + time_offset) + + result = invoke([ + '--swh-web-url', 'mock://swh-web.example.org', + '--swh-storage-url', 'foo://example.org', + 'check-vault', 'directory', + ], catch_exceptions=True) + + assert re.match( + rf'VAULT CRITICAL - cooking directory {dir_id} took more than ' + r'[0-9]+\.[0-9]{2}s and has status: foo\n' + r"| 'total time' = [0-9]\.[0-9]{2}s", + result.output) + assert result.exit_code == 2, result.output + + assert sleep_mock.call_count == 2 diff --git a/swh/icinga_plugins/vault.py b/swh/icinga_plugins/vault.py --- a/swh/icinga_plugins/vault.py +++ b/swh/icinga_plugins/vault.py @@ -9,13 +9,19 @@ from swh.storage import get_storage +from .base_check import BaseCheck + class NoDirectory(Exception): pass -class VaultCheck: +class VaultCheck(BaseCheck): + DEFAULT_WARNING_THRESHOLD = 0 + DEFAULT_CRITICAL_THRESHOLD = 3600 + def __init__(self, obj): + super().__init__(obj) self._swh_storage = get_storage('remote', url=obj['swh_storage_url']) self._swh_web_url = obj['swh_web_url'] self._poll_interval = obj['poll_interval'] @@ -44,6 +50,7 @@ return 2 start_time = time.time() + total_time = 0 response = requests.post(self._url_for_dir(dir_id)) assert response.status_code == 200, (response, response.text) result = response.json() @@ -53,23 +60,30 @@ assert response.status_code == 200, (response, response.text) result = response.json() - end_time = time.time() - total_time = end_time - start_time + total_time = time.time() - start_time + + if total_time > self.critical_threshold: + print(f'VAULT CRITICAL - cooking directory {dir_id.hex()} ' + f'took more than {total_time:.2f}s and has status: ' + f'{result["progress_message"]}') + print(f"| 'total time' = {total_time:.2f}s") + return 2 if result['status'] == 'done': - print(f'VAULT OK - cooking directory {dir_id.hex()} ' + (status_code, status) = self.get_status(total_time) + print(f'VAULT {status} - cooking directory {dir_id.hex()} ' f'took {total_time:.2f}s and succeeded.') print(f"| 'total time' = {total_time:.2f}s") - return 0 + return status_code elif result['status'] == 'failed': print(f'VAULT CRITICAL - cooking directory {dir_id.hex()} ' f'took {total_time:.2f}s and failed with: ' f'{result["progress_message"]}') print(f"| 'total time' = {total_time:.2f}s") - return 3 + return 2 else: print(f'VAULT CRITICAL - cooking directory {dir_id.hex()} ' f'took {total_time:.2f}s and resulted in unknown: ' f'status: {result["status"]}') print(f"| 'total time' = {total_time:.2f}s") - return 3 + return 2