diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..73145d3 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# python: Reformat code with black 22.3.0 +cb62ef1b400559657f9211650f5fab2063d8578e diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 05398bb..1c95e3d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,42 +1,40 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.1.0 hooks: - id: trailing-whitespace - id: check-json - id: check-yaml - repo: https://gitlab.com/pycqa/flake8 rev: 4.0.1 hooks: - id: flake8 + additional_dependencies: [flake8-bugbear==22.3.23] - repo: https://github.com/codespell-project/codespell rev: v2.1.0 hooks: - id: codespell name: Check source code spelling stages: [commit] - - id: codespell - name: Check commit message spelling - stages: [commit-msg] - repo: local hooks: - id: mypy name: mypy entry: mypy args: [swh] pass_filenames: false language: system types: [python] - repo: https://github.com/PyCQA/isort rev: 5.10.1 hooks: - id: isort - repo: https://github.com/python/black - rev: 19.10b0 + rev: 22.3.0 hooks: - id: black diff --git a/PKG-INFO b/PKG-INFO index befa3cd..753dded 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,29 +1,29 @@ Metadata-Version: 2.1 Name: swh.icinga_plugins -Version: 0.4.2 +Version: 0.5.0 Summary: Icinga plugins for Software Heritage infrastructure monitoring Home-page: https://forge.softwareheritage.org/diffusion/swh-icinga-plugins Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-icinga-plugins Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-icinga-plugins ================== Scripts for end-to-end monitoring of the SWH infrastructure diff --git a/mypy.ini b/mypy.ini index 2b77ba2..f8f07e3 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,18 +1,22 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-requests_mock.*] ignore_missing_imports = True +[mypy-prometheus_client.*] +ignore_missing_imports = True + + # [mypy-add_your_lib_here.*] # ignore_missing_imports = True diff --git a/pytest.ini b/pytest.ini index 9454e79..114a44a 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,5 @@ [pytest] addopts = -p no:django -norecursedirs = docs .* +norecursedirs = build docs .* + +asyncio_mode = strict diff --git a/requirements-test.txt b/requirements-test.txt index 888161b..e470c85 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,7 +1,7 @@ -pytest < 7.0.0 # v7.0.0 removed _pytest.tmpdir.TempdirFactory, which is used by some of the pytest plugins we use +pytest pytest-mock requests-mock types-click types-requests types-python-dateutil types-PyYAML diff --git a/requirements.txt b/requirements.txt index ced35f8..516e676 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ click psycopg2 requests +prometheus-client +typing diff --git a/setup.cfg b/setup.cfg index 1d722c2..f65ba0a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,9 @@ [flake8] -ignore = E203,E231,W503 +select = C,E,F,W,B950 +ignore = E203,E231,E501,W503 max-line-length = 88 [egg_info] tag_build = tag_date = 0 diff --git a/swh.icinga_plugins.egg-info/PKG-INFO b/swh.icinga_plugins.egg-info/PKG-INFO index d49054b..0320d1f 100644 --- a/swh.icinga_plugins.egg-info/PKG-INFO +++ b/swh.icinga_plugins.egg-info/PKG-INFO @@ -1,29 +1,29 @@ Metadata-Version: 2.1 Name: swh.icinga-plugins -Version: 0.4.2 +Version: 0.5.0 Summary: Icinga plugins for Software Heritage infrastructure monitoring Home-page: https://forge.softwareheritage.org/diffusion/swh-icinga-plugins Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-icinga-plugins Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-icinga-plugins ================== Scripts for end-to-end monitoring of the SWH infrastructure diff --git a/swh.icinga_plugins.egg-info/SOURCES.txt b/swh.icinga_plugins.egg-info/SOURCES.txt index e7829f9..97debe4 100644 --- a/swh.icinga_plugins.egg-info/SOURCES.txt +++ b/swh.icinga_plugins.egg-info/SOURCES.txt @@ -1,47 +1,49 @@ +.git-blame-ignore-revs .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.md mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini data/deposit/jesuisgpl.tgz data/deposit/jesuisgpl.tgz.xml docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.icinga_plugins.egg-info/PKG-INFO swh.icinga_plugins.egg-info/SOURCES.txt swh.icinga_plugins.egg-info/dependency_links.txt swh.icinga_plugins.egg-info/entry_points.txt swh.icinga_plugins.egg-info/requires.txt swh.icinga_plugins.egg-info/top_level.txt swh/icinga_plugins/__init__.py swh/icinga_plugins/base_check.py swh/icinga_plugins/cli.py swh/icinga_plugins/deposit.py swh/icinga_plugins/py.typed swh/icinga_plugins/save_code_now.py swh/icinga_plugins/vault.py swh/icinga_plugins/tests/__init__.py swh/icinga_plugins/tests/conftest.py +swh/icinga_plugins/tests/test_base_check.py swh/icinga_plugins/tests/test_deposit.py swh/icinga_plugins/tests/test_save_code_now.py swh/icinga_plugins/tests/test_vault.py swh/icinga_plugins/tests/utils.py swh/icinga_plugins/tests/web_scenario.py \ No newline at end of file diff --git a/swh.icinga_plugins.egg-info/requires.txt b/swh.icinga_plugins.egg-info/requires.txt index eb46405..0f59140 100644 --- a/swh.icinga_plugins.egg-info/requires.txt +++ b/swh.icinga_plugins.egg-info/requires.txt @@ -1,15 +1,17 @@ click psycopg2 requests +prometheus-client +typing swh.core[http]>=0.3 swh.deposit>=0.3 swh.storage>=0.0.162 [testing] -pytest<7.0.0 +pytest pytest-mock requests-mock types-click types-requests types-python-dateutil types-PyYAML diff --git a/swh/icinga_plugins/base_check.py b/swh/icinga_plugins/base_check.py index 7110a9e..08be439 100644 --- a/swh/icinga_plugins/base_check.py +++ b/swh/icinga_plugins/base_check.py @@ -1,32 +1,119 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import atexit +from typing import Any, Dict, List -from typing import Dict +from prometheus_client import CollectorRegistry, Gauge, Summary, write_to_textfile class BaseCheck: DEFAULT_WARNING_THRESHOLD = 60 DEFAULT_CRITICAL_THRESHOLD = 120 + PROMETHEUS_METRICS_BASENAME = "swh_e2e_" - def __init__(self, obj: Dict[str, str]) -> None: + def __init__(self, obj: Dict[str, str], application: str): self.warning_threshold = float( obj.get("warning_threshold", self.DEFAULT_WARNING_THRESHOLD) ) self.critical_threshold = float( obj.get("critical_threshold", self.DEFAULT_CRITICAL_THRESHOLD) ) + self.prometheus_enabled = obj.get("prometheus_enabled") + self.prometheus_exporter_directory = obj.get("prometheus_exporter_directory") + self.environment = obj.get("environment") + self.application = application + + # A new registry is created to not export the default process metrics + self.registry = CollectorRegistry() + + self.prometheus_metrics: Dict[str, Any] = {} + + atexit.register(self.save_prometheus_metrics) def get_status(self, value): if self.critical_threshold and value >= self.critical_threshold: return (2, "CRITICAL") elif self.warning_threshold and value >= self.warning_threshold: return (1, "WARNING") else: return (0, "OK") def print_result(self, status_type, status_string, **metrics): print(f"{self.TYPE} {status_type} - {status_string}") for (metric_name, metric_value) in sorted(metrics.items()): print(f"| '{metric_name}' = {metric_value:.2f}s") + + def collect_prometheus_metric( + self, name: str, value: float, labels: List[str] = [] + ): + g = self.prometheus_metrics.get(self.PROMETHEUS_METRICS_BASENAME + name) + + if g is None: + raise ValueError(f"No metric {name} found") + + g.labels(*self._get_label_values(labels)).set(value) + + def _get_label_values(self, labels: List[str]) -> List[str]: + label_list = [] + + if self.environment: + label_list.append(self.environment) + + if self.application is None: + raise ValueError("Application name must be specified") + label_list.append(self.application) + + return label_list + labels + + def _get_label_names(self, values: List[str] = []) -> List[str]: + full_list = [] + + if self.environment: + full_list.append(self.environment) + full_list.append("application") + + full_list += values + + return full_list + + def register_prometheus_summary( + self, name: str, unit: str, labels: List[str] = [] + ) -> None: + full_name = self.PROMETHEUS_METRICS_BASENAME + name + + self.prometheus_metrics[full_name] = Summary( + full_name, + "", + registry=self.registry, + unit=unit, + labelnames=self._get_label_names(labels), + ) + + def register_prometheus_gauge( + self, name: str, unit: str, labels: List[str] = [] + ) -> None: + full_name = self.PROMETHEUS_METRICS_BASENAME + name + + self.prometheus_metrics[full_name] = Gauge( + name=full_name, + documentation="", + registry=self.registry, + unit=unit, + labelnames=self._get_label_names(labels), + ) + + def save_prometheus_metrics(self) -> None: + """Dump on disk the .prom file containing the + metrics collected during the check execution. + + It's a callback method triggered by the atexit + declared in the constructor.""" + if self.prometheus_enabled: + assert self.prometheus_exporter_directory is not None + + filename = ( + self.prometheus_exporter_directory + "/" + self.application + ".prom" + ) + write_to_textfile(filename, self.registry) diff --git a/swh/icinga_plugins/cli.py b/swh/icinga_plugins/cli.py index b9412ae..9ee8bd0 100644 --- a/swh/icinga_plugins/cli.py +++ b/swh/icinga_plugins/cli.py @@ -1,147 +1,164 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import sys import click from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group @swh_cli_group.group(name="icinga_plugins", context_settings=CONTEXT_SETTINGS) @click.option("-w", "--warning", type=int, help="Warning threshold.") @click.option("-c", "--critical", type=int, help="Critical threshold.") +@click.option("--prometheus-exporter/--no-prometheus-exporter", default=False) +@click.option( + "--prometheus-exporter-directory", + type=str, + default="/var/lib/prometheus/node-exporter", +) +@click.option("--environment", type=str, help="The tested environment") @click.pass_context -def icinga_cli_group(ctx, warning, critical): - """Main command for Icinga plugins - """ +def icinga_cli_group( + ctx, + warning, + critical, + prometheus_exporter: bool, + prometheus_exporter_directory: str, + environment: str, +): + """Main command for Icinga plugins""" ctx.ensure_object(dict) if warning: ctx.obj["warning_threshold"] = int(warning) if critical: ctx.obj["critical_threshold"] = int(critical) + ctx.obj["prometheus_enabled"] = prometheus_exporter + ctx.obj["prometheus_exporter_directory"] = prometheus_exporter_directory + ctx.obj["environment"] = environment + @icinga_cli_group.group(name="check-vault") @click.option( "--swh-storage-url", type=str, required=True, help="URL to an swh-storage HTTP API" ) @click.option( "--swh-web-url", type=str, required=True, help="URL to an swh-web instance" ) @click.option( "--poll-interval", type=int, default=10, help="Interval (in seconds) between two polls to the API, " "to check for cooking status.", ) @click.pass_context def check_vault(ctx, **kwargs): ctx.obj.update(kwargs) @check_vault.command(name="directory") @click.pass_context def check_vault_directory(ctx): """Picks a random directory, requests its cooking via swh-web, and waits for completion.""" from .vault import VaultCheck sys.exit(VaultCheck(ctx.obj).main()) @icinga_cli_group.group(name="check-savecodenow") @click.option( "--swh-web-url", type=str, required=True, help="URL to an swh-web instance" ) @click.option( "--poll-interval", type=int, default=10, help="Interval (in seconds) between two polls to the API, " "to check for save code now status.", ) @click.pass_context def check_scn(ctx, **kwargs): ctx.obj.update(kwargs) @check_scn.command(name="origin") @click.argument("origin", type=str) @click.option("--visit-type", type=str, required=True, help="Visit type for origin") @click.pass_context def check_scn_origin(ctx, origin, visit_type): """Requests a save code now via the api for a given origin with type visit_type, waits for its completion, report approximate time of completion (failed or succeeded) and warn if threshold exceeded. """ from .save_code_now import SaveCodeNowCheck sys.exit(SaveCodeNowCheck(ctx.obj, origin, visit_type).main()) @icinga_cli_group.group(name="check-deposit") @click.option( "--server", type=str, default="https://deposit.softwareheritage.org/1", help="URL to the SWORD server to test", ) @click.option( "--provider-url", type=str, required=True, help=( "Root URL of the deposit client, as defined in the " "'deposit_client.provider_url' column in the deposit DB" ), ) @click.option("--username", type=str, required=True, help="Login for the SWORD server") @click.option( "--password", type=str, required=True, help="Password for the SWORD server" ) @click.option( "--collection", type=str, required=True, help="Software collection to use on the SWORD server", ) @click.option( "--poll-interval", type=int, default=10, help="Interval (in seconds) between two polls to the API, " "to check for ingestion status.", ) @click.option( "--swh-web-url", type=str, required=True, help="URL to an swh-web instance" ) @click.pass_context def check_deposit(ctx, **kwargs): ctx.obj.update(kwargs) @check_deposit.command(name="single") @click.option( "--archive", type=click.Path(), required=True, help="Software artefact to upload" ) @click.option( "--metadata", type=click.Path(), required=True, help="Metadata file for the software artefact.", ) @click.pass_context def check_deposit_single(ctx, **kwargs): """Checks the provided archive and metadata file and be deposited.""" from .deposit import DepositCheck ctx.obj.update(kwargs) sys.exit(DepositCheck(ctx.obj).main()) diff --git a/swh/icinga_plugins/deposit.py b/swh/icinga_plugins/deposit.py index 8f94c0f..863c4a9 100644 --- a/swh/icinga_plugins/deposit.py +++ b/swh/icinga_plugins/deposit.py @@ -1,264 +1,307 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import sys import time from typing import Any, Dict, Optional import requests from swh.deposit.client import PublicApiDepositClient from .base_check import BaseCheck class DepositCheck(BaseCheck): TYPE = "DEPOSIT" DEFAULT_WARNING_THRESHOLD = 120 DEFAULT_CRITICAL_THRESHOLD = 3600 def __init__(self, obj): - super().__init__(obj) + super().__init__(obj, application="deposit") self.api_url = obj["swh_web_url"].rstrip("/") self._poll_interval = obj["poll_interval"] self._archive_path = obj["archive"] self._metadata_path = obj["metadata"] self._collection = obj["collection"] self._slug: Optional[str] = None self._provider_url = obj["provider_url"] self._client = PublicApiDepositClient( { "url": obj["server"], "auth": {"username": obj["username"], "password": obj["password"]}, } ) + self.register_prometheus_gauge("duration", "seconds", ["step", "status"]) + self.register_prometheus_gauge("status", "") + def upload_deposit(self): slug = ( "check-deposit-%s" % datetime.datetime.fromtimestamp(time.time()).isoformat() ) result = self._client.deposit_create( archive=self._archive_path, metadata=self._metadata_path, collection=self._collection, in_progress=False, slug=slug, ) self._slug = slug self._deposit_id = result["deposit_id"] return result def update_deposit_with_metadata(self) -> Dict[str, Any]: - """Trigger a metadata update on the deposit once it's completed. - - """ + """Trigger a metadata update on the deposit once it's completed.""" deposit = self.get_deposit_status() swhid = deposit["deposit_swh_id"] assert deposit["deposit_id"] == self._deposit_id # We can reuse the initial metadata file we already sent return self._client.deposit_update( self._collection, self._deposit_id, self._slug, metadata=self._metadata_path, swhid=swhid, ) def get_deposit_status(self): return self._client.deposit_status( collection=self._collection, deposit_id=self._deposit_id ) def wait_while_status(self, statuses, start_time, metrics, result): while result["deposit_status"] in statuses: metrics["total_time"] = time.time() - start_time if metrics["total_time"] > self.critical_threshold: self.print_result( "CRITICAL", f"Timed out while in status " f'{result["deposit_status"]} ' f'({metrics["total_time"]}s seconds since deposit ' f"started)", **metrics, ) + + self.collect_prometheus_metric( + "duration", + metrics["total_time"], + [result["deposit_status"], "timeout"], + ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["", "timeout"] + ) + self.collect_prometheus_metric("status", 2) + sys.exit(2) time.sleep(self._poll_interval) result = self.get_deposit_status() return result def main(self): start_time = time.time() start_datetime = datetime.datetime.fromtimestamp( start_time, tz=datetime.timezone.utc ) metrics = {} # Upload the archive and metadata result = self.upload_deposit() metrics["upload_time"] = time.time() - start_time # Wait for validation result = self.wait_while_status(["deposited"], start_time, metrics, result) metrics["total_time"] = time.time() - start_time metrics["validation_time"] = metrics["total_time"] - metrics["upload_time"] # Check validation succeeded if result["deposit_status"] == "rejected": self.print_result( "CRITICAL", f'Deposit was rejected: {result["deposit_status_detail"]}', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["validation_time"], ["validation", "rejected"] + ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["validation", "rejected"] + ) + self.collect_prometheus_metric("status", 2) return 2 + self.collect_prometheus_metric( + "duration", metrics["validation_time"], ["validation", "ok"] + ) # Wait for loading result = self.wait_while_status( ["verified", "loading"], start_time, metrics, result ) metrics["total_time"] = time.time() - start_time metrics["load_time"] = ( metrics["total_time"] - metrics["upload_time"] - metrics["validation_time"] ) + self.collect_prometheus_metric( + "duration", metrics["load_time"], ["loading", result["deposit_status"]] + ) # Check loading succeeded if result["deposit_status"] == "failed": self.print_result( "CRITICAL", f'Deposit loading failed: {result["deposit_status_detail"]}', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "failed"] + ) + self.collect_prometheus_metric("status", 2) return 2 # Check for unexpected status if result["deposit_status"] != "done": self.print_result( "CRITICAL", f'Deposit got unexpected status: {result["deposit_status"]} ' f'({result["deposit_status_detail"]})', **metrics, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", result["deposit_status"]] + ) + self.collect_prometheus_metric("status", 2) return 2 # Get the SWHID if "deposit_swh_id" not in result: # if the deposit succeeded immediately (which is rare), it does not # contain the SWHID, so we need to re-fetch its status. result = self.get_deposit_status() if result.get("deposit_swh_id") is None: self.print_result( "CRITICAL", f"'deposit_swh_id' missing from result: {result!r}", **metrics, ) return 2 swhid = result["deposit_swh_id"] # Check for unexpected status if result["deposit_status"] != "done": self.print_result( "CRITICAL", f'Deposit status went from "done" to: {result["deposit_status"]} ' f'({result["deposit_status_detail"]})', **metrics, ) return 2 # Get metadata list from swh-web response = requests.get( f"{self.api_url}/api/1/raw-extrinsic-metadata/swhid/{swhid}/", params={ "authority": f"deposit_client {self._provider_url}", "after": start_datetime.isoformat(), }, ) if response.status_code != 200: self.print_result( "CRITICAL", f"Getting the list of metadata returned code {response.status_code}: " f"{response.content!r}", **metrics, ) return 2 metadata_objects = response.json() expected_origin = f"{self._provider_url}/{self._slug}" # Filter out objects that were clearly not created by this deposit relevant_metadata_objects = [ d for d in metadata_objects if d.get("origin") == expected_origin ] if not relevant_metadata_objects: self.print_result( "CRITICAL", f"No recent metadata on {swhid} with origin {expected_origin} in: " f"{metadata_objects!r}", **metrics, ) return 2 # Check the metadata was loaded as-is metadata_url = relevant_metadata_objects[0]["metadata_url"] metadata_file = requests.get(metadata_url).content with open(self._metadata_path, "rb") as fd: expected_metadata_file = fd.read() if metadata_file != expected_metadata_file: self.print_result( "CRITICAL", f"Metadata on {swhid} with origin {expected_origin} " f"(at {metadata_url}) differs from uploaded Atom document " f"(at {self._metadata_path})", **metrics, ) return 2 # Everything went fine, check total time wasn't too large and # print result (status_code, status) = self.get_status(metrics["total_time"]) self.print_result( status, f'Deposit took {metrics["total_time"]:.2f}s and succeeded.', **metrics, ) if status_code != 0: # Stop if any problem in the initial scenario + self.collect_prometheus_metric("status", status_code) return status_code # Initial deposit is now completed, now we can update the deposit with metadata result = self.update_deposit_with_metadata() total_time = time.time() - start_time metrics_update = { "total_time": total_time, "update_time": ( total_time - metrics["upload_time"] - metrics["validation_time"] - metrics["load_time"] ), } if "error" in result: self.print_result( "CRITICAL", f'Deposit Metadata update failed: {result["error"]} ', **metrics_update, ) + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "metadata_error"] + ) + self.collect_prometheus_metric("status", 2) return 2 (status_code, status) = self.get_status(metrics_update["total_time"]) self.print_result( status, f'Deposit Metadata update took {metrics_update["update_time"]:.2f}s ' "and succeeded.", **metrics_update, ) + + self.collect_prometheus_metric( + "duration", metrics["total_time"], ["total", "done"] + ) + self.collect_prometheus_metric("status", status_code) return status_code diff --git a/swh/icinga_plugins/save_code_now.py b/swh/icinga_plugins/save_code_now.py index 131c080..1922225 100644 --- a/swh/icinga_plugins/save_code_now.py +++ b/swh/icinga_plugins/save_code_now.py @@ -1,113 +1,124 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time from typing import Dict, List import requests from .base_check import BaseCheck REPORT_MSG = "Save code now request for origin" WAITING_STATUSES = ("not yet scheduled", "running", "scheduled") class SaveCodeNowCheck(BaseCheck): TYPE = "SAVECODENOW" DEFAULT_WARNING_THRESHOLD = 60 DEFAULT_CRITICAL_THRESHOLD = 120 def __init__(self, obj: Dict, origin: str, visit_type: str) -> None: - super().__init__(obj) + super().__init__(obj, application="scn") self.api_url = obj["swh_web_url"].rstrip("/") self.poll_interval = obj["poll_interval"] self.origin = origin self.visit_type = visit_type + self.register_prometheus_gauge("duration", "seconds", ["status"]) + self.register_prometheus_gauge("status", "") + @staticmethod def api_url_scn(root_api_url: str, origin: str, visit_type: str) -> str: """Compute the save code now api url for a given origin""" return f"{root_api_url}/api/1/origin/save/{visit_type}/url/{origin}/" def main(self) -> int: """Scenario description: 1. Requests a save code now request via the api for origin self.origin with type self.visit_type. 2. Polling regularly at self.poll_interval seconds the completion status. 3. When either succeeded, failed or threshold exceeded, report approximate time of completion. This will warn if thresholds are exceeded. """ start_time: float = time.time() total_time: float = 0.0 scn_url = self.api_url_scn(self.api_url, self.origin, self.visit_type) response = requests.post(scn_url) assert response.status_code == 200, (response, response.text) result: Dict = response.json() status_key = "save_task_status" request_date = result["save_request_date"] origin_info = (self.visit_type, self.origin) while result[status_key] in WAITING_STATUSES: time.sleep(self.poll_interval) response = requests.get(scn_url) assert ( response.status_code == 200 ), f"Unexpected response: {response}, {response.text}" raw_result: List[Dict] = response.json() assert len(raw_result) > 0, f"Unexpected result: {raw_result}" if len(raw_result) > 1: # retrieve only the one status result we are interested in result = next( filter(lambda r: r["save_request_date"] == request_date, raw_result) ) else: result = raw_result[0] # this because the api can return multiple entries for the same origin assert result["save_request_date"] == request_date total_time = time.time() - start_time if total_time > self.critical_threshold: self.print_result( "CRITICAL", f"{REPORT_MSG} {origin_info} took more than {total_time:.2f}s " f'and has status: {result["save_task_status"]}.', total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["timeout"]) + self.collect_prometheus_metric("status", 2) return 2 if result[status_key] == "succeeded": (status_code, status) = self.get_status(total_time) self.print_result( status, f"{REPORT_MSG} {origin_info} took {total_time:.2f}s and succeeded.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["succeeded"]) + self.collect_prometheus_metric("status", status_code) return status_code elif result[status_key] == "failed": self.print_result( "CRITICAL", f"{REPORT_MSG} {origin_info} took {total_time:.2f}s and failed.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["failed"]) + self.collect_prometheus_metric("status", 2) return 2 else: self.print_result( "CRITICAL", f"{REPORT_MSG} {origin_info} took {total_time:.2f}s " "and resulted in unsupported status: " f"{result['save_request_status']} ; {result[status_key]}.", total_time=total_time, ) + self.collect_prometheus_metric("duration", total_time, ["failed"]) + self.collect_prometheus_metric("status", 2) return 2 diff --git a/swh/icinga_plugins/tests/test_base_check.py b/swh/icinga_plugins/tests/test_base_check.py new file mode 100644 index 0000000..fe25dd3 --- /dev/null +++ b/swh/icinga_plugins/tests/test_base_check.py @@ -0,0 +1,57 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.icinga_plugins.base_check import BaseCheck + + +def test_inexistent_metric(): + base_check = BaseCheck({}, "test") + + with pytest.raises(ValueError, match="No metric unknown found"): + base_check.collect_prometheus_metric("unknown", 10, []) + + +def test_environment(): + base_check = BaseCheck({"environment": "pytest"}, "test") + + with pytest.raises(ValueError, match="No metric unknown found"): + base_check.collect_prometheus_metric("unknown", 10, []) + + +def test_application_not_defined(): + base_check = BaseCheck({"environment": "pytest"}, "test") + base_check.register_prometheus_gauge("gauge", "seconds") + base_check.application = None + + with pytest.raises(ValueError, match="Application name must be specified"): + base_check.collect_prometheus_metric("gauge", 10, []) + + +def test_save_without_directory(tmpdir): + config = { + "prometheus_enabled": True, + } + + base_check = BaseCheck(config, "test") + + with pytest.raises(AssertionError): + base_check.save_prometheus_metrics() + + +def test_save(tmpdir): + application = "my_application" + config = { + "prometheus_enabled": True, + "prometheus_exporter_directory": tmpdir.strpath, + } + + base_check = BaseCheck(config, application) + base_check.register_prometheus_gauge("gauge", "count") + base_check.collect_prometheus_metric("gauge", 10) + base_check.save_prometheus_metrics() + + assert f"{tmpdir.strpath}/{application}.prom" in tmpdir.listdir() diff --git a/swh/icinga_plugins/tests/test_deposit.py b/swh/icinga_plugins/tests/test_deposit.py index d0acc62..18c472f 100644 --- a/swh/icinga_plugins/tests/test_deposit.py +++ b/swh/icinga_plugins/tests/test_deposit.py @@ -1,925 +1,1018 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import io import os import tarfile import time from typing import Optional import pytest from swh.icinga_plugins.tests.utils import invoke from .web_scenario import WebScenario POLL_INTERVAL = 10 BASE_URL = "http://swh-deposit.example.org/1" BASE_WEB_URL = "http+mock://swh-web.example.org" PROVIDER_URL = "http://icinga-checker.example.org" COMMON_OPTIONS = [ "--server", BASE_URL, "--username", "test", "--password", "test", "--collection", "testcol", "--swh-web-url", BASE_WEB_URL, "--provider-url", PROVIDER_URL, ] SAMPLE_METADATA = """ Test Software swh test-software No One """ ENTRY_TEMPLATE = """ 42 2019-12-19 18:11:00 foo.tar.gz {status} http://purl.org/net/sword/package/SimpleZip """ STATUS_TEMPLATE = """ 42 {status} {status_detail}%s """ def compute_origin(): # This is the same origin the checker would compute, because we mock time.time # to be constant until time.sleep is called return ( PROVIDER_URL + "/check-deposit-%s" % datetime.datetime.fromtimestamp(time.time()).isoformat() ) def status_template( status: str, status_detail: str = "", swhid: Optional[str] = None ) -> str: - """Generate a proper status template out of status, status_detail and optional swhid - - """ + """Generate a proper status template out of status, status_detail and optional swhid""" if swhid is not None: template = ( STATUS_TEMPLATE % f"\n {swhid}" ) return template.format(status=status, status_detail=status_detail, swhid=swhid) template = STATUS_TEMPLATE % "" return template.format(status=status, status_detail=status_detail) def test_status_template(): actual_status = status_template(status="deposited") assert ( actual_status == """ 42 deposited """ ) actual_status = status_template(status="verified", status_detail="detail") assert ( actual_status == """ 42 verified detail """ ) actual_status = status_template( status="done", swhid="swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" ) assert ( actual_status == """ 42 done swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74 """ ) @pytest.fixture(scope="session") def tmp_path(tmp_path_factory): return tmp_path_factory.mktemp(__name__) @pytest.fixture(scope="session") def sample_metadata(tmp_path): - """Returns a sample metadata file's path - - """ + """Returns a sample metadata file's path""" path = os.path.join(tmp_path, "metadata.xml") with open(path, "w") as fd: fd.write(SAMPLE_METADATA) return path @pytest.fixture(scope="session") def sample_archive(tmp_path): - """Returns a sample archive's path - - """ + """Returns a sample archive's path""" path = os.path.join(tmp_path, "archive.tar.gz") with tarfile.open(path, "w:gz") as tf: tf.addfile(tarfile.TarInfo("hello.py"), io.BytesIO(b'print("Hello world")')) return path def test_deposit_immediate_success( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): - """Both deposit creation and deposit metadata update passed without delays - - """ + """Both deposit creation and deposit metadata update passed without delays""" origin = compute_origin() scenario = WebScenario() status_xml = status_template( status="done", status_detail="", swhid="swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74", ) # Initial deposit scenario.add_step( - "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="done"), + "post", + f"{BASE_URL}/testcol/", + ENTRY_TEMPLATE.format(status="done"), ) # Checker gets the SWHID swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" - status_xml = status_template(status="done", status_detail="", swhid=swhid,) + status_xml = status_template( + status="done", + status_detail="", + swhid=swhid, + ) scenario.add_step("get", f"{BASE_URL}/testcol/42/status/", status_xml) # Then the checker checks the metadata appeared on the website scenario.add_step( "get", f"{BASE_WEB_URL}/api/1/raw-extrinsic-metadata/swhid/{swhid}/" f"?authority=deposit_client+http%3A%2F%2Ficinga-checker.example.org" f"&after=2022-03-04T17%3A02%3A39%2B00%3A00", [ { "swhid": swhid, "origin": origin, "discovery_date": "2999-03-03T10:48:47+00:00", "metadata_url": f"{BASE_WEB_URL}/the-metadata-url", } ], ) scenario.add_step("get", f"{BASE_WEB_URL}/the-metadata-url", SAMPLE_METADATA) # Then metadata update scenario.add_step("get", f"{BASE_URL}/testcol/42/status/", status_xml) # internal deposit client does call status, then update metadata then status api scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_xml, + "get", + f"{BASE_URL}/testcol/42/status/", + status_xml, ) scenario.add_step( - "put", f"{BASE_URL}/testcol/42/atom/", status_xml, + "put", + f"{BASE_URL}/testcol/42/atom/", + status_xml, ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_xml, + "get", + f"{BASE_URL}/testcol/42/status/", + status_xml, ) scenario.install_mock(requests_mock) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ] ) assert result.output == ( "DEPOSIT OK - Deposit took 0.00s and succeeded.\n" "| 'load_time' = 0.00s\n" "| 'total_time' = 0.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 0.00s\n" "DEPOSIT OK - Deposit Metadata update took 0.00s and succeeded.\n" "| 'total_time' = 0.00s\n" "| 'update_time' = 0.00s\n" ) assert result.exit_code == 0, f"Unexpected output: {result.output}" def test_deposit_delays( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): """Deposit creation passed with some delays, deposit metadata update passed without delay """ origin = compute_origin() scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="loading"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="loading"), ) # Deposit done, checker gets the SWHID swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" - status_xml = status_template(status="done", status_detail="", swhid=swhid,) + status_xml = status_template( + status="done", + status_detail="", + swhid=swhid, + ) scenario.add_step("get", f"{BASE_URL}/testcol/42/status/", status_xml) # Then the checker checks the metadata appeared on the website scenario.add_step( "get", f"{BASE_WEB_URL}/api/1/raw-extrinsic-metadata/swhid/{swhid}/" f"?authority=deposit_client+http%3A%2F%2Ficinga-checker.example.org" f"&after=2022-03-04T17%3A02%3A39%2B00%3A00", [ { "swhid": swhid, "origin": origin, "discovery_date": "2999-03-03T10:48:47+00:00", "metadata_url": f"{BASE_WEB_URL}/the-metadata-url", } ], ) scenario.add_step("get", f"{BASE_WEB_URL}/the-metadata-url", SAMPLE_METADATA) # Then metadata update scenario.add_step("get", f"{BASE_URL}/testcol/42/status/", status_xml) # internal deposit client does call status, then update metadata then status api scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_xml, + "get", + f"{BASE_URL}/testcol/42/status/", + status_xml, ) scenario.add_step( - "put", f"{BASE_URL}/testcol/42/atom/", status_xml, + "put", + f"{BASE_URL}/testcol/42/atom/", + status_xml, ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_xml, + "get", + f"{BASE_URL}/testcol/42/status/", + status_xml, ) scenario.install_mock(requests_mock) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ] ) assert result.output == ( "DEPOSIT OK - Deposit took 30.00s and succeeded.\n" "| 'load_time' = 20.00s\n" "| 'total_time' = 30.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" "DEPOSIT OK - Deposit Metadata update took 0.00s and succeeded.\n" "| 'total_time' = 30.00s\n" "| 'update_time' = 0.00s\n" ) assert result.exit_code == 0, f"Unexpected output: {result.output}" def test_deposit_then_metadata_update_failed( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): - """Deposit creation passed, deposit metadata update failed - - """ + """Deposit creation passed, deposit metadata update failed""" origin = compute_origin() scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="loading"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="loading"), ) # Deposit done, checker gets the SWHID swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" - status_xml = status_template(status="done", status_detail="", swhid=swhid,) + status_xml = status_template( + status="done", + status_detail="", + swhid=swhid, + ) scenario.add_step("get", f"{BASE_URL}/testcol/42/status/", status_xml) # Then the checker checks the metadata appeared on the website scenario.add_step( "get", f"{BASE_WEB_URL}/api/1/raw-extrinsic-metadata/swhid/{swhid}/" f"?authority=deposit_client+http%3A%2F%2Ficinga-checker.example.org" f"&after=2022-03-04T17%3A02%3A39%2B00%3A00", [ { "swhid": swhid, "origin": origin, "discovery_date": "2999-03-03T10:48:47+00:00", "metadata_url": f"{BASE_WEB_URL}/the-metadata-url", } ], ) scenario.add_step("get", f"{BASE_WEB_URL}/the-metadata-url", SAMPLE_METADATA) # Then metadata update calls failed_status_xml = status_template( status="failed", # lying here status_detail="Failure to ingest", swhid="swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74", ) scenario.add_step("get", f"{BASE_URL}/testcol/42/status/", failed_status_xml) scenario.add_step("get", f"{BASE_URL}/testcol/42/status/", failed_status_xml) scenario.install_mock(requests_mock) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( "DEPOSIT OK - Deposit took 30.00s and succeeded.\n" "| 'load_time' = 20.00s\n" "| 'total_time' = 30.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" "DEPOSIT CRITICAL - Deposit Metadata update failed: You can only update " "metadata on deposit with status 'done' \n" "| 'total_time' = 30.00s\n" "| 'update_time' = 0.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_deposit_delay_warning( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): - """Deposit creation exceeded delays, no deposit update occurred. - - """ + """Deposit creation exceeded delays, no deposit update occurred.""" origin = compute_origin() scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) # Deposit done, checker gets the SWHID swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" - status_xml = status_template(status="done", status_detail="", swhid=swhid,) + status_xml = status_template( + status="done", + status_detail="", + swhid=swhid, + ) scenario.add_step("get", f"{BASE_URL}/testcol/42/status/", status_xml) # Then the checker checks the metadata appeared on the website scenario.add_step( "get", f"{BASE_WEB_URL}/api/1/raw-extrinsic-metadata/swhid/{swhid}/" f"?authority=deposit_client+http%3A%2F%2Ficinga-checker.example.org" f"&after=2022-03-04T17%3A02%3A39%2B00%3A00", [ { "swhid": swhid, "origin": origin, "discovery_date": "2999-03-03T10:48:47+00:00", "metadata_url": f"{BASE_WEB_URL}/the-metadata-url", } ], ) scenario.add_step("get", f"{BASE_WEB_URL}/the-metadata-url", SAMPLE_METADATA) scenario.install_mock(requests_mock) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "--warning", "15", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( "DEPOSIT WARNING - Deposit took 20.00s and succeeded.\n" "| 'load_time' = 10.00s\n" "| 'total_time' = 20.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" ) assert result.exit_code == 1, f"Unexpected output: {result.output}" def test_deposit_delay_critical( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): origin = compute_origin() scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) # Deposit done, checker gets the SWHID swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" - status_xml = status_template(status="done", status_detail="", swhid=swhid,) + status_xml = status_template( + status="done", + status_detail="", + swhid=swhid, + ) scenario.add_step( "get", f"{BASE_URL}/testcol/42/status/", status_xml, callback=lambda: time.sleep(60), ) # Then the checker checks the metadata appeared on the website scenario.add_step( "get", f"{BASE_WEB_URL}/api/1/raw-extrinsic-metadata/swhid/{swhid}/" f"?authority=deposit_client+http%3A%2F%2Ficinga-checker.example.org" f"&after=2022-03-04T17%3A02%3A39%2B00%3A00", [ { "swhid": swhid, "origin": origin, "discovery_date": "2999-03-03T10:48:47+00:00", "metadata_url": f"{BASE_WEB_URL}/the-metadata-url", } ], ) scenario.add_step("get", f"{BASE_WEB_URL}/the-metadata-url", SAMPLE_METADATA) scenario.install_mock(requests_mock) result = invoke( [ "--critical", "50", + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( "DEPOSIT CRITICAL - Deposit took 80.00s and succeeded.\n" "| 'load_time' = 70.00s\n" "| 'total_time' = 80.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_deposit_timeout( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited"), callback=lambda: time.sleep(1500), ) scenario.add_step( "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), callback=lambda: time.sleep(1500), ) scenario.add_step( "get", f"{BASE_URL}/testcol/42/status/", status_template(status="loading"), callback=lambda: time.sleep(1500), ) scenario.install_mock(requests_mock) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( "DEPOSIT CRITICAL - Timed out while in status loading " "(4520.0s seconds since deposit started)\n" "| 'total_time' = 4520.00s\n" "| 'upload_time' = 1500.00s\n" "| 'validation_time' = 1510.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_deposit_metadata_missing( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): origin = compute_origin() scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) # Deposit done, checker gets the SWHID swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" - status_xml = status_template(status="done", status_detail="", swhid=swhid,) + status_xml = status_template( + status="done", + status_detail="", + swhid=swhid, + ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_xml, + "get", + f"{BASE_URL}/testcol/42/status/", + status_xml, ) # Then the checker checks the metadata appeared on the website metadata_list = [ { # Filtered out, because wrong origin "swhid": swhid, "origin": "http://wrong-origin.example.org", "discovery_date": "2999-03-03T10:48:47+00:00", "metadata_url": f"{BASE_WEB_URL}/the-metadata-url", }, ] scenario.add_step( "get", f"{BASE_WEB_URL}/api/1/raw-extrinsic-metadata/swhid/{swhid}/" f"?authority=deposit_client+http%3A%2F%2Ficinga-checker.example.org" f"&after=2022-03-04T17%3A02%3A39%2B00%3A00", metadata_list, ) scenario.install_mock(requests_mock) result = invoke( [ "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( f"DEPOSIT CRITICAL - No recent metadata on {swhid} with origin {origin} in: " f"{metadata_list!r}\n" "| 'load_time' = 10.00s\n" "| 'total_time' = 20.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_deposit_metadata_error( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) # Deposit done, checker gets the SWHID swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" - status_xml = status_template(status="done", status_detail="", swhid=swhid,) + status_xml = status_template( + status="done", + status_detail="", + swhid=swhid, + ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_xml, + "get", + f"{BASE_URL}/testcol/42/status/", + status_xml, ) # Then the checker checks the metadata appeared on the website scenario.add_step( "get", f"{BASE_WEB_URL}/api/1/raw-extrinsic-metadata/swhid/{swhid}/" f"?authority=deposit_client+http%3A%2F%2Ficinga-checker.example.org" f"&after=2022-03-04T17%3A02%3A39%2B00%3A00", "foo\nbar", status_code=400, ) scenario.install_mock(requests_mock) result = invoke( [ "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( "DEPOSIT CRITICAL - Getting the list of metadata returned code 400: " "b'foo\\nbar'\n" "| 'load_time' = 10.00s\n" "| 'total_time' = 20.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_deposit_metadata_corrupt( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): origin = compute_origin() scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) # Deposit done, checker gets the SWHID swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74" - status_xml = status_template(status="done", status_detail="", swhid=swhid,) + status_xml = status_template( + status="done", + status_detail="", + swhid=swhid, + ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_xml, + "get", + f"{BASE_URL}/testcol/42/status/", + status_xml, ) # Then the checker checks the metadata appeared on the website metadata_list = [ { "swhid": swhid, "origin": origin, "discovery_date": "2999-03-03T09:48:47+00:00", "metadata_url": f"{BASE_WEB_URL}/the-metadata-url", }, ] scenario.add_step( "get", f"{BASE_WEB_URL}/api/1/raw-extrinsic-metadata/swhid/{swhid}/" f"?authority=deposit_client+http%3A%2F%2Ficinga-checker.example.org" f"&after=2022-03-04T17%3A02%3A39%2B00%3A00", metadata_list, ) scenario.add_step( "get", f"{BASE_WEB_URL}/the-metadata-url", SAMPLE_METADATA[0:-1], # corrupting the metadata by dropping the last byte ) scenario.install_mock(requests_mock) result = invoke( [ "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( f"DEPOSIT CRITICAL - Metadata on {swhid} with origin {origin} (at " f"{BASE_WEB_URL}/the-metadata-url) differs from uploaded Atom document (at " f"{sample_metadata})\n" "| 'load_time' = 10.00s\n" "| 'total_time' = 20.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_deposit_rejected( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( "get", f"{BASE_URL}/testcol/42/status/", status_template(status="rejected", status_detail="booo"), ) scenario.install_mock(requests_mock) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( "DEPOSIT CRITICAL - Deposit was rejected: booo\n" "| 'total_time' = 10.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_deposit_failed( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="loading"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="loading"), ) scenario.add_step( "get", f"{BASE_URL}/testcol/42/status/", status_template(status="failed", status_detail="booo"), ) scenario.install_mock(requests_mock) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( "DEPOSIT CRITICAL - Deposit loading failed: booo\n" "| 'load_time' = 20.00s\n" "| 'total_time' = 30.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_deposit_unexpected_status( requests_mock, mocker, sample_archive, sample_metadata, mocked_time ): scenario = WebScenario() scenario.add_step( "post", f"{BASE_URL}/testcol/", ENTRY_TEMPLATE.format(status="deposited") ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="verified"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="verified"), ) scenario.add_step( - "get", f"{BASE_URL}/testcol/42/status/", status_template(status="loading"), + "get", + f"{BASE_URL}/testcol/42/status/", + status_template(status="loading"), ) scenario.add_step( "get", f"{BASE_URL}/testcol/42/status/", status_template(status="what", status_detail="booo"), ) scenario.install_mock(requests_mock) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-deposit", *COMMON_OPTIONS, "single", "--archive", sample_archive, "--metadata", sample_metadata, ], catch_exceptions=True, ) assert result.output == ( "DEPOSIT CRITICAL - Deposit got unexpected status: what (booo)\n" "| 'load_time' = 20.00s\n" "| 'total_time' = 30.00s\n" "| 'upload_time' = 0.00s\n" "| 'validation_time' = 10.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" diff --git a/swh/icinga_plugins/tests/test_save_code_now.py b/swh/icinga_plugins/tests/test_save_code_now.py index bd5e95a..03d72d6 100644 --- a/swh/icinga_plugins/tests/test_save_code_now.py +++ b/swh/icinga_plugins/tests/test_save_code_now.py @@ -1,258 +1,259 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import random from typing import Dict, Optional, Tuple import pytest from swh.icinga_plugins.save_code_now import ( REPORT_MSG, WAITING_STATUSES, SaveCodeNowCheck, ) from .utils import invoke from .web_scenario import WebScenario def fake_response( origin: str, visit_type: str, sor_status: str = "pending", task_status: Optional[str] = None, ) -> Dict: """Fake a save code now request api response""" visit_date = None if task_status in ("failed", "succeeded"): visit_date = str(datetime.now(tz=timezone.utc)) return { "visit_type": visit_type, "origin_url": origin, "save_request_date": "to-replace", "save_request_status": sor_status, "save_task_status": task_status, "visit_date": visit_date, } @pytest.fixture def origin_info() -> Tuple[str, str]: - """Build an origin info to request save code now - - """ + """Build an origin info to request save code now""" origin_name = random.choice(range(10)) return random.choice(["git", "svn", "hg"]), f"mock://fake-origin-url/{origin_name}" def test_save_code_now_success(requests_mock, mocker, mocked_time, origin_info): """Successful ingestion scenario below threshold""" scenario = WebScenario() visit_type, origin = origin_info root_api_url = "mock://swh-web.example.org" api_url = SaveCodeNowCheck.api_url_scn(root_api_url, origin, visit_type) # creation request scenario.add_step( "post", api_url, fake_response(origin, visit_type, "accepted", "not yet scheduled"), ) response_scheduled = fake_response(origin, visit_type, "accepted", "scheduled") # status polling requests scenario.add_step("get", api_url, [response_scheduled]) # sometimes we can have multiple response so we fake that here scenario.add_step("get", api_url, [response_scheduled, response_scheduled]) scenario.add_step( "get", api_url, [fake_response(origin, visit_type, "accepted", "succeeded")] ) scenario.install_mock(requests_mock) # fmt: off result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", "/tmp", "check-savecodenow", "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, ] ) # fmt: on assert result.output == ( f"{SaveCodeNowCheck.TYPE} OK - {REPORT_MSG} {origin_info} took " f"30.00s and succeeded.\n" f"| 'total_time' = 30.00s\n" ) assert result.exit_code == 0, f"Unexpected result: {result.output}" def test_save_code_now_failure(requests_mock, mocker, mocked_time, origin_info): """Failed ingestion scenario should be reported""" scenario = WebScenario() visit_type, origin = origin_info root_api_url = "mock://swh-web.example.org" api_url = SaveCodeNowCheck.api_url_scn(root_api_url, origin, visit_type) # creation request scenario.add_step( "post", api_url, fake_response(origin, visit_type, "accepted", "not yet scheduled"), ) # status polling requests scenario.add_step( "get", api_url, [fake_response(origin, visit_type, "accepted", "scheduled")] ) scenario.add_step( "get", api_url, [fake_response(origin, visit_type, "accepted", "failed")] ) scenario.install_mock(requests_mock) # fmt: off result = invoke( [ "check-savecodenow", "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, ], catch_exceptions=True, ) # fmt: on assert result.output == ( f"{SaveCodeNowCheck.TYPE} CRITICAL - {REPORT_MSG} {origin_info} took " f"20.00s and failed.\n" f"| 'total_time' = 20.00s\n" ) assert result.exit_code == 2, f"Unexpected result: {result.output}" def test_save_code_now_pending_state_unsupported( requests_mock, mocker, mocked_time, origin_info ): """Pending save requests are not supported in the test so they should fail early Pending requests are requests that need a moderator to accept the repository into the save code now flow. Do not actually use such origin to trigger the checks. """ scenario = WebScenario() visit_type, origin = origin_info root_api_url = "mock://swh-web2.example.org" api_url = SaveCodeNowCheck.api_url_scn(root_api_url, origin, visit_type) # creation request scenario.add_step( - "post", api_url, fake_response(origin, visit_type, "pending", "not created"), + "post", + api_url, + fake_response(origin, visit_type, "pending", "not created"), ) scenario.install_mock(requests_mock) # fmt: off result = invoke( [ "check-savecodenow", "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, ], catch_exceptions=True, ) # fmt: on assert result.output == ( f"{SaveCodeNowCheck.TYPE} CRITICAL - {REPORT_MSG} {origin_info} took " f"0.00s and resulted in unsupported status: pending ; not created.\n" f"| 'total_time' = 0.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_save_code_now_threshold_exceeded( requests_mock, mocker, mocked_time, origin_info ): - """Saving requests exceeding threshold should mention warning in output - - """ + """Saving requests exceeding threshold should mention warning in output""" scenario = WebScenario() visit_type, origin = origin_info root_api_url = "mock://swh-web2.example.org" api_url = SaveCodeNowCheck.api_url_scn(root_api_url, origin, visit_type) # creation request scenario.add_step( "post", api_url, fake_response(origin, visit_type, "accepted", "not yet scheduled"), ) # we'll make the response being in the awaiting status # beyond 13, this will exceed the threshold for i in range(13): waiting_status = random.choice(WAITING_STATUSES) response_scheduled = fake_response( origin, visit_type, "accepted", waiting_status ) scenario.add_step("get", api_url, [response_scheduled]) scenario.install_mock(requests_mock) # fmt: off result = invoke( [ - "check-savecodenow", "--swh-web-url", root_api_url, + "check-savecodenow", + "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, ], catch_exceptions=True, ) # fmt: on assert result.output == ( f"{SaveCodeNowCheck.TYPE} CRITICAL - {REPORT_MSG} {origin_info} took " f"more than 130.00s and has status: {waiting_status}.\n" f"| 'total_time' = 130.00s\n" ) assert result.exit_code == 2, f"Unexpected output: {result.output}" def test_save_code_now_unexpected_failure( requests_mock, mocker, mocked_time, origin_info ): """Unexpected failure if the webapi refuses to answer for example""" scenario = WebScenario() visit_type, origin = origin_info root_api_url = "mock://swh-web.example.org" api_url = SaveCodeNowCheck.api_url_scn(root_api_url, origin, visit_type) # creation request scenario.add_step( "post", api_url, fake_response(origin, visit_type, "accepted", "not yet scheduled"), ) # status polling requests scenario.add_step( "get", api_url, [fake_response(origin, visit_type, "accepted", "scheduled")] ) # unexpected issue when communicating with the api scenario.add_step("get", api_url, {}, status_code=500) scenario.install_mock(requests_mock) with pytest.raises(AssertionError): # fmt: off invoke( [ "check-savecodenow", "--swh-web-url", root_api_url, "origin", origin, "--visit-type", visit_type, ], ) # fmt: on diff --git a/swh/icinga_plugins/tests/test_vault.py b/swh/icinga_plugins/tests/test_vault.py index e421a90..07beb09 100644 --- a/swh/icinga_plugins/tests/test_vault.py +++ b/swh/icinga_plugins/tests/test_vault.py @@ -1,530 +1,551 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import tarfile import time from swh.icinga_plugins.tests.utils import invoke from .web_scenario import WebScenario DIR_ID = "ab" * 20 url_api = f"mock://swh-web.example.org/api/1/vault/directory/{DIR_ID}/" url_fetch = f"mock://swh-web.example.org/api/1/vault/directory/{DIR_ID}/raw/" def _make_tarfile(): fd = io.BytesIO() with tarfile.open(fileobj=fd, mode="w:gz") as tf: tf.addfile(tarfile.TarInfo(f"swh:1:dir:{DIR_ID}/README"), b"this is a readme\n") tarinfo = tarfile.TarInfo(f"swh:1:dir:{DIR_ID}") tarinfo.type = tarfile.DIRTYPE tf.addfile(tarinfo) return fd.getvalue() TARBALL = _make_tarfile() response_pending = { "obj_id": DIR_ID, "obj_type": "directory", "progress_message": "foo", "status": "pending", } response_done = { "fetch_url": url_fetch, "id": 9, "obj_id": DIR_ID, "obj_type": "directory", "status": "done", } response_done_no_fetch = { "id": 9, "obj_id": DIR_ID, "obj_type": "directory", "status": "done", } response_failed = { "obj_id": DIR_ID, "obj_type": "directory", "progress_message": "foobar", "status": "failed", } response_unknown_status = { "obj_id": DIR_ID, "obj_type": "directory", "progress_message": "what", "status": "boo", } class FakeStorage: def __init__(self, foo, **kwargs): pass def directory_get_random(self): return bytes.fromhex(DIR_ID) def test_vault_immediate_success(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_done) scenario.add_step( "get", url_fetch, TARBALL, headers={"Content-Type": "application/gzip"} ) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ] ) assert result.output == ( f"VAULT OK - cooking directory {DIR_ID} took " f"10.00s and succeeded.\n" f"| 'total_time' = 10.00s\n" ) assert result.exit_code == 0, result.output def test_vault_delayed_success(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_pending) scenario.add_step("get", url_api, response_done) scenario.add_step( "get", url_fetch, TARBALL, headers={"Content-Type": "application/gzip"} ) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ] ) assert result.output == ( f"VAULT OK - cooking directory {DIR_ID} took " f"20.00s and succeeded.\n" f"| 'total_time' = 20.00s\n" ) assert result.exit_code == 0, result.output def test_vault_failure(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_failed) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ( f"VAULT CRITICAL - cooking directory {DIR_ID} took " f"10.00s and failed with: foobar\n" f"| 'total_time' = 10.00s\n" ) assert result.exit_code == 2, result.output def test_vault_unknown_status(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_unknown_status) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ( f"VAULT CRITICAL - cooking directory {DIR_ID} took " f"10.00s and resulted in unknown status: boo\n" f"| 'total_time' = 10.00s\n" ) assert result.exit_code == 2, result.output def test_vault_timeout(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_pending) scenario.add_step( "get", url_api, response_pending, callback=lambda: time.sleep(4000) ) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ( f"VAULT CRITICAL - cooking directory {DIR_ID} took more than " f"4020.00s and has status: foo\n" f"| 'total_time' = 4020.00s\n" ) assert result.exit_code == 2, result.output def test_vault_cached_directory(requests_mock, mocker, mocked_time): """First serves a directory that's already in the cache, to test that vault_check requests another one.""" scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=200) scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_done) scenario.add_step( "get", url_fetch, TARBALL, headers={"Content-Type": "application/gzip"} ) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ] ) assert result.output == ( f"VAULT OK - cooking directory {DIR_ID} took " f"10.00s and succeeded.\n" f"| 'total_time' = 10.00s\n" ) assert result.exit_code == 0, result.output def test_vault_no_directory(requests_mock, mocker, mocked_time): """Tests with an empty storage""" scenario = WebScenario() scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage mocker.patch(f"{__name__}.FakeStorage.directory_get_random", return_value=None) result = invoke( [ + "--prometheus-exporter", + "--prometheus-exporter-directory", + "/tmp", "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ("VAULT CRITICAL - No directory exists in the archive.\n") assert result.exit_code == 2, result.output def test_vault_fetch_failed(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_done) scenario.add_step( "get", url_fetch, "", status_code=500, headers={"Content-Type": "application/gzip"}, ) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ( f"VAULT CRITICAL - cooking directory {DIR_ID} took " f"10.00s and succeeded, but fetch failed with status code 500.\n" f"| 'total_time' = 10.00s\n" ) assert result.exit_code == 2, result.output def test_vault_fetch_missing_content_type(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_done) scenario.add_step("get", url_fetch, "") scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ( "VAULT CRITICAL - Unexpected Content-Type when downloading bundle: None\n" "| 'total_time' = 10.00s\n" ) assert result.exit_code == 2, result.output def test_vault_corrupt_tarball_gzip(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_pending) scenario.add_step("get", url_api, response_done) scenario.add_step( "get", url_fetch, b"this-is-not-a-tarball", headers={"Content-Type": "application/gzip"}, ) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ( "VAULT CRITICAL - ReadError while reading tarball: not a gzip file\n" "| 'total_time' = 20.00s\n" ) assert result.exit_code == 2, result.output def test_vault_corrupt_tarball_member(requests_mock, mocker, mocked_time): fd = io.BytesIO() with tarfile.open(fileobj=fd, mode="w:gz") as tf: tf.addfile(tarfile.TarInfo("wrong_dir_name/README"), b"this is a readme\n") tarball = fd.getvalue() scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_pending) scenario.add_step("get", url_api, response_done) scenario.add_step( - "get", url_fetch, tarball, headers={"Content-Type": "application/gzip"}, + "get", + url_fetch, + tarball, + headers={"Content-Type": "application/gzip"}, ) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ( "VAULT CRITICAL - Unexpected member in tarball: wrong_dir_name/README\n" "| 'total_time' = 20.00s\n" ) assert result.exit_code == 2, result.output def test_vault_empty_tarball(requests_mock, mocker, mocked_time): fd = io.BytesIO() with tarfile.open(fileobj=fd, mode="w:gz"): pass tarball = fd.getvalue() print(tarball) scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_pending) scenario.add_step("get", url_api, response_done) scenario.add_step( - "get", url_fetch, tarball, headers={"Content-Type": "application/gzip"}, + "get", + url_fetch, + tarball, + headers={"Content-Type": "application/gzip"}, ) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) # This error message will need to be updated when https://bugs.python.org/issue46922 # is resolved. assert result.output == ( "VAULT CRITICAL - StreamError while reading tarball (empty file?): " "seeking backwards is not allowed\n" "| 'total_time' = 20.00s\n" ) assert result.exit_code == 2, result.output def test_vault_no_fetch_url(requests_mock, mocker, mocked_time): scenario = WebScenario() scenario.add_step("get", url_api, {}, status_code=404) scenario.add_step("post", url_api, response_pending) scenario.add_step("get", url_api, response_done_no_fetch) scenario.install_mock(requests_mock) get_storage_mock = mocker.patch("swh.icinga_plugins.vault.get_storage") get_storage_mock.side_effect = FakeStorage result = invoke( [ "check-vault", "--swh-web-url", "mock://swh-web.example.org", "--swh-storage-url", "foo://example.org", "directory", ], catch_exceptions=True, ) assert result.output == ( f"VAULT CRITICAL - cooking directory {DIR_ID} took 10.00s and succeeded, " f"but API response did not contain a fetch_url.\n" f"| 'total_time' = 10.00s\n" ) assert result.exit_code == 2, result.output diff --git a/swh/icinga_plugins/tests/web_scenario.py b/swh/icinga_plugins/tests/web_scenario.py index 18a7e90..454bb47 100644 --- a/swh/icinga_plugins/tests/web_scenario.py +++ b/swh/icinga_plugins/tests/web_scenario.py @@ -1,94 +1,96 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Wrapper around requests-mock to mock successive responses from a web service. Tests can build successive steps by calling :py:meth:`WebScenario.add_step` with specifications of what endpoints should be called and in what order.""" import dataclasses import json from typing import Callable, Dict, List, Optional, Set, Union import requests_mock @dataclasses.dataclass(frozen=True) class Step: expected_method: str expected_url: str response: Union[str, bytes, Dict, List] status_code: int = 200 headers: Dict[str, str] = dataclasses.field(default_factory=dict) callback: Optional[Callable[[], int]] = None @dataclasses.dataclass(frozen=True) class Endpoint: method: str url: str class WebScenario: """Stores the state of the successive calls to the web service expected by tests.""" _steps: List[Step] _endpoints: Set[Endpoint] _current_step: int def __init__(self): self._steps = [] self._endpoints = set() self._current_step = 0 def add_endpoint(self, *args, **kwargs): """Adds an endpoint to be mocked. Arguments are the same as :py:class:Endpoint. """ self._endpoints.add(Endpoint(*args, **kwargs)) def add_step(self, *args, **kwargs): """Adds an expected call to the list of expected calls. Also automatically calls :py:meth:`add_endpoint` so the associated endpoint is mocked. Arguments are the same as :py:class:`Step`. """ step = Step(*args, **kwargs) self._steps.append(step) self.add_endpoint(step.expected_method, step.expected_url) def install_mock(self, mocker: requests_mock.Mocker): """Mocks entrypoints registered with :py:meth:`add_endpoint` (or :py:meth:`add_step`) using the provided mocker. """ for endpoint in self._endpoints: mocker.register_uri( - endpoint.method.upper(), endpoint.url, content=self._request_callback, + endpoint.method.upper(), + endpoint.url, + content=self._request_callback, ) def _request_callback(self, request, context): step = self._steps[self._current_step] assert request.url == step.expected_url assert request.method.upper() == step.expected_method.upper() self._current_step += 1 context.status_code = step.status_code context.headers.update(step.headers) if step.callback: step.callback() if isinstance(step.response, str): return step.response.encode() elif isinstance(step.response, bytes): return step.response else: return json.dumps(step.response).encode() diff --git a/swh/icinga_plugins/vault.py b/swh/icinga_plugins/vault.py index 25d8693..8265920 100644 --- a/swh/icinga_plugins/vault.py +++ b/swh/icinga_plugins/vault.py @@ -1,174 +1,213 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tarfile import time +from typing import List import requests from swh.storage import get_storage from .base_check import BaseCheck class NoDirectory(Exception): pass class VaultCheck(BaseCheck): TYPE = "VAULT" DEFAULT_WARNING_THRESHOLD = 0 DEFAULT_CRITICAL_THRESHOLD = 3600 def __init__(self, obj): - super().__init__(obj) + super().__init__(obj, application="vault") self._swh_storage = get_storage("remote", url=obj["swh_storage_url"]) self._swh_web_url = obj["swh_web_url"] self._poll_interval = obj["poll_interval"] + self.register_prometheus_gauge("status", "") + self.register_prometheus_gauge("duration", "seconds", ["step", "status"]) + def _url_for_dir(self, dir_id): return self._swh_web_url + f"/api/1/vault/directory/{dir_id.hex()}/" def _pick_directory(self): dir_ = self._swh_storage.directory_get_random() if dir_ is None: raise NoDirectory() return dir_ def _pick_uncached_directory(self): while True: dir_id = self._pick_directory() response = requests.get(self._url_for_dir(dir_id)) if response.status_code == 404: return dir_id + def _collect_prometheus_metrics( + self, status: int, duration: float, labels: List[str] + ) -> None: + self.collect_prometheus_metric("status", status) + self.collect_prometheus_metric( + "duration", duration, labels, + ) + def main(self): try: dir_id = self._pick_uncached_directory() except NoDirectory: self.print_result("CRITICAL", "No directory exists in the archive.") return 2 start_time = time.time() total_time = 0 response = requests.post(self._url_for_dir(dir_id)) assert response.status_code == 200, (response, response.text) result = response.json() while result["status"] in ("new", "pending"): time.sleep(self._poll_interval) response = requests.get(self._url_for_dir(dir_id)) assert response.status_code == 200, (response, response.text) result = response.json() total_time = time.time() - start_time if total_time > self.critical_threshold: self.print_result( "CRITICAL", f"cooking directory {dir_id.hex()} took more than " f"{total_time:.2f}s and has status: " f'{result["progress_message"]}', total_time=total_time, ) + + self._collect_prometheus_metrics(2, total_time, ["cooking", "timeout"]) + return 2 if result["status"] == "failed": self.print_result( "CRITICAL", f"cooking directory {dir_id.hex()} took {total_time:.2f}s " f'and failed with: {result["progress_message"]}', total_time=total_time, ) + + self._collect_prometheus_metrics(2, total_time, ["cooking", "failed"]) + return 2 elif result["status"] != "done": self.print_result( "CRITICAL", f"cooking directory {dir_id.hex()} took {total_time:.2f}s " f'and resulted in unknown status: {result["status"]}', total_time=total_time, ) + + self._collect_prometheus_metrics(2, total_time, ["cooking", "unknown"]) return 2 (status_code, status) = self.get_status(total_time) if "fetch_url" not in result: self.print_result( "CRITICAL", f"cooking directory {dir_id.hex()} took {total_time:.2f}s " f"and succeeded, but API response did not contain a fetch_url.", total_time=total_time, ) + self._collect_prometheus_metrics(2, total_time, ["fetch", "no_url"]) return 2 with requests.get(result["fetch_url"], stream=True) as fetch_response: try: fetch_response.raise_for_status() except requests.HTTPError: self.print_result( "CRITICAL", f"cooking directory {dir_id.hex()} took {total_time:.2f}s " f"and succeeded, but fetch failed with status code " f"{fetch_response.status_code}.", total_time=total_time, ) + self._collect_prometheus_metrics(2, total_time, ["fetch", "error"]) return 2 content_type = fetch_response.headers.get("Content-Type") if content_type != "application/gzip": self.print_result( "CRITICAL", f"Unexpected Content-Type when downloading bundle: {content_type}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["download", "unexpected_content_type"] + ) return 2 try: with tarfile.open(fileobj=fetch_response.raw, mode="r|gz") as tf: # Note that we are streaming the tarfile from the network, # so we are allowed at most one pass on the tf object; # and the sooner we close it the better. # Fortunately, checking only the first member is good enough: tarinfo = tf.next() swhid = f"swh:1:dir:{dir_id.hex()}" if tarinfo.name != swhid and not tarinfo.name.startswith( f"{swhid}/" ): self.print_result( "CRITICAL", f"Unexpected member in tarball: {tarinfo.name}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["check", "archive_content"] + ) return 2 except tarfile.ReadError as e: self.print_result( "CRITICAL", f"ReadError while reading tarball: {e}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["check", "archive_content"] + ) return 2 except tarfile.StreamError as e: if e.args[0] == "seeking backwards is not allowed": # Probably https://bugs.python.org/issue46922 self.print_result( "CRITICAL", f"StreamError while reading tarball (empty file?): {e}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["check", "archive_content"] + ) return 2 self.print_result( "CRITICAL", f"StreamError while reading tarball: {e}", total_time=total_time, ) + self._collect_prometheus_metrics( + 2, total_time, ["check", "archive_content"] + ) return 2 self.print_result( status, f"cooking directory {dir_id.hex()} took {total_time:.2f}s " f"and succeeded.", total_time=total_time, ) + + self._collect_prometheus_metrics(status_code, total_time, ["end", ""]) return status_code diff --git a/tox.ini b/tox.ini index 601a983..9b44907 100644 --- a/tox.ini +++ b/tox.ini @@ -1,73 +1,74 @@ [tox] envlist=black,flake8,mypy,py3 [testenv:py3] deps = .[testing] pytest-cov commands = pytest --doctest-modules \ {envsitepackagesdir}/swh/icinga_plugins \ --cov={envsitepackagesdir}/swh/icinga_plugins \ --cov-branch {posargs} [testenv:black] skip_install = true deps = - black==19.10b0 + black==22.3.0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = - flake8 + flake8==4.0.1 + flake8-bugbear==22.3.23 commands = {envpython} -m flake8 [testenv:mypy] skip_install = true deps = .[testing] - mypy==0.920 + mypy==0.942 commands = mypy swh # build documentation outside swh-environment using the current # git HEAD of swh-docs, is executed on CI for each diff to prevent # breaking doc build [testenv:sphinx] whitelist_externals = make usedevelop = true extras = testing deps = # fetch and install swh-docs in develop mode -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors SPHINXOPTS = -W commands = make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs # build documentation only inside swh-environment using local state # of swh-docs package [testenv:sphinx-dev] whitelist_externals = make usedevelop = true extras = testing deps = # install swh-docs in develop mode -e ../swh-docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors SPHINXOPTS = -W commands = make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs