From 878a4fc092135ce45dc1ce1211e0b8ea8677b2a4 Mon Sep 17 00:00:00 2001 From: Alex Tau Date: Fri, 15 Aug 2025 20:11:32 +0300 Subject: [PATCH] send start and stop healthchecks signals correctly --- modules/options.nix | 6 ++---- src/lego_monitoring/__init__.py | 12 +++++++---- src/lego_monitoring/alerting/enum.py | 2 +- src/lego_monitoring/alerting/sender.py | 30 ++++++-------------------- src/lego_monitoring/checks/__init__.py | 1 + src/lego_monitoring/checks/self.py | 30 ++++++++++++++++++++++++++ src/lego_monitoring/config/enums.py | 3 +-- src/lego_monitoring/core/checkers.py | 8 +++---- 8 files changed, 54 insertions(+), 38 deletions(-) create mode 100644 src/lego_monitoring/checks/self.py diff --git a/modules/options.nix b/modules/options.nix index 8af2ab5..128dd72 100644 --- a/modules/options.nix +++ b/modules/options.nix @@ -26,8 +26,7 @@ in enabledCheckSets = lib.mkOption { type = lib.types.listOf (lib.types.enum [ - "start" - "stop" + "self" "remind" "cpu" @@ -40,8 +39,7 @@ in default = [ ]; description = '' List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results. Available check sets: - * start -- send an alert when lego-monitoring is started - * stop -- send an alert when lego-monitoring is stopped + * self -- send an alert when lego-monitoring is started and stopped * remind -- periodically (daily by default) remind about ongoing unresolved alerts * cpu -- alerts when CPU usage is above threshold * ram -- alerts when RAM usage is above threshold diff --git a/src/lego_monitoring/__init__.py b/src/lego_monitoring/__init__.py index 488bf1a..16bde25 100644 --- a/src/lego_monitoring/__init__.py +++ b/src/lego_monitoring/__init__.py @@ -52,8 +52,10 @@ async def async_main(): check_sets = config_enums.CheckSet checker_sets: dict[config_enums.CheckSet, list[Coroutine | BaseChecker]] = { - check_sets.START: [sender.send_start_alert()], - check_sets.STOP: [], # this is checked later + check_sets.SELF: [ + sender.send_alert(checks.generate_start_alert()), + IntervalChecker(checks.self_check, interval=datetime.timedelta(minutes=5), persistent=False), + ], check_sets.CPU: [IntervalChecker(checks.cpu_check, interval=datetime.timedelta(minutes=3), persistent=True)], check_sets.RAM: [IntervalChecker(checks.ram_check, interval=datetime.timedelta(minutes=1), persistent=True)], check_sets.TEMP: [IntervalChecker(checks.temp_check, interval=datetime.timedelta(minutes=5), persistent=True)], @@ -120,8 +122,10 @@ async def async_main(): checker_tasks.add(task) while True: if stopping: - if "stop" in config.enabled_check_sets: - await sender.send_stop_alert() + if "self" in config.enabled_check_sets: + alert = checks.generate_stop_alert() + await sender.send_alert(alert) + await sender.send_healthchecks_status(alert) await tg_client.disconnect() raise SystemExit else: diff --git a/src/lego_monitoring/alerting/enum.py b/src/lego_monitoring/alerting/enum.py index b79abff..5e14034 100644 --- a/src/lego_monitoring/alerting/enum.py +++ b/src/lego_monitoring/alerting/enum.py @@ -2,7 +2,7 @@ from enum import IntEnum, StrEnum class AlertType(StrEnum): - BOOT = "BOOT" + SELF = "SELF" ERROR = "ERROR" TEST = "TEST" REMIND = "REMIND" diff --git a/src/lego_monitoring/alerting/sender.py b/src/lego_monitoring/alerting/sender.py index edd35c0..8601172 100644 --- a/src/lego_monitoring/alerting/sender.py +++ b/src/lego_monitoring/alerting/sender.py @@ -1,7 +1,11 @@ +import logging +from socket import gethostname + from telethon import TelegramClient from telethon.sessions import MemorySession from uplink import AiohttpClient +from ..checks.utils import format_for_healthchecks_slug from ..core import cvars from .alert import Alert from .clients.healthchecks import HealthchecksClient @@ -37,6 +41,7 @@ def format_message(alert: Alert, note: str) -> str: async def send_alert(alert: Alert, note: str = "") -> None: + logging.debug(f"Sending {alert.alert_type} alert to Telegram") try: tg_client = cvars.tg_client.get() except LookupError: # being called standalone @@ -62,6 +67,8 @@ async def send_healthchecks_status(alert: Alert) -> None: else: return keys["default"] + logging.debug(f"Sending {alert.alert_type} to Healthchecks") + if alert.healthchecks_slug is None: return try: @@ -76,26 +83,3 @@ async def send_healthchecks_status(alert: Alert) -> None: await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message) else: await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message) - - -# TODO service itself has to be monitored like everything else - with regular pinging - if we're -# using healthchecks -async def send_start_alert() -> None: - config = cvars.config.get() - await send_alert( - Alert( - alert_type=AlertType.BOOT, - message=f"Service running with enabled checks: {', '.join(config.enabled_check_sets)}", - severity=Severity.INFO, - ) - ) - - -async def send_stop_alert() -> None: - await send_alert( - Alert( - alert_type=AlertType.BOOT, - message="Service stopping.", - severity=Severity.INFO, - ) - ) diff --git a/src/lego_monitoring/checks/__init__.py b/src/lego_monitoring/checks/__init__.py index 62a11da..ae61375 100644 --- a/src/lego_monitoring/checks/__init__.py +++ b/src/lego_monitoring/checks/__init__.py @@ -2,5 +2,6 @@ from .cpu import cpu_check from .net import NetIOTracker from .ram import ram_check from .remind import remind_check +from .self import generate_start_alert, generate_stop_alert, self_check from .temp import temp_check from .vulnix import vulnix_check diff --git a/src/lego_monitoring/checks/self.py b/src/lego_monitoring/checks/self.py new file mode 100644 index 0000000..a907055 --- /dev/null +++ b/src/lego_monitoring/checks/self.py @@ -0,0 +1,30 @@ +from socket import gethostname + +from lego_monitoring.alerting.alert import Alert +from lego_monitoring.alerting.enum import AlertType, Severity +from lego_monitoring.core import cvars + +from .utils import format_for_healthchecks_slug + + +def self_check() -> list[Alert]: + return [generate_start_alert()] + + +def generate_start_alert() -> Alert: + config = cvars.config.get() + return Alert( + alert_type=AlertType.SELF, + message=f"Host is up, lego-monitoring is running. Enabled checks: {', '.join(config.enabled_check_sets)}", + severity=Severity.OK, + healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring", + ) + + +def generate_stop_alert() -> Alert: + return Alert( + alert_type=AlertType.SELF, + message=f"Lego-monitoring service stopping.", + severity=Severity.INFO, + healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring", + ) diff --git a/src/lego_monitoring/config/enums.py b/src/lego_monitoring/config/enums.py index a020a8b..1e73da3 100644 --- a/src/lego_monitoring/config/enums.py +++ b/src/lego_monitoring/config/enums.py @@ -2,8 +2,7 @@ from enum import StrEnum class CheckSet(StrEnum): - START = "start" - STOP = "stop" + SELF = "self" REMIND = "remind" CPU = "cpu" diff --git a/src/lego_monitoring/core/checkers.py b/src/lego_monitoring/core/checkers.py index 4cd14d0..78168bf 100644 --- a/src/lego_monitoring/core/checkers.py +++ b/src/lego_monitoring/core/checkers.py @@ -63,6 +63,10 @@ class BaseChecker: return result async def _handle_alerts(self, alerts: list[Alert]) -> None: + if not self.is_reminder: + for alert in alerts: + await send_healthchecks_status(alert) + if not self.persistent: for alert in alerts: if alert.severity != Severity.OK: @@ -75,10 +79,6 @@ class BaseChecker: for alert in alerts: await send_alert(alert, note="ongoing") - if not self.is_reminder: - for alert in alerts: - await send_healthchecks_status(alert) - async def run_checker(self) -> None: raise NotImplementedError