send start and stop healthchecks signals correctly

2026-03-10 04:41:10 +00:00 · 2025-08-15 20:11:32 +03:00 · 2025-08-15 20:11:32 +03:00 · 878a4fc092
commit 878a4fc092
parent 13fd4b05d9
8 changed files with 54 additions and 38 deletions
--- a/modules/options.nix
+++ b/modules/options.nix
@ -26,8 +26,7 @@ in
    enabledCheckSets = lib.mkOption {
      type = lib.types.listOf (lib.types.enum [
-        "start"
+        "self"
        "stop"
        "remind"
        "cpu"
@ -40,8 +39,7 @@ in
      default = [ ];
      description = ''
        List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results. Available check sets:
-        * start -- send an alert when lego-monitoring is started
+        * self -- send an alert when lego-monitoring is started and stopped
        * stop -- send an alert when lego-monitoring is stopped
        * remind -- periodically (daily by default) remind about ongoing unresolved alerts
        * cpu -- alerts when CPU usage is above threshold
        * ram -- alerts when RAM usage is above threshold
--- a/src/lego_monitoring/init.py
+++ b/src/lego_monitoring/init.py
@ -52,8 +52,10 @@ async def async_main():
    check_sets = config_enums.CheckSet
    checker_sets: dict[config_enums.CheckSet, list[Coroutine | BaseChecker]] = {
-        check_sets.START: [sender.send_start_alert()],
+        check_sets.SELF: [
-        check_sets.STOP: [],  # this is checked later
+            sender.send_alert(checks.generate_start_alert()),
            IntervalChecker(checks.self_check, interval=datetime.timedelta(minutes=5), persistent=False),
        ],
        check_sets.CPU: [IntervalChecker(checks.cpu_check, interval=datetime.timedelta(minutes=3), persistent=True)],
        check_sets.RAM: [IntervalChecker(checks.ram_check, interval=datetime.timedelta(minutes=1), persistent=True)],
        check_sets.TEMP: [IntervalChecker(checks.temp_check, interval=datetime.timedelta(minutes=5), persistent=True)],
@ -120,8 +122,10 @@ async def async_main():
            checker_tasks.add(task)
        while True:
            if stopping:
-                if "stop" in config.enabled_check_sets:
+                if "self" in config.enabled_check_sets:
-                    await sender.send_stop_alert()
+                    alert = checks.generate_stop_alert()
                    await sender.send_alert(alert)
                    await sender.send_healthchecks_status(alert)
                await tg_client.disconnect()
                raise SystemExit
            else:
--- a/src/lego_monitoring/alerting/enum.py
+++ b/src/lego_monitoring/alerting/enum.py
@ -2,7 +2,7 @@ from enum import IntEnum, StrEnum
 class AlertType(StrEnum):
-    BOOT = "BOOT"
+    SELF = "SELF"
    ERROR = "ERROR"
    TEST = "TEST"
    REMIND = "REMIND"
--- a/src/lego_monitoring/alerting/sender.py
+++ b/src/lego_monitoring/alerting/sender.py
@ -1,7 +1,11 @@
 import logging
 from socket import gethostname
 from telethon import TelegramClient
 from telethon.sessions import MemorySession
 from uplink import AiohttpClient
 from ..checks.utils import format_for_healthchecks_slug
 from ..core import cvars
 from .alert import Alert
 from .clients.healthchecks import HealthchecksClient
@ -37,6 +41,7 @@ def format_message(alert: Alert, note: str) -> str:
 async def send_alert(alert: Alert, note: str = "") -> None:
    logging.debug(f"Sending {alert.alert_type} alert to Telegram")
    try:
        tg_client = cvars.tg_client.get()
    except LookupError:  # being called standalone
@ -62,6 +67,8 @@ async def send_healthchecks_status(alert: Alert) -> None:
        else:
            return keys["default"]
    logging.debug(f"Sending {alert.alert_type} to Healthchecks")
    if alert.healthchecks_slug is None:
        return
    try:
@ -76,26 +83,3 @@ async def send_healthchecks_status(alert: Alert) -> None:
        await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
    else:
        await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
 # TODO service itself has to be monitored like everything else - with regular pinging - if we're
 # using healthchecks
 async def send_start_alert() -> None:
    config = cvars.config.get()
    await send_alert(
        Alert(
            alert_type=AlertType.BOOT,
            message=f"Service running with enabled checks: {', '.join(config.enabled_check_sets)}",
            severity=Severity.INFO,
        )
    )
 async def send_stop_alert() -> None:
    await send_alert(
        Alert(
            alert_type=AlertType.BOOT,
            message="Service stopping.",
            severity=Severity.INFO,
        )
    )
--- a/src/lego_monitoring/checks/init.py
+++ b/src/lego_monitoring/checks/init.py
@ -2,5 +2,6 @@ from .cpu import cpu_check
 from .net import NetIOTracker
 from .ram import ram_check
 from .remind import remind_check
 from .self import generate_start_alert, generate_stop_alert, self_check
 from .temp import temp_check
 from .vulnix import vulnix_check
--- a/src/lego_monitoring/checks/self.py
+++ b/src/lego_monitoring/checks/self.py
@ -0,0 +1,30 @@
 from socket import gethostname
 from lego_monitoring.alerting.alert import Alert
 from lego_monitoring.alerting.enum import AlertType, Severity
 from lego_monitoring.core import cvars
 from .utils import format_for_healthchecks_slug
 def self_check() -> list[Alert]:
    return [generate_start_alert()]
 def generate_start_alert() -> Alert:
    config = cvars.config.get()
    return Alert(
        alert_type=AlertType.SELF,
        message=f"Host is up, lego-monitoring is running. Enabled checks: {', '.join(config.enabled_check_sets)}",
        severity=Severity.OK,
        healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
    )
 def generate_stop_alert() -> Alert:
    return Alert(
        alert_type=AlertType.SELF,
        message=f"Lego-monitoring service stopping.",
        severity=Severity.INFO,
        healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
    )
--- a/src/lego_monitoring/config/enums.py
+++ b/src/lego_monitoring/config/enums.py
@ -2,8 +2,7 @@ from enum import StrEnum
 class CheckSet(StrEnum):
-    START = "start"
+    SELF = "self"
    STOP = "stop"
    REMIND = "remind"
    CPU = "cpu"
--- a/src/lego_monitoring/core/checkers.py
+++ b/src/lego_monitoring/core/checkers.py
@ -63,6 +63,10 @@ class BaseChecker:
        return result
    async def _handle_alerts(self, alerts: list[Alert]) -> None:
        if not self.is_reminder:
            for alert in alerts:
                await send_healthchecks_status(alert)
        if not self.persistent:
            for alert in alerts:
                if alert.severity != Severity.OK:
@ -75,10 +79,6 @@ class BaseChecker:
            for alert in alerts:
                await send_alert(alert, note="ongoing")
        if not self.is_reminder:
            for alert in alerts:
                await send_healthchecks_status(alert)
    async def run_checker(self) -> None:
        raise NotImplementedError