send start and stop healthchecks signals correctly

This commit is contained in:
Alex Tau 2025-08-15 20:11:32 +03:00
parent 13fd4b05d9
commit 878a4fc092
8 changed files with 54 additions and 38 deletions

View file

@ -26,8 +26,7 @@ in
enabledCheckSets = lib.mkOption { enabledCheckSets = lib.mkOption {
type = lib.types.listOf (lib.types.enum [ type = lib.types.listOf (lib.types.enum [
"start" "self"
"stop"
"remind" "remind"
"cpu" "cpu"
@ -40,8 +39,7 @@ in
default = [ ]; default = [ ];
description = '' description = ''
List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results. Available check sets: List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results. Available check sets:
* start -- send an alert when lego-monitoring is started * self -- send an alert when lego-monitoring is started and stopped
* stop -- send an alert when lego-monitoring is stopped
* remind -- periodically (daily by default) remind about ongoing unresolved alerts * remind -- periodically (daily by default) remind about ongoing unresolved alerts
* cpu -- alerts when CPU usage is above threshold * cpu -- alerts when CPU usage is above threshold
* ram -- alerts when RAM usage is above threshold * ram -- alerts when RAM usage is above threshold

View file

@ -52,8 +52,10 @@ async def async_main():
check_sets = config_enums.CheckSet check_sets = config_enums.CheckSet
checker_sets: dict[config_enums.CheckSet, list[Coroutine | BaseChecker]] = { checker_sets: dict[config_enums.CheckSet, list[Coroutine | BaseChecker]] = {
check_sets.START: [sender.send_start_alert()], check_sets.SELF: [
check_sets.STOP: [], # this is checked later sender.send_alert(checks.generate_start_alert()),
IntervalChecker(checks.self_check, interval=datetime.timedelta(minutes=5), persistent=False),
],
check_sets.CPU: [IntervalChecker(checks.cpu_check, interval=datetime.timedelta(minutes=3), persistent=True)], check_sets.CPU: [IntervalChecker(checks.cpu_check, interval=datetime.timedelta(minutes=3), persistent=True)],
check_sets.RAM: [IntervalChecker(checks.ram_check, interval=datetime.timedelta(minutes=1), persistent=True)], check_sets.RAM: [IntervalChecker(checks.ram_check, interval=datetime.timedelta(minutes=1), persistent=True)],
check_sets.TEMP: [IntervalChecker(checks.temp_check, interval=datetime.timedelta(minutes=5), persistent=True)], check_sets.TEMP: [IntervalChecker(checks.temp_check, interval=datetime.timedelta(minutes=5), persistent=True)],
@ -120,8 +122,10 @@ async def async_main():
checker_tasks.add(task) checker_tasks.add(task)
while True: while True:
if stopping: if stopping:
if "stop" in config.enabled_check_sets: if "self" in config.enabled_check_sets:
await sender.send_stop_alert() alert = checks.generate_stop_alert()
await sender.send_alert(alert)
await sender.send_healthchecks_status(alert)
await tg_client.disconnect() await tg_client.disconnect()
raise SystemExit raise SystemExit
else: else:

View file

@ -2,7 +2,7 @@ from enum import IntEnum, StrEnum
class AlertType(StrEnum): class AlertType(StrEnum):
BOOT = "BOOT" SELF = "SELF"
ERROR = "ERROR" ERROR = "ERROR"
TEST = "TEST" TEST = "TEST"
REMIND = "REMIND" REMIND = "REMIND"

View file

@ -1,7 +1,11 @@
import logging
from socket import gethostname
from telethon import TelegramClient from telethon import TelegramClient
from telethon.sessions import MemorySession from telethon.sessions import MemorySession
from uplink import AiohttpClient from uplink import AiohttpClient
from ..checks.utils import format_for_healthchecks_slug
from ..core import cvars from ..core import cvars
from .alert import Alert from .alert import Alert
from .clients.healthchecks import HealthchecksClient from .clients.healthchecks import HealthchecksClient
@ -37,6 +41,7 @@ def format_message(alert: Alert, note: str) -> str:
async def send_alert(alert: Alert, note: str = "") -> None: async def send_alert(alert: Alert, note: str = "") -> None:
logging.debug(f"Sending {alert.alert_type} alert to Telegram")
try: try:
tg_client = cvars.tg_client.get() tg_client = cvars.tg_client.get()
except LookupError: # being called standalone except LookupError: # being called standalone
@ -62,6 +67,8 @@ async def send_healthchecks_status(alert: Alert) -> None:
else: else:
return keys["default"] return keys["default"]
logging.debug(f"Sending {alert.alert_type} to Healthchecks")
if alert.healthchecks_slug is None: if alert.healthchecks_slug is None:
return return
try: try:
@ -76,26 +83,3 @@ async def send_healthchecks_status(alert: Alert) -> None:
await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message) await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
else: else:
await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message) await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
# TODO service itself has to be monitored like everything else - with regular pinging - if we're
# using healthchecks
async def send_start_alert() -> None:
config = cvars.config.get()
await send_alert(
Alert(
alert_type=AlertType.BOOT,
message=f"Service running with enabled checks: {', '.join(config.enabled_check_sets)}",
severity=Severity.INFO,
)
)
async def send_stop_alert() -> None:
await send_alert(
Alert(
alert_type=AlertType.BOOT,
message="Service stopping.",
severity=Severity.INFO,
)
)

View file

@ -2,5 +2,6 @@ from .cpu import cpu_check
from .net import NetIOTracker from .net import NetIOTracker
from .ram import ram_check from .ram import ram_check
from .remind import remind_check from .remind import remind_check
from .self import generate_start_alert, generate_stop_alert, self_check
from .temp import temp_check from .temp import temp_check
from .vulnix import vulnix_check from .vulnix import vulnix_check

View file

@ -0,0 +1,30 @@
from socket import gethostname
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
from .utils import format_for_healthchecks_slug
def self_check() -> list[Alert]:
return [generate_start_alert()]
def generate_start_alert() -> Alert:
config = cvars.config.get()
return Alert(
alert_type=AlertType.SELF,
message=f"Host is up, lego-monitoring is running. Enabled checks: {', '.join(config.enabled_check_sets)}",
severity=Severity.OK,
healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
)
def generate_stop_alert() -> Alert:
return Alert(
alert_type=AlertType.SELF,
message=f"Lego-monitoring service stopping.",
severity=Severity.INFO,
healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
)

View file

@ -2,8 +2,7 @@ from enum import StrEnum
class CheckSet(StrEnum): class CheckSet(StrEnum):
START = "start" SELF = "self"
STOP = "stop"
REMIND = "remind" REMIND = "remind"
CPU = "cpu" CPU = "cpu"

View file

@ -63,6 +63,10 @@ class BaseChecker:
return result return result
async def _handle_alerts(self, alerts: list[Alert]) -> None: async def _handle_alerts(self, alerts: list[Alert]) -> None:
if not self.is_reminder:
for alert in alerts:
await send_healthchecks_status(alert)
if not self.persistent: if not self.persistent:
for alert in alerts: for alert in alerts:
if alert.severity != Severity.OK: if alert.severity != Severity.OK:
@ -75,10 +79,6 @@ class BaseChecker:
for alert in alerts: for alert in alerts:
await send_alert(alert, note="ongoing") await send_alert(alert, note="ongoing")
if not self.is_reminder:
for alert in alerts:
await send_healthchecks_status(alert)
async def run_checker(self) -> None: async def run_checker(self) -> None:
raise NotImplementedError raise NotImplementedError