send start and stop healthchecks signals correctly

This commit is contained in:
Alex Tau 2025-08-15 20:11:32 +03:00
parent 13fd4b05d9
commit 878a4fc092
8 changed files with 54 additions and 38 deletions

View file

@ -26,8 +26,7 @@ in
enabledCheckSets = lib.mkOption {
type = lib.types.listOf (lib.types.enum [
"start"
"stop"
"self"
"remind"
"cpu"
@ -40,8 +39,7 @@ in
default = [ ];
description = ''
List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results. Available check sets:
* start -- send an alert when lego-monitoring is started
* stop -- send an alert when lego-monitoring is stopped
* self -- send an alert when lego-monitoring is started and stopped
* remind -- periodically (daily by default) remind about ongoing unresolved alerts
* cpu -- alerts when CPU usage is above threshold
* ram -- alerts when RAM usage is above threshold

View file

@ -52,8 +52,10 @@ async def async_main():
check_sets = config_enums.CheckSet
checker_sets: dict[config_enums.CheckSet, list[Coroutine | BaseChecker]] = {
check_sets.START: [sender.send_start_alert()],
check_sets.STOP: [], # this is checked later
check_sets.SELF: [
sender.send_alert(checks.generate_start_alert()),
IntervalChecker(checks.self_check, interval=datetime.timedelta(minutes=5), persistent=False),
],
check_sets.CPU: [IntervalChecker(checks.cpu_check, interval=datetime.timedelta(minutes=3), persistent=True)],
check_sets.RAM: [IntervalChecker(checks.ram_check, interval=datetime.timedelta(minutes=1), persistent=True)],
check_sets.TEMP: [IntervalChecker(checks.temp_check, interval=datetime.timedelta(minutes=5), persistent=True)],
@ -120,8 +122,10 @@ async def async_main():
checker_tasks.add(task)
while True:
if stopping:
if "stop" in config.enabled_check_sets:
await sender.send_stop_alert()
if "self" in config.enabled_check_sets:
alert = checks.generate_stop_alert()
await sender.send_alert(alert)
await sender.send_healthchecks_status(alert)
await tg_client.disconnect()
raise SystemExit
else:

View file

@ -2,7 +2,7 @@ from enum import IntEnum, StrEnum
class AlertType(StrEnum):
BOOT = "BOOT"
SELF = "SELF"
ERROR = "ERROR"
TEST = "TEST"
REMIND = "REMIND"

View file

@ -1,7 +1,11 @@
import logging
from socket import gethostname
from telethon import TelegramClient
from telethon.sessions import MemorySession
from uplink import AiohttpClient
from ..checks.utils import format_for_healthchecks_slug
from ..core import cvars
from .alert import Alert
from .clients.healthchecks import HealthchecksClient
@ -37,6 +41,7 @@ def format_message(alert: Alert, note: str) -> str:
async def send_alert(alert: Alert, note: str = "") -> None:
logging.debug(f"Sending {alert.alert_type} alert to Telegram")
try:
tg_client = cvars.tg_client.get()
except LookupError: # being called standalone
@ -62,6 +67,8 @@ async def send_healthchecks_status(alert: Alert) -> None:
else:
return keys["default"]
logging.debug(f"Sending {alert.alert_type} to Healthchecks")
if alert.healthchecks_slug is None:
return
try:
@ -76,26 +83,3 @@ async def send_healthchecks_status(alert: Alert) -> None:
await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
else:
await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
# TODO service itself has to be monitored like everything else - with regular pinging - if we're
# using healthchecks
async def send_start_alert() -> None:
config = cvars.config.get()
await send_alert(
Alert(
alert_type=AlertType.BOOT,
message=f"Service running with enabled checks: {', '.join(config.enabled_check_sets)}",
severity=Severity.INFO,
)
)
async def send_stop_alert() -> None:
await send_alert(
Alert(
alert_type=AlertType.BOOT,
message="Service stopping.",
severity=Severity.INFO,
)
)

View file

@ -2,5 +2,6 @@ from .cpu import cpu_check
from .net import NetIOTracker
from .ram import ram_check
from .remind import remind_check
from .self import generate_start_alert, generate_stop_alert, self_check
from .temp import temp_check
from .vulnix import vulnix_check

View file

@ -0,0 +1,30 @@
from socket import gethostname
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
from .utils import format_for_healthchecks_slug
def self_check() -> list[Alert]:
return [generate_start_alert()]
def generate_start_alert() -> Alert:
config = cvars.config.get()
return Alert(
alert_type=AlertType.SELF,
message=f"Host is up, lego-monitoring is running. Enabled checks: {', '.join(config.enabled_check_sets)}",
severity=Severity.OK,
healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
)
def generate_stop_alert() -> Alert:
return Alert(
alert_type=AlertType.SELF,
message=f"Lego-monitoring service stopping.",
severity=Severity.INFO,
healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
)

View file

@ -2,8 +2,7 @@ from enum import StrEnum
class CheckSet(StrEnum):
START = "start"
STOP = "stop"
SELF = "self"
REMIND = "remind"
CPU = "cpu"

View file

@ -63,6 +63,10 @@ class BaseChecker:
return result
async def _handle_alerts(self, alerts: list[Alert]) -> None:
if not self.is_reminder:
for alert in alerts:
await send_healthchecks_status(alert)
if not self.persistent:
for alert in alerts:
if alert.severity != Severity.OK:
@ -75,10 +79,6 @@ class BaseChecker:
for alert in alerts:
await send_alert(alert, note="ongoing")
if not self.is_reminder:
for alert in alerts:
await send_healthchecks_status(alert)
async def run_checker(self) -> None:
raise NotImplementedError