try to use OK alerts to reflect successful checks

This commit is contained in:
Alex Tau 2025-08-15 18:02:43 +03:00
parent d59d5ac4e2
commit 5f9952314d
8 changed files with 22 additions and 17 deletions

View file

@ -80,7 +80,10 @@ in
Specify `default` as the slug to use this key for check types that don't have a key explicitly assigned to them.
If you are unsure of the exact slug a check will generate, it is recommended to try it out with the default key first, before
assigning a specific one.'';
assigning a specific one.
**Note**: checks will be auto-provisioned, but correct intervals and grace periods have to be configured manually from the web console,
otherwise silent failures will not be recorded until after 1 day (the default healthchecks interval).'';
};
};
};

View file

@ -96,17 +96,17 @@ async def async_main():
command_manager = CommandHandlerManager(checkers)
await command_manager.attach_handlers(tg_client)
cvars.tg_client.set(tg_client)
else:
logging.info("Telegram integration is disabled")
tg_client = None
cvars.tg_client.set(tg_client)
if config.alert_channels.healthchecks is not None:
healthchecks_client = sender.get_healthchecks_client()
logging.info("Ready to send pings to healthchecks")
cvars.healthchecks_client.set(healthchecks_client)
else:
healthchecks_client = None
logging.info("Healthchecks integration is disabled")
signal.signal(signal.SIGTERM, stop_gracefully)

View file

@ -56,7 +56,7 @@ class CommandHandlerManager:
if not isinstance(c, BaseChecker) or not c.persistent:
continue
for a in c.current_alerts:
if a.alert_type not in alert_num_by_state_with_max_type:
if a.alert_type not in alert_num_by_state_with_max_type and a.severity != Severity.OK:
alert_num_by_state_with_max_type[a.alert_type] = [a.severity, 1]
else:
existing_list = alert_num_by_state_with_max_type[a.alert_type]
@ -80,6 +80,8 @@ class CommandHandlerManager:
if not isinstance(c, BaseChecker) or not c.persistent:
continue
for a in c.current_alerts:
if a.severity == Severity.OK:
continue
message = format_message(a, note="ongoing")
messages.add(message)
if len(messages) == 0:

View file

@ -22,7 +22,7 @@ class AlertType(StrEnum):
class Severity(IntEnum):
OK = 0 # should only be used when persistent alerts resolve
OK = 0
INFO = 1
WARNING = 2
CRITICAL = 3

View file

@ -38,7 +38,7 @@ def format_message(alert: Alert, note: str) -> str:
async def send_alert(alert: Alert, note: str = "") -> None:
try:
client = cvars.tg_client.get()
tg_client = cvars.tg_client.get()
except LookupError: # being called standalone
# cvars.config.set(get_config())
# temp_client = True
@ -47,10 +47,10 @@ async def send_alert(alert: Alert, note: str = "") -> None:
raise NotImplementedError # TODO
else:
... # temp_client = False
if client is not None:
if tg_client is not None:
room_id = cvars.config.get().alert_channels.telegram.room_id
message = format_message(alert, note)
await client.send_message(entity=room_id, message=message)
await tg_client.send_message(entity=room_id, message=message)
# if temp_client:
# await client.close()

View file

@ -26,4 +26,6 @@ def temp_check() -> list[Alert]:
else:
continue
alert_list.append(alert)
if len(alert_list) == 0:
alert_list.append(Alert(alert_type=AlertType.TEMP, message="All sensors nominal", severity=Severity.OK))
return alert_list

View file

@ -24,7 +24,7 @@ class BaseChecker:
"""
False: this persistent checker only emits messages when its max alert severity is changed
True: this persistent checker emits messages every times it checks
True: this persistent checker emits messages every times it checks and any non-OK alerts are present
Has no effect if persistent == False
"""
@ -65,17 +65,15 @@ class BaseChecker:
async def _handle_alerts(self, alerts: list[Alert]) -> None:
if not self.persistent:
for alert in alerts:
await send_alert(alert, "ongoing" if self.is_reminder else "")
if alert.severity != Severity.OK:
await send_alert(alert, "ongoing" if self.is_reminder else "")
return
old_types = self.current_alerts.get_types()
old_severity, new_severity = self.current_alerts.update(alerts)
new_types = self.current_alerts.get_types()
if old_severity != new_severity or self.send_any_state:
if (old_severity != new_severity or self.send_any_state) and not (
old_severity == None and new_severity == Severity.OK
):
for alert in alerts:
await send_alert(alert, note="ongoing")
for alert_type in old_types - new_types:
alert = Alert(alert_type=alert_type, message="Situation resolved", severity=Severity.OK)
await send_alert(alert)
async def run_checker(self) -> None:
raise NotImplementedError

View file

@ -9,6 +9,6 @@ from lego_monitoring.alerting.current import CurrentAlerts
from ..config import Config
config: ContextVar[Config] = ContextVar("config")
tg_client: ContextVar[Optional[TelegramClient]] = ContextVar("tg_client", default=None)
tg_client: ContextVar[Optional[TelegramClient]] = ContextVar("tg_client")
healthchecks_client: ContextVar[Optional[HealthchecksClient]] = ContextVar("healthchecks_client", default=None)
current_alerts: ContextVar[list[CurrentAlerts]] = ContextVar("current_alerts", default=[])