mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
try to use OK alerts to reflect successful checks
This commit is contained in:
parent
d59d5ac4e2
commit
5f9952314d
8 changed files with 22 additions and 17 deletions
|
|
@ -80,7 +80,10 @@ in
|
|||
Specify `default` as the slug to use this key for check types that don't have a key explicitly assigned to them.
|
||||
|
||||
If you are unsure of the exact slug a check will generate, it is recommended to try it out with the default key first, before
|
||||
assigning a specific one.'';
|
||||
assigning a specific one.
|
||||
|
||||
**Note**: checks will be auto-provisioned, but correct intervals and grace periods have to be configured manually from the web console,
|
||||
otherwise silent failures will not be recorded until after 1 day (the default healthchecks interval).'';
|
||||
};
|
||||
};
|
||||
};
|
||||
|
|
|
|||
|
|
@ -96,17 +96,17 @@ async def async_main():
|
|||
|
||||
command_manager = CommandHandlerManager(checkers)
|
||||
await command_manager.attach_handlers(tg_client)
|
||||
cvars.tg_client.set(tg_client)
|
||||
else:
|
||||
logging.info("Telegram integration is disabled")
|
||||
tg_client = None
|
||||
|
||||
cvars.tg_client.set(tg_client)
|
||||
|
||||
if config.alert_channels.healthchecks is not None:
|
||||
healthchecks_client = sender.get_healthchecks_client()
|
||||
logging.info("Ready to send pings to healthchecks")
|
||||
cvars.healthchecks_client.set(healthchecks_client)
|
||||
else:
|
||||
healthchecks_client = None
|
||||
logging.info("Healthchecks integration is disabled")
|
||||
|
||||
signal.signal(signal.SIGTERM, stop_gracefully)
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ class CommandHandlerManager:
|
|||
if not isinstance(c, BaseChecker) or not c.persistent:
|
||||
continue
|
||||
for a in c.current_alerts:
|
||||
if a.alert_type not in alert_num_by_state_with_max_type:
|
||||
if a.alert_type not in alert_num_by_state_with_max_type and a.severity != Severity.OK:
|
||||
alert_num_by_state_with_max_type[a.alert_type] = [a.severity, 1]
|
||||
else:
|
||||
existing_list = alert_num_by_state_with_max_type[a.alert_type]
|
||||
|
|
@ -80,6 +80,8 @@ class CommandHandlerManager:
|
|||
if not isinstance(c, BaseChecker) or not c.persistent:
|
||||
continue
|
||||
for a in c.current_alerts:
|
||||
if a.severity == Severity.OK:
|
||||
continue
|
||||
message = format_message(a, note="ongoing")
|
||||
messages.add(message)
|
||||
if len(messages) == 0:
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ class AlertType(StrEnum):
|
|||
|
||||
|
||||
class Severity(IntEnum):
|
||||
OK = 0 # should only be used when persistent alerts resolve
|
||||
OK = 0
|
||||
INFO = 1
|
||||
WARNING = 2
|
||||
CRITICAL = 3
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ def format_message(alert: Alert, note: str) -> str:
|
|||
|
||||
async def send_alert(alert: Alert, note: str = "") -> None:
|
||||
try:
|
||||
client = cvars.tg_client.get()
|
||||
tg_client = cvars.tg_client.get()
|
||||
except LookupError: # being called standalone
|
||||
# cvars.config.set(get_config())
|
||||
# temp_client = True
|
||||
|
|
@ -47,10 +47,10 @@ async def send_alert(alert: Alert, note: str = "") -> None:
|
|||
raise NotImplementedError # TODO
|
||||
else:
|
||||
... # temp_client = False
|
||||
if client is not None:
|
||||
if tg_client is not None:
|
||||
room_id = cvars.config.get().alert_channels.telegram.room_id
|
||||
message = format_message(alert, note)
|
||||
await client.send_message(entity=room_id, message=message)
|
||||
await tg_client.send_message(entity=room_id, message=message)
|
||||
# if temp_client:
|
||||
# await client.close()
|
||||
|
||||
|
|
|
|||
|
|
@ -26,4 +26,6 @@ def temp_check() -> list[Alert]:
|
|||
else:
|
||||
continue
|
||||
alert_list.append(alert)
|
||||
if len(alert_list) == 0:
|
||||
alert_list.append(Alert(alert_type=AlertType.TEMP, message="All sensors nominal", severity=Severity.OK))
|
||||
return alert_list
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ class BaseChecker:
|
|||
"""
|
||||
False: this persistent checker only emits messages when its max alert severity is changed
|
||||
|
||||
True: this persistent checker emits messages every times it checks
|
||||
True: this persistent checker emits messages every times it checks and any non-OK alerts are present
|
||||
|
||||
Has no effect if persistent == False
|
||||
"""
|
||||
|
|
@ -65,17 +65,15 @@ class BaseChecker:
|
|||
async def _handle_alerts(self, alerts: list[Alert]) -> None:
|
||||
if not self.persistent:
|
||||
for alert in alerts:
|
||||
await send_alert(alert, "ongoing" if self.is_reminder else "")
|
||||
if alert.severity != Severity.OK:
|
||||
await send_alert(alert, "ongoing" if self.is_reminder else "")
|
||||
return
|
||||
old_types = self.current_alerts.get_types()
|
||||
old_severity, new_severity = self.current_alerts.update(alerts)
|
||||
new_types = self.current_alerts.get_types()
|
||||
if old_severity != new_severity or self.send_any_state:
|
||||
if (old_severity != new_severity or self.send_any_state) and not (
|
||||
old_severity == None and new_severity == Severity.OK
|
||||
):
|
||||
for alert in alerts:
|
||||
await send_alert(alert, note="ongoing")
|
||||
for alert_type in old_types - new_types:
|
||||
alert = Alert(alert_type=alert_type, message="Situation resolved", severity=Severity.OK)
|
||||
await send_alert(alert)
|
||||
|
||||
async def run_checker(self) -> None:
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
|
|
@ -9,6 +9,6 @@ from lego_monitoring.alerting.current import CurrentAlerts
|
|||
from ..config import Config
|
||||
|
||||
config: ContextVar[Config] = ContextVar("config")
|
||||
tg_client: ContextVar[Optional[TelegramClient]] = ContextVar("tg_client", default=None)
|
||||
tg_client: ContextVar[Optional[TelegramClient]] = ContextVar("tg_client")
|
||||
healthchecks_client: ContextVar[Optional[HealthchecksClient]] = ContextVar("healthchecks_client", default=None)
|
||||
current_alerts: ContextVar[list[CurrentAlerts]] = ContextVar("current_alerts", default=[])
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue