mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-09 20:31:10 +00:00
send start and stop healthchecks signals correctly
This commit is contained in:
parent
13fd4b05d9
commit
878a4fc092
8 changed files with 54 additions and 38 deletions
|
|
@ -26,8 +26,7 @@ in
|
|||
|
||||
enabledCheckSets = lib.mkOption {
|
||||
type = lib.types.listOf (lib.types.enum [
|
||||
"start"
|
||||
"stop"
|
||||
"self"
|
||||
"remind"
|
||||
|
||||
"cpu"
|
||||
|
|
@ -40,8 +39,7 @@ in
|
|||
default = [ ];
|
||||
description = ''
|
||||
List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results. Available check sets:
|
||||
* start -- send an alert when lego-monitoring is started
|
||||
* stop -- send an alert when lego-monitoring is stopped
|
||||
* self -- send an alert when lego-monitoring is started and stopped
|
||||
* remind -- periodically (daily by default) remind about ongoing unresolved alerts
|
||||
* cpu -- alerts when CPU usage is above threshold
|
||||
* ram -- alerts when RAM usage is above threshold
|
||||
|
|
|
|||
|
|
@ -52,8 +52,10 @@ async def async_main():
|
|||
check_sets = config_enums.CheckSet
|
||||
|
||||
checker_sets: dict[config_enums.CheckSet, list[Coroutine | BaseChecker]] = {
|
||||
check_sets.START: [sender.send_start_alert()],
|
||||
check_sets.STOP: [], # this is checked later
|
||||
check_sets.SELF: [
|
||||
sender.send_alert(checks.generate_start_alert()),
|
||||
IntervalChecker(checks.self_check, interval=datetime.timedelta(minutes=5), persistent=False),
|
||||
],
|
||||
check_sets.CPU: [IntervalChecker(checks.cpu_check, interval=datetime.timedelta(minutes=3), persistent=True)],
|
||||
check_sets.RAM: [IntervalChecker(checks.ram_check, interval=datetime.timedelta(minutes=1), persistent=True)],
|
||||
check_sets.TEMP: [IntervalChecker(checks.temp_check, interval=datetime.timedelta(minutes=5), persistent=True)],
|
||||
|
|
@ -120,8 +122,10 @@ async def async_main():
|
|||
checker_tasks.add(task)
|
||||
while True:
|
||||
if stopping:
|
||||
if "stop" in config.enabled_check_sets:
|
||||
await sender.send_stop_alert()
|
||||
if "self" in config.enabled_check_sets:
|
||||
alert = checks.generate_stop_alert()
|
||||
await sender.send_alert(alert)
|
||||
await sender.send_healthchecks_status(alert)
|
||||
await tg_client.disconnect()
|
||||
raise SystemExit
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ from enum import IntEnum, StrEnum
|
|||
|
||||
|
||||
class AlertType(StrEnum):
|
||||
BOOT = "BOOT"
|
||||
SELF = "SELF"
|
||||
ERROR = "ERROR"
|
||||
TEST = "TEST"
|
||||
REMIND = "REMIND"
|
||||
|
|
|
|||
|
|
@ -1,7 +1,11 @@
|
|||
import logging
|
||||
from socket import gethostname
|
||||
|
||||
from telethon import TelegramClient
|
||||
from telethon.sessions import MemorySession
|
||||
from uplink import AiohttpClient
|
||||
|
||||
from ..checks.utils import format_for_healthchecks_slug
|
||||
from ..core import cvars
|
||||
from .alert import Alert
|
||||
from .clients.healthchecks import HealthchecksClient
|
||||
|
|
@ -37,6 +41,7 @@ def format_message(alert: Alert, note: str) -> str:
|
|||
|
||||
|
||||
async def send_alert(alert: Alert, note: str = "") -> None:
|
||||
logging.debug(f"Sending {alert.alert_type} alert to Telegram")
|
||||
try:
|
||||
tg_client = cvars.tg_client.get()
|
||||
except LookupError: # being called standalone
|
||||
|
|
@ -62,6 +67,8 @@ async def send_healthchecks_status(alert: Alert) -> None:
|
|||
else:
|
||||
return keys["default"]
|
||||
|
||||
logging.debug(f"Sending {alert.alert_type} to Healthchecks")
|
||||
|
||||
if alert.healthchecks_slug is None:
|
||||
return
|
||||
try:
|
||||
|
|
@ -76,26 +83,3 @@ async def send_healthchecks_status(alert: Alert) -> None:
|
|||
await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
|
||||
else:
|
||||
await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
|
||||
|
||||
|
||||
# TODO service itself has to be monitored like everything else - with regular pinging - if we're
|
||||
# using healthchecks
|
||||
async def send_start_alert() -> None:
|
||||
config = cvars.config.get()
|
||||
await send_alert(
|
||||
Alert(
|
||||
alert_type=AlertType.BOOT,
|
||||
message=f"Service running with enabled checks: {', '.join(config.enabled_check_sets)}",
|
||||
severity=Severity.INFO,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def send_stop_alert() -> None:
|
||||
await send_alert(
|
||||
Alert(
|
||||
alert_type=AlertType.BOOT,
|
||||
message="Service stopping.",
|
||||
severity=Severity.INFO,
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -2,5 +2,6 @@ from .cpu import cpu_check
|
|||
from .net import NetIOTracker
|
||||
from .ram import ram_check
|
||||
from .remind import remind_check
|
||||
from .self import generate_start_alert, generate_stop_alert, self_check
|
||||
from .temp import temp_check
|
||||
from .vulnix import vulnix_check
|
||||
|
|
|
|||
30
src/lego_monitoring/checks/self.py
Normal file
30
src/lego_monitoring/checks/self.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
from socket import gethostname
|
||||
|
||||
from lego_monitoring.alerting.alert import Alert
|
||||
from lego_monitoring.alerting.enum import AlertType, Severity
|
||||
from lego_monitoring.core import cvars
|
||||
|
||||
from .utils import format_for_healthchecks_slug
|
||||
|
||||
|
||||
def self_check() -> list[Alert]:
|
||||
return [generate_start_alert()]
|
||||
|
||||
|
||||
def generate_start_alert() -> Alert:
|
||||
config = cvars.config.get()
|
||||
return Alert(
|
||||
alert_type=AlertType.SELF,
|
||||
message=f"Host is up, lego-monitoring is running. Enabled checks: {', '.join(config.enabled_check_sets)}",
|
||||
severity=Severity.OK,
|
||||
healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
|
||||
)
|
||||
|
||||
|
||||
def generate_stop_alert() -> Alert:
|
||||
return Alert(
|
||||
alert_type=AlertType.SELF,
|
||||
message=f"Lego-monitoring service stopping.",
|
||||
severity=Severity.INFO,
|
||||
healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
|
||||
)
|
||||
|
|
@ -2,8 +2,7 @@ from enum import StrEnum
|
|||
|
||||
|
||||
class CheckSet(StrEnum):
|
||||
START = "start"
|
||||
STOP = "stop"
|
||||
SELF = "self"
|
||||
REMIND = "remind"
|
||||
|
||||
CPU = "cpu"
|
||||
|
|
|
|||
|
|
@ -63,6 +63,10 @@ class BaseChecker:
|
|||
return result
|
||||
|
||||
async def _handle_alerts(self, alerts: list[Alert]) -> None:
|
||||
if not self.is_reminder:
|
||||
for alert in alerts:
|
||||
await send_healthchecks_status(alert)
|
||||
|
||||
if not self.persistent:
|
||||
for alert in alerts:
|
||||
if alert.severity != Severity.OK:
|
||||
|
|
@ -75,10 +79,6 @@ class BaseChecker:
|
|||
for alert in alerts:
|
||||
await send_alert(alert, note="ongoing")
|
||||
|
||||
if not self.is_reminder:
|
||||
for alert in alerts:
|
||||
await send_healthchecks_status(alert)
|
||||
|
||||
async def run_checker(self) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue