mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
send start and stop healthchecks signals correctly
This commit is contained in:
parent
13fd4b05d9
commit
878a4fc092
8 changed files with 54 additions and 38 deletions
|
|
@ -26,8 +26,7 @@ in
|
||||||
|
|
||||||
enabledCheckSets = lib.mkOption {
|
enabledCheckSets = lib.mkOption {
|
||||||
type = lib.types.listOf (lib.types.enum [
|
type = lib.types.listOf (lib.types.enum [
|
||||||
"start"
|
"self"
|
||||||
"stop"
|
|
||||||
"remind"
|
"remind"
|
||||||
|
|
||||||
"cpu"
|
"cpu"
|
||||||
|
|
@ -40,8 +39,7 @@ in
|
||||||
default = [ ];
|
default = [ ];
|
||||||
description = ''
|
description = ''
|
||||||
List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results. Available check sets:
|
List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results. Available check sets:
|
||||||
* start -- send an alert when lego-monitoring is started
|
* self -- send an alert when lego-monitoring is started and stopped
|
||||||
* stop -- send an alert when lego-monitoring is stopped
|
|
||||||
* remind -- periodically (daily by default) remind about ongoing unresolved alerts
|
* remind -- periodically (daily by default) remind about ongoing unresolved alerts
|
||||||
* cpu -- alerts when CPU usage is above threshold
|
* cpu -- alerts when CPU usage is above threshold
|
||||||
* ram -- alerts when RAM usage is above threshold
|
* ram -- alerts when RAM usage is above threshold
|
||||||
|
|
|
||||||
|
|
@ -52,8 +52,10 @@ async def async_main():
|
||||||
check_sets = config_enums.CheckSet
|
check_sets = config_enums.CheckSet
|
||||||
|
|
||||||
checker_sets: dict[config_enums.CheckSet, list[Coroutine | BaseChecker]] = {
|
checker_sets: dict[config_enums.CheckSet, list[Coroutine | BaseChecker]] = {
|
||||||
check_sets.START: [sender.send_start_alert()],
|
check_sets.SELF: [
|
||||||
check_sets.STOP: [], # this is checked later
|
sender.send_alert(checks.generate_start_alert()),
|
||||||
|
IntervalChecker(checks.self_check, interval=datetime.timedelta(minutes=5), persistent=False),
|
||||||
|
],
|
||||||
check_sets.CPU: [IntervalChecker(checks.cpu_check, interval=datetime.timedelta(minutes=3), persistent=True)],
|
check_sets.CPU: [IntervalChecker(checks.cpu_check, interval=datetime.timedelta(minutes=3), persistent=True)],
|
||||||
check_sets.RAM: [IntervalChecker(checks.ram_check, interval=datetime.timedelta(minutes=1), persistent=True)],
|
check_sets.RAM: [IntervalChecker(checks.ram_check, interval=datetime.timedelta(minutes=1), persistent=True)],
|
||||||
check_sets.TEMP: [IntervalChecker(checks.temp_check, interval=datetime.timedelta(minutes=5), persistent=True)],
|
check_sets.TEMP: [IntervalChecker(checks.temp_check, interval=datetime.timedelta(minutes=5), persistent=True)],
|
||||||
|
|
@ -120,8 +122,10 @@ async def async_main():
|
||||||
checker_tasks.add(task)
|
checker_tasks.add(task)
|
||||||
while True:
|
while True:
|
||||||
if stopping:
|
if stopping:
|
||||||
if "stop" in config.enabled_check_sets:
|
if "self" in config.enabled_check_sets:
|
||||||
await sender.send_stop_alert()
|
alert = checks.generate_stop_alert()
|
||||||
|
await sender.send_alert(alert)
|
||||||
|
await sender.send_healthchecks_status(alert)
|
||||||
await tg_client.disconnect()
|
await tg_client.disconnect()
|
||||||
raise SystemExit
|
raise SystemExit
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ from enum import IntEnum, StrEnum
|
||||||
|
|
||||||
|
|
||||||
class AlertType(StrEnum):
|
class AlertType(StrEnum):
|
||||||
BOOT = "BOOT"
|
SELF = "SELF"
|
||||||
ERROR = "ERROR"
|
ERROR = "ERROR"
|
||||||
TEST = "TEST"
|
TEST = "TEST"
|
||||||
REMIND = "REMIND"
|
REMIND = "REMIND"
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,11 @@
|
||||||
|
import logging
|
||||||
|
from socket import gethostname
|
||||||
|
|
||||||
from telethon import TelegramClient
|
from telethon import TelegramClient
|
||||||
from telethon.sessions import MemorySession
|
from telethon.sessions import MemorySession
|
||||||
from uplink import AiohttpClient
|
from uplink import AiohttpClient
|
||||||
|
|
||||||
|
from ..checks.utils import format_for_healthchecks_slug
|
||||||
from ..core import cvars
|
from ..core import cvars
|
||||||
from .alert import Alert
|
from .alert import Alert
|
||||||
from .clients.healthchecks import HealthchecksClient
|
from .clients.healthchecks import HealthchecksClient
|
||||||
|
|
@ -37,6 +41,7 @@ def format_message(alert: Alert, note: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
async def send_alert(alert: Alert, note: str = "") -> None:
|
async def send_alert(alert: Alert, note: str = "") -> None:
|
||||||
|
logging.debug(f"Sending {alert.alert_type} alert to Telegram")
|
||||||
try:
|
try:
|
||||||
tg_client = cvars.tg_client.get()
|
tg_client = cvars.tg_client.get()
|
||||||
except LookupError: # being called standalone
|
except LookupError: # being called standalone
|
||||||
|
|
@ -62,6 +67,8 @@ async def send_healthchecks_status(alert: Alert) -> None:
|
||||||
else:
|
else:
|
||||||
return keys["default"]
|
return keys["default"]
|
||||||
|
|
||||||
|
logging.debug(f"Sending {alert.alert_type} to Healthchecks")
|
||||||
|
|
||||||
if alert.healthchecks_slug is None:
|
if alert.healthchecks_slug is None:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
|
|
@ -76,26 +83,3 @@ async def send_healthchecks_status(alert: Alert) -> None:
|
||||||
await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
|
await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
|
||||||
else:
|
else:
|
||||||
await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
|
await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
|
||||||
|
|
||||||
|
|
||||||
# TODO service itself has to be monitored like everything else - with regular pinging - if we're
|
|
||||||
# using healthchecks
|
|
||||||
async def send_start_alert() -> None:
|
|
||||||
config = cvars.config.get()
|
|
||||||
await send_alert(
|
|
||||||
Alert(
|
|
||||||
alert_type=AlertType.BOOT,
|
|
||||||
message=f"Service running with enabled checks: {', '.join(config.enabled_check_sets)}",
|
|
||||||
severity=Severity.INFO,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def send_stop_alert() -> None:
|
|
||||||
await send_alert(
|
|
||||||
Alert(
|
|
||||||
alert_type=AlertType.BOOT,
|
|
||||||
message="Service stopping.",
|
|
||||||
severity=Severity.INFO,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
|
||||||
|
|
@ -2,5 +2,6 @@ from .cpu import cpu_check
|
||||||
from .net import NetIOTracker
|
from .net import NetIOTracker
|
||||||
from .ram import ram_check
|
from .ram import ram_check
|
||||||
from .remind import remind_check
|
from .remind import remind_check
|
||||||
|
from .self import generate_start_alert, generate_stop_alert, self_check
|
||||||
from .temp import temp_check
|
from .temp import temp_check
|
||||||
from .vulnix import vulnix_check
|
from .vulnix import vulnix_check
|
||||||
|
|
|
||||||
30
src/lego_monitoring/checks/self.py
Normal file
30
src/lego_monitoring/checks/self.py
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
from socket import gethostname
|
||||||
|
|
||||||
|
from lego_monitoring.alerting.alert import Alert
|
||||||
|
from lego_monitoring.alerting.enum import AlertType, Severity
|
||||||
|
from lego_monitoring.core import cvars
|
||||||
|
|
||||||
|
from .utils import format_for_healthchecks_slug
|
||||||
|
|
||||||
|
|
||||||
|
def self_check() -> list[Alert]:
|
||||||
|
return [generate_start_alert()]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_start_alert() -> Alert:
|
||||||
|
config = cvars.config.get()
|
||||||
|
return Alert(
|
||||||
|
alert_type=AlertType.SELF,
|
||||||
|
message=f"Host is up, lego-monitoring is running. Enabled checks: {', '.join(config.enabled_check_sets)}",
|
||||||
|
severity=Severity.OK,
|
||||||
|
healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_stop_alert() -> Alert:
|
||||||
|
return Alert(
|
||||||
|
alert_type=AlertType.SELF,
|
||||||
|
message=f"Lego-monitoring service stopping.",
|
||||||
|
severity=Severity.INFO,
|
||||||
|
healthchecks_slug=f"{format_for_healthchecks_slug(gethostname())}-lego_monitoring",
|
||||||
|
)
|
||||||
|
|
@ -2,8 +2,7 @@ from enum import StrEnum
|
||||||
|
|
||||||
|
|
||||||
class CheckSet(StrEnum):
|
class CheckSet(StrEnum):
|
||||||
START = "start"
|
SELF = "self"
|
||||||
STOP = "stop"
|
|
||||||
REMIND = "remind"
|
REMIND = "remind"
|
||||||
|
|
||||||
CPU = "cpu"
|
CPU = "cpu"
|
||||||
|
|
|
||||||
|
|
@ -63,6 +63,10 @@ class BaseChecker:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def _handle_alerts(self, alerts: list[Alert]) -> None:
|
async def _handle_alerts(self, alerts: list[Alert]) -> None:
|
||||||
|
if not self.is_reminder:
|
||||||
|
for alert in alerts:
|
||||||
|
await send_healthchecks_status(alert)
|
||||||
|
|
||||||
if not self.persistent:
|
if not self.persistent:
|
||||||
for alert in alerts:
|
for alert in alerts:
|
||||||
if alert.severity != Severity.OK:
|
if alert.severity != Severity.OK:
|
||||||
|
|
@ -75,10 +79,6 @@ class BaseChecker:
|
||||||
for alert in alerts:
|
for alert in alerts:
|
||||||
await send_alert(alert, note="ongoing")
|
await send_alert(alert, note="ongoing")
|
||||||
|
|
||||||
if not self.is_reminder:
|
|
||||||
for alert in alerts:
|
|
||||||
await send_healthchecks_status(alert)
|
|
||||||
|
|
||||||
async def run_checker(self) -> None:
|
async def run_checker(self) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue