mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
use NestedDeserializableDataclass for config
This commit is contained in:
parent
96664684f8
commit
3eb358d618
13 changed files with 188 additions and 130 deletions
|
|
@ -1,34 +1,14 @@
|
||||||
import json
|
import json
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, StrEnum
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import nio
|
import nio
|
||||||
|
|
||||||
from alerting.common import CONFIG_FILE
|
from alerting.enum import AlertType, Severity
|
||||||
from misc import cvars
|
from misc import cvars
|
||||||
|
from misc.common import CONFIG_FILE
|
||||||
|
from misc.config import get_config
|
||||||
class AlertType(StrEnum):
|
|
||||||
TEST = "TEST"
|
|
||||||
ERROR = "ERROR"
|
|
||||||
RAM = "RAM"
|
|
||||||
CPU = "CPU"
|
|
||||||
TEMP = "TEMP"
|
|
||||||
VULN = "VULN"
|
|
||||||
LOGIN = "LOGIN" # TODO
|
|
||||||
SMART = "SMART" # TODO
|
|
||||||
RAID = "RAID"
|
|
||||||
DISKS = "DISKS"
|
|
||||||
UPS = "UPS"
|
|
||||||
UPDATE = "UPDATE"
|
|
||||||
|
|
||||||
|
|
||||||
class Severity(StrEnum):
|
|
||||||
INFO = "INFO"
|
|
||||||
WARNING = "WARNING"
|
|
||||||
CRITICAL = "CRITICAL"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
@ -44,11 +24,11 @@ async def get_client() -> nio.AsyncClient:
|
||||||
Returns a Matrix client.
|
Returns a Matrix client.
|
||||||
It is better to call get_client once and use it for multiple send_alert calls
|
It is better to call get_client once and use it for multiple send_alert calls
|
||||||
"""
|
"""
|
||||||
matrix_cfg = cvars.config.get()["matrix"]
|
matrix_cfg = cvars.config.get().matrix
|
||||||
client = nio.AsyncClient(matrix_cfg["homeserver"])
|
client = nio.AsyncClient(matrix_cfg.homeserver)
|
||||||
client.access_token = matrix_cfg["access_token"]
|
client.access_token = matrix_cfg.access_token
|
||||||
client.user_id = matrix_cfg["user_id"]
|
client.user_id = matrix_cfg.user_id
|
||||||
client.device_id = matrix_cfg["device_id"]
|
client.device_id = matrix_cfg.device_id
|
||||||
return client
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -72,15 +52,13 @@ async def send_alert(alert: Alert) -> None:
|
||||||
try:
|
try:
|
||||||
client = cvars.matrix_client.get()
|
client = cvars.matrix_client.get()
|
||||||
except LookupError: # being called standalone
|
except LookupError: # being called standalone
|
||||||
async with aiofiles.open(CONFIG_FILE) as f:
|
cvars.config.set(get_config())
|
||||||
contents = await f.read()
|
|
||||||
cvars.config.set(json.loads(contents))
|
|
||||||
temp_client = True
|
temp_client = True
|
||||||
client = await get_client()
|
client = await get_client()
|
||||||
cvars.matrix_client.set(client)
|
cvars.matrix_client.set(client)
|
||||||
else:
|
else:
|
||||||
temp_client = False
|
temp_client = False
|
||||||
room_id = cvars.config.get()["matrix"]["room_id"]
|
room_id = cvars.config.get().matrix.room_id
|
||||||
message, html_message = format_message(alert)
|
message, html_message = format_message(alert)
|
||||||
content = {
|
content = {
|
||||||
"msgtype": "m.text",
|
"msgtype": "m.text",
|
||||||
|
|
|
||||||
22
alerting/enum.py
Normal file
22
alerting/enum.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
from enum import StrEnum
|
||||||
|
|
||||||
|
|
||||||
|
class AlertType(StrEnum):
|
||||||
|
TEST = "TEST"
|
||||||
|
ERROR = "ERROR"
|
||||||
|
RAM = "RAM"
|
||||||
|
CPU = "CPU"
|
||||||
|
TEMP = "TEMP"
|
||||||
|
VULN = "VULN"
|
||||||
|
LOGIN = "LOGIN" # TODO
|
||||||
|
SMART = "SMART" # TODO
|
||||||
|
RAID = "RAID"
|
||||||
|
DISKS = "DISKS"
|
||||||
|
UPS = "UPS"
|
||||||
|
UPDATE = "UPDATE"
|
||||||
|
|
||||||
|
|
||||||
|
class Severity(StrEnum):
|
||||||
|
INFO = "INFO"
|
||||||
|
WARNING = "WARNING"
|
||||||
|
CRITICAL = "CRITICAL"
|
||||||
|
|
@ -4,9 +4,10 @@ import getpass
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from common import CONFIG_FILE
|
|
||||||
from nio import AsyncClient, LoginResponse
|
from nio import AsyncClient, LoginResponse
|
||||||
|
|
||||||
|
from misc.common import CONFIG_FILE
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
113
misc/checks.py
113
misc/checks.py
|
|
@ -3,6 +3,7 @@ import traceback
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
from alerting import alerts
|
from alerting import alerts
|
||||||
|
from alerting.enum import AlertType, Severity
|
||||||
from misc import cvars, docker_registry, sensors, vuln
|
from misc import cvars, docker_registry, sensors, vuln
|
||||||
from misc.disks import LVAttr, WearoutIndicator, get_wearout_reading
|
from misc.disks import LVAttr, WearoutIndicator, get_wearout_reading
|
||||||
|
|
||||||
|
|
@ -18,15 +19,15 @@ def temp_check() -> list[alerts.Alert]:
|
||||||
continue # little valuable info and too low limits there, might as well ignore it
|
continue # little valuable info and too low limits there, might as well ignore it
|
||||||
if sensor.critical_temp is not None and (IS_TESTING or sensor.current_temp > sensor.critical_temp):
|
if sensor.critical_temp is not None and (IS_TESTING or sensor.current_temp > sensor.critical_temp):
|
||||||
alert = alerts.Alert(
|
alert = alerts.Alert(
|
||||||
alert_type=alerts.AlertType("TEMP"),
|
alert_type=AlertType("TEMP"),
|
||||||
message=f"{sensor.sensor_type} {sensor.sensor_label}: {sensor.current_temp}°C > {sensor.critical_temp}°C",
|
message=f"{sensor.sensor_type} {sensor.sensor_label}: {sensor.current_temp}°C > {sensor.critical_temp}°C",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
elif sensor.highest_temp is not None and (IS_TESTING or sensor.current_temp > sensor.highest_temp):
|
elif sensor.highest_temp is not None and (IS_TESTING or sensor.current_temp > sensor.highest_temp):
|
||||||
alert = alerts.Alert(
|
alert = alerts.Alert(
|
||||||
alert_type=alerts.AlertType("TEMP"),
|
alert_type=AlertType("TEMP"),
|
||||||
message=f"{sensor.sensor_type} {sensor.sensor_label}: {sensor.current_temp}°C > {sensor.highest_temp}°C",
|
message=f"{sensor.sensor_type} {sensor.sensor_label}: {sensor.current_temp}°C > {sensor.highest_temp}°C",
|
||||||
severity=alerts.Severity.WARNING,
|
severity=Severity.WARNING,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
@ -38,15 +39,15 @@ def cpu_check() -> list[alerts.Alert]:
|
||||||
sensor = sensors.Sensors.get_cpu()
|
sensor = sensors.Sensors.get_cpu()
|
||||||
if IS_TESTING or sensor.current_load > sensor.critical_load:
|
if IS_TESTING or sensor.current_load > sensor.critical_load:
|
||||||
alert = alerts.Alert(
|
alert = alerts.Alert(
|
||||||
alert_type=alerts.AlertType("CPU"),
|
alert_type=AlertType("CPU"),
|
||||||
message=f"{sensor.current_load}% > {sensor.critical_load}%",
|
message=f"{sensor.current_load}% > {sensor.critical_load}%",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
elif IS_TESTING or sensor.current_load > sensor.highest_load:
|
elif IS_TESTING or sensor.current_load > sensor.highest_load:
|
||||||
alert = alerts.Alert(
|
alert = alerts.Alert(
|
||||||
alert_type=alerts.AlertType("CPU"),
|
alert_type=AlertType("CPU"),
|
||||||
message=f"{sensor.current_load}% > {sensor.highest_load}%",
|
message=f"{sensor.current_load}% > {sensor.highest_load}%",
|
||||||
severity=alerts.Severity.WARNING,
|
severity=Severity.WARNING,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
|
@ -57,15 +58,15 @@ def ram_check() -> list[alerts.Alert]:
|
||||||
sensor = sensors.Sensors.get_ram()
|
sensor = sensors.Sensors.get_ram()
|
||||||
if IS_TESTING or sensor.current_avail < sensor.critical_avail:
|
if IS_TESTING or sensor.current_avail < sensor.critical_avail:
|
||||||
alert = alerts.Alert(
|
alert = alerts.Alert(
|
||||||
alert_type=alerts.AlertType("RAM"),
|
alert_type=AlertType("RAM"),
|
||||||
message=f"{(sensor.current_avail / 1024**3):.2f} GiB < {(sensor.critical_avail / 1024**3):.2f} GiB",
|
message=f"{(sensor.current_avail / 1024**3):.2f} GiB < {(sensor.critical_avail / 1024**3):.2f} GiB",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
elif IS_TESTING or sensor.current_avail < sensor.warning_avail:
|
elif IS_TESTING or sensor.current_avail < sensor.warning_avail:
|
||||||
alert = alerts.Alert(
|
alert = alerts.Alert(
|
||||||
alert_type=alerts.AlertType("RAM"),
|
alert_type=AlertType("RAM"),
|
||||||
message=f"{(sensor.current_avail / 1024**3):.2f} GiB < {(sensor.warning_avail / 1024**3):.2f} GiB",
|
message=f"{(sensor.current_avail / 1024**3):.2f} GiB < {(sensor.warning_avail / 1024**3):.2f} GiB",
|
||||||
severity=alerts.Severity.WARNING,
|
severity=Severity.WARNING,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
|
@ -79,18 +80,18 @@ async def vuln_check() -> list[alerts.Alert]:
|
||||||
if IS_TESTING or v.fixed or v.severity in (vuln.Severity.HIGH, vuln.Severity.CRITICAL):
|
if IS_TESTING or v.fixed or v.severity in (vuln.Severity.HIGH, vuln.Severity.CRITICAL):
|
||||||
match v.severity:
|
match v.severity:
|
||||||
case vuln.Severity.LOW:
|
case vuln.Severity.LOW:
|
||||||
severity = alerts.Severity.INFO
|
severity = Severity.INFO
|
||||||
case vuln.Severity.MEDIUM:
|
case vuln.Severity.MEDIUM:
|
||||||
severity = alerts.Severity.WARNING
|
severity = Severity.WARNING
|
||||||
case vuln.Severity.HIGH | vuln.Severity.CRITICAL:
|
case vuln.Severity.HIGH | vuln.Severity.CRITICAL:
|
||||||
severity = alerts.Severity.CRITICAL
|
severity = Severity.CRITICAL
|
||||||
message = f"{v.id}: {v.vuln_type} in {','.join(v.packages)}"
|
message = f"{v.id}: {v.vuln_type} in {','.join(v.packages)}"
|
||||||
html_message = f"<a href='{v.link}'>{v.id}</a>: {v.vuln_type} in {','.join(v.packages)}"
|
html_message = f"<a href='{v.link}'>{v.id}</a>: {v.vuln_type} in {','.join(v.packages)}"
|
||||||
if v.fixed:
|
if v.fixed:
|
||||||
message.append(f" -- update to {v.fixed} ASAP")
|
message.append(f" -- update to {v.fixed} ASAP")
|
||||||
html_message.append(f" -- update to {v.fixed} ASAP")
|
html_message.append(f" -- update to {v.fixed} ASAP")
|
||||||
alert = alerts.Alert(
|
alert = alerts.Alert(
|
||||||
alert_type=alerts.AlertType.VULN,
|
alert_type=AlertType.VULN,
|
||||||
message=message,
|
message=message,
|
||||||
html_message=html_message,
|
html_message=html_message,
|
||||||
severity=severity,
|
severity=severity,
|
||||||
|
|
@ -110,49 +111,47 @@ async def ups_check() -> list[alerts.Alert]:
|
||||||
if IS_TESTING or sensor.battery_charge_percentage < sensor.battery_critical_percentage:
|
if IS_TESTING or sensor.battery_charge_percentage < sensor.battery_critical_percentage:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.UPS,
|
alert_type=AlertType.UPS,
|
||||||
message=f"Battery is under {sensor.battery_critical_percentage}%\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
message=f"Battery is under {sensor.battery_critical_percentage}%\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
elif IS_TESTING or sensor.battery_charge_percentage < sensor.battery_warning_percentage:
|
elif IS_TESTING or sensor.battery_charge_percentage < sensor.battery_warning_percentage:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.UPS,
|
alert_type=AlertType.UPS,
|
||||||
message=f"Battery is under {sensor.battery_warning_percentage}%\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
message=f"Battery is under {sensor.battery_warning_percentage}%\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
||||||
severity=alerts.Severity.WARNING,
|
severity=Severity.WARNING,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
for status in sensor.ups_status:
|
for status in sensor.ups_status:
|
||||||
if IS_TESTING or status == sensors.UPSStatus.UPS_OVERLOAD:
|
if IS_TESTING or status == sensors.UPSStatus.UPS_OVERLOAD:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(alert_type=AlertType.UPS, message=f"UPS is overloaded!", severity=Severity.CRITICAL)
|
||||||
alert_type=alerts.AlertType.UPS, message=f"UPS is overloaded!", severity=alerts.Severity.CRITICAL
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
elif IS_TESTING or status == sensors.UPSStatus.ON_BATTERY:
|
elif IS_TESTING or status == sensors.UPSStatus.ON_BATTERY:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.UPS,
|
alert_type=AlertType.UPS,
|
||||||
message=f"UPS is on battery.\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
message=f"UPS is on battery.\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
||||||
severity=alerts.Severity.INFO,
|
severity=Severity.INFO,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
elif IS_TESTING or status == sensors.UPSStatus.UPS_TRIM:
|
elif IS_TESTING or status == sensors.UPSStatus.UPS_TRIM:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.UPS,
|
alert_type=AlertType.UPS,
|
||||||
message=f"Overvoltage detected: trimming voltage to nominal.",
|
message=f"Overvoltage detected: trimming voltage to nominal.",
|
||||||
severity=alerts.Severity.INFO,
|
severity=Severity.INFO,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
elif IS_TESTING or status == sensors.UPSStatus.UPS_BOOST:
|
elif IS_TESTING or status == sensors.UPSStatus.UPS_BOOST:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.UPS,
|
alert_type=AlertType.UPS,
|
||||||
message=f"Undervoltage detected: boosting voltage to nominal.",
|
message=f"Undervoltage detected: boosting voltage to nominal.",
|
||||||
severity=alerts.Severity.INFO,
|
severity=Severity.INFO,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -165,26 +164,26 @@ async def docker_registry_check() -> list[alerts.Alert]:
|
||||||
for image in updated_images:
|
for image in updated_images:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.UPDATE,
|
alert_type=AlertType.UPDATE,
|
||||||
message=f"{image} docker image: new version available",
|
message=f"{image} docker image: new version available",
|
||||||
severity=alerts.Severity.INFO,
|
severity=Severity.INFO,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return alert_list
|
return alert_list
|
||||||
|
|
||||||
|
|
||||||
def raid_check() -> list[alerts.Alert]:
|
def raid_check() -> list[alerts.Alert]:
|
||||||
check_config = cvars.config.get()["checks"]["raid"]
|
check_config = cvars.config.get().checks.raid
|
||||||
alert_list = []
|
alert_list = []
|
||||||
for lv in check_config["lvs"]:
|
for lv in check_config.lvs:
|
||||||
try:
|
try:
|
||||||
lv_attr = LVAttr.from_cli(lv)
|
lv_attr = LVAttr.from_cli(lv)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.ERROR,
|
alert_type=AlertType.ERROR,
|
||||||
message=f"Could not check RAID LV {lv}: {repr(exc)}, see logs",
|
message=f"Could not check RAID LV {lv}: {repr(exc)}, see logs",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
logging.error(traceback.format_exc())
|
logging.error(traceback.format_exc())
|
||||||
|
|
@ -194,9 +193,9 @@ def raid_check() -> list[alerts.Alert]:
|
||||||
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
|
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.ERROR,
|
alert_type=AlertType.ERROR,
|
||||||
message=f"LV {lv} is not of RAID type",
|
message=f"LV {lv} is not of RAID type",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
@ -204,9 +203,9 @@ def raid_check() -> list[alerts.Alert]:
|
||||||
if IS_TESTING:
|
if IS_TESTING:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.RAID,
|
alert_type=AlertType.RAID,
|
||||||
message=f"Test alert: LV {lv} health is {lv_attr.health}",
|
message=f"Test alert: LV {lv} health is {lv_attr.health}",
|
||||||
severity=alerts.Severity.INFO,
|
severity=Severity.INFO,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -214,33 +213,33 @@ def raid_check() -> list[alerts.Alert]:
|
||||||
case LVAttr.Health.PARTIAL:
|
case LVAttr.Health.PARTIAL:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.RAID,
|
alert_type=AlertType.RAID,
|
||||||
message=f"LV {lv} operating in partial mode; one of PVs has failed",
|
message=f"LV {lv} operating in partial mode; one of PVs has failed",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
case LVAttr.Health.UNKNOWN:
|
case LVAttr.Health.UNKNOWN:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.RAID,
|
alert_type=AlertType.RAID,
|
||||||
message=f"LV {lv}'s state is unknown",
|
message=f"LV {lv}'s state is unknown",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
case LVAttr.Health.REFRESH_NEEDED:
|
case LVAttr.Health.REFRESH_NEEDED:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.RAID,
|
alert_type=AlertType.RAID,
|
||||||
message=f"LV {lv} has suffered a write error; run a refresh or replace the failing PV",
|
message=f"LV {lv} has suffered a write error; run a refresh or replace the failing PV",
|
||||||
severity=alerts.Severity.WARNING,
|
severity=Severity.WARNING,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
case LVAttr.Health.MISMATCHES:
|
case LVAttr.Health.MISMATCHES:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.RAID,
|
alert_type=AlertType.RAID,
|
||||||
message=f"LV {lv} is partially incoherent; run a repairing scrub operation",
|
message=f"LV {lv} is partially incoherent; run a repairing scrub operation",
|
||||||
severity=alerts.Severity.WARNING,
|
severity=Severity.WARNING,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -248,17 +247,17 @@ def raid_check() -> list[alerts.Alert]:
|
||||||
|
|
||||||
|
|
||||||
def disk_wearout_check() -> list[alerts.Alert]:
|
def disk_wearout_check() -> list[alerts.Alert]:
|
||||||
check_config = cvars.config.get()["checks"]["wearout"]
|
check_config = cvars.config.get().checks.wearout
|
||||||
alert_list = []
|
alert_list = []
|
||||||
for disk in check_config["disks"]:
|
for disk in check_config.disks:
|
||||||
try:
|
try:
|
||||||
wearout_reading = get_wearout_reading(disk["name"])
|
wearout_reading = get_wearout_reading(disk.name)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.ERROR,
|
alert_type=AlertType.ERROR,
|
||||||
message=f"Could not check wearout for disk {disk['name']}: {repr(exc)}, see logs",
|
message=f"Could not check wearout for disk {disk.name}: {repr(exc)}, see logs",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
logging.error(traceback.format_exc())
|
logging.error(traceback.format_exc())
|
||||||
|
|
@ -267,13 +266,11 @@ def disk_wearout_check() -> list[alerts.Alert]:
|
||||||
if IS_TESTING or wearout_reading.current_reading < wearout_reading.threshold_reading:
|
if IS_TESTING or wearout_reading.current_reading < wearout_reading.threshold_reading:
|
||||||
match wearout_reading.indicator:
|
match wearout_reading.indicator:
|
||||||
case WearoutIndicator.REALLOCATED_SECTORS:
|
case WearoutIndicator.REALLOCATED_SECTORS:
|
||||||
message = f"Disk {disk['name']} has reallocated sectors (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
|
message = f"Disk {disk.name} has reallocated sectors (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
|
||||||
case WearoutIndicator.SPARE_BLOCKS:
|
case WearoutIndicator.SPARE_BLOCKS:
|
||||||
message = f"Disk {disk['name']} has too few spare blocks (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
|
message = f"Disk {disk.name} has too few spare blocks (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
|
||||||
alert_list.append(
|
alert_list.append(
|
||||||
alerts.Alert(
|
alerts.Alert(alert_type=AlertType.DISKS, message=message, severity=Severity[disk.severity])
|
||||||
alert_type=alerts.AlertType.DISKS, message=message, severity=alerts.Severity[disk["severity"]]
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return alert_list
|
return alert_list
|
||||||
|
|
|
||||||
58
misc/config.py
Normal file
58
misc/config.py
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from alt_utils import NestedDeserializableDataclass
|
||||||
|
|
||||||
|
from alerting.enum import Severity
|
||||||
|
from misc.common import CONFIG_FILE
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MatrixConfig:
|
||||||
|
homeserver: str
|
||||||
|
user_id: str
|
||||||
|
device_id: str
|
||||||
|
access_token: str
|
||||||
|
room_id: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CheckDockerRegistryConfig:
|
||||||
|
hub_url: str
|
||||||
|
images: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CheckRaidConfig:
|
||||||
|
lvs: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CheckWearoutDiskConfig:
|
||||||
|
name: str
|
||||||
|
severity: Severity
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CheckWearoutConfig(NestedDeserializableDataclass):
|
||||||
|
disks: list[CheckWearoutDiskConfig]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChecksConfig(NestedDeserializableDataclass):
|
||||||
|
docker_registry: CheckDockerRegistryConfig
|
||||||
|
raid: CheckRaidConfig
|
||||||
|
wearout: CheckWearoutConfig
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Config(NestedDeserializableDataclass):
|
||||||
|
matrix: MatrixConfig
|
||||||
|
checks: ChecksConfig
|
||||||
|
|
||||||
|
|
||||||
|
def get_config() -> Config:
|
||||||
|
with open(CONFIG_FILE) as f:
|
||||||
|
cfg_dict = json.load(f)
|
||||||
|
cfg = Config.from_dict(cfg_dict)
|
||||||
|
return cfg
|
||||||
|
|
@ -2,5 +2,7 @@ from contextvars import ContextVar
|
||||||
|
|
||||||
import nio
|
import nio
|
||||||
|
|
||||||
config: ContextVar[dict] = ContextVar("config")
|
from misc.config import Config
|
||||||
|
|
||||||
|
config: ContextVar[Config] = ContextVar("config")
|
||||||
matrix_client: ContextVar[nio.AsyncClient] = ContextVar("matrix_client")
|
matrix_client: ContextVar[nio.AsyncClient] = ContextVar("matrix_client")
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ from urllib.parse import urlparse
|
||||||
import uplink
|
import uplink
|
||||||
|
|
||||||
from alerting import alerts
|
from alerting import alerts
|
||||||
|
from alerting.enum import AlertType, Severity
|
||||||
from misc import cvars
|
from misc import cvars
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -67,12 +68,12 @@ class DockerRegistryClient(uplink.Consumer):
|
||||||
|
|
||||||
|
|
||||||
async def get_updated_images() -> list[str]:
|
async def get_updated_images() -> list[str]:
|
||||||
check_config = cvars.config.get()["checks"]["docker_registry"]
|
check_config = cvars.config.get().checks.docker_registry
|
||||||
hub_client = DockerHubClient(base_url=check_config["hub_url"], client=uplink.AiohttpClient())
|
hub_client = DockerHubClient(base_url=check_config.hub_url, client=uplink.AiohttpClient())
|
||||||
now = datetime.datetime.now(datetime.timezone.utc)
|
now = datetime.datetime.now(datetime.timezone.utc)
|
||||||
|
|
||||||
updated_images = []
|
updated_images = []
|
||||||
for image in check_config["images"]:
|
for image in check_config.images:
|
||||||
image_split = image.split("/")
|
image_split = image.split("/")
|
||||||
match len(image_split):
|
match len(image_split):
|
||||||
case 2:
|
case 2:
|
||||||
|
|
@ -84,9 +85,9 @@ async def get_updated_images() -> list[str]:
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await alerts.send_alert(
|
await alerts.send_alert(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.ERROR,
|
alert_type=AlertType.ERROR,
|
||||||
message=f"Could not query Docker Hub: {repr(exc)}, see logs",
|
message=f"Could not query Docker Hub: {repr(exc)}, see logs",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
logging.error(traceback.format_exc())
|
logging.error(traceback.format_exc())
|
||||||
|
|
@ -111,9 +112,9 @@ async def get_updated_images() -> list[str]:
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await alerts.send_alert(
|
await alerts.send_alert(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.ERROR,
|
alert_type=AlertType.ERROR,
|
||||||
message=f"Could not query Docker registry {registry}: {repr(exc)}, see logs",
|
message=f"Could not query Docker registry {registry}: {repr(exc)}, see logs",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
logging.error(traceback.format_exc())
|
logging.error(traceback.format_exc())
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ from enum import StrEnum
|
||||||
from psutil import cpu_percent, sensors_temperatures, virtual_memory
|
from psutil import cpu_percent, sensors_temperatures, virtual_memory
|
||||||
|
|
||||||
from alerting import alerts
|
from alerting import alerts
|
||||||
|
from alerting.enum import AlertType, Severity
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
@ -138,9 +139,9 @@ class Sensors:
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
await alerts.send_alert(
|
await alerts.send_alert(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.ERROR,
|
alert_type=AlertType.ERROR,
|
||||||
message="upsc is not installed!",
|
message="upsc is not installed!",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ from enum import StrEnum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from alerting import alerts
|
from alerting import alerts
|
||||||
|
from alerting.enum import AlertType, Severity
|
||||||
|
|
||||||
|
|
||||||
class Severity(StrEnum):
|
class Severity(StrEnum):
|
||||||
|
|
@ -48,18 +49,18 @@ async def get_vulns() -> list[Vulnerability]:
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
await alerts.send_alert(
|
await alerts.send_alert(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.ERROR,
|
alert_type=AlertType.ERROR,
|
||||||
message="arch-audit not installed!",
|
message="arch-audit not installed!",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await alerts.send_alert(
|
await alerts.send_alert(
|
||||||
alerts.Alert(
|
alerts.Alert(
|
||||||
alert_type=alerts.AlertType.ERROR,
|
alert_type=AlertType.ERROR,
|
||||||
message=f"arch-audit failed to run: {repr(exc)}, see logs",
|
message=f"arch-audit failed to run: {repr(exc)}, see logs",
|
||||||
severity=alerts.Severity.CRITICAL,
|
severity=Severity.CRITICAL,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
logging.error(traceback.format_exc())
|
logging.error(traceback.format_exc())
|
||||||
|
|
|
||||||
|
|
@ -3,3 +3,4 @@ psutil==5.9.8
|
||||||
matrix-nio[e2e]==0.24.0
|
matrix-nio[e2e]==0.24.0
|
||||||
uplink[aiohttp]==0.9.7
|
uplink[aiohttp]==0.9.7
|
||||||
setuptools==75.2.0
|
setuptools==75.2.0
|
||||||
|
alt-utils==0.0.4
|
||||||
|
|
|
||||||
|
|
@ -4,25 +4,26 @@ from os import environ
|
||||||
from sys import argv
|
from sys import argv
|
||||||
|
|
||||||
from alerting import alerts
|
from alerting import alerts
|
||||||
|
from alerting.enum import AlertType, Severity
|
||||||
|
|
||||||
type_priority_map = {
|
type_priority_map = {
|
||||||
"ONLINE": alerts.Severity.INFO, # UPS is back online
|
"ONLINE": Severity.INFO, # UPS is back online
|
||||||
"ONBATT": alerts.Severity.WARNING, # UPS is on battery
|
"ONBATT": Severity.WARNING, # UPS is on battery
|
||||||
"LOWBATT": alerts.Severity.CRITICAL, # UPS is on battery and has a low battery (is critical)
|
"LOWBATT": Severity.CRITICAL, # UPS is on battery and has a low battery (is critical)
|
||||||
"FSD": alerts.Severity.CRITICAL, # UPS is being shutdown by the primary (FSD = "Forced Shutdown")
|
"FSD": Severity.CRITICAL, # UPS is being shutdown by the primary (FSD = "Forced Shutdown")
|
||||||
"COMMOK": alerts.Severity.INFO, # Communications established with the UPS
|
"COMMOK": Severity.INFO, # Communications established with the UPS
|
||||||
"COMMBAD": alerts.Severity.WARNING, # Communications lost to the UPS
|
"COMMBAD": Severity.WARNING, # Communications lost to the UPS
|
||||||
"SHUTDOWN": alerts.Severity.CRITICAL, # The system is being shutdown
|
"SHUTDOWN": Severity.CRITICAL, # The system is being shutdown
|
||||||
"REPLBATT": alerts.Severity.WARNING, # The UPS battery is bad and needs to be replaced
|
"REPLBATT": Severity.WARNING, # The UPS battery is bad and needs to be replaced
|
||||||
"NOCOMM": alerts.Severity.WARNING, # A UPS is unavailable (can’t be contacted for monitoring)
|
"NOCOMM": Severity.WARNING, # A UPS is unavailable (can’t be contacted for monitoring)
|
||||||
"NOPARENT": alerts.Severity.CRITICAL, # upsmon parent process died - shutdown impossible
|
"NOPARENT": Severity.CRITICAL, # upsmon parent process died - shutdown impossible
|
||||||
"CAL": alerts.Severity.INFO, # UPS calibration in progress
|
"CAL": Severity.INFO, # UPS calibration in progress
|
||||||
"NOTCAL": alerts.Severity.INFO, # UPS calibration finished
|
"NOTCAL": Severity.INFO, # UPS calibration finished
|
||||||
"OFF": alerts.Severity.CRITICAL, # UPS administratively OFF or asleep
|
"OFF": Severity.CRITICAL, # UPS administratively OFF or asleep
|
||||||
"NOTOFF": alerts.Severity.INFO, # UPS no longer administratively OFF or asleep
|
"NOTOFF": Severity.INFO, # UPS no longer administratively OFF or asleep
|
||||||
"BYPASS": alerts.Severity.WARNING, # UPS on bypass (powered, not protecting)
|
"BYPASS": Severity.WARNING, # UPS on bypass (powered, not protecting)
|
||||||
"NOTBYPASS": alerts.Severity.INFO, # UPS no longer on bypass
|
"NOTBYPASS": Severity.INFO, # UPS no longer on bypass
|
||||||
None: alerts.Severity.CRITICAL, # unknown alert type
|
None: Severity.CRITICAL, # unknown alert type
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -33,7 +34,7 @@ async def main():
|
||||||
message = argv[1]
|
message = argv[1]
|
||||||
typestr = environ.get("NOTIFYTYPE", None)
|
typestr = environ.get("NOTIFYTYPE", None)
|
||||||
severity = type_priority_map[typestr]
|
severity = type_priority_map[typestr]
|
||||||
alert = alerts.Alert(alert_type=alerts.AlertType.UPS, message=message, severity=severity)
|
alert = alerts.Alert(alert_type=AlertType.UPS, message=message, severity=severity)
|
||||||
await alerts.send_alert(alert)
|
await alerts.send_alert(alert)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,13 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import asyncio
|
import asyncio
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import signal
|
import signal
|
||||||
|
|
||||||
import aiofiles
|
|
||||||
|
|
||||||
from alerting import alerts
|
from alerting import alerts
|
||||||
from alerting.common import CONFIG_FILE
|
|
||||||
from misc import checks, cvars
|
from misc import checks, cvars
|
||||||
from misc.checkers import interval_checker, scheduled_checker
|
from misc.checkers import interval_checker, scheduled_checker
|
||||||
|
from misc.config import get_config
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
@ -25,9 +22,7 @@ def stop_gracefully(signum, frame):
|
||||||
async def main():
|
async def main():
|
||||||
signal.signal(signal.SIGTERM, stop_gracefully)
|
signal.signal(signal.SIGTERM, stop_gracefully)
|
||||||
|
|
||||||
async with aiofiles.open(CONFIG_FILE) as f:
|
cvars.config.set(get_config())
|
||||||
contents = await f.read()
|
|
||||||
cvars.config.set(json.loads(contents))
|
|
||||||
|
|
||||||
client = await alerts.get_client()
|
client = await alerts.get_client()
|
||||||
cvars.matrix_client.set(client)
|
cvars.matrix_client.set(client)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue