add temp monitoring

This commit is contained in:
Alex Tau 2025-05-02 15:25:27 +03:00
parent 19ee6f487b
commit 758438382d
13 changed files with 272 additions and 25 deletions

View file

@ -7,17 +7,21 @@ package:
... ...
}: }:
let
tempSensorOptions = (import ./submodules/tempSensorOptions.nix) { inherit lib; };
in
{ {
options.services.lego-monitoring = { options.services.lego-monitoring = {
enable = lib.mkEnableOption "lego-monitoring service."; enable = lib.mkEnableOption "lego-monitoring service.";
enabledCheckerSets = lib.mkOption { enabledCheckSets = lib.mkOption {
type = lib.types.listOf (lib.types.enum [ type = lib.types.listOf (lib.types.enum [
"start" "start"
"stop" "stop"
"temp"
]); ]);
default = [ ]; default = [ ];
description = "List of enabled checker sets. Each checker set is a module which checks something and generates alerts based on check results."; description = "List of enabled check sets. Each check set is a module which checks something and generates alerts based on check results.";
}; };
telegram = { telegram = {
@ -30,17 +34,61 @@ package:
description = "ID of chat where to send alerts."; description = "ID of chat where to send alerts.";
}; };
}; };
checks = {
temp = {
sensors = lib.mkOption {
type = lib.types.attrsOf (lib.types.submodule tempSensorOptions);
default = { };
description = ''
Temp sensor override definitions. Sensors not defined here, or missing options in definitions, will be read with default parameters.
To get list of sensors and their default configurations, run `lego-monitoring --print-temp`.'';
example = lib.literalExpression ''
{
amdgpu.readings.edge.label = "Integrated GPU";
k10temp.readings = {
Tctl = {
label = "AMD CPU";
criticalTemp = 95.0;
};
Tccd1.enabled = false;
Tccd2.enabled = false;
};
nvme.readings = {
"Sensor 1".enabled = false;
"Sensor 2".enabled = false;
};
}
'';
};
};
};
}; };
config = let config = let
cfg = config.services.lego-monitoring; cfg = config.services.lego-monitoring;
json = pkgs.formats.json {}; json = pkgs.formats.json {};
serviceConfigFile = json.generate "config.json" { serviceConfigFile = json.generate "config.json" {
enabled_checker_sets = cfg.enabledCheckerSets; enabled_check_sets = cfg.enabledCheckSets;
telegram = with cfg.telegram; { telegram = with cfg.telegram; {
creds_secret_path = credsSecretPath; creds_secret_path = credsSecretPath;
room_id = roomId; room_id = roomId;
}; };
checks = {
temp.sensors = lib.mapAttrs (_: sensorCfg: {
inherit (sensorCfg) name enabled;
readings = lib.mapAttrs (_: readingCfg: {
inherit (readingCfg) label enabled;
warning_temp = readingCfg.warningTemp;
critical_temp = readingCfg.criticalTemp;
}) sensorCfg.readings;
}) cfg.checks.temp.sensors;
};
}; };
in lib.mkIf cfg.enable { in lib.mkIf cfg.enable {
systemd.services.lego-monitoring = { systemd.services.lego-monitoring = {

View file

@ -0,0 +1,49 @@
{
lib,
}:
let
tempReadingOptions = {
options = {
label = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "Friendly label of the reading.";
};
enabled = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Whether this reading is enabled.";
};
warningTemp = lib.mkOption {
type = lib.types.nullOr lib.types.float;
default = null;
description = "Warning temperature threshold.";
};
criticalTemp = lib.mkOption {
type = lib.types.nullOr lib.types.float;
default = null;
description = "Critical temperature threshold.";
};
};
};
in
{
options = {
name = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "Friendly name of the sensor.";
};
enabled = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Whether sensor is enabled.";
};
readings = lib.mkOption {
type = lib.types.attrsOf (lib.types.submodule tempReadingOptions);
default = { };
description = "Overrides for specific readings of the sensor, by label.";
};
};
}

View file

@ -6,6 +6,7 @@ readme = "README.md"
requires-python = ">=3.12" requires-python = ">=3.12"
dependencies = [ dependencies = [
"alt-utils>=0.0.6", "alt-utils>=0.0.6",
"psutil>=7.0.0",
"telethon>=1.40.0", "telethon>=1.40.0",
] ]

View file

@ -1,12 +1,16 @@
import argparse import argparse
import asyncio import asyncio
import datetime
import logging import logging
import signal import signal
import time import time
from . import checks
from .alerting import alerts from .alerting import alerts
from .checks.temp.sensors import print_readings
from .config import load_config
from .core import cvars from .core import cvars
from .core.config import load_config from .core.checkers import interval_checker
stopping = False stopping = False
@ -27,11 +31,21 @@ async def async_main():
prog="lego-monitoring", prog="lego-monitoring",
description="Lego-monitoring service", description="Lego-monitoring service",
) )
parser.add_argument("-c", "--config", required=True) parser.add_argument("-c", "--config", help="config file")
parser.add_argument("--print-temp", help="print temp sensor readings and exit", action="store_true")
args = parser.parse_args()
config_path = parser.parse_args().config if args.config:
config = load_config(config_path) config_path = parser.parse_args().config
cvars.config.set(config) config = load_config(config_path)
cvars.config.set(config)
if args.print_temp:
print_readings()
raise SystemExit
if not args.config:
raise RuntimeError("--config must be specified in standard operating mode")
tg_client = await alerts.get_client() tg_client = await alerts.get_client()
cvars.tg_client.set(tg_client) cvars.tg_client.set(tg_client)
@ -41,10 +55,11 @@ async def async_main():
alerts.send_start_alert(), alerts.send_start_alert(),
], ],
"stop": [], # this is checked later "stop": [], # this is checked later
"temp": [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))],
} }
checkers = [] checkers = []
for enabled_set in config.enabled_checker_sets: for enabled_set in config.enabled_check_sets:
for checker in checker_sets[enabled_set]: for checker in checker_sets[enabled_set]:
checkers.append(checker) checkers.append(checker)
@ -57,7 +72,7 @@ async def async_main():
checker_tasks.add(task) checker_tasks.add(task)
while True: while True:
if stopping: if stopping:
if "stop" in config.enabled_checker_sets: if "stop" in config.enabled_check_sets:
await alerts.send_stop_alert() await alerts.send_stop_alert()
await tg_client.disconnect() await tg_client.disconnect()
raise SystemExit raise SystemExit

View file

@ -57,7 +57,7 @@ async def send_start_alert() -> None:
await send_alert( await send_alert(
Alert( Alert(
alert_type=AlertType.BOOT, alert_type=AlertType.BOOT,
message=f"Service running with enabled checkers: {', '.join(config.enabled_checker_sets)}", message=f"Service running with enabled checks: {', '.join(config.enabled_check_sets)}",
severity=Severity.INFO, severity=Severity.INFO,
) )
) )

View file

@ -3,11 +3,11 @@ from enum import StrEnum
class AlertType(StrEnum): class AlertType(StrEnum):
BOOT = "BOOT" BOOT = "BOOT"
TEMP = "TEMP"
TEST = "TEST" TEST = "TEST"
# ERROR = "ERROR" # ERROR = "ERROR"
# RAM = "RAM" # RAM = "RAM"
# CPU = "CPU" # CPU = "CPU"
# TEMP = "TEMP"
# VULN = "VULN" # VULN = "VULN"
# LOGIN = "LOGIN" # LOGIN = "LOGIN"
# SMART = "SMART" # TODO # SMART = "SMART" # TODO

View file

@ -0,0 +1 @@
from .temp import temp_check

View file

@ -0,0 +1,29 @@
from lego_monitoring.alerting import alerts
from lego_monitoring.alerting.enum import AlertType, Severity
from . import sensors
IS_TESTING = False
def temp_check() -> list[alerts.Alert]:
alert_list = []
temps = sensors.get_readings()
for sensor, readings in temps.items():
for r in readings:
if r.critical_temp is not None and (IS_TESTING or r.current_temp > r.critical_temp):
alert = alerts.Alert(
alert_type=AlertType.TEMP,
message=f"{sensor} {r.label}: {r.current_temp}°C > {r.critical_temp}°C",
severity=Severity.CRITICAL,
)
elif r.warning_temp is not None and (IS_TESTING or r.current_temp > r.warning_temp):
alert = alerts.Alert(
alert_type=AlertType.TEMP,
message=f"{sensor} {r.label}: {r.current_temp}°C > {r.warning_temp}°C",
severity=Severity.WARNING,
)
else:
continue
alert_list.append(alert)
return alert_list

View file

@ -0,0 +1,66 @@
from dataclasses import dataclass
from typing import Optional
from psutil import sensors_temperatures
from lego_monitoring.config.checks.temp import TempSensorConfig
from lego_monitoring.core import cvars
@dataclass
class TemperatureReading:
label: str
current_temp: float
warning_temp: Optional[float]
critical_temp: Optional[float]
def print_readings():
sensor_readings = get_readings()
for sensor, readings in sensor_readings.items():
print(f"*** Sensor {sensor}***\n")
for r in readings:
print(f"Label: {r.label}")
print(f"Current temp: {r.current_temp}")
print(f"Warning temp: {r.warning_temp}")
print(f"Critical temp: {r.critical_temp}\n")
def get_readings() -> dict[str, list[TemperatureReading]]:
try:
config = cvars.config.get().checks.temp.sensors
except LookupError:
config: dict[str, TempSensorConfig] = {}
psutil_temperatures = sensors_temperatures()
sensor_readings = {}
for sensor, readings in psutil_temperatures.items():
if sensor in config:
if not config[sensor].enabled:
continue
sensor_friendly_name = config[sensor].name if config[sensor].name else sensor
else:
sensor_friendly_name = sensor
sensor_readings[sensor_friendly_name] = []
for r in readings:
try:
config_r = config[sensor].readings[r.label]
except KeyError:
friendly_r = TemperatureReading(
label=r.label, current_temp=r.current, warning_temp=r.high, critical_temp=r.critical
)
else:
if not config_r.enabled:
continue
friendly_r = TemperatureReading(
label=config_r.label if config_r.label else r.label,
current_temp=r.current,
warning_temp=config_r.warning_temp if config_r.warning_temp else r.high,
critical_temp=config_r.critical_temp if config_r.critical_temp else r.critical,
)
sensor_readings[sensor_friendly_name].append(friendly_r)
return sensor_readings

View file

@ -3,6 +3,13 @@ from dataclasses import dataclass
from alt_utils import NestedDeserializableDataclass from alt_utils import NestedDeserializableDataclass
from .checks.temp import TempCheckConfig
@dataclass
class ChecksConfig(NestedDeserializableDataclass):
temp: TempCheckConfig
@dataclass @dataclass
class TelegramConfig: class TelegramConfig:
@ -12,7 +19,8 @@ class TelegramConfig:
@dataclass @dataclass
class Config(NestedDeserializableDataclass): class Config(NestedDeserializableDataclass):
enabled_checker_sets: list[str] enabled_check_sets: list[str]
checks: ChecksConfig
telegram: TelegramConfig telegram: TelegramConfig

View file

@ -0,0 +1,24 @@
from dataclasses import dataclass
from typing import Optional
from alt_utils import NestedDeserializableDataclass
@dataclass
class TempReadingConfig:
label: Optional[str]
enabled: bool
warning_temp: Optional[float]
critical_temp: Optional[float]
@dataclass
class TempSensorConfig(NestedDeserializableDataclass):
name: Optional[str]
enabled: bool
readings: dict[str, TempReadingConfig]
@dataclass
class TempCheckConfig(NestedDeserializableDataclass):
sensors: dict[str, TempSensorConfig]

View file

@ -2,7 +2,7 @@ from contextvars import ContextVar
from telethon import TelegramClient from telethon import TelegramClient
from .config import Config from ..config import Config
config: ContextVar[Config] = ContextVar("config") config: ContextVar[Config] = ContextVar("config")
tg_client: ContextVar[TelegramClient] = ContextVar("tg_client") tg_client: ContextVar[TelegramClient] = ContextVar("tg_client")

28
uv.lock generated
View file

@ -16,17 +16,32 @@ version = "0.1.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "alt-utils" }, { name = "alt-utils" },
{ name = "setuptools" }, { name = "psutil" },
{ name = "telethon" }, { name = "telethon" },
] ]
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "alt-utils", specifier = ">=0.0.6" }, { name = "alt-utils", specifier = ">=0.0.6" },
{ name = "setuptools", specifier = ">=80.0.0" }, { name = "psutil", specifier = ">=7.0.0" },
{ name = "telethon", specifier = ">=1.40.0" }, { name = "telethon", specifier = ">=1.40.0" },
] ]
[[package]]
name = "psutil"
version = "7.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051 },
{ url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535 },
{ url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004 },
{ url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986 },
{ url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544 },
{ url = "https://files.pythonhosted.org/packages/50/e6/eecf58810b9d12e6427369784efe814a1eec0f492084ce8eb8f4d89d6d61/psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99", size = 241053 },
{ url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885 },
]
[[package]] [[package]]
name = "pyaes" name = "pyaes"
version = "1.6.1" version = "1.6.1"
@ -54,15 +69,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696 }, { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696 },
] ]
[[package]]
name = "setuptools"
version = "80.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/44/80/97e25f0f1e4067677806084b7382a6ff9979f3d15119375c475c288db9d7/setuptools-80.0.0.tar.gz", hash = "sha256:c40a5b3729d58dd749c0f08f1a07d134fb8a0a3d7f87dc33e7c5e1f762138650", size = 1354221 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/23/63/5517029d6696ddf2bd378d46f63f479be001c31b462303170a1da57650cb/setuptools-80.0.0-py3-none-any.whl", hash = "sha256:a38f898dcd6e5380f4da4381a87ec90bd0a7eec23d204a5552e80ee3cab6bd27", size = 1240907 },
]
[[package]] [[package]]
name = "telethon" name = "telethon"
version = "1.40.0" version = "1.40.0"