ram check, configurable loglevel

This commit is contained in:
Alex Tau 2025-05-13 14:15:56 +03:00
parent 5095057a13
commit da85a566c4
10 changed files with 180 additions and 18 deletions

View file

@ -33,7 +33,7 @@ List of enabled check sets\. Each check set is a module which checks something a
*Type:*
list of (one of “start”, “stop”, “temp”, “cpu”, “vulnix”)
list of (one of “start”, “stop”, “cpu”, “ram”, “temp”, “vulnix”)
@ -68,7 +68,49 @@ null or floating point number
CPU load percentage for a warning alert is sent\. Null means never generate a CPU warning alert\.
CPU load percentage for a warning alert to be sent\. Null means never generate a CPU warning alert\.
*Type:*
null or floating point number
*Default:*
` 80.0 `
*Declared by:*
- [modules/options\.nix](../modules/options.nix)
## services\.lego-monitoring\.checks\.ram\.criticalPercentage
RAM usage percentage for a critical alert to be sent\. Null means never generate a RAM critical alert\.
*Type:*
null or floating point number
*Default:*
` 90.0 `
*Declared by:*
- [modules/options\.nix](../modules/options.nix)
## services\.lego-monitoring\.checks\.ram\.warningPercentage
RAM usage percentage for a warning alert to be sent\. Null means never generate a RAM warning alert\.
@ -377,6 +419,27 @@ null or string
## services\.lego-monitoring\.logLevel
Level of logging\. INFO generates a log message with every check\.
*Type:*
one of “CRITICAL”, “ERROR”, “WARNING”, “INFO”, “DEBUG”
*Default:*
` "INFO" `
*Declared by:*
- [modules/options\.nix](../modules/options.nix)
## services\.lego-monitoring\.telegram\.credsSecretPath

View file

@ -30,6 +30,7 @@ package:
serviceConfigFile = json.generate "config.json" {
enabled_check_sets = cfg.enabledCheckSets;
log_level = cfg.logLevel;
telegram = with cfg.telegram; {
creds_secret_path = credsSecretPath;
room_id = roomId;
@ -54,6 +55,11 @@ package:
warning_percentage = warningPercentage;
critical_percentage = criticalPercentage;
};
ram = with cfg.checks.ram; {
warning_percentage = warningPercentage;
critical_percentage = criticalPercentage;
};
};
};
in lib.mkIf cfg.enable {

View file

@ -11,12 +11,27 @@ in
options.services.lego-monitoring = {
enable = lib.mkEnableOption "lego-monitoring service";
logLevel = lib.mkOption {
type = lib.types.enum [
"CRITICAL"
"ERROR"
"WARNING"
"INFO"
"DEBUG"
];
default = "INFO";
description = "Level of logging. INFO generates a log message with every check.";
};
enabledCheckSets = lib.mkOption {
type = lib.types.listOf (lib.types.enum [
"start"
"stop"
"temp"
"cpu"
"ram"
"temp"
"vulnix"
]);
default = [ ];
@ -82,7 +97,7 @@ in
warningPercentage = lib.mkOption {
type = lib.types.nullOr lib.types.float;
default = 80.0;
description = "CPU load percentage for a warning alert is sent. Null means never generate a CPU warning alert.";
description = "CPU load percentage for a warning alert to be sent. Null means never generate a CPU warning alert.";
};
criticalPercentage = lib.mkOption {
type = lib.types.nullOr lib.types.float;
@ -90,6 +105,19 @@ in
description = "CPU load percentage for a critical alert to be sent. Null means never generate a CPU critical alert.";
};
};
ram = {
warningPercentage = lib.mkOption {
type = lib.types.nullOr lib.types.float;
default = 80.0;
description = "RAM usage percentage for a warning alert to be sent. Null means never generate a RAM warning alert.";
};
criticalPercentage = lib.mkOption {
type = lib.types.nullOr lib.types.float;
default = 90.0;
description = "RAM usage percentage for a critical alert to be sent. Null means never generate a RAM critical alert.";
};
};
};
};
}

View file

@ -7,6 +7,7 @@ import signal
from . import checks
from .alerting import alerts
from .checks.temp.sensors import print_readings
from .config import enums as config_enums
from .config import load_config
from .core import cvars
from .core.checkers import interval_checker
@ -20,8 +21,6 @@ def stop_gracefully(signum, frame):
def main() -> None:
logging.basicConfig(level=logging.INFO)
asyncio.run(async_main())
@ -46,17 +45,20 @@ async def async_main():
if not args.config:
raise RuntimeError("--config must be specified in standard operating mode")
logging.basicConfig(level=config.log_level)
tg_client = await alerts.get_client()
cvars.tg_client.set(tg_client)
check_sets = config_enums.CheckSet
checker_sets = {
"start": [
alerts.send_start_alert(),
],
"stop": [], # this is checked later
"temp": [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))],
"vulnix": [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))],
"cpu": [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))],
check_sets.START: [alerts.send_start_alert()],
check_sets.STOP: [], # this is checked later
check_sets.CPU: [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))],
check_sets.RAM: [interval_checker(checks.ram_check, datetime.timedelta(minutes=1))],
check_sets.TEMP: [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))],
check_sets.VULNIX: [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))],
}
checkers = []

View file

@ -5,10 +5,10 @@ class AlertType(StrEnum):
BOOT = "BOOT"
CPU = "CPU"
ERROR = "ERROR"
RAM = "RAM"
TEMP = "TEMP"
TEST = "TEST"
VULN = "VULN"
# RAM = "RAM"
# LOGIN = "LOGIN"
# SMART = "SMART" # TODO
# RAID = "RAID"

View file

@ -1,3 +1,4 @@
from .cpu import cpu_check
from .ram import ram_check
from .temp import temp_check
from .vulnix import vulnix_check

View file

@ -0,0 +1,30 @@
from psutil import virtual_memory
from lego_monitoring.alerting import alerts
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
IS_TESTING = False
def ram_check() -> list[alerts.Alert]:
percentage = virtual_memory().percent
config = cvars.config.get().checks.ram
if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage):
return [
alerts.Alert(
alert_type=AlertType.RAM,
message=f"RAM usage: {percentage:.2f}% > {config.critical_percentage:.2f}%",
severity=Severity.CRITICAL,
)
]
elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage):
return [
alerts.Alert(
alert_type=AlertType.RAM,
message=f"RAM usage: {percentage:.2f}% > {config.warning_percentage:.2f}%",
severity=Severity.WARNING,
)
]
else:
return []

View file

@ -4,16 +4,19 @@ from typing import Optional
from alt_utils import NestedDeserializableDataclass
from . import enums
from .checks.cpu import CpuCheckConfig
from .checks.ram import RamCheckConfig
from .checks.temp import TempCheckConfig
from .checks.vulnix import VulnixCheckConfig
@dataclass
class ChecksConfig(NestedDeserializableDataclass):
cpu: Optional[CpuCheckConfig] = None
temp: Optional[TempCheckConfig] = None
vulnix: Optional[VulnixCheckConfig] = None
cpu: CpuCheckConfig = field(default_factory=CpuCheckConfig)
ram: RamCheckConfig = field(default_factory=RamCheckConfig)
temp: TempCheckConfig = field(default_factory=TempCheckConfig)
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
@dataclass
@ -26,7 +29,8 @@ class TelegramConfig:
class Config(NestedDeserializableDataclass):
checks: ChecksConfig
telegram: TelegramConfig
enabled_check_sets: list[str] = field(default_factory=list)
enabled_check_sets: list[enums.CheckSet] = field(default_factory=list)
log_level: enums.LogLevelName = enums.LogLevelName.INFO
def load_config(filepath: str) -> Config:

View file

@ -0,0 +1,8 @@
from dataclasses import dataclass
from typing import Optional
@dataclass
class RamCheckConfig:
warning_percentage: Optional[float] = 80
critical_percentage: Optional[float] = 90

View file

@ -0,0 +1,20 @@
from enum import StrEnum
class CheckSet(StrEnum):
START = "start"
STOP = "stop"
CPU = "cpu"
RAM = "ram"
TEMP = "temp"
VULNIX = "vulnix"
class LogLevelName(StrEnum):
CRITICAL = "CRITICAL"
ERROR = "ERROR"
WARNING = "WARNING"
INFO = "INFO"
DEBUG = "DEBUG"