ram check, configurable loglevel

This commit is contained in:
Alex Tau 2025-05-13 14:15:56 +03:00
parent 5095057a13
commit da85a566c4
10 changed files with 180 additions and 18 deletions

View file

@ -33,7 +33,7 @@ List of enabled check sets\. Each check set is a module which checks something a
*Type:* *Type:*
list of (one of “start”, “stop”, “temp”, “cpu”, “vulnix”) list of (one of “start”, “stop”, “cpu”, “ram”, “temp”, “vulnix”)
@ -68,7 +68,49 @@ null or floating point number
CPU load percentage for a warning alert is sent\. Null means never generate a CPU warning alert\. CPU load percentage for a warning alert to be sent\. Null means never generate a CPU warning alert\.
*Type:*
null or floating point number
*Default:*
` 80.0 `
*Declared by:*
- [modules/options\.nix](../modules/options.nix)
## services\.lego-monitoring\.checks\.ram\.criticalPercentage
RAM usage percentage for a critical alert to be sent\. Null means never generate a RAM critical alert\.
*Type:*
null or floating point number
*Default:*
` 90.0 `
*Declared by:*
- [modules/options\.nix](../modules/options.nix)
## services\.lego-monitoring\.checks\.ram\.warningPercentage
RAM usage percentage for a warning alert to be sent\. Null means never generate a RAM warning alert\.
@ -377,6 +419,27 @@ null or string
## services\.lego-monitoring\.logLevel
Level of logging\. INFO generates a log message with every check\.
*Type:*
one of “CRITICAL”, “ERROR”, “WARNING”, “INFO”, “DEBUG”
*Default:*
` "INFO" `
*Declared by:*
- [modules/options\.nix](../modules/options.nix)
## services\.lego-monitoring\.telegram\.credsSecretPath ## services\.lego-monitoring\.telegram\.credsSecretPath

View file

@ -30,6 +30,7 @@ package:
serviceConfigFile = json.generate "config.json" { serviceConfigFile = json.generate "config.json" {
enabled_check_sets = cfg.enabledCheckSets; enabled_check_sets = cfg.enabledCheckSets;
log_level = cfg.logLevel;
telegram = with cfg.telegram; { telegram = with cfg.telegram; {
creds_secret_path = credsSecretPath; creds_secret_path = credsSecretPath;
room_id = roomId; room_id = roomId;
@ -54,6 +55,11 @@ package:
warning_percentage = warningPercentage; warning_percentage = warningPercentage;
critical_percentage = criticalPercentage; critical_percentage = criticalPercentage;
}; };
ram = with cfg.checks.ram; {
warning_percentage = warningPercentage;
critical_percentage = criticalPercentage;
};
}; };
}; };
in lib.mkIf cfg.enable { in lib.mkIf cfg.enable {

View file

@ -11,12 +11,27 @@ in
options.services.lego-monitoring = { options.services.lego-monitoring = {
enable = lib.mkEnableOption "lego-monitoring service"; enable = lib.mkEnableOption "lego-monitoring service";
logLevel = lib.mkOption {
type = lib.types.enum [
"CRITICAL"
"ERROR"
"WARNING"
"INFO"
"DEBUG"
];
default = "INFO";
description = "Level of logging. INFO generates a log message with every check.";
};
enabledCheckSets = lib.mkOption { enabledCheckSets = lib.mkOption {
type = lib.types.listOf (lib.types.enum [ type = lib.types.listOf (lib.types.enum [
"start" "start"
"stop" "stop"
"temp"
"cpu" "cpu"
"ram"
"temp"
"vulnix" "vulnix"
]); ]);
default = [ ]; default = [ ];
@ -82,7 +97,7 @@ in
warningPercentage = lib.mkOption { warningPercentage = lib.mkOption {
type = lib.types.nullOr lib.types.float; type = lib.types.nullOr lib.types.float;
default = 80.0; default = 80.0;
description = "CPU load percentage for a warning alert is sent. Null means never generate a CPU warning alert."; description = "CPU load percentage for a warning alert to be sent. Null means never generate a CPU warning alert.";
}; };
criticalPercentage = lib.mkOption { criticalPercentage = lib.mkOption {
type = lib.types.nullOr lib.types.float; type = lib.types.nullOr lib.types.float;
@ -90,6 +105,19 @@ in
description = "CPU load percentage for a critical alert to be sent. Null means never generate a CPU critical alert."; description = "CPU load percentage for a critical alert to be sent. Null means never generate a CPU critical alert.";
}; };
}; };
ram = {
warningPercentage = lib.mkOption {
type = lib.types.nullOr lib.types.float;
default = 80.0;
description = "RAM usage percentage for a warning alert to be sent. Null means never generate a RAM warning alert.";
};
criticalPercentage = lib.mkOption {
type = lib.types.nullOr lib.types.float;
default = 90.0;
description = "RAM usage percentage for a critical alert to be sent. Null means never generate a RAM critical alert.";
};
};
}; };
}; };
} }

View file

@ -7,6 +7,7 @@ import signal
from . import checks from . import checks
from .alerting import alerts from .alerting import alerts
from .checks.temp.sensors import print_readings from .checks.temp.sensors import print_readings
from .config import enums as config_enums
from .config import load_config from .config import load_config
from .core import cvars from .core import cvars
from .core.checkers import interval_checker from .core.checkers import interval_checker
@ -20,8 +21,6 @@ def stop_gracefully(signum, frame):
def main() -> None: def main() -> None:
logging.basicConfig(level=logging.INFO)
asyncio.run(async_main()) asyncio.run(async_main())
@ -46,17 +45,20 @@ async def async_main():
if not args.config: if not args.config:
raise RuntimeError("--config must be specified in standard operating mode") raise RuntimeError("--config must be specified in standard operating mode")
logging.basicConfig(level=config.log_level)
tg_client = await alerts.get_client() tg_client = await alerts.get_client()
cvars.tg_client.set(tg_client) cvars.tg_client.set(tg_client)
check_sets = config_enums.CheckSet
checker_sets = { checker_sets = {
"start": [ check_sets.START: [alerts.send_start_alert()],
alerts.send_start_alert(), check_sets.STOP: [], # this is checked later
], check_sets.CPU: [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))],
"stop": [], # this is checked later check_sets.RAM: [interval_checker(checks.ram_check, datetime.timedelta(minutes=1))],
"temp": [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))], check_sets.TEMP: [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))],
"vulnix": [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))], check_sets.VULNIX: [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))],
"cpu": [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))],
} }
checkers = [] checkers = []

View file

@ -5,10 +5,10 @@ class AlertType(StrEnum):
BOOT = "BOOT" BOOT = "BOOT"
CPU = "CPU" CPU = "CPU"
ERROR = "ERROR" ERROR = "ERROR"
RAM = "RAM"
TEMP = "TEMP" TEMP = "TEMP"
TEST = "TEST" TEST = "TEST"
VULN = "VULN" VULN = "VULN"
# RAM = "RAM"
# LOGIN = "LOGIN" # LOGIN = "LOGIN"
# SMART = "SMART" # TODO # SMART = "SMART" # TODO
# RAID = "RAID" # RAID = "RAID"

View file

@ -1,3 +1,4 @@
from .cpu import cpu_check from .cpu import cpu_check
from .ram import ram_check
from .temp import temp_check from .temp import temp_check
from .vulnix import vulnix_check from .vulnix import vulnix_check

View file

@ -0,0 +1,30 @@
from psutil import virtual_memory
from lego_monitoring.alerting import alerts
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
IS_TESTING = False
def ram_check() -> list[alerts.Alert]:
percentage = virtual_memory().percent
config = cvars.config.get().checks.ram
if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage):
return [
alerts.Alert(
alert_type=AlertType.RAM,
message=f"RAM usage: {percentage:.2f}% > {config.critical_percentage:.2f}%",
severity=Severity.CRITICAL,
)
]
elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage):
return [
alerts.Alert(
alert_type=AlertType.RAM,
message=f"RAM usage: {percentage:.2f}% > {config.warning_percentage:.2f}%",
severity=Severity.WARNING,
)
]
else:
return []

View file

@ -4,16 +4,19 @@ from typing import Optional
from alt_utils import NestedDeserializableDataclass from alt_utils import NestedDeserializableDataclass
from . import enums
from .checks.cpu import CpuCheckConfig from .checks.cpu import CpuCheckConfig
from .checks.ram import RamCheckConfig
from .checks.temp import TempCheckConfig from .checks.temp import TempCheckConfig
from .checks.vulnix import VulnixCheckConfig from .checks.vulnix import VulnixCheckConfig
@dataclass @dataclass
class ChecksConfig(NestedDeserializableDataclass): class ChecksConfig(NestedDeserializableDataclass):
cpu: Optional[CpuCheckConfig] = None cpu: CpuCheckConfig = field(default_factory=CpuCheckConfig)
temp: Optional[TempCheckConfig] = None ram: RamCheckConfig = field(default_factory=RamCheckConfig)
vulnix: Optional[VulnixCheckConfig] = None temp: TempCheckConfig = field(default_factory=TempCheckConfig)
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
@dataclass @dataclass
@ -26,7 +29,8 @@ class TelegramConfig:
class Config(NestedDeserializableDataclass): class Config(NestedDeserializableDataclass):
checks: ChecksConfig checks: ChecksConfig
telegram: TelegramConfig telegram: TelegramConfig
enabled_check_sets: list[str] = field(default_factory=list) enabled_check_sets: list[enums.CheckSet] = field(default_factory=list)
log_level: enums.LogLevelName = enums.LogLevelName.INFO
def load_config(filepath: str) -> Config: def load_config(filepath: str) -> Config:

View file

@ -0,0 +1,8 @@
from dataclasses import dataclass
from typing import Optional
@dataclass
class RamCheckConfig:
warning_percentage: Optional[float] = 80
critical_percentage: Optional[float] = 90

View file

@ -0,0 +1,20 @@
from enum import StrEnum
class CheckSet(StrEnum):
START = "start"
STOP = "stop"
CPU = "cpu"
RAM = "ram"
TEMP = "temp"
VULNIX = "vulnix"
class LogLevelName(StrEnum):
CRITICAL = "CRITICAL"
ERROR = "ERROR"
WARNING = "WARNING"
INFO = "INFO"
DEBUG = "DEBUG"