From da85a566c4684a9ecb5bb02175285f9eeda8d5ec Mon Sep 17 00:00:00 2001 From: Alex Tau Date: Tue, 13 May 2025 14:15:56 +0300 Subject: [PATCH] ram check, configurable loglevel --- docs/nixos-options.md | 67 +++++++++++++++++++++++- modules/default.nix | 6 +++ modules/options.nix | 32 ++++++++++- src/lego_monitoring/__init__.py | 20 +++---- src/lego_monitoring/alerting/enum.py | 2 +- src/lego_monitoring/checks/__init__.py | 1 + src/lego_monitoring/checks/ram.py | 30 +++++++++++ src/lego_monitoring/config/__init__.py | 12 +++-- src/lego_monitoring/config/checks/ram.py | 8 +++ src/lego_monitoring/config/enums.py | 20 +++++++ 10 files changed, 180 insertions(+), 18 deletions(-) create mode 100644 src/lego_monitoring/checks/ram.py create mode 100644 src/lego_monitoring/config/checks/ram.py create mode 100644 src/lego_monitoring/config/enums.py diff --git a/docs/nixos-options.md b/docs/nixos-options.md index 1839ef9..596cc5e 100644 --- a/docs/nixos-options.md +++ b/docs/nixos-options.md @@ -33,7 +33,7 @@ List of enabled check sets\. Each check set is a module which checks something a *Type:* -list of (one of “start”, “stop”, “temp”, “cpu”, “vulnix”) +list of (one of “start”, “stop”, “cpu”, “ram”, “temp”, “vulnix”) @@ -68,7 +68,49 @@ null or floating point number -CPU load percentage for a warning alert is sent\. Null means never generate a CPU warning alert\. +CPU load percentage for a warning alert to be sent\. Null means never generate a CPU warning alert\. + + + +*Type:* +null or floating point number + + + +*Default:* +` 80.0 ` + +*Declared by:* + - [modules/options\.nix](../modules/options.nix) + + + +## services\.lego-monitoring\.checks\.ram\.criticalPercentage + + + +RAM usage percentage for a critical alert to be sent\. Null means never generate a RAM critical alert\. + + + +*Type:* +null or floating point number + + + +*Default:* +` 90.0 ` + +*Declared by:* + - [modules/options\.nix](../modules/options.nix) + + + +## services\.lego-monitoring\.checks\.ram\.warningPercentage + + + +RAM usage percentage for a warning alert to be sent\. Null means never generate a RAM warning alert\. @@ -377,6 +419,27 @@ null or string +## services\.lego-monitoring\.logLevel + + + +Level of logging\. INFO generates a log message with every check\. + + + +*Type:* +one of “CRITICAL”, “ERROR”, “WARNING”, “INFO”, “DEBUG” + + + +*Default:* +` "INFO" ` + +*Declared by:* + - [modules/options\.nix](../modules/options.nix) + + + ## services\.lego-monitoring\.telegram\.credsSecretPath diff --git a/modules/default.nix b/modules/default.nix index 448fe11..18153ab 100644 --- a/modules/default.nix +++ b/modules/default.nix @@ -30,6 +30,7 @@ package: serviceConfigFile = json.generate "config.json" { enabled_check_sets = cfg.enabledCheckSets; + log_level = cfg.logLevel; telegram = with cfg.telegram; { creds_secret_path = credsSecretPath; room_id = roomId; @@ -54,6 +55,11 @@ package: warning_percentage = warningPercentage; critical_percentage = criticalPercentage; }; + + ram = with cfg.checks.ram; { + warning_percentage = warningPercentage; + critical_percentage = criticalPercentage; + }; }; }; in lib.mkIf cfg.enable { diff --git a/modules/options.nix b/modules/options.nix index 27e9f82..815a640 100644 --- a/modules/options.nix +++ b/modules/options.nix @@ -11,12 +11,27 @@ in options.services.lego-monitoring = { enable = lib.mkEnableOption "lego-monitoring service"; + logLevel = lib.mkOption { + type = lib.types.enum [ + "CRITICAL" + "ERROR" + "WARNING" + "INFO" + "DEBUG" + ]; + default = "INFO"; + description = "Level of logging. INFO generates a log message with every check."; + }; + enabledCheckSets = lib.mkOption { type = lib.types.listOf (lib.types.enum [ "start" "stop" - "temp" + "cpu" + "ram" + "temp" + "vulnix" ]); default = [ ]; @@ -82,7 +97,7 @@ in warningPercentage = lib.mkOption { type = lib.types.nullOr lib.types.float; default = 80.0; - description = "CPU load percentage for a warning alert is sent. Null means never generate a CPU warning alert."; + description = "CPU load percentage for a warning alert to be sent. Null means never generate a CPU warning alert."; }; criticalPercentage = lib.mkOption { type = lib.types.nullOr lib.types.float; @@ -90,6 +105,19 @@ in description = "CPU load percentage for a critical alert to be sent. Null means never generate a CPU critical alert."; }; }; + + ram = { + warningPercentage = lib.mkOption { + type = lib.types.nullOr lib.types.float; + default = 80.0; + description = "RAM usage percentage for a warning alert to be sent. Null means never generate a RAM warning alert."; + }; + criticalPercentage = lib.mkOption { + type = lib.types.nullOr lib.types.float; + default = 90.0; + description = "RAM usage percentage for a critical alert to be sent. Null means never generate a RAM critical alert."; + }; + }; }; }; } diff --git a/src/lego_monitoring/__init__.py b/src/lego_monitoring/__init__.py index f1299cc..987c225 100644 --- a/src/lego_monitoring/__init__.py +++ b/src/lego_monitoring/__init__.py @@ -7,6 +7,7 @@ import signal from . import checks from .alerting import alerts from .checks.temp.sensors import print_readings +from .config import enums as config_enums from .config import load_config from .core import cvars from .core.checkers import interval_checker @@ -20,8 +21,6 @@ def stop_gracefully(signum, frame): def main() -> None: - logging.basicConfig(level=logging.INFO) - asyncio.run(async_main()) @@ -46,17 +45,20 @@ async def async_main(): if not args.config: raise RuntimeError("--config must be specified in standard operating mode") + logging.basicConfig(level=config.log_level) + tg_client = await alerts.get_client() cvars.tg_client.set(tg_client) + check_sets = config_enums.CheckSet + checker_sets = { - "start": [ - alerts.send_start_alert(), - ], - "stop": [], # this is checked later - "temp": [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))], - "vulnix": [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))], - "cpu": [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))], + check_sets.START: [alerts.send_start_alert()], + check_sets.STOP: [], # this is checked later + check_sets.CPU: [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))], + check_sets.RAM: [interval_checker(checks.ram_check, datetime.timedelta(minutes=1))], + check_sets.TEMP: [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))], + check_sets.VULNIX: [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))], } checkers = [] diff --git a/src/lego_monitoring/alerting/enum.py b/src/lego_monitoring/alerting/enum.py index 0b92bb1..d3b6a18 100644 --- a/src/lego_monitoring/alerting/enum.py +++ b/src/lego_monitoring/alerting/enum.py @@ -5,10 +5,10 @@ class AlertType(StrEnum): BOOT = "BOOT" CPU = "CPU" ERROR = "ERROR" + RAM = "RAM" TEMP = "TEMP" TEST = "TEST" VULN = "VULN" - # RAM = "RAM" # LOGIN = "LOGIN" # SMART = "SMART" # TODO # RAID = "RAID" diff --git a/src/lego_monitoring/checks/__init__.py b/src/lego_monitoring/checks/__init__.py index 8818d25..ff19608 100644 --- a/src/lego_monitoring/checks/__init__.py +++ b/src/lego_monitoring/checks/__init__.py @@ -1,3 +1,4 @@ from .cpu import cpu_check +from .ram import ram_check from .temp import temp_check from .vulnix import vulnix_check diff --git a/src/lego_monitoring/checks/ram.py b/src/lego_monitoring/checks/ram.py new file mode 100644 index 0000000..334465f --- /dev/null +++ b/src/lego_monitoring/checks/ram.py @@ -0,0 +1,30 @@ +from psutil import virtual_memory + +from lego_monitoring.alerting import alerts +from lego_monitoring.alerting.enum import AlertType, Severity +from lego_monitoring.core import cvars + +IS_TESTING = False + + +def ram_check() -> list[alerts.Alert]: + percentage = virtual_memory().percent + config = cvars.config.get().checks.ram + if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage): + return [ + alerts.Alert( + alert_type=AlertType.RAM, + message=f"RAM usage: {percentage:.2f}% > {config.critical_percentage:.2f}%", + severity=Severity.CRITICAL, + ) + ] + elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage): + return [ + alerts.Alert( + alert_type=AlertType.RAM, + message=f"RAM usage: {percentage:.2f}% > {config.warning_percentage:.2f}%", + severity=Severity.WARNING, + ) + ] + else: + return [] diff --git a/src/lego_monitoring/config/__init__.py b/src/lego_monitoring/config/__init__.py index e08fdd9..3d8917e 100644 --- a/src/lego_monitoring/config/__init__.py +++ b/src/lego_monitoring/config/__init__.py @@ -4,16 +4,19 @@ from typing import Optional from alt_utils import NestedDeserializableDataclass +from . import enums from .checks.cpu import CpuCheckConfig +from .checks.ram import RamCheckConfig from .checks.temp import TempCheckConfig from .checks.vulnix import VulnixCheckConfig @dataclass class ChecksConfig(NestedDeserializableDataclass): - cpu: Optional[CpuCheckConfig] = None - temp: Optional[TempCheckConfig] = None - vulnix: Optional[VulnixCheckConfig] = None + cpu: CpuCheckConfig = field(default_factory=CpuCheckConfig) + ram: RamCheckConfig = field(default_factory=RamCheckConfig) + temp: TempCheckConfig = field(default_factory=TempCheckConfig) + vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None @dataclass @@ -26,7 +29,8 @@ class TelegramConfig: class Config(NestedDeserializableDataclass): checks: ChecksConfig telegram: TelegramConfig - enabled_check_sets: list[str] = field(default_factory=list) + enabled_check_sets: list[enums.CheckSet] = field(default_factory=list) + log_level: enums.LogLevelName = enums.LogLevelName.INFO def load_config(filepath: str) -> Config: diff --git a/src/lego_monitoring/config/checks/ram.py b/src/lego_monitoring/config/checks/ram.py new file mode 100644 index 0000000..2f46d8d --- /dev/null +++ b/src/lego_monitoring/config/checks/ram.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class RamCheckConfig: + warning_percentage: Optional[float] = 80 + critical_percentage: Optional[float] = 90 diff --git a/src/lego_monitoring/config/enums.py b/src/lego_monitoring/config/enums.py new file mode 100644 index 0000000..54954cf --- /dev/null +++ b/src/lego_monitoring/config/enums.py @@ -0,0 +1,20 @@ +from enum import StrEnum + + +class CheckSet(StrEnum): + START = "start" + STOP = "stop" + + CPU = "cpu" + RAM = "ram" + TEMP = "temp" + + VULNIX = "vulnix" + + +class LogLevelName(StrEnum): + CRITICAL = "CRITICAL" + ERROR = "ERROR" + WARNING = "WARNING" + INFO = "INFO" + DEBUG = "DEBUG"