mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-09 20:31:10 +00:00
ram check, configurable loglevel
This commit is contained in:
parent
5095057a13
commit
da85a566c4
10 changed files with 180 additions and 18 deletions
|
|
@ -33,7 +33,7 @@ List of enabled check sets\. Each check set is a module which checks something a
|
||||||
|
|
||||||
|
|
||||||
*Type:*
|
*Type:*
|
||||||
list of (one of “start”, “stop”, “temp”, “cpu”, “vulnix”)
|
list of (one of “start”, “stop”, “cpu”, “ram”, “temp”, “vulnix”)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -68,7 +68,49 @@ null or floating point number
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
CPU load percentage for a warning alert is sent\. Null means never generate a CPU warning alert\.
|
CPU load percentage for a warning alert to be sent\. Null means never generate a CPU warning alert\.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
*Type:*
|
||||||
|
null or floating point number
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
*Default:*
|
||||||
|
` 80.0 `
|
||||||
|
|
||||||
|
*Declared by:*
|
||||||
|
- [modules/options\.nix](../modules/options.nix)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## services\.lego-monitoring\.checks\.ram\.criticalPercentage
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
RAM usage percentage for a critical alert to be sent\. Null means never generate a RAM critical alert\.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
*Type:*
|
||||||
|
null or floating point number
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
*Default:*
|
||||||
|
` 90.0 `
|
||||||
|
|
||||||
|
*Declared by:*
|
||||||
|
- [modules/options\.nix](../modules/options.nix)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## services\.lego-monitoring\.checks\.ram\.warningPercentage
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
RAM usage percentage for a warning alert to be sent\. Null means never generate a RAM warning alert\.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -377,6 +419,27 @@ null or string
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## services\.lego-monitoring\.logLevel
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Level of logging\. INFO generates a log message with every check\.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
*Type:*
|
||||||
|
one of “CRITICAL”, “ERROR”, “WARNING”, “INFO”, “DEBUG”
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
*Default:*
|
||||||
|
` "INFO" `
|
||||||
|
|
||||||
|
*Declared by:*
|
||||||
|
- [modules/options\.nix](../modules/options.nix)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## services\.lego-monitoring\.telegram\.credsSecretPath
|
## services\.lego-monitoring\.telegram\.credsSecretPath
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ package:
|
||||||
|
|
||||||
serviceConfigFile = json.generate "config.json" {
|
serviceConfigFile = json.generate "config.json" {
|
||||||
enabled_check_sets = cfg.enabledCheckSets;
|
enabled_check_sets = cfg.enabledCheckSets;
|
||||||
|
log_level = cfg.logLevel;
|
||||||
telegram = with cfg.telegram; {
|
telegram = with cfg.telegram; {
|
||||||
creds_secret_path = credsSecretPath;
|
creds_secret_path = credsSecretPath;
|
||||||
room_id = roomId;
|
room_id = roomId;
|
||||||
|
|
@ -54,6 +55,11 @@ package:
|
||||||
warning_percentage = warningPercentage;
|
warning_percentage = warningPercentage;
|
||||||
critical_percentage = criticalPercentage;
|
critical_percentage = criticalPercentage;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ram = with cfg.checks.ram; {
|
||||||
|
warning_percentage = warningPercentage;
|
||||||
|
critical_percentage = criticalPercentage;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
in lib.mkIf cfg.enable {
|
in lib.mkIf cfg.enable {
|
||||||
|
|
|
||||||
|
|
@ -11,12 +11,27 @@ in
|
||||||
options.services.lego-monitoring = {
|
options.services.lego-monitoring = {
|
||||||
enable = lib.mkEnableOption "lego-monitoring service";
|
enable = lib.mkEnableOption "lego-monitoring service";
|
||||||
|
|
||||||
|
logLevel = lib.mkOption {
|
||||||
|
type = lib.types.enum [
|
||||||
|
"CRITICAL"
|
||||||
|
"ERROR"
|
||||||
|
"WARNING"
|
||||||
|
"INFO"
|
||||||
|
"DEBUG"
|
||||||
|
];
|
||||||
|
default = "INFO";
|
||||||
|
description = "Level of logging. INFO generates a log message with every check.";
|
||||||
|
};
|
||||||
|
|
||||||
enabledCheckSets = lib.mkOption {
|
enabledCheckSets = lib.mkOption {
|
||||||
type = lib.types.listOf (lib.types.enum [
|
type = lib.types.listOf (lib.types.enum [
|
||||||
"start"
|
"start"
|
||||||
"stop"
|
"stop"
|
||||||
"temp"
|
|
||||||
"cpu"
|
"cpu"
|
||||||
|
"ram"
|
||||||
|
"temp"
|
||||||
|
|
||||||
"vulnix"
|
"vulnix"
|
||||||
]);
|
]);
|
||||||
default = [ ];
|
default = [ ];
|
||||||
|
|
@ -82,7 +97,7 @@ in
|
||||||
warningPercentage = lib.mkOption {
|
warningPercentage = lib.mkOption {
|
||||||
type = lib.types.nullOr lib.types.float;
|
type = lib.types.nullOr lib.types.float;
|
||||||
default = 80.0;
|
default = 80.0;
|
||||||
description = "CPU load percentage for a warning alert is sent. Null means never generate a CPU warning alert.";
|
description = "CPU load percentage for a warning alert to be sent. Null means never generate a CPU warning alert.";
|
||||||
};
|
};
|
||||||
criticalPercentage = lib.mkOption {
|
criticalPercentage = lib.mkOption {
|
||||||
type = lib.types.nullOr lib.types.float;
|
type = lib.types.nullOr lib.types.float;
|
||||||
|
|
@ -90,6 +105,19 @@ in
|
||||||
description = "CPU load percentage for a critical alert to be sent. Null means never generate a CPU critical alert.";
|
description = "CPU load percentage for a critical alert to be sent. Null means never generate a CPU critical alert.";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ram = {
|
||||||
|
warningPercentage = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.float;
|
||||||
|
default = 80.0;
|
||||||
|
description = "RAM usage percentage for a warning alert to be sent. Null means never generate a RAM warning alert.";
|
||||||
|
};
|
||||||
|
criticalPercentage = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.float;
|
||||||
|
default = 90.0;
|
||||||
|
description = "RAM usage percentage for a critical alert to be sent. Null means never generate a RAM critical alert.";
|
||||||
|
};
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import signal
|
||||||
from . import checks
|
from . import checks
|
||||||
from .alerting import alerts
|
from .alerting import alerts
|
||||||
from .checks.temp.sensors import print_readings
|
from .checks.temp.sensors import print_readings
|
||||||
|
from .config import enums as config_enums
|
||||||
from .config import load_config
|
from .config import load_config
|
||||||
from .core import cvars
|
from .core import cvars
|
||||||
from .core.checkers import interval_checker
|
from .core.checkers import interval_checker
|
||||||
|
|
@ -20,8 +21,6 @@ def stop_gracefully(signum, frame):
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
|
|
||||||
asyncio.run(async_main())
|
asyncio.run(async_main())
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -46,17 +45,20 @@ async def async_main():
|
||||||
if not args.config:
|
if not args.config:
|
||||||
raise RuntimeError("--config must be specified in standard operating mode")
|
raise RuntimeError("--config must be specified in standard operating mode")
|
||||||
|
|
||||||
|
logging.basicConfig(level=config.log_level)
|
||||||
|
|
||||||
tg_client = await alerts.get_client()
|
tg_client = await alerts.get_client()
|
||||||
cvars.tg_client.set(tg_client)
|
cvars.tg_client.set(tg_client)
|
||||||
|
|
||||||
|
check_sets = config_enums.CheckSet
|
||||||
|
|
||||||
checker_sets = {
|
checker_sets = {
|
||||||
"start": [
|
check_sets.START: [alerts.send_start_alert()],
|
||||||
alerts.send_start_alert(),
|
check_sets.STOP: [], # this is checked later
|
||||||
],
|
check_sets.CPU: [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))],
|
||||||
"stop": [], # this is checked later
|
check_sets.RAM: [interval_checker(checks.ram_check, datetime.timedelta(minutes=1))],
|
||||||
"temp": [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))],
|
check_sets.TEMP: [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))],
|
||||||
"vulnix": [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))],
|
check_sets.VULNIX: [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))],
|
||||||
"cpu": [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
checkers = []
|
checkers = []
|
||||||
|
|
|
||||||
|
|
@ -5,10 +5,10 @@ class AlertType(StrEnum):
|
||||||
BOOT = "BOOT"
|
BOOT = "BOOT"
|
||||||
CPU = "CPU"
|
CPU = "CPU"
|
||||||
ERROR = "ERROR"
|
ERROR = "ERROR"
|
||||||
|
RAM = "RAM"
|
||||||
TEMP = "TEMP"
|
TEMP = "TEMP"
|
||||||
TEST = "TEST"
|
TEST = "TEST"
|
||||||
VULN = "VULN"
|
VULN = "VULN"
|
||||||
# RAM = "RAM"
|
|
||||||
# LOGIN = "LOGIN"
|
# LOGIN = "LOGIN"
|
||||||
# SMART = "SMART" # TODO
|
# SMART = "SMART" # TODO
|
||||||
# RAID = "RAID"
|
# RAID = "RAID"
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
from .cpu import cpu_check
|
from .cpu import cpu_check
|
||||||
|
from .ram import ram_check
|
||||||
from .temp import temp_check
|
from .temp import temp_check
|
||||||
from .vulnix import vulnix_check
|
from .vulnix import vulnix_check
|
||||||
|
|
|
||||||
30
src/lego_monitoring/checks/ram.py
Normal file
30
src/lego_monitoring/checks/ram.py
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
from psutil import virtual_memory
|
||||||
|
|
||||||
|
from lego_monitoring.alerting import alerts
|
||||||
|
from lego_monitoring.alerting.enum import AlertType, Severity
|
||||||
|
from lego_monitoring.core import cvars
|
||||||
|
|
||||||
|
IS_TESTING = False
|
||||||
|
|
||||||
|
|
||||||
|
def ram_check() -> list[alerts.Alert]:
|
||||||
|
percentage = virtual_memory().percent
|
||||||
|
config = cvars.config.get().checks.ram
|
||||||
|
if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage):
|
||||||
|
return [
|
||||||
|
alerts.Alert(
|
||||||
|
alert_type=AlertType.RAM,
|
||||||
|
message=f"RAM usage: {percentage:.2f}% > {config.critical_percentage:.2f}%",
|
||||||
|
severity=Severity.CRITICAL,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage):
|
||||||
|
return [
|
||||||
|
alerts.Alert(
|
||||||
|
alert_type=AlertType.RAM,
|
||||||
|
message=f"RAM usage: {percentage:.2f}% > {config.warning_percentage:.2f}%",
|
||||||
|
severity=Severity.WARNING,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
@ -4,16 +4,19 @@ from typing import Optional
|
||||||
|
|
||||||
from alt_utils import NestedDeserializableDataclass
|
from alt_utils import NestedDeserializableDataclass
|
||||||
|
|
||||||
|
from . import enums
|
||||||
from .checks.cpu import CpuCheckConfig
|
from .checks.cpu import CpuCheckConfig
|
||||||
|
from .checks.ram import RamCheckConfig
|
||||||
from .checks.temp import TempCheckConfig
|
from .checks.temp import TempCheckConfig
|
||||||
from .checks.vulnix import VulnixCheckConfig
|
from .checks.vulnix import VulnixCheckConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ChecksConfig(NestedDeserializableDataclass):
|
class ChecksConfig(NestedDeserializableDataclass):
|
||||||
cpu: Optional[CpuCheckConfig] = None
|
cpu: CpuCheckConfig = field(default_factory=CpuCheckConfig)
|
||||||
temp: Optional[TempCheckConfig] = None
|
ram: RamCheckConfig = field(default_factory=RamCheckConfig)
|
||||||
vulnix: Optional[VulnixCheckConfig] = None
|
temp: TempCheckConfig = field(default_factory=TempCheckConfig)
|
||||||
|
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
@ -26,7 +29,8 @@ class TelegramConfig:
|
||||||
class Config(NestedDeserializableDataclass):
|
class Config(NestedDeserializableDataclass):
|
||||||
checks: ChecksConfig
|
checks: ChecksConfig
|
||||||
telegram: TelegramConfig
|
telegram: TelegramConfig
|
||||||
enabled_check_sets: list[str] = field(default_factory=list)
|
enabled_check_sets: list[enums.CheckSet] = field(default_factory=list)
|
||||||
|
log_level: enums.LogLevelName = enums.LogLevelName.INFO
|
||||||
|
|
||||||
|
|
||||||
def load_config(filepath: str) -> Config:
|
def load_config(filepath: str) -> Config:
|
||||||
|
|
|
||||||
8
src/lego_monitoring/config/checks/ram.py
Normal file
8
src/lego_monitoring/config/checks/ram.py
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RamCheckConfig:
|
||||||
|
warning_percentage: Optional[float] = 80
|
||||||
|
critical_percentage: Optional[float] = 90
|
||||||
20
src/lego_monitoring/config/enums.py
Normal file
20
src/lego_monitoring/config/enums.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
from enum import StrEnum
|
||||||
|
|
||||||
|
|
||||||
|
class CheckSet(StrEnum):
|
||||||
|
START = "start"
|
||||||
|
STOP = "stop"
|
||||||
|
|
||||||
|
CPU = "cpu"
|
||||||
|
RAM = "ram"
|
||||||
|
TEMP = "temp"
|
||||||
|
|
||||||
|
VULNIX = "vulnix"
|
||||||
|
|
||||||
|
|
||||||
|
class LogLevelName(StrEnum):
|
||||||
|
CRITICAL = "CRITICAL"
|
||||||
|
ERROR = "ERROR"
|
||||||
|
WARNING = "WARNING"
|
||||||
|
INFO = "INFO"
|
||||||
|
DEBUG = "DEBUG"
|
||||||
Loading…
Add table
Add a link
Reference in a new issue