mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-09 20:31:10 +00:00
ram check, configurable loglevel
This commit is contained in:
parent
5095057a13
commit
da85a566c4
10 changed files with 180 additions and 18 deletions
|
|
@ -33,7 +33,7 @@ List of enabled check sets\. Each check set is a module which checks something a
|
|||
|
||||
|
||||
*Type:*
|
||||
list of (one of “start”, “stop”, “temp”, “cpu”, “vulnix”)
|
||||
list of (one of “start”, “stop”, “cpu”, “ram”, “temp”, “vulnix”)
|
||||
|
||||
|
||||
|
||||
|
|
@ -68,7 +68,49 @@ null or floating point number
|
|||
|
||||
|
||||
|
||||
CPU load percentage for a warning alert is sent\. Null means never generate a CPU warning alert\.
|
||||
CPU load percentage for a warning alert to be sent\. Null means never generate a CPU warning alert\.
|
||||
|
||||
|
||||
|
||||
*Type:*
|
||||
null or floating point number
|
||||
|
||||
|
||||
|
||||
*Default:*
|
||||
` 80.0 `
|
||||
|
||||
*Declared by:*
|
||||
- [modules/options\.nix](../modules/options.nix)
|
||||
|
||||
|
||||
|
||||
## services\.lego-monitoring\.checks\.ram\.criticalPercentage
|
||||
|
||||
|
||||
|
||||
RAM usage percentage for a critical alert to be sent\. Null means never generate a RAM critical alert\.
|
||||
|
||||
|
||||
|
||||
*Type:*
|
||||
null or floating point number
|
||||
|
||||
|
||||
|
||||
*Default:*
|
||||
` 90.0 `
|
||||
|
||||
*Declared by:*
|
||||
- [modules/options\.nix](../modules/options.nix)
|
||||
|
||||
|
||||
|
||||
## services\.lego-monitoring\.checks\.ram\.warningPercentage
|
||||
|
||||
|
||||
|
||||
RAM usage percentage for a warning alert to be sent\. Null means never generate a RAM warning alert\.
|
||||
|
||||
|
||||
|
||||
|
|
@ -377,6 +419,27 @@ null or string
|
|||
|
||||
|
||||
|
||||
## services\.lego-monitoring\.logLevel
|
||||
|
||||
|
||||
|
||||
Level of logging\. INFO generates a log message with every check\.
|
||||
|
||||
|
||||
|
||||
*Type:*
|
||||
one of “CRITICAL”, “ERROR”, “WARNING”, “INFO”, “DEBUG”
|
||||
|
||||
|
||||
|
||||
*Default:*
|
||||
` "INFO" `
|
||||
|
||||
*Declared by:*
|
||||
- [modules/options\.nix](../modules/options.nix)
|
||||
|
||||
|
||||
|
||||
## services\.lego-monitoring\.telegram\.credsSecretPath
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ package:
|
|||
|
||||
serviceConfigFile = json.generate "config.json" {
|
||||
enabled_check_sets = cfg.enabledCheckSets;
|
||||
log_level = cfg.logLevel;
|
||||
telegram = with cfg.telegram; {
|
||||
creds_secret_path = credsSecretPath;
|
||||
room_id = roomId;
|
||||
|
|
@ -54,6 +55,11 @@ package:
|
|||
warning_percentage = warningPercentage;
|
||||
critical_percentage = criticalPercentage;
|
||||
};
|
||||
|
||||
ram = with cfg.checks.ram; {
|
||||
warning_percentage = warningPercentage;
|
||||
critical_percentage = criticalPercentage;
|
||||
};
|
||||
};
|
||||
};
|
||||
in lib.mkIf cfg.enable {
|
||||
|
|
|
|||
|
|
@ -11,12 +11,27 @@ in
|
|||
options.services.lego-monitoring = {
|
||||
enable = lib.mkEnableOption "lego-monitoring service";
|
||||
|
||||
logLevel = lib.mkOption {
|
||||
type = lib.types.enum [
|
||||
"CRITICAL"
|
||||
"ERROR"
|
||||
"WARNING"
|
||||
"INFO"
|
||||
"DEBUG"
|
||||
];
|
||||
default = "INFO";
|
||||
description = "Level of logging. INFO generates a log message with every check.";
|
||||
};
|
||||
|
||||
enabledCheckSets = lib.mkOption {
|
||||
type = lib.types.listOf (lib.types.enum [
|
||||
"start"
|
||||
"stop"
|
||||
"temp"
|
||||
|
||||
"cpu"
|
||||
"ram"
|
||||
"temp"
|
||||
|
||||
"vulnix"
|
||||
]);
|
||||
default = [ ];
|
||||
|
|
@ -82,7 +97,7 @@ in
|
|||
warningPercentage = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.float;
|
||||
default = 80.0;
|
||||
description = "CPU load percentage for a warning alert is sent. Null means never generate a CPU warning alert.";
|
||||
description = "CPU load percentage for a warning alert to be sent. Null means never generate a CPU warning alert.";
|
||||
};
|
||||
criticalPercentage = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.float;
|
||||
|
|
@ -90,6 +105,19 @@ in
|
|||
description = "CPU load percentage for a critical alert to be sent. Null means never generate a CPU critical alert.";
|
||||
};
|
||||
};
|
||||
|
||||
ram = {
|
||||
warningPercentage = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.float;
|
||||
default = 80.0;
|
||||
description = "RAM usage percentage for a warning alert to be sent. Null means never generate a RAM warning alert.";
|
||||
};
|
||||
criticalPercentage = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.float;
|
||||
default = 90.0;
|
||||
description = "RAM usage percentage for a critical alert to be sent. Null means never generate a RAM critical alert.";
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import signal
|
|||
from . import checks
|
||||
from .alerting import alerts
|
||||
from .checks.temp.sensors import print_readings
|
||||
from .config import enums as config_enums
|
||||
from .config import load_config
|
||||
from .core import cvars
|
||||
from .core.checkers import interval_checker
|
||||
|
|
@ -20,8 +21,6 @@ def stop_gracefully(signum, frame):
|
|||
|
||||
|
||||
def main() -> None:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
asyncio.run(async_main())
|
||||
|
||||
|
||||
|
|
@ -46,17 +45,20 @@ async def async_main():
|
|||
if not args.config:
|
||||
raise RuntimeError("--config must be specified in standard operating mode")
|
||||
|
||||
logging.basicConfig(level=config.log_level)
|
||||
|
||||
tg_client = await alerts.get_client()
|
||||
cvars.tg_client.set(tg_client)
|
||||
|
||||
check_sets = config_enums.CheckSet
|
||||
|
||||
checker_sets = {
|
||||
"start": [
|
||||
alerts.send_start_alert(),
|
||||
],
|
||||
"stop": [], # this is checked later
|
||||
"temp": [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))],
|
||||
"vulnix": [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))],
|
||||
"cpu": [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))],
|
||||
check_sets.START: [alerts.send_start_alert()],
|
||||
check_sets.STOP: [], # this is checked later
|
||||
check_sets.CPU: [interval_checker(checks.cpu_check, datetime.timedelta(minutes=5))],
|
||||
check_sets.RAM: [interval_checker(checks.ram_check, datetime.timedelta(minutes=1))],
|
||||
check_sets.TEMP: [interval_checker(checks.temp_check, datetime.timedelta(minutes=5))],
|
||||
check_sets.VULNIX: [interval_checker(checks.vulnix_check, datetime.timedelta(days=3))],
|
||||
}
|
||||
|
||||
checkers = []
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@ class AlertType(StrEnum):
|
|||
BOOT = "BOOT"
|
||||
CPU = "CPU"
|
||||
ERROR = "ERROR"
|
||||
RAM = "RAM"
|
||||
TEMP = "TEMP"
|
||||
TEST = "TEST"
|
||||
VULN = "VULN"
|
||||
# RAM = "RAM"
|
||||
# LOGIN = "LOGIN"
|
||||
# SMART = "SMART" # TODO
|
||||
# RAID = "RAID"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
from .cpu import cpu_check
|
||||
from .ram import ram_check
|
||||
from .temp import temp_check
|
||||
from .vulnix import vulnix_check
|
||||
|
|
|
|||
30
src/lego_monitoring/checks/ram.py
Normal file
30
src/lego_monitoring/checks/ram.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
from psutil import virtual_memory
|
||||
|
||||
from lego_monitoring.alerting import alerts
|
||||
from lego_monitoring.alerting.enum import AlertType, Severity
|
||||
from lego_monitoring.core import cvars
|
||||
|
||||
IS_TESTING = False
|
||||
|
||||
|
||||
def ram_check() -> list[alerts.Alert]:
|
||||
percentage = virtual_memory().percent
|
||||
config = cvars.config.get().checks.ram
|
||||
if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage):
|
||||
return [
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.RAM,
|
||||
message=f"RAM usage: {percentage:.2f}% > {config.critical_percentage:.2f}%",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
]
|
||||
elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage):
|
||||
return [
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.RAM,
|
||||
message=f"RAM usage: {percentage:.2f}% > {config.warning_percentage:.2f}%",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
|
@ -4,16 +4,19 @@ from typing import Optional
|
|||
|
||||
from alt_utils import NestedDeserializableDataclass
|
||||
|
||||
from . import enums
|
||||
from .checks.cpu import CpuCheckConfig
|
||||
from .checks.ram import RamCheckConfig
|
||||
from .checks.temp import TempCheckConfig
|
||||
from .checks.vulnix import VulnixCheckConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChecksConfig(NestedDeserializableDataclass):
|
||||
cpu: Optional[CpuCheckConfig] = None
|
||||
temp: Optional[TempCheckConfig] = None
|
||||
vulnix: Optional[VulnixCheckConfig] = None
|
||||
cpu: CpuCheckConfig = field(default_factory=CpuCheckConfig)
|
||||
ram: RamCheckConfig = field(default_factory=RamCheckConfig)
|
||||
temp: TempCheckConfig = field(default_factory=TempCheckConfig)
|
||||
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
@ -26,7 +29,8 @@ class TelegramConfig:
|
|||
class Config(NestedDeserializableDataclass):
|
||||
checks: ChecksConfig
|
||||
telegram: TelegramConfig
|
||||
enabled_check_sets: list[str] = field(default_factory=list)
|
||||
enabled_check_sets: list[enums.CheckSet] = field(default_factory=list)
|
||||
log_level: enums.LogLevelName = enums.LogLevelName.INFO
|
||||
|
||||
|
||||
def load_config(filepath: str) -> Config:
|
||||
|
|
|
|||
8
src/lego_monitoring/config/checks/ram.py
Normal file
8
src/lego_monitoring/config/checks/ram.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class RamCheckConfig:
|
||||
warning_percentage: Optional[float] = 80
|
||||
critical_percentage: Optional[float] = 90
|
||||
20
src/lego_monitoring/config/enums.py
Normal file
20
src/lego_monitoring/config/enums.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
from enum import StrEnum
|
||||
|
||||
|
||||
class CheckSet(StrEnum):
|
||||
START = "start"
|
||||
STOP = "stop"
|
||||
|
||||
CPU = "cpu"
|
||||
RAM = "ram"
|
||||
TEMP = "temp"
|
||||
|
||||
VULNIX = "vulnix"
|
||||
|
||||
|
||||
class LogLevelName(StrEnum):
|
||||
CRITICAL = "CRITICAL"
|
||||
ERROR = "ERROR"
|
||||
WARNING = "WARNING"
|
||||
INFO = "INFO"
|
||||
DEBUG = "DEBUG"
|
||||
Loading…
Add table
Add a link
Reference in a new issue