network monitoring

This commit is contained in:
Alex Tau 2025-06-07 15:59:05 +03:00
parent 8af7b683b6
commit 8b18d407d7
21 changed files with 434 additions and 53 deletions

View file

@ -82,6 +82,9 @@ async def async_main():
is_reminder=True,
)
],
check_sets.NET: [
IntervalChecker(checks.NetIOTracker().net_check, interval=datetime.timedelta(minutes=5), persistent=True)
],
}
checkers = []

View file

@ -1,4 +1,5 @@
from dataclasses import dataclass
from dataclasses import dataclass, field
from datetime import datetime
from .enum import AlertType, Severity
@ -8,3 +9,4 @@ class Alert:
alert_type: AlertType
message: str
severity: Severity
created: datetime = field(default_factory=datetime.now)

View file

@ -3,13 +3,16 @@ from enum import IntEnum, StrEnum
class AlertType(StrEnum):
BOOT = "BOOT"
CPU = "CPU"
ERROR = "ERROR"
TEST = "TEST"
REMIND = "REMIND"
CPU = "CPU"
NET = "NET"
RAM = "RAM"
TEMP = "TEMP"
TEST = "TEST"
VULN = "VULN"
REMIND = "REMIND"
# LOGIN = "LOGIN"
# SMART = "SMART" # TODO
# RAID = "RAID"

View file

@ -18,8 +18,12 @@ async def get_client() -> TelegramClient:
def format_message(alert: Alert, note: str) -> str:
severity_emoji = SEVERITY_TO_EMOJI[alert.severity]
note_formatted = f" - <i>{note}</i>" if note else ""
message = f"{severity_emoji} {alert.alert_type} Alert{note_formatted}\n{alert.message}"
note_formatted = f"{note}, " if note else ""
if "ongoing" in note_formatted:
note_formatted += f"since {alert.created.isoformat()}"
else:
note_formatted += f"at {alert.created.isoformat()}"
message = f"{severity_emoji} {alert.alert_type} Alert - <i>{note_formatted}</i>\n{alert.message}"
return message

View file

@ -1,4 +1,5 @@
from .cpu import cpu_check
from .net import NetIOTracker
from .ram import ram_check
from .remind import remind_check
from .temp import temp_check

View file

@ -10,19 +10,19 @@ IS_TESTING = False
def cpu_check() -> list[Alert]:
percentage = cpu_percent()
config = cvars.config.get().checks.cpu
if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage):
if config.critical_percentage and (IS_TESTING or percentage >= config.critical_percentage):
return [
Alert(
alert_type=AlertType.CPU,
message=f"CPU load: {percentage:.2f}% > {config.critical_percentage:.2f}%",
message=f"CPU load: {percentage:.2f}% >= {config.critical_percentage:.2f}%",
severity=Severity.CRITICAL,
)
]
elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage):
elif config.warning_percentage and (IS_TESTING or percentage >= config.warning_percentage):
return [
Alert(
alert_type=AlertType.CPU,
message=f"CPU load: {percentage:.2f}% > {config.warning_percentage:.2f}%",
message=f"CPU load: {percentage:.2f}% >= {config.warning_percentage:.2f}%",
severity=Severity.WARNING,
)
]

View file

@ -0,0 +1,87 @@
from dataclasses import dataclass, field
from typing import Optional
from humanize import naturalsize
from psutil import net_io_counters
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
IS_TESTING = False
SECONDS_BETWEEN_CHECKS = 5 * 60
@dataclass
class NetIOTracker:
sent_per_interface: dict[str, int] = field(default_factory=dict, init=False)
recv_per_interface: dict[str, int] = field(default_factory=dict, init=False)
@staticmethod
def check_threshold(
current_stat_bytes_per_sec: float,
critical_threshold: Optional[int],
warning_threshold: Optional[int],
stat_name: str,
interface: str,
) -> Optional[Alert]:
if critical_threshold and (IS_TESTING or current_stat_bytes_per_sec >= critical_threshold):
current_stat_natural = naturalsize(current_stat_bytes_per_sec, binary=True)
critical_threshold_natural = naturalsize(critical_threshold, binary=True)
return Alert(
alert_type=AlertType.NET,
message=f"Interface {interface} {stat_name} {current_stat_natural}/s >= {critical_threshold_natural}/s",
severity=Severity.CRITICAL,
)
elif warning_threshold and (IS_TESTING or current_stat_bytes_per_sec >= warning_threshold):
current_stat_natural = naturalsize(current_stat_bytes_per_sec, binary=True)
warning_threshold_natural = naturalsize(warning_threshold, binary=True)
return Alert(
alert_type=AlertType.NET,
message=f"Interface {interface} {stat_name} {current_stat_natural}/s >= {warning_threshold_natural}/s",
severity=Severity.WARNING,
)
def net_check(self) -> list[Alert]:
alerts = []
current_stats = net_io_counters(pernic=True)
config = cvars.config.get().checks.net
for interface, thresholds in config.interfaces.items():
if interface in self.sent_per_interface and interface in self.recv_per_interface:
sent_since_last_check_per_sec = (
current_stats[interface].bytes_sent - self.sent_per_interface[interface]
) / SECONDS_BETWEEN_CHECKS
recv_since_last_check_per_sec = (
current_stats[interface].bytes_recv - self.recv_per_interface[interface]
) / SECONDS_BETWEEN_CHECKS
comb_since_last_check_per_sec = sent_since_last_check_per_sec + recv_since_last_check_per_sec
if alert := self.check_threshold(
sent_since_last_check_per_sec,
thresholds.critical_threshold_sent_bytes,
thresholds.warning_threshold_sent_bytes,
"sent",
interface,
):
alerts.append(alert)
if alert := self.check_threshold(
recv_since_last_check_per_sec,
thresholds.critical_threshold_recv_bytes,
thresholds.warning_threshold_recv_bytes,
"recv",
interface,
):
alerts.append(alert)
if alert := self.check_threshold(
comb_since_last_check_per_sec,
thresholds.critical_threshold_comb_bytes,
thresholds.warning_threshold_comb_bytes,
"comb",
interface,
):
alerts.append(alert)
self.sent_per_interface[interface] = current_stats[interface].bytes_sent
self.recv_per_interface[interface] = current_stats[interface].bytes_recv
return alerts

View file

@ -10,19 +10,19 @@ IS_TESTING = False
def ram_check() -> list[Alert]:
percentage = virtual_memory().percent
config = cvars.config.get().checks.ram
if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage):
if config.critical_percentage and (IS_TESTING or percentage >= config.critical_percentage):
return [
Alert(
alert_type=AlertType.RAM,
message=f"RAM usage: {percentage:.2f}% > {config.critical_percentage:.2f}%",
message=f"RAM usage: {percentage:.2f}% >= {config.critical_percentage:.2f}%",
severity=Severity.CRITICAL,
)
]
elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage):
elif config.warning_percentage and (IS_TESTING or percentage >= config.warning_percentage):
return [
Alert(
alert_type=AlertType.RAM,
message=f"RAM usage: {percentage:.2f}% > {config.warning_percentage:.2f}%",
message=f"RAM usage: {percentage:.2f}% >= {config.warning_percentage:.2f}%",
severity=Severity.WARNING,
)
]

View file

@ -11,16 +11,16 @@ def temp_check() -> list[Alert]:
temps = sensors.get_readings()
for sensor, readings in temps.items():
for r in readings:
if r.critical_temp is not None and (IS_TESTING or r.current_temp > r.critical_temp):
if r.critical_temp is not None and (IS_TESTING or r.current_temp >= r.critical_temp):
alert = Alert(
alert_type=AlertType.TEMP,
message=f"{sensor} {r.label}: {r.current_temp}°C > {r.critical_temp}°C",
message=f"{sensor} {r.label}: {r.current_temp}°C >= {r.critical_temp}°C",
severity=Severity.CRITICAL,
)
elif r.warning_temp is not None and (IS_TESTING or r.current_temp > r.warning_temp):
elif r.warning_temp is not None and (IS_TESTING or r.current_temp >= r.warning_temp):
alert = Alert(
alert_type=AlertType.TEMP,
message=f"{sensor} {r.label}: {r.current_temp}°C > {r.warning_temp}°C",
message=f"{sensor} {r.label}: {r.current_temp}°C >= {r.warning_temp}°C",
severity=Severity.WARNING,
)
else:

View file

@ -6,6 +6,7 @@ from alt_utils import NestedDeserializableDataclass
from . import enums
from .checks.cpu import CpuCheckConfig
from .checks.net import NetCheckConfig
from .checks.ram import RamCheckConfig
from .checks.temp import TempCheckConfig
from .checks.vulnix import VulnixCheckConfig
@ -17,6 +18,7 @@ class ChecksConfig(NestedDeserializableDataclass):
ram: RamCheckConfig = field(default_factory=RamCheckConfig)
temp: TempCheckConfig = field(default_factory=TempCheckConfig)
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
net: NetCheckConfig = field(default_factory=NetCheckConfig)
@dataclass

View file

@ -0,0 +1,19 @@
from dataclasses import dataclass, field
from typing import Optional
from alt_utils import NestedDeserializableDataclass
@dataclass
class NetInterfaceConfig:
warning_threshold_sent_bytes: Optional[int] = None
critical_threshold_sent_bytes: Optional[int] = None
warning_threshold_recv_bytes: Optional[int] = None
critical_threshold_recv_bytes: Optional[int] = None
warning_threshold_comb_bytes: Optional[int] = None
critical_threshold_comb_bytes: Optional[int] = None
@dataclass
class NetCheckConfig(NestedDeserializableDataclass):
interfaces: dict[str, NetInterfaceConfig] = field(default_factory=dict)

View file

@ -9,6 +9,7 @@ class CheckSet(StrEnum):
CPU = "cpu"
RAM = "ram"
TEMP = "temp"
NET = "net"
VULNIX = "vulnix"