mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
network monitoring
This commit is contained in:
parent
8af7b683b6
commit
8b18d407d7
21 changed files with 434 additions and 53 deletions
|
|
@ -82,6 +82,9 @@ async def async_main():
|
|||
is_reminder=True,
|
||||
)
|
||||
],
|
||||
check_sets.NET: [
|
||||
IntervalChecker(checks.NetIOTracker().net_check, interval=datetime.timedelta(minutes=5), persistent=True)
|
||||
],
|
||||
}
|
||||
|
||||
checkers = []
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
|
||||
from .enum import AlertType, Severity
|
||||
|
||||
|
|
@ -8,3 +9,4 @@ class Alert:
|
|||
alert_type: AlertType
|
||||
message: str
|
||||
severity: Severity
|
||||
created: datetime = field(default_factory=datetime.now)
|
||||
|
|
|
|||
|
|
@ -3,13 +3,16 @@ from enum import IntEnum, StrEnum
|
|||
|
||||
class AlertType(StrEnum):
|
||||
BOOT = "BOOT"
|
||||
CPU = "CPU"
|
||||
ERROR = "ERROR"
|
||||
TEST = "TEST"
|
||||
REMIND = "REMIND"
|
||||
|
||||
CPU = "CPU"
|
||||
NET = "NET"
|
||||
RAM = "RAM"
|
||||
TEMP = "TEMP"
|
||||
TEST = "TEST"
|
||||
|
||||
VULN = "VULN"
|
||||
REMIND = "REMIND"
|
||||
# LOGIN = "LOGIN"
|
||||
# SMART = "SMART" # TODO
|
||||
# RAID = "RAID"
|
||||
|
|
|
|||
|
|
@ -18,8 +18,12 @@ async def get_client() -> TelegramClient:
|
|||
|
||||
def format_message(alert: Alert, note: str) -> str:
|
||||
severity_emoji = SEVERITY_TO_EMOJI[alert.severity]
|
||||
note_formatted = f" - <i>{note}</i>" if note else ""
|
||||
message = f"{severity_emoji} {alert.alert_type} Alert{note_formatted}\n{alert.message}"
|
||||
note_formatted = f"{note}, " if note else ""
|
||||
if "ongoing" in note_formatted:
|
||||
note_formatted += f"since {alert.created.isoformat()}"
|
||||
else:
|
||||
note_formatted += f"at {alert.created.isoformat()}"
|
||||
message = f"{severity_emoji} {alert.alert_type} Alert - <i>{note_formatted}</i>\n{alert.message}"
|
||||
return message
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from .cpu import cpu_check
|
||||
from .net import NetIOTracker
|
||||
from .ram import ram_check
|
||||
from .remind import remind_check
|
||||
from .temp import temp_check
|
||||
|
|
|
|||
|
|
@ -10,19 +10,19 @@ IS_TESTING = False
|
|||
def cpu_check() -> list[Alert]:
|
||||
percentage = cpu_percent()
|
||||
config = cvars.config.get().checks.cpu
|
||||
if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage):
|
||||
if config.critical_percentage and (IS_TESTING or percentage >= config.critical_percentage):
|
||||
return [
|
||||
Alert(
|
||||
alert_type=AlertType.CPU,
|
||||
message=f"CPU load: {percentage:.2f}% > {config.critical_percentage:.2f}%",
|
||||
message=f"CPU load: {percentage:.2f}% >= {config.critical_percentage:.2f}%",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
]
|
||||
elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage):
|
||||
elif config.warning_percentage and (IS_TESTING or percentage >= config.warning_percentage):
|
||||
return [
|
||||
Alert(
|
||||
alert_type=AlertType.CPU,
|
||||
message=f"CPU load: {percentage:.2f}% > {config.warning_percentage:.2f}%",
|
||||
message=f"CPU load: {percentage:.2f}% >= {config.warning_percentage:.2f}%",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
]
|
||||
|
|
|
|||
87
src/lego_monitoring/checks/net.py
Normal file
87
src/lego_monitoring/checks/net.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from humanize import naturalsize
|
||||
from psutil import net_io_counters
|
||||
|
||||
from lego_monitoring.alerting.alert import Alert
|
||||
from lego_monitoring.alerting.enum import AlertType, Severity
|
||||
from lego_monitoring.core import cvars
|
||||
|
||||
IS_TESTING = False
|
||||
SECONDS_BETWEEN_CHECKS = 5 * 60
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetIOTracker:
|
||||
sent_per_interface: dict[str, int] = field(default_factory=dict, init=False)
|
||||
recv_per_interface: dict[str, int] = field(default_factory=dict, init=False)
|
||||
|
||||
@staticmethod
|
||||
def check_threshold(
|
||||
current_stat_bytes_per_sec: float,
|
||||
critical_threshold: Optional[int],
|
||||
warning_threshold: Optional[int],
|
||||
stat_name: str,
|
||||
interface: str,
|
||||
) -> Optional[Alert]:
|
||||
if critical_threshold and (IS_TESTING or current_stat_bytes_per_sec >= critical_threshold):
|
||||
current_stat_natural = naturalsize(current_stat_bytes_per_sec, binary=True)
|
||||
critical_threshold_natural = naturalsize(critical_threshold, binary=True)
|
||||
return Alert(
|
||||
alert_type=AlertType.NET,
|
||||
message=f"Interface {interface} {stat_name} {current_stat_natural}/s >= {critical_threshold_natural}/s",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
elif warning_threshold and (IS_TESTING or current_stat_bytes_per_sec >= warning_threshold):
|
||||
current_stat_natural = naturalsize(current_stat_bytes_per_sec, binary=True)
|
||||
warning_threshold_natural = naturalsize(warning_threshold, binary=True)
|
||||
return Alert(
|
||||
alert_type=AlertType.NET,
|
||||
message=f"Interface {interface} {stat_name} {current_stat_natural}/s >= {warning_threshold_natural}/s",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
|
||||
def net_check(self) -> list[Alert]:
|
||||
alerts = []
|
||||
current_stats = net_io_counters(pernic=True)
|
||||
config = cvars.config.get().checks.net
|
||||
for interface, thresholds in config.interfaces.items():
|
||||
if interface in self.sent_per_interface and interface in self.recv_per_interface:
|
||||
sent_since_last_check_per_sec = (
|
||||
current_stats[interface].bytes_sent - self.sent_per_interface[interface]
|
||||
) / SECONDS_BETWEEN_CHECKS
|
||||
recv_since_last_check_per_sec = (
|
||||
current_stats[interface].bytes_recv - self.recv_per_interface[interface]
|
||||
) / SECONDS_BETWEEN_CHECKS
|
||||
comb_since_last_check_per_sec = sent_since_last_check_per_sec + recv_since_last_check_per_sec
|
||||
|
||||
if alert := self.check_threshold(
|
||||
sent_since_last_check_per_sec,
|
||||
thresholds.critical_threshold_sent_bytes,
|
||||
thresholds.warning_threshold_sent_bytes,
|
||||
"sent",
|
||||
interface,
|
||||
):
|
||||
alerts.append(alert)
|
||||
if alert := self.check_threshold(
|
||||
recv_since_last_check_per_sec,
|
||||
thresholds.critical_threshold_recv_bytes,
|
||||
thresholds.warning_threshold_recv_bytes,
|
||||
"recv",
|
||||
interface,
|
||||
):
|
||||
alerts.append(alert)
|
||||
if alert := self.check_threshold(
|
||||
comb_since_last_check_per_sec,
|
||||
thresholds.critical_threshold_comb_bytes,
|
||||
thresholds.warning_threshold_comb_bytes,
|
||||
"comb",
|
||||
interface,
|
||||
):
|
||||
alerts.append(alert)
|
||||
|
||||
self.sent_per_interface[interface] = current_stats[interface].bytes_sent
|
||||
self.recv_per_interface[interface] = current_stats[interface].bytes_recv
|
||||
|
||||
return alerts
|
||||
|
|
@ -10,19 +10,19 @@ IS_TESTING = False
|
|||
def ram_check() -> list[Alert]:
|
||||
percentage = virtual_memory().percent
|
||||
config = cvars.config.get().checks.ram
|
||||
if config.critical_percentage and (IS_TESTING or percentage > config.critical_percentage):
|
||||
if config.critical_percentage and (IS_TESTING or percentage >= config.critical_percentage):
|
||||
return [
|
||||
Alert(
|
||||
alert_type=AlertType.RAM,
|
||||
message=f"RAM usage: {percentage:.2f}% > {config.critical_percentage:.2f}%",
|
||||
message=f"RAM usage: {percentage:.2f}% >= {config.critical_percentage:.2f}%",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
]
|
||||
elif config.warning_percentage and (IS_TESTING or percentage > config.warning_percentage):
|
||||
elif config.warning_percentage and (IS_TESTING or percentage >= config.warning_percentage):
|
||||
return [
|
||||
Alert(
|
||||
alert_type=AlertType.RAM,
|
||||
message=f"RAM usage: {percentage:.2f}% > {config.warning_percentage:.2f}%",
|
||||
message=f"RAM usage: {percentage:.2f}% >= {config.warning_percentage:.2f}%",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
]
|
||||
|
|
|
|||
|
|
@ -11,16 +11,16 @@ def temp_check() -> list[Alert]:
|
|||
temps = sensors.get_readings()
|
||||
for sensor, readings in temps.items():
|
||||
for r in readings:
|
||||
if r.critical_temp is not None and (IS_TESTING or r.current_temp > r.critical_temp):
|
||||
if r.critical_temp is not None and (IS_TESTING or r.current_temp >= r.critical_temp):
|
||||
alert = Alert(
|
||||
alert_type=AlertType.TEMP,
|
||||
message=f"{sensor} {r.label}: {r.current_temp}°C > {r.critical_temp}°C",
|
||||
message=f"{sensor} {r.label}: {r.current_temp}°C >= {r.critical_temp}°C",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
elif r.warning_temp is not None and (IS_TESTING or r.current_temp > r.warning_temp):
|
||||
elif r.warning_temp is not None and (IS_TESTING or r.current_temp >= r.warning_temp):
|
||||
alert = Alert(
|
||||
alert_type=AlertType.TEMP,
|
||||
message=f"{sensor} {r.label}: {r.current_temp}°C > {r.warning_temp}°C",
|
||||
message=f"{sensor} {r.label}: {r.current_temp}°C >= {r.warning_temp}°C",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from alt_utils import NestedDeserializableDataclass
|
|||
|
||||
from . import enums
|
||||
from .checks.cpu import CpuCheckConfig
|
||||
from .checks.net import NetCheckConfig
|
||||
from .checks.ram import RamCheckConfig
|
||||
from .checks.temp import TempCheckConfig
|
||||
from .checks.vulnix import VulnixCheckConfig
|
||||
|
|
@ -17,6 +18,7 @@ class ChecksConfig(NestedDeserializableDataclass):
|
|||
ram: RamCheckConfig = field(default_factory=RamCheckConfig)
|
||||
temp: TempCheckConfig = field(default_factory=TempCheckConfig)
|
||||
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
|
||||
net: NetCheckConfig = field(default_factory=NetCheckConfig)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
|||
19
src/lego_monitoring/config/checks/net.py
Normal file
19
src/lego_monitoring/config/checks/net.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from alt_utils import NestedDeserializableDataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetInterfaceConfig:
|
||||
warning_threshold_sent_bytes: Optional[int] = None
|
||||
critical_threshold_sent_bytes: Optional[int] = None
|
||||
warning_threshold_recv_bytes: Optional[int] = None
|
||||
critical_threshold_recv_bytes: Optional[int] = None
|
||||
warning_threshold_comb_bytes: Optional[int] = None
|
||||
critical_threshold_comb_bytes: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetCheckConfig(NestedDeserializableDataclass):
|
||||
interfaces: dict[str, NetInterfaceConfig] = field(default_factory=dict)
|
||||
|
|
@ -9,6 +9,7 @@ class CheckSet(StrEnum):
|
|||
CPU = "cpu"
|
||||
RAM = "ram"
|
||||
TEMP = "temp"
|
||||
NET = "net"
|
||||
|
||||
VULNIX = "vulnix"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue