lvmraid monitoring

This commit is contained in:
Alex Tau 2026-01-18 22:25:05 +03:00
parent 57accba7d7
commit db1c41fb53
13 changed files with 249 additions and 5 deletions

View file

@ -71,7 +71,8 @@
postPatch = '' postPatch = ''
substituteInPlace src/lego_monitoring/core/const.py \ substituteInPlace src/lego_monitoring/core/const.py \
--replace-fail 'VULNIX_PATH: str = ...' 'VULNIX_PATH = "${lib.getExe pkgs.vulnix}"' \ --replace-fail 'VULNIX_PATH: str = ...' 'VULNIX_PATH = "${lib.getExe pkgs.vulnix}"' \
--replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"' --replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"' \
--replace-fail 'LVS_PATH = "/usr/bin/lvs"' 'LVS_PATH = "${pkgs.lvm2.bin}/bin/lvs"'
''; '';
} }
); );

View file

@ -32,8 +32,7 @@ package:
enabled_check_sets = cfg.enabledCheckSets; enabled_check_sets = cfg.enabledCheckSets;
log_level = cfg.logLevel; log_level = cfg.logLevel;
alert_channels = { alert_channels = {
telegram = with cfg.alertChannels.telegram; if enable then telegram = with cfg.alertChannels.telegram; if enable then {
{
creds_secret_path = credsSecretPath; creds_secret_path = credsSecretPath;
room_id = roomId; room_id = roomId;
} else null; } else null;
@ -81,6 +80,8 @@ package:
ups_to_check = upsToCheck; ups_to_check = upsToCheck;
upsmon_group = upsmonGroup; upsmon_group = upsmonGroup;
}; };
lvmraid.lv_paths = cfg.checks.lvmraid.lvPaths;
}; };
}; };
in lib.mkIf cfg.enable { in lib.mkIf cfg.enable {

View file

@ -35,6 +35,7 @@ in
"temp" "temp"
"net" "net"
"ups" "ups"
"lvmraid"
"vulnix" "vulnix"
]); ]);
@ -48,6 +49,7 @@ in
* temp -- alerts when temperature readings are above thresholds * temp -- alerts when temperature readings are above thresholds
* net -- alerts when network usage is above threshold * net -- alerts when network usage is above threshold
* ups -- alerts on UPS events * ups -- alerts on UPS events
* lvmraid -- alerts when RAID LVs are unhealthy
* vulnix -- periodically scans system for known CVEs, alerts if any are found (NixOS only)''; * vulnix -- periodically scans system for known CVEs, alerts if any are found (NixOS only)'';
}; };
@ -187,6 +189,15 @@ in
description = "Group to allow to send UPS status updates. This should usually include the user upsmon runs as."; description = "Group to allow to send UPS status updates. This should usually include the user upsmon runs as.";
}; };
}; };
lvmraid = {
lvPaths = lib.mkOption {
type = with lib.types; listOf str;
default = [ ];
description = "List of LV paths to monitor.";
example = lib.literalExpression ''[ Data/lvol0 ]'';
};
};
}; };
}; };
} }

View file

@ -12,11 +12,11 @@ class AlertType(StrEnum):
RAM = "RAM" RAM = "RAM"
TEMP = "TEMP" TEMP = "TEMP"
UPS = "UPS" UPS = "UPS"
RAID = "RAID"
VULN = "VULN" VULN = "VULN"
# LOGIN = "LOGIN" # LOGIN = "LOGIN"
# SMART = "SMART" # TODO # SMART = "SMART" # TODO
# RAID = "RAID"
# DISKS = "DISKS" # DISKS = "DISKS"
# UPDATE = "UPDATE" # UPDATE = "UPDATE"

View file

@ -1,4 +1,5 @@
from .cpu import cpu_check from .cpu import cpu_check
from .lvmraid import lvmraid_check
from .net import NetIOTracker from .net import NetIOTracker
from .ram import ram_check from .ram import ram_check
from .remind import remind_check from .remind import remind_check

View file

@ -0,0 +1,79 @@
from socket import gethostname
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
from ..utils import format_for_healthchecks_slug
from .lvattr import LVAttr
def lvmraid_check() -> list[Alert]:
check_config = cvars.config.get().checks.lvmraid
alert_list = []
for lv in check_config.lv_paths:
slug = f"{format_for_healthchecks_slug(gethostname())}-lvmraid-{format_for_healthchecks_slug(lv)}"
try:
lv_attr = LVAttr.from_cli(lv)
except Exception as e:
alert_list.append(
Alert(
alert_type=AlertType.RAID,
message=f"Exception {type(e).__name__} while calling lvs: {e}",
severity=Severity.CRITICAL,
healthchecks_slug=slug,
)
)
continue
# sanity check
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
alert_list.append(
Alert(
alert_type=AlertType.RAID,
message=f"LV {lv} is not of RAID type",
severity=Severity.CRITICAL,
)
)
continue
match lv_attr.health:
case LVAttr.Health.PARTIAL:
severity_reason = f"[!] LV {lv} operating in partial mode; one of PVs has failed\n\n"
severity = Severity.CRITICAL
case LVAttr.Health.UNKNOWN:
severity_reason = f"[!] LV {lv}'s state is unknown\n\n"
severity = Severity.CRITICAL
case LVAttr.Health.REFRESH_NEEDED:
severity_reason = f"[!] LV {lv} has suffered a write error; run a refresh or replace the failing PV\n\n"
severity = Severity.WARNING
case LVAttr.Health.MISMATCHES:
severity_reason = f"[!] LV {lv} is partially incoherent; run a repairing scrub operation\n\n"
severity = Severity.WARNING
case _:
severity_reason = ""
severity = Severity.OK
status = f"""{severity_reason}LV: {lv}
Type: {lv_attr.vol_type.name}
Permissions: {lv_attr.permissions.name}
Allocation policy: {lv_attr.allocation_policy.name}
Fixed minor: {lv_attr.fixed_minor}
State: {lv_attr.state.name}
Is open: {lv_attr.is_open.name}
Target type: {lv_attr.target_type.name}
Zero before use: {lv_attr.zero_before_use}
Health: {lv_attr.health.name}
Skip activation: {lv_attr.skip_activation}
"""
alert_list.append(
Alert(
alert_type=AlertType.RAID,
message=status,
severity=severity,
healthchecks_slug=slug,
)
)
return alert_list

View file

@ -0,0 +1,138 @@
import json
import subprocess
from dataclasses import dataclass
from enum import StrEnum
from typing import Optional, Self
from lego_monitoring.core.const import LVS_PATH
@dataclass
class LVAttr:
"""https://man.archlinux.org/man/lvs.8#NOTES"""
class VolType(StrEnum):
CACHE = "C"
MIRRORED = "m"
MIRRORED_NOSYNC = "M"
ORIGIN = "o"
ORIGIN_MERGING_SNAPSHOT = "O"
INTEGRITY = "g"
RAID = "r"
RAID_NOSYNC = "R"
SNAPSHOT = "s"
MERGING_SNAPSHOT = "S"
PVMOVE = "p"
VIRTUAL = "v"
IMAGE = "i"
IMAGE_OUT_OF_SYNC = "I"
MIRROR_LOG = "l"
CONVERTING = "c"
THIN = "V"
THIN_POOL = "t"
THIN_POOL_DATA = "T"
VDO_POOL = "d"
VDO_POOL_DATA = "D"
METADATA = "e"
NORMAL = "-"
class Permissions(StrEnum):
WRITABLE = "w"
READONLY = "r"
READONLY_ACTIVATED = "R"
class AllocationPolicy(StrEnum):
ANYWHERE = "a"
ANYWHERE_LOCKED = "A"
CONTIGUOUS = "c"
CONTIGUOUS_LOCKED = "C"
INHERITED = "i"
INHERITED_LOCKED = "I"
CLING = "l"
CLING_LOCKED = "L"
NORMAL = "n"
NORMAL_LOCKED = "N"
class State(StrEnum):
ACTIVE = "a"
HISTORICAL = "h"
SUSPENDED = "s"
INVALID_SNAPSHOT = "I"
INVALID_SUSPENDED_SNAPSHOT = "S"
SNAPSHOT_MERGE_FAILED = "m"
SUSPENDED_SNAPSHOT_MERGE_FAILED = "M"
DEVICE_PRESENT_NO_TABLES = "d"
DEVICE_PRESENT_INACTIVE_TABLE = "i"
THIN_POOL_CHECK_NEEDED = "c"
SUSPENDED_THIN_POOL_CHECK_NEEDED = "C"
UNKNOWN = "X"
class IsOpen(StrEnum):
OPEN = "o"
CLOSED = "-"
UNKNOWN = "X"
class TargetType(StrEnum):
CACHE = "C"
MIRROR = "m"
RAID = "r"
SNAPSHOT = "s"
THIN = "t"
UNKNOWN = "u"
VIRTUAL = "v"
NORMAL = "-"
class Health(StrEnum):
# for all
PARTIAL = "p"
UNKNOWN = "X"
OK = "-"
# for RAID
REFRESH_NEEDED = "r"
MISMATCHES = "m"
WRITEMOSTLY = "w"
RESHAPING = "s"
REMOVE = "R"
# for thin pools and LVs
FAILED = "F"
OUT_OF_SPACE = "D"
METADATA_READ_ONLY = "M"
# for writecache
ERROR = "E"
vol_type: VolType
permissions: Permissions
allocation_policy: AllocationPolicy
fixed_minor: bool
state: State
is_open: IsOpen
target_type: TargetType
zero_before_use: bool
health: Health
skip_activation: bool
name: Optional[str] = None
@classmethod
def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self:
kwargs = {}
kwargs["vol_type"] = cls.VolType(attr_str[0])
kwargs["permissions"] = cls.Permissions(attr_str[1])
kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2])
kwargs["fixed_minor"] = True if attr_str[3] == "m" else False
kwargs["state"] = cls.State(attr_str[4])
kwargs["is_open"] = cls.IsOpen(attr_str[5])
kwargs["target_type"] = cls.TargetType(attr_str[6])
kwargs["zero_before_use"] = True if attr_str[7] == "z" else False
kwargs["health"] = cls.Health(attr_str[8])
kwargs["skip_activation"] = True if attr_str[9] == "k" else False
kwargs["name"] = name
return cls(**kwargs)
@classmethod
def from_cli(cls, name: str) -> Self:
json_obj = json.loads(subprocess.run([LVS_PATH, "--reportformat=json", name], capture_output=True).stdout)
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
return cls.from_str(attr_str, name)

View file

@ -7,6 +7,7 @@ from alt_utils import NestedDeserializableDataclass
from . import enums from . import enums
from .alert_channels import AlertChannelsConfig from .alert_channels import AlertChannelsConfig
from .checks.cpu import CpuCheckConfig from .checks.cpu import CpuCheckConfig
from .checks.lvmraid import LvmRaidCheckConfig
from .checks.net import NetCheckConfig from .checks.net import NetCheckConfig
from .checks.ram import RamCheckConfig from .checks.ram import RamCheckConfig
from .checks.temp import TempCheckConfig from .checks.temp import TempCheckConfig
@ -22,6 +23,7 @@ class ChecksConfig(NestedDeserializableDataclass):
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
net: NetCheckConfig = field(default_factory=NetCheckConfig) net: NetCheckConfig = field(default_factory=NetCheckConfig)
ups: UPSCheckConfig = field(default_factory=UPSCheckConfig) ups: UPSCheckConfig = field(default_factory=UPSCheckConfig)
lvmraid: LvmRaidCheckConfig = field(default_factory=LvmRaidCheckConfig)
@dataclass @dataclass

View file

@ -0,0 +1,6 @@
from dataclasses import dataclass, field
@dataclass
class LvmRaidCheckConfig:
lv_paths: list = field(default_factory=list)

View file

@ -10,6 +10,7 @@ class CheckSet(StrEnum):
TEMP = "temp" TEMP = "temp"
NET = "net" NET = "net"
UPS = "ups" UPS = "ups"
LVMRAID = "lvmraid"
VULNIX = "vulnix" VULNIX = "vulnix"

View file

@ -1,3 +1,4 @@
VULNIX_PATH: str = ... # path to vulnix executable VULNIX_PATH: str = ... # path to vulnix executable
UPSC_PATH = "/usr/bin/upsc" UPSC_PATH = "/usr/bin/upsc"
UPS_PIPE_NAME = "/tmp/lego-monitoring-ups-status" UPS_PIPE_NAME = "/tmp/lego-monitoring-ups-status"
LVS_PATH = "/usr/bin/lvs"

View file

@ -101,6 +101,9 @@ async def async_main():
owner_group=config.checks.ups.upsmon_group, owner_group=config.checks.ups.upsmon_group,
) )
], ],
check_sets.LVMRAID: [
IntervalChecker(checks.lvmraid_check, interval=datetime.timedelta(minutes=5), persistent=True)
],
} }
checkers = [] checkers = []

2
uv.lock generated
View file

@ -278,7 +278,7 @@ wheels = [
[[package]] [[package]]
name = "lego-monitoring" name = "lego-monitoring"
version = "1.1.1" version = "1.2.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "aiodns" }, { name = "aiodns" },