lvmraid monitoring

This commit is contained in:
Alex Tau 2026-01-18 22:25:05 +03:00
parent 57accba7d7
commit db1c41fb53
13 changed files with 249 additions and 5 deletions

View file

@ -71,7 +71,8 @@
postPatch = ''
substituteInPlace src/lego_monitoring/core/const.py \
--replace-fail 'VULNIX_PATH: str = ...' 'VULNIX_PATH = "${lib.getExe pkgs.vulnix}"' \
--replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"'
--replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"' \
--replace-fail 'LVS_PATH = "/usr/bin/lvs"' 'LVS_PATH = "${pkgs.lvm2.bin}/bin/lvs"'
'';
}
);

View file

@ -32,8 +32,7 @@ package:
enabled_check_sets = cfg.enabledCheckSets;
log_level = cfg.logLevel;
alert_channels = {
telegram = with cfg.alertChannels.telegram; if enable then
{
telegram = with cfg.alertChannels.telegram; if enable then {
creds_secret_path = credsSecretPath;
room_id = roomId;
} else null;
@ -81,6 +80,8 @@ package:
ups_to_check = upsToCheck;
upsmon_group = upsmonGroup;
};
lvmraid.lv_paths = cfg.checks.lvmraid.lvPaths;
};
};
in lib.mkIf cfg.enable {

View file

@ -35,6 +35,7 @@ in
"temp"
"net"
"ups"
"lvmraid"
"vulnix"
]);
@ -48,6 +49,7 @@ in
* temp -- alerts when temperature readings are above thresholds
* net -- alerts when network usage is above threshold
* ups -- alerts on UPS events
* lvmraid -- alerts when RAID LVs are unhealthy
* vulnix -- periodically scans system for known CVEs, alerts if any are found (NixOS only)'';
};
@ -187,6 +189,15 @@ in
description = "Group to allow to send UPS status updates. This should usually include the user upsmon runs as.";
};
};
lvmraid = {
lvPaths = lib.mkOption {
type = with lib.types; listOf str;
default = [ ];
description = "List of LV paths to monitor.";
example = lib.literalExpression ''[ Data/lvol0 ]'';
};
};
};
};
}

View file

@ -12,11 +12,11 @@ class AlertType(StrEnum):
RAM = "RAM"
TEMP = "TEMP"
UPS = "UPS"
RAID = "RAID"
VULN = "VULN"
# LOGIN = "LOGIN"
# SMART = "SMART" # TODO
# RAID = "RAID"
# DISKS = "DISKS"
# UPDATE = "UPDATE"

View file

@ -1,4 +1,5 @@
from .cpu import cpu_check
from .lvmraid import lvmraid_check
from .net import NetIOTracker
from .ram import ram_check
from .remind import remind_check

View file

@ -0,0 +1,79 @@
from socket import gethostname
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
from ..utils import format_for_healthchecks_slug
from .lvattr import LVAttr
def lvmraid_check() -> list[Alert]:
check_config = cvars.config.get().checks.lvmraid
alert_list = []
for lv in check_config.lv_paths:
slug = f"{format_for_healthchecks_slug(gethostname())}-lvmraid-{format_for_healthchecks_slug(lv)}"
try:
lv_attr = LVAttr.from_cli(lv)
except Exception as e:
alert_list.append(
Alert(
alert_type=AlertType.RAID,
message=f"Exception {type(e).__name__} while calling lvs: {e}",
severity=Severity.CRITICAL,
healthchecks_slug=slug,
)
)
continue
# sanity check
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
alert_list.append(
Alert(
alert_type=AlertType.RAID,
message=f"LV {lv} is not of RAID type",
severity=Severity.CRITICAL,
)
)
continue
match lv_attr.health:
case LVAttr.Health.PARTIAL:
severity_reason = f"[!] LV {lv} operating in partial mode; one of PVs has failed\n\n"
severity = Severity.CRITICAL
case LVAttr.Health.UNKNOWN:
severity_reason = f"[!] LV {lv}'s state is unknown\n\n"
severity = Severity.CRITICAL
case LVAttr.Health.REFRESH_NEEDED:
severity_reason = f"[!] LV {lv} has suffered a write error; run a refresh or replace the failing PV\n\n"
severity = Severity.WARNING
case LVAttr.Health.MISMATCHES:
severity_reason = f"[!] LV {lv} is partially incoherent; run a repairing scrub operation\n\n"
severity = Severity.WARNING
case _:
severity_reason = ""
severity = Severity.OK
status = f"""{severity_reason}LV: {lv}
Type: {lv_attr.vol_type.name}
Permissions: {lv_attr.permissions.name}
Allocation policy: {lv_attr.allocation_policy.name}
Fixed minor: {lv_attr.fixed_minor}
State: {lv_attr.state.name}
Is open: {lv_attr.is_open.name}
Target type: {lv_attr.target_type.name}
Zero before use: {lv_attr.zero_before_use}
Health: {lv_attr.health.name}
Skip activation: {lv_attr.skip_activation}
"""
alert_list.append(
Alert(
alert_type=AlertType.RAID,
message=status,
severity=severity,
healthchecks_slug=slug,
)
)
return alert_list

View file

@ -0,0 +1,138 @@
import json
import subprocess
from dataclasses import dataclass
from enum import StrEnum
from typing import Optional, Self
from lego_monitoring.core.const import LVS_PATH
@dataclass
class LVAttr:
"""https://man.archlinux.org/man/lvs.8#NOTES"""
class VolType(StrEnum):
CACHE = "C"
MIRRORED = "m"
MIRRORED_NOSYNC = "M"
ORIGIN = "o"
ORIGIN_MERGING_SNAPSHOT = "O"
INTEGRITY = "g"
RAID = "r"
RAID_NOSYNC = "R"
SNAPSHOT = "s"
MERGING_SNAPSHOT = "S"
PVMOVE = "p"
VIRTUAL = "v"
IMAGE = "i"
IMAGE_OUT_OF_SYNC = "I"
MIRROR_LOG = "l"
CONVERTING = "c"
THIN = "V"
THIN_POOL = "t"
THIN_POOL_DATA = "T"
VDO_POOL = "d"
VDO_POOL_DATA = "D"
METADATA = "e"
NORMAL = "-"
class Permissions(StrEnum):
WRITABLE = "w"
READONLY = "r"
READONLY_ACTIVATED = "R"
class AllocationPolicy(StrEnum):
ANYWHERE = "a"
ANYWHERE_LOCKED = "A"
CONTIGUOUS = "c"
CONTIGUOUS_LOCKED = "C"
INHERITED = "i"
INHERITED_LOCKED = "I"
CLING = "l"
CLING_LOCKED = "L"
NORMAL = "n"
NORMAL_LOCKED = "N"
class State(StrEnum):
ACTIVE = "a"
HISTORICAL = "h"
SUSPENDED = "s"
INVALID_SNAPSHOT = "I"
INVALID_SUSPENDED_SNAPSHOT = "S"
SNAPSHOT_MERGE_FAILED = "m"
SUSPENDED_SNAPSHOT_MERGE_FAILED = "M"
DEVICE_PRESENT_NO_TABLES = "d"
DEVICE_PRESENT_INACTIVE_TABLE = "i"
THIN_POOL_CHECK_NEEDED = "c"
SUSPENDED_THIN_POOL_CHECK_NEEDED = "C"
UNKNOWN = "X"
class IsOpen(StrEnum):
OPEN = "o"
CLOSED = "-"
UNKNOWN = "X"
class TargetType(StrEnum):
CACHE = "C"
MIRROR = "m"
RAID = "r"
SNAPSHOT = "s"
THIN = "t"
UNKNOWN = "u"
VIRTUAL = "v"
NORMAL = "-"
class Health(StrEnum):
# for all
PARTIAL = "p"
UNKNOWN = "X"
OK = "-"
# for RAID
REFRESH_NEEDED = "r"
MISMATCHES = "m"
WRITEMOSTLY = "w"
RESHAPING = "s"
REMOVE = "R"
# for thin pools and LVs
FAILED = "F"
OUT_OF_SPACE = "D"
METADATA_READ_ONLY = "M"
# for writecache
ERROR = "E"
vol_type: VolType
permissions: Permissions
allocation_policy: AllocationPolicy
fixed_minor: bool
state: State
is_open: IsOpen
target_type: TargetType
zero_before_use: bool
health: Health
skip_activation: bool
name: Optional[str] = None
@classmethod
def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self:
kwargs = {}
kwargs["vol_type"] = cls.VolType(attr_str[0])
kwargs["permissions"] = cls.Permissions(attr_str[1])
kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2])
kwargs["fixed_minor"] = True if attr_str[3] == "m" else False
kwargs["state"] = cls.State(attr_str[4])
kwargs["is_open"] = cls.IsOpen(attr_str[5])
kwargs["target_type"] = cls.TargetType(attr_str[6])
kwargs["zero_before_use"] = True if attr_str[7] == "z" else False
kwargs["health"] = cls.Health(attr_str[8])
kwargs["skip_activation"] = True if attr_str[9] == "k" else False
kwargs["name"] = name
return cls(**kwargs)
@classmethod
def from_cli(cls, name: str) -> Self:
json_obj = json.loads(subprocess.run([LVS_PATH, "--reportformat=json", name], capture_output=True).stdout)
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
return cls.from_str(attr_str, name)

View file

@ -7,6 +7,7 @@ from alt_utils import NestedDeserializableDataclass
from . import enums
from .alert_channels import AlertChannelsConfig
from .checks.cpu import CpuCheckConfig
from .checks.lvmraid import LvmRaidCheckConfig
from .checks.net import NetCheckConfig
from .checks.ram import RamCheckConfig
from .checks.temp import TempCheckConfig
@ -22,6 +23,7 @@ class ChecksConfig(NestedDeserializableDataclass):
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
net: NetCheckConfig = field(default_factory=NetCheckConfig)
ups: UPSCheckConfig = field(default_factory=UPSCheckConfig)
lvmraid: LvmRaidCheckConfig = field(default_factory=LvmRaidCheckConfig)
@dataclass

View file

@ -0,0 +1,6 @@
from dataclasses import dataclass, field
@dataclass
class LvmRaidCheckConfig:
lv_paths: list = field(default_factory=list)

View file

@ -10,6 +10,7 @@ class CheckSet(StrEnum):
TEMP = "temp"
NET = "net"
UPS = "ups"
LVMRAID = "lvmraid"
VULNIX = "vulnix"

View file

@ -1,3 +1,4 @@
VULNIX_PATH: str = ... # path to vulnix executable
UPSC_PATH = "/usr/bin/upsc"
UPS_PIPE_NAME = "/tmp/lego-monitoring-ups-status"
LVS_PATH = "/usr/bin/lvs"

View file

@ -101,6 +101,9 @@ async def async_main():
owner_group=config.checks.ups.upsmon_group,
)
],
check_sets.LVMRAID: [
IntervalChecker(checks.lvmraid_check, interval=datetime.timedelta(minutes=5), persistent=True)
],
}
checkers = []

2
uv.lock generated
View file

@ -278,7 +278,7 @@ wheels = [
[[package]]
name = "lego-monitoring"
version = "1.1.1"
version = "1.2.0"
source = { editable = "." }
dependencies = [
{ name = "aiodns" },