From db1c41fb5373e482da40dafa7c2e1b1af7ad9d5c Mon Sep 17 00:00:00 2001 From: Alex Tau Date: Sun, 18 Jan 2026 22:25:05 +0300 Subject: [PATCH] lvmraid monitoring --- flake.nix | 3 +- modules/default.nix | 5 +- modules/options.nix | 11 ++ src/lego_monitoring/alerting/enum.py | 2 +- src/lego_monitoring/checks/__init__.py | 1 + .../checks/lvmraid/__init__.py | 79 ++++++++++ src/lego_monitoring/checks/lvmraid/lvattr.py | 138 ++++++++++++++++++ src/lego_monitoring/config/__init__.py | 2 + src/lego_monitoring/config/checks/lvmraid.py | 6 + src/lego_monitoring/config/enums.py | 1 + src/lego_monitoring/core/const.py | 1 + src/lego_monitoring/main.py | 3 + uv.lock | 2 +- 13 files changed, 249 insertions(+), 5 deletions(-) create mode 100644 src/lego_monitoring/checks/lvmraid/__init__.py create mode 100644 src/lego_monitoring/checks/lvmraid/lvattr.py create mode 100644 src/lego_monitoring/config/checks/lvmraid.py diff --git a/flake.nix b/flake.nix index 2edc46b..845329a 100644 --- a/flake.nix +++ b/flake.nix @@ -71,7 +71,8 @@ postPatch = '' substituteInPlace src/lego_monitoring/core/const.py \ --replace-fail 'VULNIX_PATH: str = ...' 'VULNIX_PATH = "${lib.getExe pkgs.vulnix}"' \ - --replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"' + --replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"' \ + --replace-fail 'LVS_PATH = "/usr/bin/lvs"' 'LVS_PATH = "${pkgs.lvm2.bin}/bin/lvs"' ''; } ); diff --git a/modules/default.nix b/modules/default.nix index b99764a..d3ac839 100644 --- a/modules/default.nix +++ b/modules/default.nix @@ -32,8 +32,7 @@ package: enabled_check_sets = cfg.enabledCheckSets; log_level = cfg.logLevel; alert_channels = { - telegram = with cfg.alertChannels.telegram; if enable then - { + telegram = with cfg.alertChannels.telegram; if enable then { creds_secret_path = credsSecretPath; room_id = roomId; } else null; @@ -81,6 +80,8 @@ package: ups_to_check = upsToCheck; upsmon_group = upsmonGroup; }; + + lvmraid.lv_paths = cfg.checks.lvmraid.lvPaths; }; }; in lib.mkIf cfg.enable { diff --git a/modules/options.nix b/modules/options.nix index ad78efe..38578af 100644 --- a/modules/options.nix +++ b/modules/options.nix @@ -35,6 +35,7 @@ in "temp" "net" "ups" + "lvmraid" "vulnix" ]); @@ -48,6 +49,7 @@ in * temp -- alerts when temperature readings are above thresholds * net -- alerts when network usage is above threshold * ups -- alerts on UPS events + * lvmraid -- alerts when RAID LVs are unhealthy * vulnix -- periodically scans system for known CVEs, alerts if any are found (NixOS only)''; }; @@ -187,6 +189,15 @@ in description = "Group to allow to send UPS status updates. This should usually include the user upsmon runs as."; }; }; + + lvmraid = { + lvPaths = lib.mkOption { + type = with lib.types; listOf str; + default = [ ]; + description = "List of LV paths to monitor."; + example = lib.literalExpression ''[ Data/lvol0 ]''; + }; + }; }; }; } diff --git a/src/lego_monitoring/alerting/enum.py b/src/lego_monitoring/alerting/enum.py index 905a995..43d4f0c 100644 --- a/src/lego_monitoring/alerting/enum.py +++ b/src/lego_monitoring/alerting/enum.py @@ -12,11 +12,11 @@ class AlertType(StrEnum): RAM = "RAM" TEMP = "TEMP" UPS = "UPS" + RAID = "RAID" VULN = "VULN" # LOGIN = "LOGIN" # SMART = "SMART" # TODO - # RAID = "RAID" # DISKS = "DISKS" # UPDATE = "UPDATE" diff --git a/src/lego_monitoring/checks/__init__.py b/src/lego_monitoring/checks/__init__.py index 86cecfc..b1acf2b 100644 --- a/src/lego_monitoring/checks/__init__.py +++ b/src/lego_monitoring/checks/__init__.py @@ -1,4 +1,5 @@ from .cpu import cpu_check +from .lvmraid import lvmraid_check from .net import NetIOTracker from .ram import ram_check from .remind import remind_check diff --git a/src/lego_monitoring/checks/lvmraid/__init__.py b/src/lego_monitoring/checks/lvmraid/__init__.py new file mode 100644 index 0000000..66d8e0e --- /dev/null +++ b/src/lego_monitoring/checks/lvmraid/__init__.py @@ -0,0 +1,79 @@ +from socket import gethostname + +from lego_monitoring.alerting.alert import Alert +from lego_monitoring.alerting.enum import AlertType, Severity +from lego_monitoring.core import cvars + +from ..utils import format_for_healthchecks_slug +from .lvattr import LVAttr + + +def lvmraid_check() -> list[Alert]: + check_config = cvars.config.get().checks.lvmraid + alert_list = [] + for lv in check_config.lv_paths: + slug = f"{format_for_healthchecks_slug(gethostname())}-lvmraid-{format_for_healthchecks_slug(lv)}" + try: + lv_attr = LVAttr.from_cli(lv) + except Exception as e: + alert_list.append( + Alert( + alert_type=AlertType.RAID, + message=f"Exception {type(e).__name__} while calling lvs: {e}", + severity=Severity.CRITICAL, + healthchecks_slug=slug, + ) + ) + continue + + # sanity check + if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]: + alert_list.append( + Alert( + alert_type=AlertType.RAID, + message=f"LV {lv} is not of RAID type", + severity=Severity.CRITICAL, + ) + ) + continue + + match lv_attr.health: + case LVAttr.Health.PARTIAL: + severity_reason = f"[!] LV {lv} operating in partial mode; one of PVs has failed\n\n" + severity = Severity.CRITICAL + case LVAttr.Health.UNKNOWN: + severity_reason = f"[!] LV {lv}'s state is unknown\n\n" + severity = Severity.CRITICAL + case LVAttr.Health.REFRESH_NEEDED: + severity_reason = f"[!] LV {lv} has suffered a write error; run a refresh or replace the failing PV\n\n" + severity = Severity.WARNING + case LVAttr.Health.MISMATCHES: + severity_reason = f"[!] LV {lv} is partially incoherent; run a repairing scrub operation\n\n" + severity = Severity.WARNING + case _: + severity_reason = "" + severity = Severity.OK + + status = f"""{severity_reason}LV: {lv} +Type: {lv_attr.vol_type.name} +Permissions: {lv_attr.permissions.name} +Allocation policy: {lv_attr.allocation_policy.name} +Fixed minor: {lv_attr.fixed_minor} +State: {lv_attr.state.name} +Is open: {lv_attr.is_open.name} +Target type: {lv_attr.target_type.name} +Zero before use: {lv_attr.zero_before_use} +Health: {lv_attr.health.name} +Skip activation: {lv_attr.skip_activation} +""" + + alert_list.append( + Alert( + alert_type=AlertType.RAID, + message=status, + severity=severity, + healthchecks_slug=slug, + ) + ) + + return alert_list diff --git a/src/lego_monitoring/checks/lvmraid/lvattr.py b/src/lego_monitoring/checks/lvmraid/lvattr.py new file mode 100644 index 0000000..af0bd7c --- /dev/null +++ b/src/lego_monitoring/checks/lvmraid/lvattr.py @@ -0,0 +1,138 @@ +import json +import subprocess +from dataclasses import dataclass +from enum import StrEnum +from typing import Optional, Self + +from lego_monitoring.core.const import LVS_PATH + + +@dataclass +class LVAttr: + """https://man.archlinux.org/man/lvs.8#NOTES""" + + class VolType(StrEnum): + CACHE = "C" + MIRRORED = "m" + MIRRORED_NOSYNC = "M" + ORIGIN = "o" + ORIGIN_MERGING_SNAPSHOT = "O" + INTEGRITY = "g" + RAID = "r" + RAID_NOSYNC = "R" + SNAPSHOT = "s" + MERGING_SNAPSHOT = "S" + PVMOVE = "p" + VIRTUAL = "v" + IMAGE = "i" + IMAGE_OUT_OF_SYNC = "I" + MIRROR_LOG = "l" + CONVERTING = "c" + THIN = "V" + THIN_POOL = "t" + THIN_POOL_DATA = "T" + VDO_POOL = "d" + VDO_POOL_DATA = "D" + METADATA = "e" + NORMAL = "-" + + class Permissions(StrEnum): + WRITABLE = "w" + READONLY = "r" + READONLY_ACTIVATED = "R" + + class AllocationPolicy(StrEnum): + ANYWHERE = "a" + ANYWHERE_LOCKED = "A" + CONTIGUOUS = "c" + CONTIGUOUS_LOCKED = "C" + INHERITED = "i" + INHERITED_LOCKED = "I" + CLING = "l" + CLING_LOCKED = "L" + NORMAL = "n" + NORMAL_LOCKED = "N" + + class State(StrEnum): + ACTIVE = "a" + HISTORICAL = "h" + SUSPENDED = "s" + INVALID_SNAPSHOT = "I" + INVALID_SUSPENDED_SNAPSHOT = "S" + SNAPSHOT_MERGE_FAILED = "m" + SUSPENDED_SNAPSHOT_MERGE_FAILED = "M" + DEVICE_PRESENT_NO_TABLES = "d" + DEVICE_PRESENT_INACTIVE_TABLE = "i" + THIN_POOL_CHECK_NEEDED = "c" + SUSPENDED_THIN_POOL_CHECK_NEEDED = "C" + UNKNOWN = "X" + + class IsOpen(StrEnum): + OPEN = "o" + CLOSED = "-" + UNKNOWN = "X" + + class TargetType(StrEnum): + CACHE = "C" + MIRROR = "m" + RAID = "r" + SNAPSHOT = "s" + THIN = "t" + UNKNOWN = "u" + VIRTUAL = "v" + NORMAL = "-" + + class Health(StrEnum): + # for all + PARTIAL = "p" + UNKNOWN = "X" + OK = "-" + + # for RAID + REFRESH_NEEDED = "r" + MISMATCHES = "m" + WRITEMOSTLY = "w" + RESHAPING = "s" + REMOVE = "R" + + # for thin pools and LVs + FAILED = "F" + OUT_OF_SPACE = "D" + METADATA_READ_ONLY = "M" + + # for writecache + ERROR = "E" + + vol_type: VolType + permissions: Permissions + allocation_policy: AllocationPolicy + fixed_minor: bool + state: State + is_open: IsOpen + target_type: TargetType + zero_before_use: bool + health: Health + skip_activation: bool + name: Optional[str] = None + + @classmethod + def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self: + kwargs = {} + kwargs["vol_type"] = cls.VolType(attr_str[0]) + kwargs["permissions"] = cls.Permissions(attr_str[1]) + kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2]) + kwargs["fixed_minor"] = True if attr_str[3] == "m" else False + kwargs["state"] = cls.State(attr_str[4]) + kwargs["is_open"] = cls.IsOpen(attr_str[5]) + kwargs["target_type"] = cls.TargetType(attr_str[6]) + kwargs["zero_before_use"] = True if attr_str[7] == "z" else False + kwargs["health"] = cls.Health(attr_str[8]) + kwargs["skip_activation"] = True if attr_str[9] == "k" else False + kwargs["name"] = name + return cls(**kwargs) + + @classmethod + def from_cli(cls, name: str) -> Self: + json_obj = json.loads(subprocess.run([LVS_PATH, "--reportformat=json", name], capture_output=True).stdout) + attr_str = json_obj["report"][0]["lv"][0]["lv_attr"] + return cls.from_str(attr_str, name) diff --git a/src/lego_monitoring/config/__init__.py b/src/lego_monitoring/config/__init__.py index 70662c0..2024688 100644 --- a/src/lego_monitoring/config/__init__.py +++ b/src/lego_monitoring/config/__init__.py @@ -7,6 +7,7 @@ from alt_utils import NestedDeserializableDataclass from . import enums from .alert_channels import AlertChannelsConfig from .checks.cpu import CpuCheckConfig +from .checks.lvmraid import LvmRaidCheckConfig from .checks.net import NetCheckConfig from .checks.ram import RamCheckConfig from .checks.temp import TempCheckConfig @@ -22,6 +23,7 @@ class ChecksConfig(NestedDeserializableDataclass): vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None net: NetCheckConfig = field(default_factory=NetCheckConfig) ups: UPSCheckConfig = field(default_factory=UPSCheckConfig) + lvmraid: LvmRaidCheckConfig = field(default_factory=LvmRaidCheckConfig) @dataclass diff --git a/src/lego_monitoring/config/checks/lvmraid.py b/src/lego_monitoring/config/checks/lvmraid.py new file mode 100644 index 0000000..5e53952 --- /dev/null +++ b/src/lego_monitoring/config/checks/lvmraid.py @@ -0,0 +1,6 @@ +from dataclasses import dataclass, field + + +@dataclass +class LvmRaidCheckConfig: + lv_paths: list = field(default_factory=list) diff --git a/src/lego_monitoring/config/enums.py b/src/lego_monitoring/config/enums.py index 2c4064c..b90213e 100644 --- a/src/lego_monitoring/config/enums.py +++ b/src/lego_monitoring/config/enums.py @@ -10,6 +10,7 @@ class CheckSet(StrEnum): TEMP = "temp" NET = "net" UPS = "ups" + LVMRAID = "lvmraid" VULNIX = "vulnix" diff --git a/src/lego_monitoring/core/const.py b/src/lego_monitoring/core/const.py index 99724e4..4e58370 100644 --- a/src/lego_monitoring/core/const.py +++ b/src/lego_monitoring/core/const.py @@ -1,3 +1,4 @@ VULNIX_PATH: str = ... # path to vulnix executable UPSC_PATH = "/usr/bin/upsc" UPS_PIPE_NAME = "/tmp/lego-monitoring-ups-status" +LVS_PATH = "/usr/bin/lvs" diff --git a/src/lego_monitoring/main.py b/src/lego_monitoring/main.py index 3986280..948b897 100644 --- a/src/lego_monitoring/main.py +++ b/src/lego_monitoring/main.py @@ -101,6 +101,9 @@ async def async_main(): owner_group=config.checks.ups.upsmon_group, ) ], + check_sets.LVMRAID: [ + IntervalChecker(checks.lvmraid_check, interval=datetime.timedelta(minutes=5), persistent=True) + ], } checkers = [] diff --git a/uv.lock b/uv.lock index 1148674..b5d011c 100644 --- a/uv.lock +++ b/uv.lock @@ -278,7 +278,7 @@ wheels = [ [[package]] name = "lego-monitoring" -version = "1.1.1" +version = "1.2.0" source = { editable = "." } dependencies = [ { name = "aiodns" },