mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
lvmraid monitoring
This commit is contained in:
parent
57accba7d7
commit
db1c41fb53
13 changed files with 249 additions and 5 deletions
|
|
@ -71,7 +71,8 @@
|
|||
postPatch = ''
|
||||
substituteInPlace src/lego_monitoring/core/const.py \
|
||||
--replace-fail 'VULNIX_PATH: str = ...' 'VULNIX_PATH = "${lib.getExe pkgs.vulnix}"' \
|
||||
--replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"'
|
||||
--replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"' \
|
||||
--replace-fail 'LVS_PATH = "/usr/bin/lvs"' 'LVS_PATH = "${pkgs.lvm2.bin}/bin/lvs"'
|
||||
'';
|
||||
}
|
||||
);
|
||||
|
|
|
|||
|
|
@ -32,8 +32,7 @@ package:
|
|||
enabled_check_sets = cfg.enabledCheckSets;
|
||||
log_level = cfg.logLevel;
|
||||
alert_channels = {
|
||||
telegram = with cfg.alertChannels.telegram; if enable then
|
||||
{
|
||||
telegram = with cfg.alertChannels.telegram; if enable then {
|
||||
creds_secret_path = credsSecretPath;
|
||||
room_id = roomId;
|
||||
} else null;
|
||||
|
|
@ -81,6 +80,8 @@ package:
|
|||
ups_to_check = upsToCheck;
|
||||
upsmon_group = upsmonGroup;
|
||||
};
|
||||
|
||||
lvmraid.lv_paths = cfg.checks.lvmraid.lvPaths;
|
||||
};
|
||||
};
|
||||
in lib.mkIf cfg.enable {
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ in
|
|||
"temp"
|
||||
"net"
|
||||
"ups"
|
||||
"lvmraid"
|
||||
|
||||
"vulnix"
|
||||
]);
|
||||
|
|
@ -48,6 +49,7 @@ in
|
|||
* temp -- alerts when temperature readings are above thresholds
|
||||
* net -- alerts when network usage is above threshold
|
||||
* ups -- alerts on UPS events
|
||||
* lvmraid -- alerts when RAID LVs are unhealthy
|
||||
* vulnix -- periodically scans system for known CVEs, alerts if any are found (NixOS only)'';
|
||||
};
|
||||
|
||||
|
|
@ -187,6 +189,15 @@ in
|
|||
description = "Group to allow to send UPS status updates. This should usually include the user upsmon runs as.";
|
||||
};
|
||||
};
|
||||
|
||||
lvmraid = {
|
||||
lvPaths = lib.mkOption {
|
||||
type = with lib.types; listOf str;
|
||||
default = [ ];
|
||||
description = "List of LV paths to monitor.";
|
||||
example = lib.literalExpression ''[ Data/lvol0 ]'';
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,11 +12,11 @@ class AlertType(StrEnum):
|
|||
RAM = "RAM"
|
||||
TEMP = "TEMP"
|
||||
UPS = "UPS"
|
||||
RAID = "RAID"
|
||||
|
||||
VULN = "VULN"
|
||||
# LOGIN = "LOGIN"
|
||||
# SMART = "SMART" # TODO
|
||||
# RAID = "RAID"
|
||||
# DISKS = "DISKS"
|
||||
# UPDATE = "UPDATE"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from .cpu import cpu_check
|
||||
from .lvmraid import lvmraid_check
|
||||
from .net import NetIOTracker
|
||||
from .ram import ram_check
|
||||
from .remind import remind_check
|
||||
|
|
|
|||
79
src/lego_monitoring/checks/lvmraid/__init__.py
Normal file
79
src/lego_monitoring/checks/lvmraid/__init__.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
from socket import gethostname
|
||||
|
||||
from lego_monitoring.alerting.alert import Alert
|
||||
from lego_monitoring.alerting.enum import AlertType, Severity
|
||||
from lego_monitoring.core import cvars
|
||||
|
||||
from ..utils import format_for_healthchecks_slug
|
||||
from .lvattr import LVAttr
|
||||
|
||||
|
||||
def lvmraid_check() -> list[Alert]:
|
||||
check_config = cvars.config.get().checks.lvmraid
|
||||
alert_list = []
|
||||
for lv in check_config.lv_paths:
|
||||
slug = f"{format_for_healthchecks_slug(gethostname())}-lvmraid-{format_for_healthchecks_slug(lv)}"
|
||||
try:
|
||||
lv_attr = LVAttr.from_cli(lv)
|
||||
except Exception as e:
|
||||
alert_list.append(
|
||||
Alert(
|
||||
alert_type=AlertType.RAID,
|
||||
message=f"Exception {type(e).__name__} while calling lvs: {e}",
|
||||
severity=Severity.CRITICAL,
|
||||
healthchecks_slug=slug,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
# sanity check
|
||||
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
|
||||
alert_list.append(
|
||||
Alert(
|
||||
alert_type=AlertType.RAID,
|
||||
message=f"LV {lv} is not of RAID type",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
match lv_attr.health:
|
||||
case LVAttr.Health.PARTIAL:
|
||||
severity_reason = f"[!] LV {lv} operating in partial mode; one of PVs has failed\n\n"
|
||||
severity = Severity.CRITICAL
|
||||
case LVAttr.Health.UNKNOWN:
|
||||
severity_reason = f"[!] LV {lv}'s state is unknown\n\n"
|
||||
severity = Severity.CRITICAL
|
||||
case LVAttr.Health.REFRESH_NEEDED:
|
||||
severity_reason = f"[!] LV {lv} has suffered a write error; run a refresh or replace the failing PV\n\n"
|
||||
severity = Severity.WARNING
|
||||
case LVAttr.Health.MISMATCHES:
|
||||
severity_reason = f"[!] LV {lv} is partially incoherent; run a repairing scrub operation\n\n"
|
||||
severity = Severity.WARNING
|
||||
case _:
|
||||
severity_reason = ""
|
||||
severity = Severity.OK
|
||||
|
||||
status = f"""{severity_reason}LV: {lv}
|
||||
Type: {lv_attr.vol_type.name}
|
||||
Permissions: {lv_attr.permissions.name}
|
||||
Allocation policy: {lv_attr.allocation_policy.name}
|
||||
Fixed minor: {lv_attr.fixed_minor}
|
||||
State: {lv_attr.state.name}
|
||||
Is open: {lv_attr.is_open.name}
|
||||
Target type: {lv_attr.target_type.name}
|
||||
Zero before use: {lv_attr.zero_before_use}
|
||||
Health: {lv_attr.health.name}
|
||||
Skip activation: {lv_attr.skip_activation}
|
||||
"""
|
||||
|
||||
alert_list.append(
|
||||
Alert(
|
||||
alert_type=AlertType.RAID,
|
||||
message=status,
|
||||
severity=severity,
|
||||
healthchecks_slug=slug,
|
||||
)
|
||||
)
|
||||
|
||||
return alert_list
|
||||
138
src/lego_monitoring/checks/lvmraid/lvattr.py
Normal file
138
src/lego_monitoring/checks/lvmraid/lvattr.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
import json
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
from typing import Optional, Self
|
||||
|
||||
from lego_monitoring.core.const import LVS_PATH
|
||||
|
||||
|
||||
@dataclass
|
||||
class LVAttr:
|
||||
"""https://man.archlinux.org/man/lvs.8#NOTES"""
|
||||
|
||||
class VolType(StrEnum):
|
||||
CACHE = "C"
|
||||
MIRRORED = "m"
|
||||
MIRRORED_NOSYNC = "M"
|
||||
ORIGIN = "o"
|
||||
ORIGIN_MERGING_SNAPSHOT = "O"
|
||||
INTEGRITY = "g"
|
||||
RAID = "r"
|
||||
RAID_NOSYNC = "R"
|
||||
SNAPSHOT = "s"
|
||||
MERGING_SNAPSHOT = "S"
|
||||
PVMOVE = "p"
|
||||
VIRTUAL = "v"
|
||||
IMAGE = "i"
|
||||
IMAGE_OUT_OF_SYNC = "I"
|
||||
MIRROR_LOG = "l"
|
||||
CONVERTING = "c"
|
||||
THIN = "V"
|
||||
THIN_POOL = "t"
|
||||
THIN_POOL_DATA = "T"
|
||||
VDO_POOL = "d"
|
||||
VDO_POOL_DATA = "D"
|
||||
METADATA = "e"
|
||||
NORMAL = "-"
|
||||
|
||||
class Permissions(StrEnum):
|
||||
WRITABLE = "w"
|
||||
READONLY = "r"
|
||||
READONLY_ACTIVATED = "R"
|
||||
|
||||
class AllocationPolicy(StrEnum):
|
||||
ANYWHERE = "a"
|
||||
ANYWHERE_LOCKED = "A"
|
||||
CONTIGUOUS = "c"
|
||||
CONTIGUOUS_LOCKED = "C"
|
||||
INHERITED = "i"
|
||||
INHERITED_LOCKED = "I"
|
||||
CLING = "l"
|
||||
CLING_LOCKED = "L"
|
||||
NORMAL = "n"
|
||||
NORMAL_LOCKED = "N"
|
||||
|
||||
class State(StrEnum):
|
||||
ACTIVE = "a"
|
||||
HISTORICAL = "h"
|
||||
SUSPENDED = "s"
|
||||
INVALID_SNAPSHOT = "I"
|
||||
INVALID_SUSPENDED_SNAPSHOT = "S"
|
||||
SNAPSHOT_MERGE_FAILED = "m"
|
||||
SUSPENDED_SNAPSHOT_MERGE_FAILED = "M"
|
||||
DEVICE_PRESENT_NO_TABLES = "d"
|
||||
DEVICE_PRESENT_INACTIVE_TABLE = "i"
|
||||
THIN_POOL_CHECK_NEEDED = "c"
|
||||
SUSPENDED_THIN_POOL_CHECK_NEEDED = "C"
|
||||
UNKNOWN = "X"
|
||||
|
||||
class IsOpen(StrEnum):
|
||||
OPEN = "o"
|
||||
CLOSED = "-"
|
||||
UNKNOWN = "X"
|
||||
|
||||
class TargetType(StrEnum):
|
||||
CACHE = "C"
|
||||
MIRROR = "m"
|
||||
RAID = "r"
|
||||
SNAPSHOT = "s"
|
||||
THIN = "t"
|
||||
UNKNOWN = "u"
|
||||
VIRTUAL = "v"
|
||||
NORMAL = "-"
|
||||
|
||||
class Health(StrEnum):
|
||||
# for all
|
||||
PARTIAL = "p"
|
||||
UNKNOWN = "X"
|
||||
OK = "-"
|
||||
|
||||
# for RAID
|
||||
REFRESH_NEEDED = "r"
|
||||
MISMATCHES = "m"
|
||||
WRITEMOSTLY = "w"
|
||||
RESHAPING = "s"
|
||||
REMOVE = "R"
|
||||
|
||||
# for thin pools and LVs
|
||||
FAILED = "F"
|
||||
OUT_OF_SPACE = "D"
|
||||
METADATA_READ_ONLY = "M"
|
||||
|
||||
# for writecache
|
||||
ERROR = "E"
|
||||
|
||||
vol_type: VolType
|
||||
permissions: Permissions
|
||||
allocation_policy: AllocationPolicy
|
||||
fixed_minor: bool
|
||||
state: State
|
||||
is_open: IsOpen
|
||||
target_type: TargetType
|
||||
zero_before_use: bool
|
||||
health: Health
|
||||
skip_activation: bool
|
||||
name: Optional[str] = None
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self:
|
||||
kwargs = {}
|
||||
kwargs["vol_type"] = cls.VolType(attr_str[0])
|
||||
kwargs["permissions"] = cls.Permissions(attr_str[1])
|
||||
kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2])
|
||||
kwargs["fixed_minor"] = True if attr_str[3] == "m" else False
|
||||
kwargs["state"] = cls.State(attr_str[4])
|
||||
kwargs["is_open"] = cls.IsOpen(attr_str[5])
|
||||
kwargs["target_type"] = cls.TargetType(attr_str[6])
|
||||
kwargs["zero_before_use"] = True if attr_str[7] == "z" else False
|
||||
kwargs["health"] = cls.Health(attr_str[8])
|
||||
kwargs["skip_activation"] = True if attr_str[9] == "k" else False
|
||||
kwargs["name"] = name
|
||||
return cls(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_cli(cls, name: str) -> Self:
|
||||
json_obj = json.loads(subprocess.run([LVS_PATH, "--reportformat=json", name], capture_output=True).stdout)
|
||||
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
|
||||
return cls.from_str(attr_str, name)
|
||||
|
|
@ -7,6 +7,7 @@ from alt_utils import NestedDeserializableDataclass
|
|||
from . import enums
|
||||
from .alert_channels import AlertChannelsConfig
|
||||
from .checks.cpu import CpuCheckConfig
|
||||
from .checks.lvmraid import LvmRaidCheckConfig
|
||||
from .checks.net import NetCheckConfig
|
||||
from .checks.ram import RamCheckConfig
|
||||
from .checks.temp import TempCheckConfig
|
||||
|
|
@ -22,6 +23,7 @@ class ChecksConfig(NestedDeserializableDataclass):
|
|||
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
|
||||
net: NetCheckConfig = field(default_factory=NetCheckConfig)
|
||||
ups: UPSCheckConfig = field(default_factory=UPSCheckConfig)
|
||||
lvmraid: LvmRaidCheckConfig = field(default_factory=LvmRaidCheckConfig)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
|||
6
src/lego_monitoring/config/checks/lvmraid.py
Normal file
6
src/lego_monitoring/config/checks/lvmraid.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class LvmRaidCheckConfig:
|
||||
lv_paths: list = field(default_factory=list)
|
||||
|
|
@ -10,6 +10,7 @@ class CheckSet(StrEnum):
|
|||
TEMP = "temp"
|
||||
NET = "net"
|
||||
UPS = "ups"
|
||||
LVMRAID = "lvmraid"
|
||||
|
||||
VULNIX = "vulnix"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
VULNIX_PATH: str = ... # path to vulnix executable
|
||||
UPSC_PATH = "/usr/bin/upsc"
|
||||
UPS_PIPE_NAME = "/tmp/lego-monitoring-ups-status"
|
||||
LVS_PATH = "/usr/bin/lvs"
|
||||
|
|
|
|||
|
|
@ -101,6 +101,9 @@ async def async_main():
|
|||
owner_group=config.checks.ups.upsmon_group,
|
||||
)
|
||||
],
|
||||
check_sets.LVMRAID: [
|
||||
IntervalChecker(checks.lvmraid_check, interval=datetime.timedelta(minutes=5), persistent=True)
|
||||
],
|
||||
}
|
||||
|
||||
checkers = []
|
||||
|
|
|
|||
2
uv.lock
generated
2
uv.lock
generated
|
|
@ -278,7 +278,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "lego-monitoring"
|
||||
version = "1.1.1"
|
||||
version = "1.2.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "aiodns" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue