mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
lvmraid monitoring
This commit is contained in:
parent
57accba7d7
commit
db1c41fb53
13 changed files with 249 additions and 5 deletions
|
|
@ -71,7 +71,8 @@
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace src/lego_monitoring/core/const.py \
|
substituteInPlace src/lego_monitoring/core/const.py \
|
||||||
--replace-fail 'VULNIX_PATH: str = ...' 'VULNIX_PATH = "${lib.getExe pkgs.vulnix}"' \
|
--replace-fail 'VULNIX_PATH: str = ...' 'VULNIX_PATH = "${lib.getExe pkgs.vulnix}"' \
|
||||||
--replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"'
|
--replace-fail 'UPSC_PATH = "/usr/bin/upsc"' 'UPSC_PATH = "${pkgs.nut}/bin/upsc"' \
|
||||||
|
--replace-fail 'LVS_PATH = "/usr/bin/lvs"' 'LVS_PATH = "${pkgs.lvm2.bin}/bin/lvs"'
|
||||||
'';
|
'';
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
|
||||||
|
|
@ -32,8 +32,7 @@ package:
|
||||||
enabled_check_sets = cfg.enabledCheckSets;
|
enabled_check_sets = cfg.enabledCheckSets;
|
||||||
log_level = cfg.logLevel;
|
log_level = cfg.logLevel;
|
||||||
alert_channels = {
|
alert_channels = {
|
||||||
telegram = with cfg.alertChannels.telegram; if enable then
|
telegram = with cfg.alertChannels.telegram; if enable then {
|
||||||
{
|
|
||||||
creds_secret_path = credsSecretPath;
|
creds_secret_path = credsSecretPath;
|
||||||
room_id = roomId;
|
room_id = roomId;
|
||||||
} else null;
|
} else null;
|
||||||
|
|
@ -81,6 +80,8 @@ package:
|
||||||
ups_to_check = upsToCheck;
|
ups_to_check = upsToCheck;
|
||||||
upsmon_group = upsmonGroup;
|
upsmon_group = upsmonGroup;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
lvmraid.lv_paths = cfg.checks.lvmraid.lvPaths;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
in lib.mkIf cfg.enable {
|
in lib.mkIf cfg.enable {
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ in
|
||||||
"temp"
|
"temp"
|
||||||
"net"
|
"net"
|
||||||
"ups"
|
"ups"
|
||||||
|
"lvmraid"
|
||||||
|
|
||||||
"vulnix"
|
"vulnix"
|
||||||
]);
|
]);
|
||||||
|
|
@ -48,6 +49,7 @@ in
|
||||||
* temp -- alerts when temperature readings are above thresholds
|
* temp -- alerts when temperature readings are above thresholds
|
||||||
* net -- alerts when network usage is above threshold
|
* net -- alerts when network usage is above threshold
|
||||||
* ups -- alerts on UPS events
|
* ups -- alerts on UPS events
|
||||||
|
* lvmraid -- alerts when RAID LVs are unhealthy
|
||||||
* vulnix -- periodically scans system for known CVEs, alerts if any are found (NixOS only)'';
|
* vulnix -- periodically scans system for known CVEs, alerts if any are found (NixOS only)'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -187,6 +189,15 @@ in
|
||||||
description = "Group to allow to send UPS status updates. This should usually include the user upsmon runs as.";
|
description = "Group to allow to send UPS status updates. This should usually include the user upsmon runs as.";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
lvmraid = {
|
||||||
|
lvPaths = lib.mkOption {
|
||||||
|
type = with lib.types; listOf str;
|
||||||
|
default = [ ];
|
||||||
|
description = "List of LV paths to monitor.";
|
||||||
|
example = lib.literalExpression ''[ Data/lvol0 ]'';
|
||||||
|
};
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,11 +12,11 @@ class AlertType(StrEnum):
|
||||||
RAM = "RAM"
|
RAM = "RAM"
|
||||||
TEMP = "TEMP"
|
TEMP = "TEMP"
|
||||||
UPS = "UPS"
|
UPS = "UPS"
|
||||||
|
RAID = "RAID"
|
||||||
|
|
||||||
VULN = "VULN"
|
VULN = "VULN"
|
||||||
# LOGIN = "LOGIN"
|
# LOGIN = "LOGIN"
|
||||||
# SMART = "SMART" # TODO
|
# SMART = "SMART" # TODO
|
||||||
# RAID = "RAID"
|
|
||||||
# DISKS = "DISKS"
|
# DISKS = "DISKS"
|
||||||
# UPDATE = "UPDATE"
|
# UPDATE = "UPDATE"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from .cpu import cpu_check
|
from .cpu import cpu_check
|
||||||
|
from .lvmraid import lvmraid_check
|
||||||
from .net import NetIOTracker
|
from .net import NetIOTracker
|
||||||
from .ram import ram_check
|
from .ram import ram_check
|
||||||
from .remind import remind_check
|
from .remind import remind_check
|
||||||
|
|
|
||||||
79
src/lego_monitoring/checks/lvmraid/__init__.py
Normal file
79
src/lego_monitoring/checks/lvmraid/__init__.py
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
from socket import gethostname
|
||||||
|
|
||||||
|
from lego_monitoring.alerting.alert import Alert
|
||||||
|
from lego_monitoring.alerting.enum import AlertType, Severity
|
||||||
|
from lego_monitoring.core import cvars
|
||||||
|
|
||||||
|
from ..utils import format_for_healthchecks_slug
|
||||||
|
from .lvattr import LVAttr
|
||||||
|
|
||||||
|
|
||||||
|
def lvmraid_check() -> list[Alert]:
|
||||||
|
check_config = cvars.config.get().checks.lvmraid
|
||||||
|
alert_list = []
|
||||||
|
for lv in check_config.lv_paths:
|
||||||
|
slug = f"{format_for_healthchecks_slug(gethostname())}-lvmraid-{format_for_healthchecks_slug(lv)}"
|
||||||
|
try:
|
||||||
|
lv_attr = LVAttr.from_cli(lv)
|
||||||
|
except Exception as e:
|
||||||
|
alert_list.append(
|
||||||
|
Alert(
|
||||||
|
alert_type=AlertType.RAID,
|
||||||
|
message=f"Exception {type(e).__name__} while calling lvs: {e}",
|
||||||
|
severity=Severity.CRITICAL,
|
||||||
|
healthchecks_slug=slug,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
|
||||||
|
alert_list.append(
|
||||||
|
Alert(
|
||||||
|
alert_type=AlertType.RAID,
|
||||||
|
message=f"LV {lv} is not of RAID type",
|
||||||
|
severity=Severity.CRITICAL,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
match lv_attr.health:
|
||||||
|
case LVAttr.Health.PARTIAL:
|
||||||
|
severity_reason = f"[!] LV {lv} operating in partial mode; one of PVs has failed\n\n"
|
||||||
|
severity = Severity.CRITICAL
|
||||||
|
case LVAttr.Health.UNKNOWN:
|
||||||
|
severity_reason = f"[!] LV {lv}'s state is unknown\n\n"
|
||||||
|
severity = Severity.CRITICAL
|
||||||
|
case LVAttr.Health.REFRESH_NEEDED:
|
||||||
|
severity_reason = f"[!] LV {lv} has suffered a write error; run a refresh or replace the failing PV\n\n"
|
||||||
|
severity = Severity.WARNING
|
||||||
|
case LVAttr.Health.MISMATCHES:
|
||||||
|
severity_reason = f"[!] LV {lv} is partially incoherent; run a repairing scrub operation\n\n"
|
||||||
|
severity = Severity.WARNING
|
||||||
|
case _:
|
||||||
|
severity_reason = ""
|
||||||
|
severity = Severity.OK
|
||||||
|
|
||||||
|
status = f"""{severity_reason}LV: {lv}
|
||||||
|
Type: {lv_attr.vol_type.name}
|
||||||
|
Permissions: {lv_attr.permissions.name}
|
||||||
|
Allocation policy: {lv_attr.allocation_policy.name}
|
||||||
|
Fixed minor: {lv_attr.fixed_minor}
|
||||||
|
State: {lv_attr.state.name}
|
||||||
|
Is open: {lv_attr.is_open.name}
|
||||||
|
Target type: {lv_attr.target_type.name}
|
||||||
|
Zero before use: {lv_attr.zero_before_use}
|
||||||
|
Health: {lv_attr.health.name}
|
||||||
|
Skip activation: {lv_attr.skip_activation}
|
||||||
|
"""
|
||||||
|
|
||||||
|
alert_list.append(
|
||||||
|
Alert(
|
||||||
|
alert_type=AlertType.RAID,
|
||||||
|
message=status,
|
||||||
|
severity=severity,
|
||||||
|
healthchecks_slug=slug,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return alert_list
|
||||||
138
src/lego_monitoring/checks/lvmraid/lvattr.py
Normal file
138
src/lego_monitoring/checks/lvmraid/lvattr.py
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import StrEnum
|
||||||
|
from typing import Optional, Self
|
||||||
|
|
||||||
|
from lego_monitoring.core.const import LVS_PATH
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LVAttr:
|
||||||
|
"""https://man.archlinux.org/man/lvs.8#NOTES"""
|
||||||
|
|
||||||
|
class VolType(StrEnum):
|
||||||
|
CACHE = "C"
|
||||||
|
MIRRORED = "m"
|
||||||
|
MIRRORED_NOSYNC = "M"
|
||||||
|
ORIGIN = "o"
|
||||||
|
ORIGIN_MERGING_SNAPSHOT = "O"
|
||||||
|
INTEGRITY = "g"
|
||||||
|
RAID = "r"
|
||||||
|
RAID_NOSYNC = "R"
|
||||||
|
SNAPSHOT = "s"
|
||||||
|
MERGING_SNAPSHOT = "S"
|
||||||
|
PVMOVE = "p"
|
||||||
|
VIRTUAL = "v"
|
||||||
|
IMAGE = "i"
|
||||||
|
IMAGE_OUT_OF_SYNC = "I"
|
||||||
|
MIRROR_LOG = "l"
|
||||||
|
CONVERTING = "c"
|
||||||
|
THIN = "V"
|
||||||
|
THIN_POOL = "t"
|
||||||
|
THIN_POOL_DATA = "T"
|
||||||
|
VDO_POOL = "d"
|
||||||
|
VDO_POOL_DATA = "D"
|
||||||
|
METADATA = "e"
|
||||||
|
NORMAL = "-"
|
||||||
|
|
||||||
|
class Permissions(StrEnum):
|
||||||
|
WRITABLE = "w"
|
||||||
|
READONLY = "r"
|
||||||
|
READONLY_ACTIVATED = "R"
|
||||||
|
|
||||||
|
class AllocationPolicy(StrEnum):
|
||||||
|
ANYWHERE = "a"
|
||||||
|
ANYWHERE_LOCKED = "A"
|
||||||
|
CONTIGUOUS = "c"
|
||||||
|
CONTIGUOUS_LOCKED = "C"
|
||||||
|
INHERITED = "i"
|
||||||
|
INHERITED_LOCKED = "I"
|
||||||
|
CLING = "l"
|
||||||
|
CLING_LOCKED = "L"
|
||||||
|
NORMAL = "n"
|
||||||
|
NORMAL_LOCKED = "N"
|
||||||
|
|
||||||
|
class State(StrEnum):
|
||||||
|
ACTIVE = "a"
|
||||||
|
HISTORICAL = "h"
|
||||||
|
SUSPENDED = "s"
|
||||||
|
INVALID_SNAPSHOT = "I"
|
||||||
|
INVALID_SUSPENDED_SNAPSHOT = "S"
|
||||||
|
SNAPSHOT_MERGE_FAILED = "m"
|
||||||
|
SUSPENDED_SNAPSHOT_MERGE_FAILED = "M"
|
||||||
|
DEVICE_PRESENT_NO_TABLES = "d"
|
||||||
|
DEVICE_PRESENT_INACTIVE_TABLE = "i"
|
||||||
|
THIN_POOL_CHECK_NEEDED = "c"
|
||||||
|
SUSPENDED_THIN_POOL_CHECK_NEEDED = "C"
|
||||||
|
UNKNOWN = "X"
|
||||||
|
|
||||||
|
class IsOpen(StrEnum):
|
||||||
|
OPEN = "o"
|
||||||
|
CLOSED = "-"
|
||||||
|
UNKNOWN = "X"
|
||||||
|
|
||||||
|
class TargetType(StrEnum):
|
||||||
|
CACHE = "C"
|
||||||
|
MIRROR = "m"
|
||||||
|
RAID = "r"
|
||||||
|
SNAPSHOT = "s"
|
||||||
|
THIN = "t"
|
||||||
|
UNKNOWN = "u"
|
||||||
|
VIRTUAL = "v"
|
||||||
|
NORMAL = "-"
|
||||||
|
|
||||||
|
class Health(StrEnum):
|
||||||
|
# for all
|
||||||
|
PARTIAL = "p"
|
||||||
|
UNKNOWN = "X"
|
||||||
|
OK = "-"
|
||||||
|
|
||||||
|
# for RAID
|
||||||
|
REFRESH_NEEDED = "r"
|
||||||
|
MISMATCHES = "m"
|
||||||
|
WRITEMOSTLY = "w"
|
||||||
|
RESHAPING = "s"
|
||||||
|
REMOVE = "R"
|
||||||
|
|
||||||
|
# for thin pools and LVs
|
||||||
|
FAILED = "F"
|
||||||
|
OUT_OF_SPACE = "D"
|
||||||
|
METADATA_READ_ONLY = "M"
|
||||||
|
|
||||||
|
# for writecache
|
||||||
|
ERROR = "E"
|
||||||
|
|
||||||
|
vol_type: VolType
|
||||||
|
permissions: Permissions
|
||||||
|
allocation_policy: AllocationPolicy
|
||||||
|
fixed_minor: bool
|
||||||
|
state: State
|
||||||
|
is_open: IsOpen
|
||||||
|
target_type: TargetType
|
||||||
|
zero_before_use: bool
|
||||||
|
health: Health
|
||||||
|
skip_activation: bool
|
||||||
|
name: Optional[str] = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self:
|
||||||
|
kwargs = {}
|
||||||
|
kwargs["vol_type"] = cls.VolType(attr_str[0])
|
||||||
|
kwargs["permissions"] = cls.Permissions(attr_str[1])
|
||||||
|
kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2])
|
||||||
|
kwargs["fixed_minor"] = True if attr_str[3] == "m" else False
|
||||||
|
kwargs["state"] = cls.State(attr_str[4])
|
||||||
|
kwargs["is_open"] = cls.IsOpen(attr_str[5])
|
||||||
|
kwargs["target_type"] = cls.TargetType(attr_str[6])
|
||||||
|
kwargs["zero_before_use"] = True if attr_str[7] == "z" else False
|
||||||
|
kwargs["health"] = cls.Health(attr_str[8])
|
||||||
|
kwargs["skip_activation"] = True if attr_str[9] == "k" else False
|
||||||
|
kwargs["name"] = name
|
||||||
|
return cls(**kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_cli(cls, name: str) -> Self:
|
||||||
|
json_obj = json.loads(subprocess.run([LVS_PATH, "--reportformat=json", name], capture_output=True).stdout)
|
||||||
|
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
|
||||||
|
return cls.from_str(attr_str, name)
|
||||||
|
|
@ -7,6 +7,7 @@ from alt_utils import NestedDeserializableDataclass
|
||||||
from . import enums
|
from . import enums
|
||||||
from .alert_channels import AlertChannelsConfig
|
from .alert_channels import AlertChannelsConfig
|
||||||
from .checks.cpu import CpuCheckConfig
|
from .checks.cpu import CpuCheckConfig
|
||||||
|
from .checks.lvmraid import LvmRaidCheckConfig
|
||||||
from .checks.net import NetCheckConfig
|
from .checks.net import NetCheckConfig
|
||||||
from .checks.ram import RamCheckConfig
|
from .checks.ram import RamCheckConfig
|
||||||
from .checks.temp import TempCheckConfig
|
from .checks.temp import TempCheckConfig
|
||||||
|
|
@ -22,6 +23,7 @@ class ChecksConfig(NestedDeserializableDataclass):
|
||||||
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
|
vulnix: Optional[VulnixCheckConfig] = None # vulnix check WILL raise if this config section is None
|
||||||
net: NetCheckConfig = field(default_factory=NetCheckConfig)
|
net: NetCheckConfig = field(default_factory=NetCheckConfig)
|
||||||
ups: UPSCheckConfig = field(default_factory=UPSCheckConfig)
|
ups: UPSCheckConfig = field(default_factory=UPSCheckConfig)
|
||||||
|
lvmraid: LvmRaidCheckConfig = field(default_factory=LvmRaidCheckConfig)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
||||||
6
src/lego_monitoring/config/checks/lvmraid.py
Normal file
6
src/lego_monitoring/config/checks/lvmraid.py
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LvmRaidCheckConfig:
|
||||||
|
lv_paths: list = field(default_factory=list)
|
||||||
|
|
@ -10,6 +10,7 @@ class CheckSet(StrEnum):
|
||||||
TEMP = "temp"
|
TEMP = "temp"
|
||||||
NET = "net"
|
NET = "net"
|
||||||
UPS = "ups"
|
UPS = "ups"
|
||||||
|
LVMRAID = "lvmraid"
|
||||||
|
|
||||||
VULNIX = "vulnix"
|
VULNIX = "vulnix"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
VULNIX_PATH: str = ... # path to vulnix executable
|
VULNIX_PATH: str = ... # path to vulnix executable
|
||||||
UPSC_PATH = "/usr/bin/upsc"
|
UPSC_PATH = "/usr/bin/upsc"
|
||||||
UPS_PIPE_NAME = "/tmp/lego-monitoring-ups-status"
|
UPS_PIPE_NAME = "/tmp/lego-monitoring-ups-status"
|
||||||
|
LVS_PATH = "/usr/bin/lvs"
|
||||||
|
|
|
||||||
|
|
@ -101,6 +101,9 @@ async def async_main():
|
||||||
owner_group=config.checks.ups.upsmon_group,
|
owner_group=config.checks.ups.upsmon_group,
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
|
check_sets.LVMRAID: [
|
||||||
|
IntervalChecker(checks.lvmraid_check, interval=datetime.timedelta(minutes=5), persistent=True)
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
checkers = []
|
checkers = []
|
||||||
|
|
|
||||||
2
uv.lock
generated
2
uv.lock
generated
|
|
@ -278,7 +278,7 @@ wheels = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lego-monitoring"
|
name = "lego-monitoring"
|
||||||
version = "1.1.1"
|
version = "1.2.0"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "aiodns" },
|
{ name = "aiodns" },
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue