check disk health

This commit is contained in:
Alex 2024-11-09 12:22:12 +03:00
parent df9647708d
commit 47c110f83e
6 changed files with 239 additions and 2 deletions

View file

@ -19,7 +19,7 @@ class AlertType(StrEnum):
VULN = "VULN" VULN = "VULN"
LOGIN = "LOGIN" # TODO LOGIN = "LOGIN" # TODO
SMART = "SMART" # TODO SMART = "SMART" # TODO
RAID = "RAID" # TODO RAID = "RAID"
UPS = "UPS" UPS = "UPS"
UPDATE = "UPDATE" UPDATE = "UPDATE"

View file

@ -5,6 +5,9 @@
"images": [ "images": [
"gitlab/gitlab-ce" "gitlab/gitlab-ce"
] ]
},
"raid": {
"lvs": ["Data/lvol0"]
} }
} }
} }

View file

@ -1,7 +1,10 @@
import logging
import traceback
from datetime import timedelta from datetime import timedelta
from alerting import alerts from alerting import alerts
from misc import docker_registry, sensors, vuln from misc import cvars, docker_registry, sensors, vuln
from misc.disks import LVAttr
from misc.enums import UPSStatus from misc.enums import UPSStatus
IS_TESTING = False IS_TESTING = False
@ -169,3 +172,72 @@ async def docker_registry_check() -> list[alerts.Alert]:
) )
) )
return alert_list return alert_list
def raid_check() -> list[alerts.Alert]:
check_config = cvars.config.get()["checks"]["raid"]
alert_list = []
for lv in check_config["lvs"]:
try:
lv_attr = LVAttr.from_cli(lv)
except Exception as exc:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"Could not check RAID LV {lv}: {repr(exc)}, see logs",
severity=alerts.Severity.CRITICAL,
)
)
logging.error(traceback.format_exc())
continue
# sanity check
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"LV {lv} is not of RAID type",
severity=alerts.Severity.CRITICAL,
)
)
continue
match lv_attr.health:
case LVAttr.Health.PARTIAL:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} operating in partial mode; one of PVs has failed",
severity=alerts.Severity.CRITICAL,
)
)
continue
case LVAttr.Health.UNKNOWN:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv}'s state is unknown",
severity=alerts.Severity.CRITICAL,
)
)
continue
case LVAttr.Health.REFRESH_NEEDED:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} has suffered a write error; run a refresh or replace the failing PV",
severity=alerts.Severity.WARNING,
)
)
continue
case LVAttr.Health.MISMATCHES:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} is partially incoherent; run a repairing scrub operation",
severity=alerts.Severity.WARNING,
)
)
continue
return alert_list

134
misc/disks.py Normal file
View file

@ -0,0 +1,134 @@
import json
import subprocess
from dataclasses import dataclass
from enum import StrEnum
from typing import Optional, Self
@dataclass
class LVAttr:
"""https://man.archlinux.org/man/lvs.8#NOTES"""
class VolType(StrEnum):
CACHE = "C"
MIRRORED = "m"
MIRRORED_NOSYNC = "M"
ORIGIN = "o"
ORIGIN_MERGING_SNAPSHOT = "O"
INTEGRITY = "g"
RAID = "r"
RAID_NOSYNC = "R"
SNAPSHOT = "s"
MERGING_SNAPSHOT = "S"
PVMOVE = "p"
VIRTUAL = "v"
IMAGE = "i"
IMAGE_OUT_OF_SYNC = "I"
MIRROR_LOG = "l"
CONVERTING = "c"
THIN = "V"
THIN_POOL = "t"
THIN_POOL_DATA = "T"
VDO_POOL = "d"
VDO_POOL_DATA = "D"
METADATA = "e"
class Permissions(StrEnum):
WRITABLE = "w"
READONLY = "r"
READONLY_ACTIVATED = "R"
class AllocationPolicy(StrEnum):
ANYWHERE = "a"
ANYWHERE_LOCKED = "A"
CONTIGUOUS = "c"
CONTIGUOUS_LOCKED = "C"
INHERITED = "i"
INHERITED_LOCKED = "I"
CLING = "l"
CLING_LOCKED = "L"
NORMAL = "n"
NORMAL_LOCKED = "N"
class State(StrEnum):
ACTIVE = "a"
HISTORICAL = "h"
SUSPENDED = "s"
INVALID_SNAPSHOT = "I"
INVALID_SUSPENDED_SNAPSHOT = "S"
SNAPSHOT_MERGE_FAILED = "m"
SUSPENDED_SNAPSHOT_MERGE_FAILED = "M"
DEVICE_PRESENT_NO_TABLES = "d"
DEVICE_PRESENT_INACTIVE_TABLE = "i"
THIN_POOL_CHECK_NEEDED = "c"
SUSPENDED_THIN_POOL_CHECK_NEEDED = "C"
UNKNOWN = "X"
class IsOpen(StrEnum):
OPEN = "o"
CLOSED = "-"
UNKNOWN = "X"
class TargetType(StrEnum):
CACHE = "C"
MIRROR = "m"
RAID = "r"
SNAPSHOT = "s"
THIN = "t"
UNKNOWN = "u"
VIRTUAL = "v"
class Health(StrEnum):
# for all
PARTIAL = "p"
UNKNOWN = "X"
OK = "-"
# for RAID
REFRESH_NEEDED = "r"
MISMATCHES = "m"
WRITEMOSTLY = "w"
RESHAPING = "s"
REMOVE = "R"
# for thin pools and LVs
FAILED = "F"
OUT_OF_SPACE = "D"
METADATA_READ_ONLY = "M"
# for writecache
ERROR = "E"
vol_type: VolType
permissions: Permissions
allocation_policy: AllocationPolicy
fixed_minor: bool
state: State
is_open: IsOpen
target_type: TargetType
zero_before_use: bool
health: Health
skip_activation: bool
name: Optional[str] = None
@classmethod
def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self:
kwargs = {}
kwargs["vol_type"] = cls.VolType(attr_str[0])
kwargs["permissions"] = cls.Permissions(attr_str[1])
kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2])
kwargs["fixed_minor"] = True if attr_str[3] == "m" else False
kwargs["state"] = cls.State(attr_str[4])
kwargs["is_open"] = cls.IsOpen(attr_str[5])
kwargs["target_type"] = cls.TargetType(attr_str[6])
kwargs["zero_before_use"] = True if attr_str[7] == "z" else False
kwargs["health"] = cls.Health(attr_str[8])
kwargs["skip_activation"] = True if attr_str[9] == "k" else False
kwargs["name"] = name
return cls(**kwargs)
@classmethod
def from_cli(cls, name: str) -> Self:
json_obj = json.loads(subprocess.run(["lvs", "--reportformat=json", name], capture_output=True).stdout)
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
return cls.from_str(attr_str, name)

View file

@ -37,6 +37,7 @@ async def main():
interval_checker(checks.ups_check, datetime.timedelta(minutes=5)), interval_checker(checks.ups_check, datetime.timedelta(minutes=5)),
interval_checker(checks.ram_check, datetime.timedelta(minutes=1)), interval_checker(checks.ram_check, datetime.timedelta(minutes=1)),
interval_checker(checks.vuln_check, datetime.timedelta(days=1)), interval_checker(checks.vuln_check, datetime.timedelta(days=1)),
interval_checker(checks.raid_check, datetime.timedelta(days=1)),
scheduled_checker( scheduled_checker(
checks.docker_registry_check, period=datetime.timedelta(days=1), when=datetime.time(hour=0, minute=0) checks.docker_registry_check, period=datetime.timedelta(days=1), when=datetime.time(hour=0, minute=0)
), ),

27
tests/test_disks.py Normal file
View file

@ -0,0 +1,27 @@
import unittest
from misc.disks import LVAttr
class TestDisks(unittest.TestCase):
def test_lv_attr_declaration(self):
self.assertEqual(
LVAttr.from_str("rwi-aor---", "Data/lvol0"),
LVAttr(
vol_type=LVAttr.VolType.RAID,
permissions=LVAttr.Permissions.WRITABLE,
allocation_policy=LVAttr.AllocationPolicy.INHERITED,
fixed_minor=False,
state=LVAttr.State.ACTIVE,
is_open=LVAttr.IsOpen.OPEN,
target_type=LVAttr.TargetType.RAID,
zero_before_use=False,
health=LVAttr.Health.OK,
skip_activation=False,
name="Data/lvol0",
),
)
if __name__ == "__main__":
unittest.main()