diff --git a/alerting/alerts.py b/alerting/alerts.py index ae57492..a658b46 100644 --- a/alerting/alerts.py +++ b/alerting/alerts.py @@ -19,7 +19,7 @@ class AlertType(StrEnum): VULN = "VULN" LOGIN = "LOGIN" # TODO SMART = "SMART" # TODO - RAID = "RAID" # TODO + RAID = "RAID" UPS = "UPS" UPDATE = "UPDATE" diff --git a/config.example.json b/config.example.json index 21a8897..a3ee930 100644 --- a/config.example.json +++ b/config.example.json @@ -5,6 +5,9 @@ "images": [ "gitlab/gitlab-ce" ] + }, + "raid": { + "lvs": ["Data/lvol0"] } } } diff --git a/misc/checks.py b/misc/checks.py index ed4458b..64fe817 100644 --- a/misc/checks.py +++ b/misc/checks.py @@ -1,7 +1,10 @@ +import logging +import traceback from datetime import timedelta from alerting import alerts -from misc import docker_registry, sensors, vuln +from misc import cvars, docker_registry, sensors, vuln +from misc.disks import LVAttr from misc.enums import UPSStatus IS_TESTING = False @@ -169,3 +172,72 @@ async def docker_registry_check() -> list[alerts.Alert]: ) ) return alert_list + + +def raid_check() -> list[alerts.Alert]: + check_config = cvars.config.get()["checks"]["raid"] + alert_list = [] + for lv in check_config["lvs"]: + try: + lv_attr = LVAttr.from_cli(lv) + except Exception as exc: + alert_list.append( + alerts.Alert( + alert_type=alerts.AlertType.ERROR, + message=f"Could not check RAID LV {lv}: {repr(exc)}, see logs", + severity=alerts.Severity.CRITICAL, + ) + ) + logging.error(traceback.format_exc()) + continue + + # sanity check + if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]: + alert_list.append( + alerts.Alert( + alert_type=alerts.AlertType.ERROR, + message=f"LV {lv} is not of RAID type", + severity=alerts.Severity.CRITICAL, + ) + ) + continue + + match lv_attr.health: + case LVAttr.Health.PARTIAL: + alert_list.append( + alerts.Alert( + alert_type=alerts.AlertType.RAID, + message=f"LV {lv} operating in partial mode; one of PVs has failed", + severity=alerts.Severity.CRITICAL, + ) + ) + continue + case LVAttr.Health.UNKNOWN: + alert_list.append( + alerts.Alert( + alert_type=alerts.AlertType.RAID, + message=f"LV {lv}'s state is unknown", + severity=alerts.Severity.CRITICAL, + ) + ) + continue + case LVAttr.Health.REFRESH_NEEDED: + alert_list.append( + alerts.Alert( + alert_type=alerts.AlertType.RAID, + message=f"LV {lv} has suffered a write error; run a refresh or replace the failing PV", + severity=alerts.Severity.WARNING, + ) + ) + continue + case LVAttr.Health.MISMATCHES: + alert_list.append( + alerts.Alert( + alert_type=alerts.AlertType.RAID, + message=f"LV {lv} is partially incoherent; run a repairing scrub operation", + severity=alerts.Severity.WARNING, + ) + ) + continue + + return alert_list diff --git a/misc/disks.py b/misc/disks.py new file mode 100644 index 0000000..f5a74df --- /dev/null +++ b/misc/disks.py @@ -0,0 +1,134 @@ +import json +import subprocess +from dataclasses import dataclass +from enum import StrEnum +from typing import Optional, Self + + +@dataclass +class LVAttr: + """https://man.archlinux.org/man/lvs.8#NOTES""" + + class VolType(StrEnum): + CACHE = "C" + MIRRORED = "m" + MIRRORED_NOSYNC = "M" + ORIGIN = "o" + ORIGIN_MERGING_SNAPSHOT = "O" + INTEGRITY = "g" + RAID = "r" + RAID_NOSYNC = "R" + SNAPSHOT = "s" + MERGING_SNAPSHOT = "S" + PVMOVE = "p" + VIRTUAL = "v" + IMAGE = "i" + IMAGE_OUT_OF_SYNC = "I" + MIRROR_LOG = "l" + CONVERTING = "c" + THIN = "V" + THIN_POOL = "t" + THIN_POOL_DATA = "T" + VDO_POOL = "d" + VDO_POOL_DATA = "D" + METADATA = "e" + + class Permissions(StrEnum): + WRITABLE = "w" + READONLY = "r" + READONLY_ACTIVATED = "R" + + class AllocationPolicy(StrEnum): + ANYWHERE = "a" + ANYWHERE_LOCKED = "A" + CONTIGUOUS = "c" + CONTIGUOUS_LOCKED = "C" + INHERITED = "i" + INHERITED_LOCKED = "I" + CLING = "l" + CLING_LOCKED = "L" + NORMAL = "n" + NORMAL_LOCKED = "N" + + class State(StrEnum): + ACTIVE = "a" + HISTORICAL = "h" + SUSPENDED = "s" + INVALID_SNAPSHOT = "I" + INVALID_SUSPENDED_SNAPSHOT = "S" + SNAPSHOT_MERGE_FAILED = "m" + SUSPENDED_SNAPSHOT_MERGE_FAILED = "M" + DEVICE_PRESENT_NO_TABLES = "d" + DEVICE_PRESENT_INACTIVE_TABLE = "i" + THIN_POOL_CHECK_NEEDED = "c" + SUSPENDED_THIN_POOL_CHECK_NEEDED = "C" + UNKNOWN = "X" + + class IsOpen(StrEnum): + OPEN = "o" + CLOSED = "-" + UNKNOWN = "X" + + class TargetType(StrEnum): + CACHE = "C" + MIRROR = "m" + RAID = "r" + SNAPSHOT = "s" + THIN = "t" + UNKNOWN = "u" + VIRTUAL = "v" + + class Health(StrEnum): + # for all + PARTIAL = "p" + UNKNOWN = "X" + OK = "-" + + # for RAID + REFRESH_NEEDED = "r" + MISMATCHES = "m" + WRITEMOSTLY = "w" + RESHAPING = "s" + REMOVE = "R" + + # for thin pools and LVs + FAILED = "F" + OUT_OF_SPACE = "D" + METADATA_READ_ONLY = "M" + + # for writecache + ERROR = "E" + + vol_type: VolType + permissions: Permissions + allocation_policy: AllocationPolicy + fixed_minor: bool + state: State + is_open: IsOpen + target_type: TargetType + zero_before_use: bool + health: Health + skip_activation: bool + name: Optional[str] = None + + @classmethod + def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self: + kwargs = {} + kwargs["vol_type"] = cls.VolType(attr_str[0]) + kwargs["permissions"] = cls.Permissions(attr_str[1]) + kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2]) + kwargs["fixed_minor"] = True if attr_str[3] == "m" else False + kwargs["state"] = cls.State(attr_str[4]) + kwargs["is_open"] = cls.IsOpen(attr_str[5]) + kwargs["target_type"] = cls.TargetType(attr_str[6]) + kwargs["zero_before_use"] = True if attr_str[7] == "z" else False + kwargs["health"] = cls.Health(attr_str[8]) + kwargs["skip_activation"] = True if attr_str[9] == "k" else False + kwargs["name"] = name + return cls(**kwargs) + + @classmethod + def from_cli(cls, name: str) -> Self: + json_obj = json.loads(subprocess.run(["lvs", "--reportformat=json", name], capture_output=True).stdout) + attr_str = json_obj["report"][0]["lv"][0]["lv_attr"] + return cls.from_str(attr_str, name) diff --git a/service.py b/service.py index 2d843fa..017d6bf 100755 --- a/service.py +++ b/service.py @@ -37,6 +37,7 @@ async def main(): interval_checker(checks.ups_check, datetime.timedelta(minutes=5)), interval_checker(checks.ram_check, datetime.timedelta(minutes=1)), interval_checker(checks.vuln_check, datetime.timedelta(days=1)), + interval_checker(checks.raid_check, datetime.timedelta(days=1)), scheduled_checker( checks.docker_registry_check, period=datetime.timedelta(days=1), when=datetime.time(hour=0, minute=0) ), diff --git a/tests/test_disks.py b/tests/test_disks.py new file mode 100644 index 0000000..e431a9d --- /dev/null +++ b/tests/test_disks.py @@ -0,0 +1,27 @@ +import unittest + +from misc.disks import LVAttr + + +class TestDisks(unittest.TestCase): + def test_lv_attr_declaration(self): + self.assertEqual( + LVAttr.from_str("rwi-aor---", "Data/lvol0"), + LVAttr( + vol_type=LVAttr.VolType.RAID, + permissions=LVAttr.Permissions.WRITABLE, + allocation_policy=LVAttr.AllocationPolicy.INHERITED, + fixed_minor=False, + state=LVAttr.State.ACTIVE, + is_open=LVAttr.IsOpen.OPEN, + target_type=LVAttr.TargetType.RAID, + zero_before_use=False, + health=LVAttr.Health.OK, + skip_activation=False, + name="Data/lvol0", + ), + ) + + +if __name__ == "__main__": + unittest.main()