check disk health

This commit is contained in:
Alex 2024-11-09 12:22:12 +03:00
parent df9647708d
commit 47c110f83e
6 changed files with 239 additions and 2 deletions

View file

@ -1,7 +1,10 @@
import logging
import traceback
from datetime import timedelta
from alerting import alerts
from misc import docker_registry, sensors, vuln
from misc import cvars, docker_registry, sensors, vuln
from misc.disks import LVAttr
from misc.enums import UPSStatus
IS_TESTING = False
@ -169,3 +172,72 @@ async def docker_registry_check() -> list[alerts.Alert]:
)
)
return alert_list
def raid_check() -> list[alerts.Alert]:
check_config = cvars.config.get()["checks"]["raid"]
alert_list = []
for lv in check_config["lvs"]:
try:
lv_attr = LVAttr.from_cli(lv)
except Exception as exc:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"Could not check RAID LV {lv}: {repr(exc)}, see logs",
severity=alerts.Severity.CRITICAL,
)
)
logging.error(traceback.format_exc())
continue
# sanity check
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"LV {lv} is not of RAID type",
severity=alerts.Severity.CRITICAL,
)
)
continue
match lv_attr.health:
case LVAttr.Health.PARTIAL:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} operating in partial mode; one of PVs has failed",
severity=alerts.Severity.CRITICAL,
)
)
continue
case LVAttr.Health.UNKNOWN:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv}'s state is unknown",
severity=alerts.Severity.CRITICAL,
)
)
continue
case LVAttr.Health.REFRESH_NEEDED:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} has suffered a write error; run a refresh or replace the failing PV",
severity=alerts.Severity.WARNING,
)
)
continue
case LVAttr.Health.MISMATCHES:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} is partially incoherent; run a repairing scrub operation",
severity=alerts.Severity.WARNING,
)
)
continue
return alert_list