mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
check disk health
This commit is contained in:
parent
df9647708d
commit
47c110f83e
6 changed files with 239 additions and 2 deletions
|
|
@ -1,7 +1,10 @@
|
|||
import logging
|
||||
import traceback
|
||||
from datetime import timedelta
|
||||
|
||||
from alerting import alerts
|
||||
from misc import docker_registry, sensors, vuln
|
||||
from misc import cvars, docker_registry, sensors, vuln
|
||||
from misc.disks import LVAttr
|
||||
from misc.enums import UPSStatus
|
||||
|
||||
IS_TESTING = False
|
||||
|
|
@ -169,3 +172,72 @@ async def docker_registry_check() -> list[alerts.Alert]:
|
|||
)
|
||||
)
|
||||
return alert_list
|
||||
|
||||
|
||||
def raid_check() -> list[alerts.Alert]:
|
||||
check_config = cvars.config.get()["checks"]["raid"]
|
||||
alert_list = []
|
||||
for lv in check_config["lvs"]:
|
||||
try:
|
||||
lv_attr = LVAttr.from_cli(lv)
|
||||
except Exception as exc:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=alerts.AlertType.ERROR,
|
||||
message=f"Could not check RAID LV {lv}: {repr(exc)}, see logs",
|
||||
severity=alerts.Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
logging.error(traceback.format_exc())
|
||||
continue
|
||||
|
||||
# sanity check
|
||||
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=alerts.AlertType.ERROR,
|
||||
message=f"LV {lv} is not of RAID type",
|
||||
severity=alerts.Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
match lv_attr.health:
|
||||
case LVAttr.Health.PARTIAL:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=alerts.AlertType.RAID,
|
||||
message=f"LV {lv} operating in partial mode; one of PVs has failed",
|
||||
severity=alerts.Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
continue
|
||||
case LVAttr.Health.UNKNOWN:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=alerts.AlertType.RAID,
|
||||
message=f"LV {lv}'s state is unknown",
|
||||
severity=alerts.Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
continue
|
||||
case LVAttr.Health.REFRESH_NEEDED:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=alerts.AlertType.RAID,
|
||||
message=f"LV {lv} has suffered a write error; run a refresh or replace the failing PV",
|
||||
severity=alerts.Severity.WARNING,
|
||||
)
|
||||
)
|
||||
continue
|
||||
case LVAttr.Health.MISMATCHES:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=alerts.AlertType.RAID,
|
||||
message=f"LV {lv} is partially incoherent; run a repairing scrub operation",
|
||||
severity=alerts.Severity.WARNING,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
return alert_list
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue