check disk wearout levels

This commit is contained in:
Alex 2024-11-09 13:57:32 +03:00
parent 93f5404bc1
commit 92ce59d6a3
9 changed files with 1560 additions and 8 deletions

View file

@ -4,7 +4,7 @@ from datetime import timedelta
from alerting import alerts
from misc import cvars, docker_registry, sensors, vuln
from misc.disks import LVAttr
from misc.disks import LVAttr, WearoutIndicator, get_wearout_reading
IS_TESTING = False
@ -236,3 +236,35 @@ def raid_check() -> list[alerts.Alert]:
)
return alert_list
def disk_wearout_check() -> list[alerts.Alert]:
check_config = cvars.config.get()["checks"]["wearout"]
alert_list = []
for disk in check_config["disks"]:
try:
wearout_reading = get_wearout_reading(disk["name"])
except Exception as exc:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"Could not check wearout for disk {disk['name']}: {repr(exc)}, see logs",
severity=alerts.Severity.CRITICAL,
)
)
logging.error(traceback.format_exc())
continue
if wearout_reading.current_reading < wearout_reading.threshold_reading:
match wearout_reading.indicator:
case WearoutIndicator.REALLOCATED_SECTORS:
message = f"Disk {disk['name']} has reallocated sectors (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
case WearoutIndicator.SPARE_BLOCKS:
message = f"Disk {disk['name']} has too few spare blocks (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.DISKS, message=message, severity=alerts.Severity[disk["severity"]]
)
)
return alert_list