check disk wearout levels

This commit is contained in:
Alex 2024-11-09 13:57:32 +03:00
parent 93f5404bc1
commit 92ce59d6a3
9 changed files with 1560 additions and 8 deletions

View file

@ -4,7 +4,7 @@ from datetime import timedelta
from alerting import alerts
from misc import cvars, docker_registry, sensors, vuln
from misc.disks import LVAttr
from misc.disks import LVAttr, WearoutIndicator, get_wearout_reading
IS_TESTING = False
@ -236,3 +236,35 @@ def raid_check() -> list[alerts.Alert]:
)
return alert_list
def disk_wearout_check() -> list[alerts.Alert]:
check_config = cvars.config.get()["checks"]["wearout"]
alert_list = []
for disk in check_config["disks"]:
try:
wearout_reading = get_wearout_reading(disk["name"])
except Exception as exc:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"Could not check wearout for disk {disk['name']}: {repr(exc)}, see logs",
severity=alerts.Severity.CRITICAL,
)
)
logging.error(traceback.format_exc())
continue
if wearout_reading.current_reading < wearout_reading.threshold_reading:
match wearout_reading.indicator:
case WearoutIndicator.REALLOCATED_SECTORS:
message = f"Disk {disk['name']} has reallocated sectors (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
case WearoutIndicator.SPARE_BLOCKS:
message = f"Disk {disk['name']} has too few spare blocks (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.DISKS, message=message, severity=alerts.Severity[disk["severity"]]
)
)
return alert_list

View file

@ -1,7 +1,7 @@
import json
import subprocess
from dataclasses import dataclass
from enum import StrEnum
from enum import Enum, StrEnum
from typing import Optional, Self
@ -132,3 +132,58 @@ class LVAttr:
json_obj = json.loads(subprocess.run(["lvs", "--reportformat=json", name], capture_output=True).stdout)
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
return cls.from_str(attr_str, name)
class WearoutIndicator(Enum):
REALLOCATED_SECTORS = 0
SPARE_BLOCKS = 1
@dataclass
class WearoutReading:
indicator: WearoutIndicator
current_reading: int
threshold_reading: int
def _get_wearout_reading_from_smartctl_output(smartctl_output: dict) -> WearoutReading:
disk_protocol = smartctl_output["device"]["protocol"]
rotation_rate = smartctl_output.get("rotation_rate", 0)
match rotation_rate:
case 0: # assuming non-rotating media is an SSD
indicator = WearoutIndicator.SPARE_BLOCKS
match disk_protocol:
case "ATA":
attr_table = smartctl_output["ata_smart_attributes"]["table"]
for a in attr_table:
if a["name"] == "Available_Reservd_Space":
value = a["value"]
threshold = a["thresh"]
break
else:
raise Exception(f"no Available_Reservd_Space on ATA SSD")
case "NVMe":
value = smartctl_output["nvme_smart_health_information_log"]["available_spare"]
threshold = smartctl_output["nvme_smart_health_information_log"]["available_spare_threshold"]
case _:
indicator = WearoutIndicator.REALLOCATED_SECTORS
match disk_protocol:
case "ATA":
attr_table = smartctl_output["ata_smart_attributes"]["table"]
for a in attr_table:
if a["name"] == "Reallocated_Sector_Ct":
value = a["value"]
threshold = a["thresh"]
break
else:
raise Exception(f"no Reallocated_Sector_Ct on ATA HDD")
case "NVMe": # ? NVMe HDDs are very rare, if they even exist
raise NotImplementedError
return WearoutReading(indicator, current_reading=value, threshold_reading=threshold)
def get_wearout_reading(disk: str) -> WearoutReading:
smartctl_output = json.loads(subprocess.run(["smartctl", "-ja", disk], capture_output=True).stdout.decode("utf-8"))
wearout_reading = _get_wearout_reading_from_smartctl_output(smartctl_output)
return wearout_reading