check disk wearout levels

2026-03-10 04:41:10 +00:00 · 2024-11-09 13:57:32 +03:00 · 2024-11-09 13:57:32 +03:00 · 92ce59d6a3
commit 92ce59d6a3
parent 93f5404bc1
9 changed files with 1560 additions and 8 deletions
--- a/misc/checks.py
+++ b/misc/checks.py
@ -4,7 +4,7 @@ from datetime import timedelta

 from alerting import alerts
 from misc import cvars, docker_registry, sensors, vuln
-from misc.disks import LVAttr
+from misc.disks import LVAttr, WearoutIndicator, get_wearout_reading

 IS_TESTING = False

@ -236,3 +236,35 @@ def raid_check() -> list[alerts.Alert]:
                )

    return alert_list
+
+
+def disk_wearout_check() -> list[alerts.Alert]:
+    check_config = cvars.config.get()["checks"]["wearout"]
+    alert_list = []
+    for disk in check_config["disks"]:
+        try:
+            wearout_reading = get_wearout_reading(disk["name"])
+        except Exception as exc:
+            alert_list.append(
+                alerts.Alert(
+                    alert_type=alerts.AlertType.ERROR,
+                    message=f"Could not check wearout for disk {disk['name']}: {repr(exc)}, see logs",
+                    severity=alerts.Severity.CRITICAL,
+                )
+            )
+            logging.error(traceback.format_exc())
+            continue
+
+        if wearout_reading.current_reading < wearout_reading.threshold_reading:
+            match wearout_reading.indicator:
+                case WearoutIndicator.REALLOCATED_SECTORS:
+                    message = f"Disk {disk['name']} has reallocated sectors (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
+                case WearoutIndicator.SPARE_BLOCKS:
+                    message = f"Disk {disk['name']} has too few spare blocks (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
+            alert_list.append(
+                alerts.Alert(
+                    alert_type=alerts.AlertType.DISKS, message=message, severity=alerts.Severity[disk["severity"]]
+                )
+            )
+
+    return alert_list
--- a/misc/disks.py
+++ b/misc/disks.py
@ -1,7 +1,7 @@
 import json
 import subprocess
 from dataclasses import dataclass
-from enum import StrEnum
+from enum import Enum, StrEnum
 from typing import Optional, Self


@ -132,3 +132,58 @@ class LVAttr:
        json_obj = json.loads(subprocess.run(["lvs", "--reportformat=json", name], capture_output=True).stdout)
        attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
        return cls.from_str(attr_str, name)
+
+
+class WearoutIndicator(Enum):
+    REALLOCATED_SECTORS = 0
+    SPARE_BLOCKS = 1
+
+
+@dataclass
+class WearoutReading:
+    indicator: WearoutIndicator
+    current_reading: int
+    threshold_reading: int
+
+
+def _get_wearout_reading_from_smartctl_output(smartctl_output: dict) -> WearoutReading:
+    disk_protocol = smartctl_output["device"]["protocol"]
+    rotation_rate = smartctl_output.get("rotation_rate", 0)
+    match rotation_rate:
+        case 0:  # assuming non-rotating media is an SSD
+            indicator = WearoutIndicator.SPARE_BLOCKS
+            match disk_protocol:
+                case "ATA":
+                    attr_table = smartctl_output["ata_smart_attributes"]["table"]
+                    for a in attr_table:
+                        if a["name"] == "Available_Reservd_Space":
+                            value = a["value"]
+                            threshold = a["thresh"]
+                            break
+                    else:
+                        raise Exception(f"no Available_Reservd_Space on ATA SSD")
+                case "NVMe":
+                    value = smartctl_output["nvme_smart_health_information_log"]["available_spare"]
+                    threshold = smartctl_output["nvme_smart_health_information_log"]["available_spare_threshold"]
+        case _:
+            indicator = WearoutIndicator.REALLOCATED_SECTORS
+            match disk_protocol:
+                case "ATA":
+                    attr_table = smartctl_output["ata_smart_attributes"]["table"]
+                    for a in attr_table:
+                        if a["name"] == "Reallocated_Sector_Ct":
+                            value = a["value"]
+                            threshold = a["thresh"]
+                            break
+                    else:
+                        raise Exception(f"no Reallocated_Sector_Ct on ATA HDD")
+                case "NVMe":  # ? NVMe HDDs are very rare, if they even exist
+                    raise NotImplementedError
+
+    return WearoutReading(indicator, current_reading=value, threshold_reading=threshold)
+
+
+def get_wearout_reading(disk: str) -> WearoutReading:
+    smartctl_output = json.loads(subprocess.run(["smartctl", "-ja", disk], capture_output=True).stdout.decode("utf-8"))
+    wearout_reading = _get_wearout_reading_from_smartctl_output(smartctl_output)
+    return wearout_reading