mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
check disk wearout levels
This commit is contained in:
parent
93f5404bc1
commit
92ce59d6a3
9 changed files with 1560 additions and 8 deletions
|
|
@ -4,7 +4,7 @@ from datetime import timedelta
|
|||
|
||||
from alerting import alerts
|
||||
from misc import cvars, docker_registry, sensors, vuln
|
||||
from misc.disks import LVAttr
|
||||
from misc.disks import LVAttr, WearoutIndicator, get_wearout_reading
|
||||
|
||||
IS_TESTING = False
|
||||
|
||||
|
|
@ -236,3 +236,35 @@ def raid_check() -> list[alerts.Alert]:
|
|||
)
|
||||
|
||||
return alert_list
|
||||
|
||||
|
||||
def disk_wearout_check() -> list[alerts.Alert]:
|
||||
check_config = cvars.config.get()["checks"]["wearout"]
|
||||
alert_list = []
|
||||
for disk in check_config["disks"]:
|
||||
try:
|
||||
wearout_reading = get_wearout_reading(disk["name"])
|
||||
except Exception as exc:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=alerts.AlertType.ERROR,
|
||||
message=f"Could not check wearout for disk {disk['name']}: {repr(exc)}, see logs",
|
||||
severity=alerts.Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
logging.error(traceback.format_exc())
|
||||
continue
|
||||
|
||||
if wearout_reading.current_reading < wearout_reading.threshold_reading:
|
||||
match wearout_reading.indicator:
|
||||
case WearoutIndicator.REALLOCATED_SECTORS:
|
||||
message = f"Disk {disk['name']} has reallocated sectors (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
|
||||
case WearoutIndicator.SPARE_BLOCKS:
|
||||
message = f"Disk {disk['name']} has too few spare blocks (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=alerts.AlertType.DISKS, message=message, severity=alerts.Severity[disk["severity"]]
|
||||
)
|
||||
)
|
||||
|
||||
return alert_list
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import json
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
from enum import Enum, StrEnum
|
||||
from typing import Optional, Self
|
||||
|
||||
|
||||
|
|
@ -132,3 +132,58 @@ class LVAttr:
|
|||
json_obj = json.loads(subprocess.run(["lvs", "--reportformat=json", name], capture_output=True).stdout)
|
||||
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
|
||||
return cls.from_str(attr_str, name)
|
||||
|
||||
|
||||
class WearoutIndicator(Enum):
|
||||
REALLOCATED_SECTORS = 0
|
||||
SPARE_BLOCKS = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class WearoutReading:
|
||||
indicator: WearoutIndicator
|
||||
current_reading: int
|
||||
threshold_reading: int
|
||||
|
||||
|
||||
def _get_wearout_reading_from_smartctl_output(smartctl_output: dict) -> WearoutReading:
|
||||
disk_protocol = smartctl_output["device"]["protocol"]
|
||||
rotation_rate = smartctl_output.get("rotation_rate", 0)
|
||||
match rotation_rate:
|
||||
case 0: # assuming non-rotating media is an SSD
|
||||
indicator = WearoutIndicator.SPARE_BLOCKS
|
||||
match disk_protocol:
|
||||
case "ATA":
|
||||
attr_table = smartctl_output["ata_smart_attributes"]["table"]
|
||||
for a in attr_table:
|
||||
if a["name"] == "Available_Reservd_Space":
|
||||
value = a["value"]
|
||||
threshold = a["thresh"]
|
||||
break
|
||||
else:
|
||||
raise Exception(f"no Available_Reservd_Space on ATA SSD")
|
||||
case "NVMe":
|
||||
value = smartctl_output["nvme_smart_health_information_log"]["available_spare"]
|
||||
threshold = smartctl_output["nvme_smart_health_information_log"]["available_spare_threshold"]
|
||||
case _:
|
||||
indicator = WearoutIndicator.REALLOCATED_SECTORS
|
||||
match disk_protocol:
|
||||
case "ATA":
|
||||
attr_table = smartctl_output["ata_smart_attributes"]["table"]
|
||||
for a in attr_table:
|
||||
if a["name"] == "Reallocated_Sector_Ct":
|
||||
value = a["value"]
|
||||
threshold = a["thresh"]
|
||||
break
|
||||
else:
|
||||
raise Exception(f"no Reallocated_Sector_Ct on ATA HDD")
|
||||
case "NVMe": # ? NVMe HDDs are very rare, if they even exist
|
||||
raise NotImplementedError
|
||||
|
||||
return WearoutReading(indicator, current_reading=value, threshold_reading=threshold)
|
||||
|
||||
|
||||
def get_wearout_reading(disk: str) -> WearoutReading:
|
||||
smartctl_output = json.loads(subprocess.run(["smartctl", "-ja", disk], capture_output=True).stdout.decode("utf-8"))
|
||||
wearout_reading = _get_wearout_reading_from_smartctl_output(smartctl_output)
|
||||
return wearout_reading
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue