Merge branch 'disk-health' into 'main'

Disk health monitoring (wearout and RAID PV fails)

See merge request lego/lego-monitoring!6
This commit is contained in:
Alex Tau 2024-11-09 15:59:23 +00:00
commit 37d1720758
11 changed files with 1825 additions and 34 deletions

View file

@ -19,15 +19,16 @@ class AlertType(StrEnum):
VULN = "VULN"
LOGIN = "LOGIN" # TODO
SMART = "SMART" # TODO
RAID = "RAID" # TODO
RAID = "RAID"
DISKS = "DISKS"
UPS = "UPS"
UPDATE = "UPDATE"
class Severity(Enum):
INFO = 1
WARNING = 2
CRITICAL = 3
class Severity(StrEnum):
INFO = "INFO"
WARNING = "WARNING"
CRITICAL = "CRITICAL"
@dataclass

View file

@ -5,6 +5,23 @@
"images": [
"gitlab/gitlab-ce"
]
},
"raid": {
"lvs": [
"Data/lvol0"
]
},
"wearout": {
"disks": [
{
"name": "/dev/sda",
"severity": "WARNING"
},
{
"name": "/dev/nvme0",
"severity": "CRITICAL"
}
]
}
}
}

View file

@ -1,8 +1,10 @@
import logging
import traceback
from datetime import timedelta
from alerting import alerts
from misc import docker_registry, sensors, vuln
from misc.enums import UPSStatus
from misc import cvars, docker_registry, sensors, vuln
from misc.disks import LVAttr, WearoutIndicator, get_wearout_reading
IS_TESTING = False
@ -123,13 +125,13 @@ async def ups_check() -> list[alerts.Alert]:
)
for status in sensor.ups_status:
if IS_TESTING or status == UPSStatus.UPS_OVERLOAD:
if IS_TESTING or status == sensors.UPSStatus.UPS_OVERLOAD:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.UPS, message=f"UPS is overloaded!", severity=alerts.Severity.CRITICAL
)
)
elif IS_TESTING or status == UPSStatus.ON_BATTERY:
elif IS_TESTING or status == sensors.UPSStatus.ON_BATTERY:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.UPS,
@ -137,7 +139,7 @@ async def ups_check() -> list[alerts.Alert]:
severity=alerts.Severity.INFO,
)
)
elif IS_TESTING or status == UPSStatus.UPS_TRIM:
elif IS_TESTING or status == sensors.UPSStatus.UPS_TRIM:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.UPS,
@ -145,7 +147,7 @@ async def ups_check() -> list[alerts.Alert]:
severity=alerts.Severity.INFO,
)
)
elif IS_TESTING or status == UPSStatus.UPS_BOOST:
elif IS_TESTING or status == sensors.UPSStatus.UPS_BOOST:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.UPS,
@ -169,3 +171,109 @@ async def docker_registry_check() -> list[alerts.Alert]:
)
)
return alert_list
def raid_check() -> list[alerts.Alert]:
check_config = cvars.config.get()["checks"]["raid"]
alert_list = []
for lv in check_config["lvs"]:
try:
lv_attr = LVAttr.from_cli(lv)
except Exception as exc:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"Could not check RAID LV {lv}: {repr(exc)}, see logs",
severity=alerts.Severity.CRITICAL,
)
)
logging.error(traceback.format_exc())
continue
# sanity check
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"LV {lv} is not of RAID type",
severity=alerts.Severity.CRITICAL,
)
)
continue
if IS_TESTING:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"Test alert: LV {lv} health is {lv_attr.health}",
severity=alerts.Severity.INFO,
)
)
match lv_attr.health:
case LVAttr.Health.PARTIAL:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} operating in partial mode; one of PVs has failed",
severity=alerts.Severity.CRITICAL,
)
)
case LVAttr.Health.UNKNOWN:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv}'s state is unknown",
severity=alerts.Severity.CRITICAL,
)
)
case LVAttr.Health.REFRESH_NEEDED:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} has suffered a write error; run a refresh or replace the failing PV",
severity=alerts.Severity.WARNING,
)
)
case LVAttr.Health.MISMATCHES:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.RAID,
message=f"LV {lv} is partially incoherent; run a repairing scrub operation",
severity=alerts.Severity.WARNING,
)
)
return alert_list
def disk_wearout_check() -> list[alerts.Alert]:
check_config = cvars.config.get()["checks"]["wearout"]
alert_list = []
for disk in check_config["disks"]:
try:
wearout_reading = get_wearout_reading(disk["name"])
except Exception as exc:
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.ERROR,
message=f"Could not check wearout for disk {disk['name']}: {repr(exc)}, see logs",
severity=alerts.Severity.CRITICAL,
)
)
logging.error(traceback.format_exc())
continue
if IS_TESTING or wearout_reading.current_reading < wearout_reading.threshold_reading:
match wearout_reading.indicator:
case WearoutIndicator.REALLOCATED_SECTORS:
message = f"Disk {disk['name']} has reallocated sectors (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
case WearoutIndicator.SPARE_BLOCKS:
message = f"Disk {disk['name']} has too few spare blocks (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
alert_list.append(
alerts.Alert(
alert_type=alerts.AlertType.DISKS, message=message, severity=alerts.Severity[disk["severity"]]
)
)
return alert_list

189
misc/disks.py Normal file
View file

@ -0,0 +1,189 @@
import json
import subprocess
from dataclasses import dataclass
from enum import Enum, StrEnum
from typing import Optional, Self
@dataclass
class LVAttr:
"""https://man.archlinux.org/man/lvs.8#NOTES"""
class VolType(StrEnum):
CACHE = "C"
MIRRORED = "m"
MIRRORED_NOSYNC = "M"
ORIGIN = "o"
ORIGIN_MERGING_SNAPSHOT = "O"
INTEGRITY = "g"
RAID = "r"
RAID_NOSYNC = "R"
SNAPSHOT = "s"
MERGING_SNAPSHOT = "S"
PVMOVE = "p"
VIRTUAL = "v"
IMAGE = "i"
IMAGE_OUT_OF_SYNC = "I"
MIRROR_LOG = "l"
CONVERTING = "c"
THIN = "V"
THIN_POOL = "t"
THIN_POOL_DATA = "T"
VDO_POOL = "d"
VDO_POOL_DATA = "D"
METADATA = "e"
class Permissions(StrEnum):
WRITABLE = "w"
READONLY = "r"
READONLY_ACTIVATED = "R"
class AllocationPolicy(StrEnum):
ANYWHERE = "a"
ANYWHERE_LOCKED = "A"
CONTIGUOUS = "c"
CONTIGUOUS_LOCKED = "C"
INHERITED = "i"
INHERITED_LOCKED = "I"
CLING = "l"
CLING_LOCKED = "L"
NORMAL = "n"
NORMAL_LOCKED = "N"
class State(StrEnum):
ACTIVE = "a"
HISTORICAL = "h"
SUSPENDED = "s"
INVALID_SNAPSHOT = "I"
INVALID_SUSPENDED_SNAPSHOT = "S"
SNAPSHOT_MERGE_FAILED = "m"
SUSPENDED_SNAPSHOT_MERGE_FAILED = "M"
DEVICE_PRESENT_NO_TABLES = "d"
DEVICE_PRESENT_INACTIVE_TABLE = "i"
THIN_POOL_CHECK_NEEDED = "c"
SUSPENDED_THIN_POOL_CHECK_NEEDED = "C"
UNKNOWN = "X"
class IsOpen(StrEnum):
OPEN = "o"
CLOSED = "-"
UNKNOWN = "X"
class TargetType(StrEnum):
CACHE = "C"
MIRROR = "m"
RAID = "r"
SNAPSHOT = "s"
THIN = "t"
UNKNOWN = "u"
VIRTUAL = "v"
class Health(StrEnum):
# for all
PARTIAL = "p"
UNKNOWN = "X"
OK = "-"
# for RAID
REFRESH_NEEDED = "r"
MISMATCHES = "m"
WRITEMOSTLY = "w"
RESHAPING = "s"
REMOVE = "R"
# for thin pools and LVs
FAILED = "F"
OUT_OF_SPACE = "D"
METADATA_READ_ONLY = "M"
# for writecache
ERROR = "E"
vol_type: VolType
permissions: Permissions
allocation_policy: AllocationPolicy
fixed_minor: bool
state: State
is_open: IsOpen
target_type: TargetType
zero_before_use: bool
health: Health
skip_activation: bool
name: Optional[str] = None
@classmethod
def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self:
kwargs = {}
kwargs["vol_type"] = cls.VolType(attr_str[0])
kwargs["permissions"] = cls.Permissions(attr_str[1])
kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2])
kwargs["fixed_minor"] = True if attr_str[3] == "m" else False
kwargs["state"] = cls.State(attr_str[4])
kwargs["is_open"] = cls.IsOpen(attr_str[5])
kwargs["target_type"] = cls.TargetType(attr_str[6])
kwargs["zero_before_use"] = True if attr_str[7] == "z" else False
kwargs["health"] = cls.Health(attr_str[8])
kwargs["skip_activation"] = True if attr_str[9] == "k" else False
kwargs["name"] = name
return cls(**kwargs)
@classmethod
def from_cli(cls, name: str) -> Self:
json_obj = json.loads(subprocess.run(["lvs", "--reportformat=json", name], capture_output=True).stdout)
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
return cls.from_str(attr_str, name)
class WearoutIndicator(Enum):
REALLOCATED_SECTORS = 0
SPARE_BLOCKS = 1
@dataclass
class WearoutReading:
indicator: WearoutIndicator
current_reading: int
threshold_reading: int
def _get_wearout_reading_from_smartctl_output(smartctl_output: dict) -> WearoutReading:
disk_protocol = smartctl_output["device"]["protocol"]
rotation_rate = smartctl_output.get("rotation_rate", 0)
match rotation_rate:
case 0: # assuming non-rotating media is an SSD
indicator = WearoutIndicator.SPARE_BLOCKS
match disk_protocol:
case "ATA":
attr_table = smartctl_output["ata_smart_attributes"]["table"]
for a in attr_table:
if a["name"] == "Available_Reservd_Space":
value = a["value"]
threshold = a["thresh"]
break
else:
raise Exception(f"no Available_Reservd_Space on ATA SSD")
case "NVMe":
value = smartctl_output["nvme_smart_health_information_log"]["available_spare"]
threshold = smartctl_output["nvme_smart_health_information_log"]["available_spare_threshold"]
case _:
indicator = WearoutIndicator.REALLOCATED_SECTORS
match disk_protocol:
case "ATA":
attr_table = smartctl_output["ata_smart_attributes"]["table"]
for a in attr_table:
if a["name"] == "Reallocated_Sector_Ct":
value = a["value"]
threshold = a["thresh"]
break
else:
raise Exception(f"no Reallocated_Sector_Ct on ATA HDD")
case "NVMe": # ? NVMe HDDs are very rare, if they even exist
raise NotImplementedError
return WearoutReading(indicator, current_reading=value, threshold_reading=threshold)
def get_wearout_reading(disk: str) -> WearoutReading:
smartctl_output = json.loads(subprocess.run(["smartctl", "-ja", disk], capture_output=True).stdout.decode("utf-8"))
wearout_reading = _get_wearout_reading_from_smartctl_output(smartctl_output)
return wearout_reading

View file

@ -1,21 +0,0 @@
from enum import StrEnum
class UPSStatus(StrEnum):
"""https://networkupstools.org/docs/developer-guide.chunked/new-drivers.html#_status_data"""
ON_LINE = "OL"
ON_BATTERY = "OB"
BATTERY_LOW = "LB"
BATTERY_HIGH = "HB"
BATTERY_REPLACE = "RB"
BATTERY_CHARGING = "CHRG"
BATTERY_DISCHARGING = "DISCHRG"
UPS_BYPASS = "BYPASS"
"""Battery and connected devices are not protected from power outage!"""
UPS_OFFLINE = "OFF"
UPS_OVERLOAD = "OVER"
UPS_CALIBRATION = "CAL"
UPS_TRIM = "TRIM"
UPS_BOOST = "BOOST"
UPS_FSD = "FSD"

View file

@ -1,12 +1,11 @@
import subprocess
from dataclasses import dataclass
from enum import StrEnum
from psutil import cpu_percent, sensors_temperatures, virtual_memory
from alerting import alerts
from .enums import UPSStatus
@dataclass
class TemperatureSensor:
@ -32,6 +31,26 @@ class RamSensor:
critical_avail: int = 2 * 1024**3
class UPSStatus(StrEnum):
"""https://networkupstools.org/docs/developer-guide.chunked/new-drivers.html#_status_data"""
ON_LINE = "OL"
ON_BATTERY = "OB"
BATTERY_LOW = "LB"
BATTERY_HIGH = "HB"
BATTERY_REPLACE = "RB"
BATTERY_CHARGING = "CHRG"
BATTERY_DISCHARGING = "DISCHRG"
UPS_BYPASS = "BYPASS"
"""Battery and connected devices are not protected from power outage!"""
UPS_OFFLINE = "OFF"
UPS_OVERLOAD = "OVER"
UPS_CALIBRATION = "CAL"
UPS_TRIM = "TRIM"
UPS_BOOST = "BOOST"
UPS_FSD = "FSD"
@dataclass
class UPSSensor:
ups_status: list[UPSStatus] = None

View file

@ -37,6 +37,8 @@ async def main():
interval_checker(checks.ups_check, datetime.timedelta(minutes=5)),
interval_checker(checks.ram_check, datetime.timedelta(minutes=1)),
interval_checker(checks.vuln_check, datetime.timedelta(days=1)),
interval_checker(checks.raid_check, datetime.timedelta(days=1)),
interval_checker(checks.disk_wearout_check, datetime.timedelta(days=1)),
scheduled_checker(
checks.docker_registry_check, period=datetime.timedelta(days=1), when=datetime.time(hour=0, minute=0)
),

594
tests/smartctl_ata_hdd.json Normal file
View file

@ -0,0 +1,594 @@
{
"json_format_version": [
1,
0
],
"smartctl": {
"version": [
7,
4
],
"pre_release": false,
"svn_revision": "5530",
"platform_info": "x86_64-linux-6.11.3-arch1-1",
"build_info": "(local build)",
"argv": [
"smartctl",
"-ja",
"/dev/sda"
],
"drive_database_version": {
"string": "7.3/5528"
},
"exit_status": 0
},
"local_time": {
"time_t": 1731149584,
"asctime": "Sat Nov 9 13:53:04 2024 MSK"
},
"device": {
"name": "/dev/sda",
"info_name": "/dev/sda [SAT]",
"type": "sat",
"protocol": "ATA"
},
"model_name": "WDC WD20EARZ-00C5XB0",
"serial_number": "WD-WX32D83C15U7",
"wwn": {
"naa": 5,
"oui": 5358,
"id": 8959374949
},
"firmware_version": "01.01A01",
"user_capacity": {
"blocks": 3907029168,
"bytes": 2000398934016
},
"logical_block_size": 512,
"physical_block_size": 4096,
"rotation_rate": 5400,
"form_factor": {
"ata_value": 2,
"name": "3.5 inches"
},
"trim": {
"supported": false
},
"in_smartctl_database": false,
"ata_version": {
"string": "ACS-3 T13/2161-D revision 5",
"major_value": 2046,
"minor_value": 109
},
"sata_version": {
"string": "SATA 3.1",
"value": 126
},
"interface_speed": {
"max": {
"sata_value": 14,
"string": "6.0 Gb/s",
"units_per_second": 60,
"bits_per_unit": 100000000
},
"current": {
"sata_value": 3,
"string": "6.0 Gb/s",
"units_per_second": 60,
"bits_per_unit": 100000000
}
},
"smart_support": {
"available": true,
"enabled": true
},
"smart_status": {
"passed": true
},
"ata_smart_data": {
"offline_data_collection": {
"status": {
"value": 0,
"string": "was never started"
},
"completion_seconds": 19380
},
"self_test": {
"status": {
"value": 0,
"string": "completed without error",
"passed": true
},
"polling_minutes": {
"short": 2,
"extended": 208,
"conveyance": 5
}
},
"capabilities": {
"values": [
123,
3
],
"exec_offline_immediate_supported": true,
"offline_is_aborted_upon_new_cmd": false,
"offline_surface_scan_supported": true,
"self_tests_supported": true,
"conveyance_self_test_supported": true,
"selective_self_test_supported": true,
"attribute_autosave_enabled": true,
"error_logging_supported": true,
"gp_logging_supported": true
}
},
"ata_sct_capabilities": {
"value": 12341,
"error_recovery_control_supported": false,
"feature_control_supported": true,
"data_table_supported": true
},
"ata_smart_attributes": {
"revision": 16,
"table": [
{
"id": 1,
"name": "Raw_Read_Error_Rate",
"value": 200,
"worst": 200,
"thresh": 51,
"when_failed": "",
"flags": {
"value": 47,
"string": "POSR-K ",
"prefailure": true,
"updated_online": true,
"performance": true,
"error_rate": true,
"event_count": false,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 3,
"name": "Spin_Up_Time",
"value": 175,
"worst": 175,
"thresh": 21,
"when_failed": "",
"flags": {
"value": 39,
"string": "POS--K ",
"prefailure": true,
"updated_online": true,
"performance": true,
"error_rate": false,
"event_count": false,
"auto_keep": true
},
"raw": {
"value": 2241,
"string": "2241"
}
},
{
"id": 4,
"name": "Start_Stop_Count",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 11,
"string": "11"
}
},
{
"id": 5,
"name": "Reallocated_Sector_Ct",
"value": 200,
"worst": 200,
"thresh": 140,
"when_failed": "",
"flags": {
"value": 51,
"string": "PO--CK ",
"prefailure": true,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 7,
"name": "Seek_Error_Rate",
"value": 100,
"worst": 253,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 46,
"string": "-OSR-K ",
"prefailure": false,
"updated_online": true,
"performance": true,
"error_rate": true,
"event_count": false,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 9,
"name": "Power_On_Hours",
"value": 96,
"worst": 96,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 3275,
"string": "3275"
}
},
{
"id": 10,
"name": "Spin_Retry_Count",
"value": 100,
"worst": 253,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 11,
"name": "Calibration_Retry_Count",
"value": 100,
"worst": 253,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 12,
"name": "Power_Cycle_Count",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 10,
"string": "10"
}
},
{
"id": 192,
"name": "Power-Off_Retract_Count",
"value": 200,
"worst": 200,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 6,
"string": "6"
}
},
{
"id": 193,
"name": "Load_Cycle_Count",
"value": 200,
"worst": 200,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 28,
"string": "28"
}
},
{
"id": 194,
"name": "Temperature_Celsius",
"value": 112,
"worst": 105,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 34,
"string": "-O---K ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": false,
"auto_keep": true
},
"raw": {
"value": 31,
"string": "31"
}
},
{
"id": 196,
"name": "Reallocated_Event_Count",
"value": 200,
"worst": 200,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 197,
"name": "Current_Pending_Sector",
"value": 200,
"worst": 200,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 198,
"name": "Offline_Uncorrectable",
"value": 100,
"worst": 253,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 48,
"string": "----CK ",
"prefailure": false,
"updated_online": false,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 199,
"name": "UDMA_CRC_Error_Count",
"value": 200,
"worst": 200,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 200,
"name": "Multi_Zone_Error_Rate",
"value": 100,
"worst": 253,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 8,
"string": "---R-- ",
"prefailure": false,
"updated_online": false,
"performance": false,
"error_rate": true,
"event_count": false,
"auto_keep": false
},
"raw": {
"value": 0,
"string": "0"
}
}
]
},
"power_on_time": {
"hours": 3275
},
"power_cycle_count": 10,
"temperature": {
"current": 31
},
"ata_smart_error_log": {
"summary": {
"revision": 1,
"count": 0
}
},
"ata_smart_self_test_log": {
"standard": {
"revision": 1,
"table": [
{
"type": {
"value": 3,
"string": "Conveyance offline"
},
"status": {
"value": 0,
"string": "Completed without error",
"passed": true
},
"lifetime_hours": 0
}
],
"count": 1,
"error_count_total": 0,
"error_count_outdated": 0
}
},
"ata_smart_selective_self_test_log": {
"revision": 1,
"table": [
{
"lba_min": 0,
"lba_max": 0,
"status": {
"value": 0,
"string": "Not_testing"
}
},
{
"lba_min": 0,
"lba_max": 0,
"status": {
"value": 0,
"string": "Not_testing"
}
},
{
"lba_min": 0,
"lba_max": 0,
"status": {
"value": 0,
"string": "Not_testing"
}
},
{
"lba_min": 0,
"lba_max": 0,
"status": {
"value": 0,
"string": "Not_testing"
}
},
{
"lba_min": 0,
"lba_max": 0,
"status": {
"value": 0,
"string": "Not_testing"
}
}
],
"flags": {
"value": 0,
"remainder_scan_enabled": false
},
"power_up_scan_resume_minutes": 0
}
}

680
tests/smartctl_ata_ssd.json Normal file
View file

@ -0,0 +1,680 @@
{
"json_format_version": [
1,
0
],
"smartctl": {
"version": [
7,
4
],
"pre_release": false,
"svn_revision": "5530",
"platform_info": "x86_64-linux-6.11.6-arch1-1",
"build_info": "(local build)",
"argv": [
"smartctl",
"-ja",
"/dev/sda"
],
"drive_database_version": {
"string": "7.3/5528"
},
"exit_status": 0
},
"local_time": {
"time_t": 1731149676,
"asctime": "Sat Nov 9 13:54:36 2024 MSK"
},
"device": {
"name": "/dev/sda",
"info_name": "/dev/sda [SAT]",
"type": "sat",
"protocol": "ATA"
},
"model_family": "WD Blue / Red / Green SSDs",
"model_name": "WDC WDS100T2G0A-00JH30",
"serial_number": "20299A802244",
"wwn": {
"naa": 5,
"oui": 6980,
"id": 37501727029
},
"firmware_version": "UH510000",
"user_capacity": {
"blocks": 1953529856,
"bytes": 1000207286272
},
"logical_block_size": 512,
"physical_block_size": 512,
"rotation_rate": 0,
"form_factor": {
"ata_value": 3,
"name": "2.5 inches"
},
"trim": {
"supported": true,
"deterministic": true,
"zeroed": false
},
"in_smartctl_database": true,
"ata_version": {
"string": "ACS-2 T13/2015-D revision 3",
"major_value": 1008,
"minor_value": 272
},
"sata_version": {
"string": "SATA 3.2",
"value": 255
},
"interface_speed": {
"max": {
"sata_value": 14,
"string": "6.0 Gb/s",
"units_per_second": 60,
"bits_per_unit": 100000000
},
"current": {
"sata_value": 3,
"string": "6.0 Gb/s",
"units_per_second": 60,
"bits_per_unit": 100000000
}
},
"smart_support": {
"available": true,
"enabled": true
},
"smart_status": {
"passed": true
},
"ata_smart_data": {
"offline_data_collection": {
"status": {
"value": 0,
"string": "was never started"
},
"completion_seconds": 120
},
"self_test": {
"status": {
"value": 0,
"string": "completed without error",
"passed": true
},
"polling_minutes": {
"short": 2,
"extended": 182
}
},
"capabilities": {
"values": [
21,
3
],
"exec_offline_immediate_supported": true,
"offline_is_aborted_upon_new_cmd": true,
"offline_surface_scan_supported": false,
"self_tests_supported": true,
"conveyance_self_test_supported": false,
"selective_self_test_supported": false,
"attribute_autosave_enabled": true,
"error_logging_supported": true,
"gp_logging_supported": true
}
},
"ata_smart_attributes": {
"revision": 1,
"table": [
{
"id": 5,
"name": "Reallocated_Sector_Ct",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 9,
"name": "Power_On_Hours",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 7976,
"string": "7976"
}
},
{
"id": 12,
"name": "Power_Cycle_Count",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 1218,
"string": "1218"
}
},
{
"id": 165,
"name": "Block_Erase_Count",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 2063,
"string": "2063"
}
},
{
"id": 166,
"name": "Minimum_PE_Cycles_TLC",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 15,
"string": "15"
}
},
{
"id": 167,
"name": "Max_Bad_Blocks_per_Die",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 168,
"name": "Maximum_PE_Cycles_TLC",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 36,
"string": "36"
}
},
{
"id": 169,
"name": "Total_Bad_Blocks",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 1075,
"string": "1075"
}
},
{
"id": 170,
"name": "Grown_Bad_Blocks",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 171,
"name": "Program_Fail_Count",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 172,
"name": "Erase_Fail_Count",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 173,
"name": "Average_PE_Cycles_TLC",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 15,
"string": "15"
}
},
{
"id": 174,
"name": "Unexpected_Power_Loss",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 144,
"string": "144"
}
},
{
"id": 184,
"name": "End-to-End_Error",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 187,
"name": "Reported_Uncorrect",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 188,
"name": "Command_Timeout",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 194,
"name": "Temperature_Celsius",
"value": 70,
"worst": 58,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 34,
"string": "-O---K ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": false,
"auto_keep": true
},
"raw": {
"value": 249108103198,
"string": "30 (Min/Max 0/58)"
}
},
{
"id": 199,
"name": "UDMA_CRC_Error_Count",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
},
{
"id": 230,
"name": "Media_Wearout_Indicator",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 5879860561241,
"string": "0x055903000559"
}
},
{
"id": 232,
"name": "Available_Reservd_Space",
"value": 100,
"worst": 100,
"thresh": 5,
"when_failed": "",
"flags": {
"value": 51,
"string": "PO--CK ",
"prefailure": true,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 100,
"string": "100"
}
},
{
"id": 233,
"name": "NAND_GB_Written_TLC",
"value": 100,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 15654,
"string": "15654"
}
},
{
"id": 234,
"name": "NAND_GB_Written_SLC",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 32244,
"string": "32244"
}
},
{
"id": 241,
"name": "Host_Writes_GiB",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 48,
"string": "----CK ",
"prefailure": false,
"updated_online": false,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 13847,
"string": "13847"
}
},
{
"id": 242,
"name": "Host_Reads_GiB",
"value": 100,
"worst": 100,
"thresh": 0,
"when_failed": "",
"flags": {
"value": 48,
"string": "----CK ",
"prefailure": false,
"updated_online": false,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 31195,
"string": "31195"
}
},
{
"id": 244,
"name": "Temp_Throttle_Status",
"value": 0,
"worst": 100,
"flags": {
"value": 50,
"string": "-O--CK ",
"prefailure": false,
"updated_online": true,
"performance": false,
"error_rate": false,
"event_count": true,
"auto_keep": true
},
"raw": {
"value": 0,
"string": "0"
}
}
]
},
"power_on_time": {
"hours": 7976
},
"power_cycle_count": 1218,
"temperature": {
"current": 30
},
"ata_smart_error_log": {
"summary": {
"revision": 1,
"count": 0
}
},
"ata_smart_self_test_log": {
"standard": {
"revision": 1,
"count": 0
}
}
}

View file

@ -0,0 +1,145 @@
{
"json_format_version": [
1,
0
],
"smartctl": {
"version": [
7,
4
],
"pre_release": false,
"svn_revision": "5530",
"platform_info": "x86_64-linux-6.11.3-arch1-1",
"build_info": "(local build)",
"argv": [
"smartctl",
"-ja",
"/dev/nvme0"
],
"exit_status": 0
},
"local_time": {
"time_t": 1731149045,
"asctime": "Sat Nov 9 13:44:05 2024 MSK"
},
"device": {
"name": "/dev/nvme0",
"info_name": "/dev/nvme0",
"type": "nvme",
"protocol": "NVMe"
},
"model_name": "Samsung SSD 970 EVO Plus 1TB",
"serial_number": "S4EWNM0W921977B",
"firmware_version": "2B2QEXM7",
"nvme_pci_vendor": {
"id": 5197,
"subsystem_id": 5197
},
"nvme_ieee_oui_identifier": 9528,
"nvme_total_capacity": 1000204886016,
"nvme_unallocated_capacity": 0,
"nvme_controller_id": 4,
"nvme_version": {
"string": "1.3",
"value": 66304
},
"nvme_number_of_namespaces": 1,
"nvme_namespaces": [
{
"id": 1,
"size": {
"blocks": 1953525168,
"bytes": 1000204886016
},
"capacity": {
"blocks": 1953525168,
"bytes": 1000204886016
},
"utilization": {
"blocks": 686279048,
"bytes": 351374872576
},
"formatted_lba_size": 512,
"eui64": {
"oui": 9528,
"ext_id": 383083641036
}
}
],
"user_capacity": {
"blocks": 1953525168,
"bytes": 1000204886016
},
"logical_block_size": 512,
"smart_support": {
"available": true,
"enabled": true
},
"smart_status": {
"passed": true,
"nvme": {
"value": 0
}
},
"nvme_smart_health_information_log": {
"critical_warning": 0,
"temperature": 47,
"available_spare": 100,
"available_spare_threshold": 10,
"percentage_used": 0,
"data_units_read": 111588,
"data_units_written": 1802957,
"host_reads": 2570341,
"host_writes": 36266417,
"controller_busy_time": 133,
"power_cycles": 31,
"power_on_hours": 432,
"unsafe_shutdowns": 18,
"media_errors": 0,
"num_err_log_entries": 63,
"warning_temp_time": 0,
"critical_comp_time": 0,
"temperature_sensors": [
47,
51
]
},
"temperature": {
"current": 47
},
"power_cycle_count": 31,
"power_on_time": {
"hours": 432
},
"nvme_error_information_log": {
"size": 64,
"read": 16,
"unread": 0,
"table": [
{
"error_count": 63,
"submission_queue_id": 0,
"command_id": 8,
"status_field": {
"value": 8194,
"do_not_retry": false,
"status_code_type": 0,
"status_code": 2,
"string": "Invalid Field in Command"
},
"phase_tag": false,
"lba": {
"value": 0
},
"nsid": 0
}
]
},
"nvme_self_test_log": {
"current_self_test_operation": {
"value": 0,
"string": "No self-test in progress"
}
}
}

57
tests/test_disks.py Normal file
View file

@ -0,0 +1,57 @@
import json
import unittest
from misc.disks import (
LVAttr,
WearoutIndicator,
WearoutReading,
_get_wearout_reading_from_smartctl_output,
)
class TestDisks(unittest.TestCase):
def test_lv_attr_declaration(self):
self.assertEqual(
LVAttr.from_str("rwi-aor---", "Data/lvol0"),
LVAttr(
vol_type=LVAttr.VolType.RAID,
permissions=LVAttr.Permissions.WRITABLE,
allocation_policy=LVAttr.AllocationPolicy.INHERITED,
fixed_minor=False,
state=LVAttr.State.ACTIVE,
is_open=LVAttr.IsOpen.OPEN,
target_type=LVAttr.TargetType.RAID,
zero_before_use=False,
health=LVAttr.Health.OK,
skip_activation=False,
name="Data/lvol0",
),
)
def test_wearout_reading_nvme_ssd(self):
with open("tests/smartctl_nvme_ssd.json") as f:
smartctl_output = json.load(f)
self.assertEqual(
_get_wearout_reading_from_smartctl_output(smartctl_output),
WearoutReading(indicator=WearoutIndicator.SPARE_BLOCKS, current_reading=100, threshold_reading=10),
)
def test_wearout_reading_ata_hdd(self):
with open("tests/smartctl_ata_hdd.json") as f:
smartctl_output = json.load(f)
self.assertEqual(
_get_wearout_reading_from_smartctl_output(smartctl_output),
WearoutReading(indicator=WearoutIndicator.REALLOCATED_SECTORS, current_reading=200, threshold_reading=140),
)
def test_wearout_reading_ata_ssd(self):
with open("tests/smartctl_ata_ssd.json") as f:
smartctl_output = json.load(f)
self.assertEqual(
_get_wearout_reading_from_smartctl_output(smartctl_output),
WearoutReading(indicator=WearoutIndicator.SPARE_BLOCKS, current_reading=100, threshold_reading=5),
)
if __name__ == "__main__":
unittest.main()