mirror of
https://forgejo.altau.su/lego/lego-monitoring.git
synced 2026-03-10 04:41:10 +00:00
move existing stuff to archive dir (for now)
This commit is contained in:
parent
ae1204449c
commit
4fc491f61a
32 changed files with 0 additions and 0 deletions
53
archive-arch/misc/checkers.py
Normal file
53
archive-arch/misc/checkers.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
import asyncio
|
||||
import datetime
|
||||
import logging
|
||||
from typing import Callable, Coroutine
|
||||
|
||||
from alerting import alerts
|
||||
|
||||
|
||||
async def _call_check(check: Callable | Coroutine, *args, **kwargs) -> list[alerts.Alert]:
|
||||
if isinstance(check, Callable):
|
||||
result = check(*args, **kwargs)
|
||||
if isinstance(result, Coroutine):
|
||||
result = await result
|
||||
elif isinstance(check, Coroutine):
|
||||
result = await check
|
||||
else:
|
||||
raise TypeError(f"check is {type(check)}, neither function nor coroutine")
|
||||
return result
|
||||
|
||||
|
||||
async def interval_checker(check: Callable | Coroutine, interval: datetime.timedelta, *args, **kwargs):
|
||||
interval_secs = interval.total_seconds()
|
||||
while True:
|
||||
logging.info(f"Calling {check.__name__}")
|
||||
result = await _call_check(check, *args, **kwargs)
|
||||
logging.info(f"Got {len(result)} alerts")
|
||||
for alert in result:
|
||||
await alerts.send_alert(alert)
|
||||
await asyncio.sleep(interval_secs)
|
||||
|
||||
|
||||
async def scheduled_checker(
|
||||
check: Callable | Coroutine, period: datetime.timedelta, when: datetime.time, *args, **kwargs
|
||||
):
|
||||
match period:
|
||||
case datetime.timedelta(days=1):
|
||||
while True:
|
||||
now = datetime.datetime.now()
|
||||
next_datetime = datetime.datetime.combine(datetime.date.today(), when)
|
||||
if next_datetime < now:
|
||||
next_datetime += datetime.timedelta(days=1)
|
||||
logging.info(f"Scheduled to call {check.__name__} at {next_datetime.isoformat()}")
|
||||
await asyncio.sleep(
|
||||
(next_datetime - now).total_seconds()
|
||||
) # might be negative at this point, asyncio doesn't care
|
||||
|
||||
logging.info(f"Calling {check.__name__}")
|
||||
result = await _call_check(check, *args, **kwargs)
|
||||
logging.info(f"Got {len(result)} alerts")
|
||||
for alert in result:
|
||||
await alerts.send_alert(alert)
|
||||
case _:
|
||||
raise NotImplementedError
|
||||
276
archive-arch/misc/checks.py
Normal file
276
archive-arch/misc/checks.py
Normal file
|
|
@ -0,0 +1,276 @@
|
|||
import logging
|
||||
import traceback
|
||||
from datetime import timedelta
|
||||
|
||||
from alerting import alerts
|
||||
from alerting.enum import AlertType, Severity
|
||||
from misc import cvars, docker_registry, sensors, vuln
|
||||
from misc.disks import LVAttr, WearoutIndicator, get_wearout_reading
|
||||
|
||||
IS_TESTING = False
|
||||
|
||||
|
||||
def temp_check() -> list[alerts.Alert]:
|
||||
alert_list = []
|
||||
temps = sensors.Sensors.get_temperatures()
|
||||
for _, sensor_list in temps.items():
|
||||
for sensor in sensor_list:
|
||||
if sensor.sensor_type == "nct6687":
|
||||
continue # little valuable info and too low limits there, might as well ignore it
|
||||
if sensor.critical_temp is not None and (IS_TESTING or sensor.current_temp > sensor.critical_temp):
|
||||
alert = alerts.Alert(
|
||||
alert_type=AlertType("TEMP"),
|
||||
message=f"{sensor.sensor_type} {sensor.sensor_label}: {sensor.current_temp}°C > {sensor.critical_temp}°C",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
elif sensor.highest_temp is not None and (IS_TESTING or sensor.current_temp > sensor.highest_temp):
|
||||
alert = alerts.Alert(
|
||||
alert_type=AlertType("TEMP"),
|
||||
message=f"{sensor.sensor_type} {sensor.sensor_label}: {sensor.current_temp}°C > {sensor.highest_temp}°C",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
else:
|
||||
continue
|
||||
alert_list.append(alert)
|
||||
return alert_list
|
||||
|
||||
|
||||
def cpu_check() -> list[alerts.Alert]:
|
||||
sensor = sensors.Sensors.get_cpu()
|
||||
if IS_TESTING or sensor.current_load > sensor.critical_load:
|
||||
alert = alerts.Alert(
|
||||
alert_type=AlertType("CPU"),
|
||||
message=f"{sensor.current_load}% > {sensor.critical_load}%",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
elif IS_TESTING or sensor.current_load > sensor.highest_load:
|
||||
alert = alerts.Alert(
|
||||
alert_type=AlertType("CPU"),
|
||||
message=f"{sensor.current_load}% > {sensor.highest_load}%",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
else:
|
||||
return []
|
||||
return [alert]
|
||||
|
||||
|
||||
def ram_check() -> list[alerts.Alert]:
|
||||
sensor = sensors.Sensors.get_ram()
|
||||
if IS_TESTING or sensor.current_avail < sensor.critical_avail:
|
||||
alert = alerts.Alert(
|
||||
alert_type=AlertType("RAM"),
|
||||
message=f"{(sensor.current_avail / 1024**3):.2f} GiB < {(sensor.critical_avail / 1024**3):.2f} GiB",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
elif IS_TESTING or sensor.current_avail < sensor.warning_avail:
|
||||
alert = alerts.Alert(
|
||||
alert_type=AlertType("RAM"),
|
||||
message=f"{(sensor.current_avail / 1024**3):.2f} GiB < {(sensor.warning_avail / 1024**3):.2f} GiB",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
else:
|
||||
return []
|
||||
return [alert]
|
||||
|
||||
|
||||
async def vuln_check() -> list[alerts.Alert]:
|
||||
vulns = await vuln.get_vulns()
|
||||
alert_list = []
|
||||
for v in vulns:
|
||||
if IS_TESTING or v.fixed or v.severity in (vuln.Severity.HIGH, vuln.Severity.CRITICAL):
|
||||
match v.severity:
|
||||
case vuln.Severity.LOW:
|
||||
severity = Severity.INFO
|
||||
case vuln.Severity.MEDIUM:
|
||||
severity = Severity.WARNING
|
||||
case vuln.Severity.HIGH | vuln.Severity.CRITICAL:
|
||||
severity = Severity.CRITICAL
|
||||
message = f"{v.id}: {v.vuln_type} in {','.join(v.packages)}"
|
||||
html_message = f"<a href='{v.link}'>{v.id}</a>: {v.vuln_type} in {','.join(v.packages)}"
|
||||
if v.fixed:
|
||||
message.append(f" -- update to {v.fixed} ASAP")
|
||||
html_message.append(f" -- update to {v.fixed} ASAP")
|
||||
alert = alerts.Alert(
|
||||
alert_type=AlertType.VULN,
|
||||
message=message,
|
||||
html_message=html_message,
|
||||
severity=severity,
|
||||
)
|
||||
alert_list.append(alert)
|
||||
return alert_list
|
||||
|
||||
|
||||
async def ups_check() -> list[alerts.Alert]:
|
||||
sensor = await sensors.Sensors.get_ups()
|
||||
|
||||
if not sensor:
|
||||
return
|
||||
|
||||
alert_list = []
|
||||
|
||||
if IS_TESTING or sensor.battery_charge_percentage < sensor.battery_critical_percentage:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.UPS,
|
||||
message=f"Battery is under {sensor.battery_critical_percentage}%\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
elif IS_TESTING or sensor.battery_charge_percentage < sensor.battery_warning_percentage:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.UPS,
|
||||
message=f"Battery is under {sensor.battery_warning_percentage}%\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
)
|
||||
|
||||
for status in sensor.ups_status:
|
||||
if IS_TESTING or status == sensors.UPSStatus.UPS_OVERLOAD:
|
||||
alert_list.append(
|
||||
alerts.Alert(alert_type=AlertType.UPS, message=f"UPS is overloaded!", severity=Severity.CRITICAL)
|
||||
)
|
||||
elif IS_TESTING or status == sensors.UPSStatus.ON_BATTERY:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.UPS,
|
||||
message=f"UPS is on battery.\n{sensor.battery_charge_percentage}% ({timedelta(seconds=sensor.battery_runtime)}) remaining.",
|
||||
severity=Severity.INFO,
|
||||
)
|
||||
)
|
||||
elif IS_TESTING or status == sensors.UPSStatus.UPS_TRIM:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.UPS,
|
||||
message=f"Overvoltage detected: trimming voltage to nominal.",
|
||||
severity=Severity.INFO,
|
||||
)
|
||||
)
|
||||
elif IS_TESTING or status == sensors.UPSStatus.UPS_BOOST:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.UPS,
|
||||
message=f"Undervoltage detected: boosting voltage to nominal.",
|
||||
severity=Severity.INFO,
|
||||
)
|
||||
)
|
||||
|
||||
return alert_list
|
||||
|
||||
|
||||
async def docker_registry_check() -> list[alerts.Alert]:
|
||||
updated_images = await docker_registry.get_updated_images()
|
||||
alert_list = []
|
||||
for image in updated_images:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.UPDATE,
|
||||
message=f"{image} docker image: new version available",
|
||||
severity=Severity.INFO,
|
||||
)
|
||||
)
|
||||
return alert_list
|
||||
|
||||
|
||||
def raid_check() -> list[alerts.Alert]:
|
||||
check_config = cvars.config.get().checks.raid
|
||||
alert_list = []
|
||||
for lv in check_config.lvs:
|
||||
try:
|
||||
lv_attr = LVAttr.from_cli(lv)
|
||||
except Exception as exc:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.ERROR,
|
||||
message=f"Could not check RAID LV {lv}: {repr(exc)}, see logs",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
logging.error(traceback.format_exc())
|
||||
continue
|
||||
|
||||
# sanity check
|
||||
if lv_attr.vol_type not in [LVAttr.VolType.RAID, LVAttr.VolType.RAID_NOSYNC]:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.ERROR,
|
||||
message=f"LV {lv} is not of RAID type",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
if IS_TESTING:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.RAID,
|
||||
message=f"Test alert: LV {lv} health is {lv_attr.health}",
|
||||
severity=Severity.INFO,
|
||||
)
|
||||
)
|
||||
|
||||
match lv_attr.health:
|
||||
case LVAttr.Health.PARTIAL:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.RAID,
|
||||
message=f"LV {lv} operating in partial mode; one of PVs has failed",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
case LVAttr.Health.UNKNOWN:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.RAID,
|
||||
message=f"LV {lv}'s state is unknown",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
case LVAttr.Health.REFRESH_NEEDED:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.RAID,
|
||||
message=f"LV {lv} has suffered a write error; run a refresh or replace the failing PV",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
)
|
||||
case LVAttr.Health.MISMATCHES:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.RAID,
|
||||
message=f"LV {lv} is partially incoherent; run a repairing scrub operation",
|
||||
severity=Severity.WARNING,
|
||||
)
|
||||
)
|
||||
|
||||
return alert_list
|
||||
|
||||
|
||||
def disk_wearout_check() -> list[alerts.Alert]:
|
||||
check_config = cvars.config.get().checks.wearout
|
||||
alert_list = []
|
||||
for disk in check_config.disks:
|
||||
try:
|
||||
wearout_reading = get_wearout_reading(disk.name)
|
||||
except Exception as exc:
|
||||
alert_list.append(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.ERROR,
|
||||
message=f"Could not check wearout for disk {disk.name}: {repr(exc)}, see logs",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
logging.error(traceback.format_exc())
|
||||
continue
|
||||
|
||||
if IS_TESTING or wearout_reading.current_reading < wearout_reading.threshold_reading:
|
||||
match wearout_reading.indicator:
|
||||
case WearoutIndicator.REALLOCATED_SECTORS:
|
||||
message = f"Disk {disk.name} has reallocated sectors (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
|
||||
case WearoutIndicator.SPARE_BLOCKS:
|
||||
message = f"Disk {disk.name} has too few spare blocks (curr {wearout_reading.current_reading}, thresh {wearout_reading.threshold_reading})"
|
||||
alert_list.append(
|
||||
alerts.Alert(alert_type=AlertType.DISKS, message=message, severity=Severity[disk.severity])
|
||||
)
|
||||
|
||||
return alert_list
|
||||
6
archive-arch/misc/common.py
Normal file
6
archive-arch/misc/common.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
CONFIG_FILE = (Path(os.path.dirname(os.path.realpath(__file__))) / ".." / "config.json").resolve()
|
||||
TMP_DIR = Path(tempfile.gettempdir()) / "lego-monitoring"
|
||||
65
archive-arch/misc/config.py
Normal file
65
archive-arch/misc/config.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
import json
|
||||
from dataclasses import dataclass
|
||||
|
||||
from alt_utils import NestedDeserializableDataclass
|
||||
|
||||
from alerting.enum import Severity
|
||||
from misc.common import CONFIG_FILE
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatrixConfig:
|
||||
homeserver: str
|
||||
user_id: str
|
||||
device_id: str
|
||||
access_token: str
|
||||
room_id: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckDockerRegistryConfig:
|
||||
hub_url: str
|
||||
images: list[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckRaidConfig:
|
||||
lvs: list[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckWearoutDiskConfig:
|
||||
name: str
|
||||
severity: Severity
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckWearoutConfig(NestedDeserializableDataclass):
|
||||
disks: list[CheckWearoutDiskConfig]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckLoginConfig:
|
||||
hostname: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChecksConfig(NestedDeserializableDataclass):
|
||||
docker_registry: CheckDockerRegistryConfig
|
||||
raid: CheckRaidConfig
|
||||
wearout: CheckWearoutConfig
|
||||
login: CheckLoginConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config(NestedDeserializableDataclass):
|
||||
matrix: MatrixConfig
|
||||
checks: ChecksConfig
|
||||
disabled_checks: list[str]
|
||||
|
||||
|
||||
def get_config() -> Config:
|
||||
with open(CONFIG_FILE) as f:
|
||||
cfg_dict = json.load(f)
|
||||
cfg = Config.from_dict(cfg_dict)
|
||||
return cfg
|
||||
8
archive-arch/misc/cvars.py
Normal file
8
archive-arch/misc/cvars.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
from contextvars import ContextVar
|
||||
|
||||
import nio
|
||||
|
||||
from misc.config import Config
|
||||
|
||||
config: ContextVar[Config] = ContextVar("config")
|
||||
matrix_client: ContextVar[nio.AsyncClient] = ContextVar("matrix_client")
|
||||
191
archive-arch/misc/disks.py
Normal file
191
archive-arch/misc/disks.py
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
import json
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, StrEnum
|
||||
from typing import Optional, Self
|
||||
|
||||
|
||||
@dataclass
|
||||
class LVAttr:
|
||||
"""https://man.archlinux.org/man/lvs.8#NOTES"""
|
||||
|
||||
class VolType(StrEnum):
|
||||
CACHE = "C"
|
||||
MIRRORED = "m"
|
||||
MIRRORED_NOSYNC = "M"
|
||||
ORIGIN = "o"
|
||||
ORIGIN_MERGING_SNAPSHOT = "O"
|
||||
INTEGRITY = "g"
|
||||
RAID = "r"
|
||||
RAID_NOSYNC = "R"
|
||||
SNAPSHOT = "s"
|
||||
MERGING_SNAPSHOT = "S"
|
||||
PVMOVE = "p"
|
||||
VIRTUAL = "v"
|
||||
IMAGE = "i"
|
||||
IMAGE_OUT_OF_SYNC = "I"
|
||||
MIRROR_LOG = "l"
|
||||
CONVERTING = "c"
|
||||
THIN = "V"
|
||||
THIN_POOL = "t"
|
||||
THIN_POOL_DATA = "T"
|
||||
VDO_POOL = "d"
|
||||
VDO_POOL_DATA = "D"
|
||||
METADATA = "e"
|
||||
NORMAL = "-"
|
||||
|
||||
class Permissions(StrEnum):
|
||||
WRITABLE = "w"
|
||||
READONLY = "r"
|
||||
READONLY_ACTIVATED = "R"
|
||||
|
||||
class AllocationPolicy(StrEnum):
|
||||
ANYWHERE = "a"
|
||||
ANYWHERE_LOCKED = "A"
|
||||
CONTIGUOUS = "c"
|
||||
CONTIGUOUS_LOCKED = "C"
|
||||
INHERITED = "i"
|
||||
INHERITED_LOCKED = "I"
|
||||
CLING = "l"
|
||||
CLING_LOCKED = "L"
|
||||
NORMAL = "n"
|
||||
NORMAL_LOCKED = "N"
|
||||
|
||||
class State(StrEnum):
|
||||
ACTIVE = "a"
|
||||
HISTORICAL = "h"
|
||||
SUSPENDED = "s"
|
||||
INVALID_SNAPSHOT = "I"
|
||||
INVALID_SUSPENDED_SNAPSHOT = "S"
|
||||
SNAPSHOT_MERGE_FAILED = "m"
|
||||
SUSPENDED_SNAPSHOT_MERGE_FAILED = "M"
|
||||
DEVICE_PRESENT_NO_TABLES = "d"
|
||||
DEVICE_PRESENT_INACTIVE_TABLE = "i"
|
||||
THIN_POOL_CHECK_NEEDED = "c"
|
||||
SUSPENDED_THIN_POOL_CHECK_NEEDED = "C"
|
||||
UNKNOWN = "X"
|
||||
|
||||
class IsOpen(StrEnum):
|
||||
OPEN = "o"
|
||||
CLOSED = "-"
|
||||
UNKNOWN = "X"
|
||||
|
||||
class TargetType(StrEnum):
|
||||
CACHE = "C"
|
||||
MIRROR = "m"
|
||||
RAID = "r"
|
||||
SNAPSHOT = "s"
|
||||
THIN = "t"
|
||||
UNKNOWN = "u"
|
||||
VIRTUAL = "v"
|
||||
NORMAL = "-"
|
||||
|
||||
class Health(StrEnum):
|
||||
# for all
|
||||
PARTIAL = "p"
|
||||
UNKNOWN = "X"
|
||||
OK = "-"
|
||||
|
||||
# for RAID
|
||||
REFRESH_NEEDED = "r"
|
||||
MISMATCHES = "m"
|
||||
WRITEMOSTLY = "w"
|
||||
RESHAPING = "s"
|
||||
REMOVE = "R"
|
||||
|
||||
# for thin pools and LVs
|
||||
FAILED = "F"
|
||||
OUT_OF_SPACE = "D"
|
||||
METADATA_READ_ONLY = "M"
|
||||
|
||||
# for writecache
|
||||
ERROR = "E"
|
||||
|
||||
vol_type: VolType
|
||||
permissions: Permissions
|
||||
allocation_policy: AllocationPolicy
|
||||
fixed_minor: bool
|
||||
state: State
|
||||
is_open: IsOpen
|
||||
target_type: TargetType
|
||||
zero_before_use: bool
|
||||
health: Health
|
||||
skip_activation: bool
|
||||
name: Optional[str] = None
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, attr_str: str, name: Optional[str] = None) -> Self:
|
||||
kwargs = {}
|
||||
kwargs["vol_type"] = cls.VolType(attr_str[0])
|
||||
kwargs["permissions"] = cls.Permissions(attr_str[1])
|
||||
kwargs["allocation_policy"] = cls.AllocationPolicy(attr_str[2])
|
||||
kwargs["fixed_minor"] = True if attr_str[3] == "m" else False
|
||||
kwargs["state"] = cls.State(attr_str[4])
|
||||
kwargs["is_open"] = cls.IsOpen(attr_str[5])
|
||||
kwargs["target_type"] = cls.TargetType(attr_str[6])
|
||||
kwargs["zero_before_use"] = True if attr_str[7] == "z" else False
|
||||
kwargs["health"] = cls.Health(attr_str[8])
|
||||
kwargs["skip_activation"] = True if attr_str[9] == "k" else False
|
||||
kwargs["name"] = name
|
||||
return cls(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_cli(cls, name: str) -> Self:
|
||||
json_obj = json.loads(subprocess.run(["lvs", "--reportformat=json", name], capture_output=True).stdout)
|
||||
attr_str = json_obj["report"][0]["lv"][0]["lv_attr"]
|
||||
return cls.from_str(attr_str, name)
|
||||
|
||||
|
||||
class WearoutIndicator(Enum):
|
||||
REALLOCATED_SECTORS = 0
|
||||
SPARE_BLOCKS = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class WearoutReading:
|
||||
indicator: WearoutIndicator
|
||||
current_reading: int
|
||||
threshold_reading: int
|
||||
|
||||
|
||||
def _get_wearout_reading_from_smartctl_output(smartctl_output: dict) -> WearoutReading:
|
||||
disk_protocol = smartctl_output["device"]["protocol"]
|
||||
rotation_rate = smartctl_output.get("rotation_rate", 0)
|
||||
match rotation_rate:
|
||||
case 0: # assuming non-rotating media is an SSD
|
||||
indicator = WearoutIndicator.SPARE_BLOCKS
|
||||
match disk_protocol:
|
||||
case "ATA":
|
||||
attr_table = smartctl_output["ata_smart_attributes"]["table"]
|
||||
for a in attr_table:
|
||||
if a["name"] == "Available_Reservd_Space":
|
||||
value = a["value"]
|
||||
threshold = a["thresh"]
|
||||
break
|
||||
else:
|
||||
raise Exception(f"no Available_Reservd_Space on ATA SSD")
|
||||
case "NVMe":
|
||||
value = smartctl_output["nvme_smart_health_information_log"]["available_spare"]
|
||||
threshold = smartctl_output["nvme_smart_health_information_log"]["available_spare_threshold"]
|
||||
case _:
|
||||
indicator = WearoutIndicator.REALLOCATED_SECTORS
|
||||
match disk_protocol:
|
||||
case "ATA":
|
||||
attr_table = smartctl_output["ata_smart_attributes"]["table"]
|
||||
for a in attr_table:
|
||||
if a["name"] == "Reallocated_Sector_Ct":
|
||||
value = a["value"]
|
||||
threshold = a["thresh"]
|
||||
break
|
||||
else:
|
||||
raise Exception(f"no Reallocated_Sector_Ct on ATA HDD")
|
||||
case "NVMe": # ? NVMe HDDs are very rare, if they even exist
|
||||
raise NotImplementedError
|
||||
|
||||
return WearoutReading(indicator, current_reading=value, threshold_reading=threshold)
|
||||
|
||||
|
||||
def get_wearout_reading(disk: str) -> WearoutReading:
|
||||
smartctl_output = json.loads(subprocess.run(["smartctl", "-ja", disk], capture_output=True).stdout.decode("utf-8"))
|
||||
wearout_reading = _get_wearout_reading_from_smartctl_output(smartctl_output)
|
||||
return wearout_reading
|
||||
129
archive-arch/misc/docker_registry.py
Normal file
129
archive-arch/misc/docker_registry.py
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
import datetime
|
||||
import ipaddress
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import socket
|
||||
import traceback
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import uplink
|
||||
|
||||
from alerting import alerts
|
||||
from alerting.enum import AlertType, Severity
|
||||
from misc import cvars
|
||||
|
||||
|
||||
class DockerHubClient(uplink.Consumer):
|
||||
@uplink.returns.json
|
||||
@uplink.get("v2/namespaces/{namespace}/repositories/{repository}/tags/latest")
|
||||
def get_latest_tag(self, namespace: uplink.Path, repository: uplink.Path): ...
|
||||
|
||||
|
||||
class DockerRegistryAuthorizer(uplink.Consumer):
|
||||
@uplink.returns.json
|
||||
@uplink.get()
|
||||
def _get_token_unprotected(self, service: uplink.Query, scope: uplink.Query): ...
|
||||
|
||||
async def get_token(self, service: Optional[str], scope: Optional[str]) -> str:
|
||||
host = urlparse(self.session.base_url).hostname
|
||||
ips = set()
|
||||
try:
|
||||
ips.add(ipaddress.ip_address(host))
|
||||
except:
|
||||
addrinfo = socket.getaddrinfo(host, None)
|
||||
for t in addrinfo:
|
||||
ips.add(ipaddress.ip_address(t[4][0]))
|
||||
for ip in ips:
|
||||
if not ip.is_global:
|
||||
raise Exception(f"{host} resolved to {ip} which is not global")
|
||||
return (await self._get_token_unprotected(service, scope))["token"]
|
||||
|
||||
|
||||
class DockerRegistryClient(uplink.Consumer):
|
||||
@uplink.get("v2/{namespace}/{repository}/manifests/latest")
|
||||
def _test_manifest(self, namespace: uplink.Path, repository: uplink.Path): ...
|
||||
|
||||
@uplink.returns.json
|
||||
@uplink.get("v2/{namespace}/{repository}/manifests/latest")
|
||||
def _get_manifest(self, namespace: uplink.Path, repository: uplink.Path): ...
|
||||
|
||||
@uplink.get("v2/{namespace}/{repository}/blobs/{digest}")
|
||||
def _get_blob(self, namespace: uplink.Path, repository: uplink.Path, digest: uplink.Path): ...
|
||||
|
||||
async def get_auth_requirements(self, namespace: str, repository: str) -> Optional[tuple[str, str, str]]:
|
||||
response = await self._test_manifest(namespace, repository)
|
||||
if 200 <= response.status_code < 300:
|
||||
return None
|
||||
auth_regex = re.compile(r"([^\s,]+) ?[=] ?\"?([^\s,\"]+)\"?")
|
||||
auth_keys = dict(auth_regex.findall(response.headers["Www-Authenticate"]))
|
||||
return (auth_keys["realm"], auth_keys.get("service", None), auth_keys.get("scope", None))
|
||||
|
||||
async def get_updated_datetime_iso(self, namespace: str, repository: str) -> str:
|
||||
manifest = await self._get_manifest(namespace, repository)
|
||||
config_digest = manifest["config"]["digest"]
|
||||
blob = json.loads(await (await self._get_blob(namespace, repository, digest=config_digest)).content.read())
|
||||
return blob["created"]
|
||||
|
||||
|
||||
async def get_updated_images() -> list[str]:
|
||||
check_config = cvars.config.get().checks.docker_registry
|
||||
hub_client = DockerHubClient(base_url=check_config.hub_url, client=uplink.AiohttpClient())
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
|
||||
updated_images = []
|
||||
for image in check_config.images:
|
||||
image_split = image.split("/")
|
||||
match len(image_split):
|
||||
case 2:
|
||||
namespace, repository = image_split
|
||||
try:
|
||||
last_updated_iso = (await hub_client.get_latest_tag(namespace=namespace, repository=repository))[
|
||||
"tag_last_pushed"
|
||||
]
|
||||
except Exception as exc:
|
||||
await alerts.send_alert(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.ERROR,
|
||||
message=f"Could not query Docker Hub: {repr(exc)}, see logs",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
logging.error(traceback.format_exc())
|
||||
return []
|
||||
|
||||
case 3:
|
||||
registry, namespace, repository = image_split
|
||||
registry_client = DockerRegistryClient(base_url=f"https://{registry}/", client=uplink.AiohttpClient())
|
||||
try:
|
||||
requirements = await registry_client.get_auth_requirements(namespace, repository)
|
||||
|
||||
if requirements is not None:
|
||||
registry_authorizer = DockerRegistryAuthorizer(
|
||||
base_url=requirements[0], client=uplink.AiohttpClient()
|
||||
)
|
||||
token = await registry_authorizer.get_token(requirements[1], requirements[2])
|
||||
registry_client.session.headers["Authorization"] = f"Bearer {token}"
|
||||
|
||||
last_updated_iso = await registry_client.get_updated_datetime_iso(
|
||||
namespace=namespace, repository=repository
|
||||
)
|
||||
except Exception as exc:
|
||||
await alerts.send_alert(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.ERROR,
|
||||
message=f"Could not query Docker registry {registry}: {repr(exc)}, see logs",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
logging.error(traceback.format_exc())
|
||||
return []
|
||||
case _:
|
||||
raise Exception(f"Invalid image spec: {image}")
|
||||
last_updated = datetime.datetime.fromisoformat(last_updated_iso)
|
||||
logging.info(f"Image {image} last updated at {last_updated}")
|
||||
if now - last_updated <= datetime.timedelta(days=1):
|
||||
updated_images.append(image)
|
||||
|
||||
return updated_images
|
||||
167
archive-arch/misc/sensors.py
Normal file
167
archive-arch/misc/sensors.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
|
||||
from psutil import cpu_percent, sensors_temperatures, virtual_memory
|
||||
|
||||
from alerting import alerts
|
||||
from alerting.enum import AlertType, Severity
|
||||
|
||||
|
||||
@dataclass
|
||||
class TemperatureSensor:
|
||||
sensor_type: str
|
||||
sensor_label: str
|
||||
current_temp: float
|
||||
highest_temp: float | None = None
|
||||
critical_temp: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CpuSensor:
|
||||
current_load: float
|
||||
highest_load: float = 90
|
||||
critical_load: float = 95
|
||||
|
||||
|
||||
@dataclass
|
||||
class RamSensor:
|
||||
current_avail: int
|
||||
current_avail_percentage: float
|
||||
warning_avail: int = 4 * 1024**3
|
||||
critical_avail: int = 2 * 1024**3
|
||||
|
||||
|
||||
class UPSStatus(StrEnum):
|
||||
"""https://networkupstools.org/docs/developer-guide.chunked/new-drivers.html#_status_data"""
|
||||
|
||||
ON_LINE = "OL"
|
||||
ON_BATTERY = "OB"
|
||||
BATTERY_LOW = "LB"
|
||||
BATTERY_HIGH = "HB"
|
||||
BATTERY_REPLACE = "RB"
|
||||
BATTERY_CHARGING = "CHRG"
|
||||
BATTERY_DISCHARGING = "DISCHRG"
|
||||
UPS_BYPASS = "BYPASS"
|
||||
"""Battery and connected devices are not protected from power outage!"""
|
||||
UPS_OFFLINE = "OFF"
|
||||
UPS_OVERLOAD = "OVER"
|
||||
UPS_CALIBRATION = "CAL"
|
||||
UPS_TRIM = "TRIM"
|
||||
UPS_BOOST = "BOOST"
|
||||
UPS_FSD = "FSD"
|
||||
|
||||
|
||||
@dataclass
|
||||
class UPSSensor:
|
||||
ups_status: list[UPSStatus] = None
|
||||
battery_charge_percentage: int = None
|
||||
battery_warning_percentage: int = 20
|
||||
battery_critical_percentage: int = 10
|
||||
battery_runtime: int = 1000
|
||||
|
||||
|
||||
class Sensors:
|
||||
@staticmethod
|
||||
def get_temperatures() -> dict[str, list[TemperatureSensor]]:
|
||||
psutil_temp_sensors = sensors_temperatures()
|
||||
|
||||
temp_sensors = {}
|
||||
|
||||
for s_type, sensors in psutil_temp_sensors.items():
|
||||
if s_type not in temp_sensors.keys():
|
||||
temp_sensors[s_type] = []
|
||||
match (s_type):
|
||||
case "nvme":
|
||||
for sensor in sensors:
|
||||
temp_sensors[s_type].append(
|
||||
TemperatureSensor(
|
||||
sensor_type=s_type,
|
||||
sensor_label=sensor.label,
|
||||
current_temp=sensor.current,
|
||||
highest_temp=sensor.high,
|
||||
critical_temp=sensor.critical,
|
||||
)
|
||||
)
|
||||
case "amdgpu":
|
||||
temp_sensors[s_type].append(
|
||||
TemperatureSensor(
|
||||
sensor_type=s_type,
|
||||
sensor_label="Integrated GPU",
|
||||
current_temp=sensors[0].current,
|
||||
)
|
||||
)
|
||||
case "k10temp":
|
||||
temp_sensors[s_type].append(
|
||||
TemperatureSensor(
|
||||
sensor_type=s_type,
|
||||
sensor_label="AMD CPU",
|
||||
current_temp=sensors[0].current,
|
||||
critical_temp=95.0, # hardcoded because we have R9 7900X
|
||||
)
|
||||
)
|
||||
case "nct6687":
|
||||
lables = {
|
||||
"AMD TSI Addr 98h": "CPU",
|
||||
"Diode 0 (curr)": "System",
|
||||
"Thermistor 15": "VRM MOSFET",
|
||||
"Thermistor 1": "Platform Controller Hub (Peripherals)",
|
||||
"Thermistor 16": "CPU Socket",
|
||||
}
|
||||
|
||||
for sensor in sensors[:-2]:
|
||||
real_label = lables[sensor.label]
|
||||
temp_sensors[s_type].append(
|
||||
TemperatureSensor(
|
||||
sensor_type=s_type,
|
||||
sensor_label=real_label,
|
||||
current_temp=sensor.current,
|
||||
highest_temp=sensor.high or None,
|
||||
critical_temp=sensor.critical or None,
|
||||
)
|
||||
)
|
||||
|
||||
return temp_sensors
|
||||
|
||||
@staticmethod
|
||||
def get_cpu() -> CpuSensor:
|
||||
return CpuSensor(current_load=cpu_percent())
|
||||
|
||||
@staticmethod
|
||||
def get_ram() -> RamSensor:
|
||||
ram = virtual_memory()
|
||||
return RamSensor(current_avail=ram.available, current_avail_percentage=ram.percent)
|
||||
|
||||
@staticmethod
|
||||
async def get_ups() -> None | UPSSensor:
|
||||
try:
|
||||
raw_data = subprocess.run(["upsc", "cp1300"], stdout=subprocess.PIPE, encoding="utf-8")
|
||||
except FileNotFoundError:
|
||||
await alerts.send_alert(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.ERROR,
|
||||
message="upsc is not installed!",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
return None
|
||||
|
||||
sensor_data = UPSSensor()
|
||||
|
||||
for line in raw_data.stdout.splitlines():
|
||||
sensor, value = line.split(": ")[:2]
|
||||
match sensor:
|
||||
case "battery.charge":
|
||||
sensor_data.battery_charge_percentage = int(value)
|
||||
case "battery.charge.low":
|
||||
sensor_data.battery_critical_percentage = int(value)
|
||||
case "battery.charge.warning":
|
||||
sensor_data.battery_warning_percentage = int(value)
|
||||
case "battery.runtime":
|
||||
sensor_data.battery_runtime = int(value)
|
||||
case "ups.status":
|
||||
sensor_data.ups_status = [UPSStatus(status) for status in value.split()]
|
||||
case _:
|
||||
...
|
||||
|
||||
return sensor_data
|
||||
68
archive-arch/misc/vuln.py
Normal file
68
archive-arch/misc/vuln.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
from typing import Optional
|
||||
|
||||
from alerting import alerts
|
||||
from alerting.enum import AlertType, Severity
|
||||
|
||||
|
||||
class Severity(StrEnum):
|
||||
LOW = "Low"
|
||||
MEDIUM = "Medium"
|
||||
HIGH = "High"
|
||||
CRITICAL = "Critical"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Vulnerability:
|
||||
id: str
|
||||
link: str
|
||||
vuln_type: str
|
||||
packages: list[str]
|
||||
severity: Severity
|
||||
fixed: Optional[str]
|
||||
|
||||
|
||||
def _parse_arch_audit_output(output: str) -> list[Vulnerability]:
|
||||
arch_audit_json = json.loads(output)
|
||||
vulnerabilities = []
|
||||
for v in arch_audit_json:
|
||||
vulnerability = Vulnerability(
|
||||
id=v["name"],
|
||||
link=f"https://security.archlinux.org/{v['name']}",
|
||||
vuln_type=v["type"],
|
||||
packages=v["packages"],
|
||||
severity=v["severity"],
|
||||
fixed=v["fixed"],
|
||||
)
|
||||
vulnerabilities.append(vulnerability)
|
||||
return vulnerabilities
|
||||
|
||||
|
||||
async def get_vulns() -> list[Vulnerability]:
|
||||
try:
|
||||
arch_audit_output = subprocess.check_output(["arch-audit", "--json"])
|
||||
except FileNotFoundError:
|
||||
await alerts.send_alert(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.ERROR,
|
||||
message="arch-audit not installed!",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
return []
|
||||
except Exception as exc:
|
||||
await alerts.send_alert(
|
||||
alerts.Alert(
|
||||
alert_type=AlertType.ERROR,
|
||||
message=f"arch-audit failed to run: {repr(exc)}, see logs",
|
||||
severity=Severity.CRITICAL,
|
||||
)
|
||||
)
|
||||
logging.error(traceback.format_exc())
|
||||
return []
|
||||
return _parse_arch_audit_output(arch_audit_output)
|
||||
Loading…
Add table
Add a link
Reference in a new issue