diff --git a/src/lego_monitoring/alerting/sender.py b/src/lego_monitoring/alerting/sender.py
index b9508f9..edd35c0 100644
--- a/src/lego_monitoring/alerting/sender.py
+++ b/src/lego_monitoring/alerting/sender.py
@@ -54,9 +54,28 @@ async def send_alert(alert: Alert, note: str = "") -> None:
# if temp_client:
# await client.close()
- # TODO ping healthchecks if enabled
- if alert.healthchecks_slug is not None:
- raise NotImplementedError
+
+async def send_healthchecks_status(alert: Alert) -> None:
+ def get_pinging_key(keys: dict[str, str]):
+ if alert.healthchecks_slug in keys:
+ return keys[alert.healthchecks_slug]
+ else:
+ return keys["default"]
+
+ if alert.healthchecks_slug is None:
+ return
+ try:
+ hc_client = cvars.healthchecks_client.get()
+ except LookupError:
+ raise NotImplementedError # TODO
+ if hc_client is None:
+ return
+ config = cvars.config.get()
+ key = get_pinging_key(config.alert_channels.healthchecks.pinging_keys)
+ if alert.severity == Severity.OK:
+ await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
+ else:
+ await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message)
# TODO service itself has to be monitored like everything else - with regular pinging - if we're
diff --git a/src/lego_monitoring/checks/cpu.py b/src/lego_monitoring/checks/cpu.py
index 594c1d6..f31cb7b 100644
--- a/src/lego_monitoring/checks/cpu.py
+++ b/src/lego_monitoring/checks/cpu.py
@@ -1,21 +1,27 @@
+from socket import gethostname
+
from psutil import cpu_percent
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
+from .utils import format_for_healthchecks_slug
+
IS_TESTING = False
def cpu_check() -> list[Alert]:
percentage = cpu_percent()
config = cvars.config.get().checks.cpu
+ slug = f"{format_for_healthchecks_slug(gethostname())}-cpu"
if config.critical_percentage and (IS_TESTING or percentage >= config.critical_percentage):
return [
Alert(
alert_type=AlertType.CPU,
message=f"CPU load: {percentage:.2f}% >= {config.critical_percentage:.2f}%",
severity=Severity.CRITICAL,
+ healthchecks_slug=slug,
)
]
elif config.warning_percentage and (IS_TESTING or percentage >= config.warning_percentage):
@@ -24,7 +30,15 @@ def cpu_check() -> list[Alert]:
alert_type=AlertType.CPU,
message=f"CPU load: {percentage:.2f}% >= {config.warning_percentage:.2f}%",
severity=Severity.WARNING,
+ healthchecks_slug=slug,
)
]
else:
- return [Alert(alert_type=AlertType.CPU, message=f"CPU load: {percentage:.2f}% (nominal)", severity=Severity.OK)]
+ return [
+ Alert(
+ alert_type=AlertType.CPU,
+ message=f"CPU load: {percentage:.2f}% (nominal)",
+ severity=Severity.OK,
+ healthchecks_slug=slug,
+ )
+ ]
diff --git a/src/lego_monitoring/checks/net.py b/src/lego_monitoring/checks/net.py
index 073bd8f..10c6d03 100644
--- a/src/lego_monitoring/checks/net.py
+++ b/src/lego_monitoring/checks/net.py
@@ -1,4 +1,5 @@
from dataclasses import dataclass, field
+from socket import gethostname
from typing import Optional
from humanize import naturalsize
@@ -8,6 +9,8 @@ from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
+from .utils import format_for_healthchecks_slug
+
IS_TESTING = False
SECONDS_BETWEEN_CHECKS = 5 * 60
@@ -25,6 +28,9 @@ class NetIOTracker:
stat_name: str,
interface: str,
) -> Optional[Alert]:
+ slug = (
+ f"{format_for_healthchecks_slug(gethostname())}-net-{format_for_healthchecks_slug(interface)}-{stat_name}"
+ )
current_stat_natural = naturalsize(current_stat_bytes_per_sec, binary=True)
if critical_threshold and (IS_TESTING or current_stat_bytes_per_sec >= critical_threshold):
critical_threshold_natural = naturalsize(critical_threshold, binary=True)
@@ -32,6 +38,7 @@ class NetIOTracker:
alert_type=AlertType.NET,
message=f"Interface {interface} {stat_name} {current_stat_natural}/s >= {critical_threshold_natural}/s",
severity=Severity.CRITICAL,
+ healthchecks_slug=slug,
)
elif warning_threshold and (IS_TESTING or current_stat_bytes_per_sec >= warning_threshold):
warning_threshold_natural = naturalsize(warning_threshold, binary=True)
@@ -39,12 +46,14 @@ class NetIOTracker:
alert_type=AlertType.NET,
message=f"Interface {interface} {stat_name} {current_stat_natural}/s >= {warning_threshold_natural}/s",
severity=Severity.WARNING,
+ healthchecks_slug=slug,
)
else:
return Alert(
alert_type=AlertType.NET,
message=f"Interface {interface} {stat_name} {current_stat_natural}/s (nominal)",
severity=Severity.OK,
+ healthchecks_slug=slug,
)
def net_check(self) -> list[Alert]:
diff --git a/src/lego_monitoring/checks/ram.py b/src/lego_monitoring/checks/ram.py
index e91c891..07e8878 100644
--- a/src/lego_monitoring/checks/ram.py
+++ b/src/lego_monitoring/checks/ram.py
@@ -1,21 +1,27 @@
+from socket import gethostname
+
from psutil import virtual_memory
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.core import cvars
+from .utils import format_for_healthchecks_slug
+
IS_TESTING = False
def ram_check() -> list[Alert]:
percentage = virtual_memory().percent
config = cvars.config.get().checks.ram
+ slug = f"{format_for_healthchecks_slug(gethostname())}-ram"
if config.critical_percentage and (IS_TESTING or percentage >= config.critical_percentage):
return [
Alert(
alert_type=AlertType.RAM,
message=f"RAM usage: {percentage:.2f}% >= {config.critical_percentage:.2f}%",
severity=Severity.CRITICAL,
+ healthchecks_slug=slug,
)
]
elif config.warning_percentage and (IS_TESTING or percentage >= config.warning_percentage):
@@ -24,9 +30,15 @@ def ram_check() -> list[Alert]:
alert_type=AlertType.RAM,
message=f"RAM usage: {percentage:.2f}% >= {config.warning_percentage:.2f}%",
severity=Severity.WARNING,
+ healthchecks_slug=slug,
)
]
else:
return [
- Alert(alert_type=AlertType.RAM, message=f"RAM usage: {percentage:.2f}% (nominal)", severity=Severity.OK)
+ Alert(
+ alert_type=AlertType.RAM,
+ message=f"RAM usage: {percentage:.2f}% (nominal)",
+ severity=Severity.OK,
+ healthchecks_slug=slug,
+ )
]
diff --git a/src/lego_monitoring/checks/temp/__init__.py b/src/lego_monitoring/checks/temp/__init__.py
index e461148..4cb74bc 100644
--- a/src/lego_monitoring/checks/temp/__init__.py
+++ b/src/lego_monitoring/checks/temp/__init__.py
@@ -1,6 +1,9 @@
+from socket import gethostname
+
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
+from ..utils import format_for_healthchecks_slug
from . import sensors
IS_TESTING = False
@@ -11,23 +14,29 @@ def temp_check() -> list[Alert]:
temps = sensors.get_readings()
for sensor, readings in temps.items():
for r in readings:
+ sensor_slug = format_for_healthchecks_slug(sensor)
+ label_slug = format_for_healthchecks_slug(r.label)
+ slug = f"{format_for_healthchecks_slug(gethostname())}-temp-{sensor_slug}-{label_slug}"
if r.critical_temp is not None and (IS_TESTING or r.current_temp >= r.critical_temp):
alert = Alert(
alert_type=AlertType.TEMP,
message=f"{sensor} {r.label}: {r.current_temp}°C >= {r.critical_temp}°C",
severity=Severity.CRITICAL,
+ healthchecks_slug=slug,
)
elif r.warning_temp is not None and (IS_TESTING or r.current_temp >= r.warning_temp):
alert = Alert(
alert_type=AlertType.TEMP,
message=f"{sensor} {r.label}: {r.current_temp}°C >= {r.warning_temp}°C",
severity=Severity.WARNING,
+ healthchecks_slug=slug,
)
else:
alert = Alert(
alert_type=AlertType.TEMP,
message=f"{sensor} {r.label}: {r.current_temp}°C (nominal)",
severity=Severity.OK,
+ healthchecks_slug=slug,
)
alert_list.append(alert)
diff --git a/src/lego_monitoring/checks/utils.py b/src/lego_monitoring/checks/utils.py
new file mode 100644
index 0000000..140c3be
--- /dev/null
+++ b/src/lego_monitoring/checks/utils.py
@@ -0,0 +1,5 @@
+import re
+
+
+def format_for_healthchecks_slug(s: str) -> str:
+ return re.sub(r"[^a-z0-9_-]", "_", s.lower())
diff --git a/src/lego_monitoring/checks/vulnix/__init__.py b/src/lego_monitoring/checks/vulnix/__init__.py
index cd43c57..2e866e6 100644
--- a/src/lego_monitoring/checks/vulnix/__init__.py
+++ b/src/lego_monitoring/checks/vulnix/__init__.py
@@ -1,7 +1,10 @@
+from socket import gethostname
+
from lego_monitoring.alerting.alert import Alert
from lego_monitoring.alerting.enum import AlertType, Severity
from lego_monitoring.alerting.sender import send_alert
+from ..utils import format_for_healthchecks_slug
from .vulnix import get_vulnix_output
IS_TESTING = False
@@ -9,6 +12,7 @@ IS_TESTING = False
async def vulnix_check() -> list[Alert]:
alert_list = []
+ slug = f"{format_for_healthchecks_slug(gethostname())}-vulnix"
try:
vulnix_output = get_vulnix_output(IS_TESTING)
except Exception as e:
@@ -17,6 +21,7 @@ async def vulnix_check() -> list[Alert]:
alert_type=AlertType.ERROR,
message=f"Exception {type(e).__name__} while calling vulnix: {e}",
severity=Severity.CRITICAL,
+ healthchecks_slug=slug,
)
)
return []
@@ -29,6 +34,7 @@ async def vulnix_check() -> list[Alert]:
continue
message = f"New findings in derivation {finding.derivation}:"
short_message = f"New findings in {finding.derivation} (short ver):"
+ plain_message = f"New findings in derivation {finding.derivation}:"
for cve in non_whitelisted_cves:
if cve in finding.cvssv3_basescore:
score_str = f"(CVSSv3 = {finding.cvssv3_basescore[cve]})"
@@ -36,6 +42,7 @@ async def vulnix_check() -> list[Alert]:
score_str = "(not scored by CVSSv3)"
message += f'\n* {cve} - {finding.description[cve]} {score_str}'
short_message += f'\n * {cve}'
+ plain_message += f"\n* https://nvd.nist.gov/vuln/detail/{cve} - {finding.description[cve]} {score_str}"
if len(message) > 3700:
message = short_message
@@ -44,6 +51,8 @@ async def vulnix_check() -> list[Alert]:
alert_type=AlertType.VULN,
message=message,
severity=Severity.WARNING,
+ healthchecks_slug=slug,
+ plain_message=plain_message,
)
alert_list.append(alert)
@@ -51,6 +60,6 @@ async def vulnix_check() -> list[Alert]:
alert_list[0].message += "\n(just testing)"
return [alert_list[0]]
elif len(alert_list) == 0:
- return [Alert(AlertType.VULN, message="No vulnerabilities found", severity=Severity.OK)]
+ return [Alert(AlertType.VULN, message="No vulnerabilities found", severity=Severity.OK, healthchecks_slug=slug)]
else:
return alert_list
diff --git a/src/lego_monitoring/core/checkers.py b/src/lego_monitoring/core/checkers.py
index ccb6c41..8b4bb5a 100644
--- a/src/lego_monitoring/core/checkers.py
+++ b/src/lego_monitoring/core/checkers.py
@@ -7,7 +7,7 @@ from typing import Any, Callable, Coroutine
from ..alerting.alert import Alert
from ..alerting.current import CurrentAlerts
from ..alerting.enum import Severity
-from ..alerting.sender import send_alert
+from ..alerting.sender import send_alert, send_healthchecks_status
@dataclass
@@ -75,6 +75,9 @@ class BaseChecker:
for alert in alerts:
await send_alert(alert, note="ongoing")
+ for alert in alerts:
+ await send_healthchecks_status(alert)
+
async def run_checker(self) -> None:
raise NotImplementedError