diff --git a/src/lego_monitoring/alerting/sender.py b/src/lego_monitoring/alerting/sender.py index b9508f9..edd35c0 100644 --- a/src/lego_monitoring/alerting/sender.py +++ b/src/lego_monitoring/alerting/sender.py @@ -54,9 +54,28 @@ async def send_alert(alert: Alert, note: str = "") -> None: # if temp_client: # await client.close() - # TODO ping healthchecks if enabled - if alert.healthchecks_slug is not None: - raise NotImplementedError + +async def send_healthchecks_status(alert: Alert) -> None: + def get_pinging_key(keys: dict[str, str]): + if alert.healthchecks_slug in keys: + return keys[alert.healthchecks_slug] + else: + return keys["default"] + + if alert.healthchecks_slug is None: + return + try: + hc_client = cvars.healthchecks_client.get() + except LookupError: + raise NotImplementedError # TODO + if hc_client is None: + return + config = cvars.config.get() + key = get_pinging_key(config.alert_channels.healthchecks.pinging_keys) + if alert.severity == Severity.OK: + await hc_client.success(key, alert.healthchecks_slug, create=True, log=alert.plain_message) + else: + await hc_client.failure(key, alert.healthchecks_slug, create=True, log=alert.plain_message) # TODO service itself has to be monitored like everything else - with regular pinging - if we're diff --git a/src/lego_monitoring/checks/cpu.py b/src/lego_monitoring/checks/cpu.py index 594c1d6..f31cb7b 100644 --- a/src/lego_monitoring/checks/cpu.py +++ b/src/lego_monitoring/checks/cpu.py @@ -1,21 +1,27 @@ +from socket import gethostname + from psutil import cpu_percent from lego_monitoring.alerting.alert import Alert from lego_monitoring.alerting.enum import AlertType, Severity from lego_monitoring.core import cvars +from .utils import format_for_healthchecks_slug + IS_TESTING = False def cpu_check() -> list[Alert]: percentage = cpu_percent() config = cvars.config.get().checks.cpu + slug = f"{format_for_healthchecks_slug(gethostname())}-cpu" if config.critical_percentage and (IS_TESTING or percentage >= config.critical_percentage): return [ Alert( alert_type=AlertType.CPU, message=f"CPU load: {percentage:.2f}% >= {config.critical_percentage:.2f}%", severity=Severity.CRITICAL, + healthchecks_slug=slug, ) ] elif config.warning_percentage and (IS_TESTING or percentage >= config.warning_percentage): @@ -24,7 +30,15 @@ def cpu_check() -> list[Alert]: alert_type=AlertType.CPU, message=f"CPU load: {percentage:.2f}% >= {config.warning_percentage:.2f}%", severity=Severity.WARNING, + healthchecks_slug=slug, ) ] else: - return [Alert(alert_type=AlertType.CPU, message=f"CPU load: {percentage:.2f}% (nominal)", severity=Severity.OK)] + return [ + Alert( + alert_type=AlertType.CPU, + message=f"CPU load: {percentage:.2f}% (nominal)", + severity=Severity.OK, + healthchecks_slug=slug, + ) + ] diff --git a/src/lego_monitoring/checks/net.py b/src/lego_monitoring/checks/net.py index 073bd8f..10c6d03 100644 --- a/src/lego_monitoring/checks/net.py +++ b/src/lego_monitoring/checks/net.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +from socket import gethostname from typing import Optional from humanize import naturalsize @@ -8,6 +9,8 @@ from lego_monitoring.alerting.alert import Alert from lego_monitoring.alerting.enum import AlertType, Severity from lego_monitoring.core import cvars +from .utils import format_for_healthchecks_slug + IS_TESTING = False SECONDS_BETWEEN_CHECKS = 5 * 60 @@ -25,6 +28,9 @@ class NetIOTracker: stat_name: str, interface: str, ) -> Optional[Alert]: + slug = ( + f"{format_for_healthchecks_slug(gethostname())}-net-{format_for_healthchecks_slug(interface)}-{stat_name}" + ) current_stat_natural = naturalsize(current_stat_bytes_per_sec, binary=True) if critical_threshold and (IS_TESTING or current_stat_bytes_per_sec >= critical_threshold): critical_threshold_natural = naturalsize(critical_threshold, binary=True) @@ -32,6 +38,7 @@ class NetIOTracker: alert_type=AlertType.NET, message=f"Interface {interface} {stat_name} {current_stat_natural}/s >= {critical_threshold_natural}/s", severity=Severity.CRITICAL, + healthchecks_slug=slug, ) elif warning_threshold and (IS_TESTING or current_stat_bytes_per_sec >= warning_threshold): warning_threshold_natural = naturalsize(warning_threshold, binary=True) @@ -39,12 +46,14 @@ class NetIOTracker: alert_type=AlertType.NET, message=f"Interface {interface} {stat_name} {current_stat_natural}/s >= {warning_threshold_natural}/s", severity=Severity.WARNING, + healthchecks_slug=slug, ) else: return Alert( alert_type=AlertType.NET, message=f"Interface {interface} {stat_name} {current_stat_natural}/s (nominal)", severity=Severity.OK, + healthchecks_slug=slug, ) def net_check(self) -> list[Alert]: diff --git a/src/lego_monitoring/checks/ram.py b/src/lego_monitoring/checks/ram.py index e91c891..07e8878 100644 --- a/src/lego_monitoring/checks/ram.py +++ b/src/lego_monitoring/checks/ram.py @@ -1,21 +1,27 @@ +from socket import gethostname + from psutil import virtual_memory from lego_monitoring.alerting.alert import Alert from lego_monitoring.alerting.enum import AlertType, Severity from lego_monitoring.core import cvars +from .utils import format_for_healthchecks_slug + IS_TESTING = False def ram_check() -> list[Alert]: percentage = virtual_memory().percent config = cvars.config.get().checks.ram + slug = f"{format_for_healthchecks_slug(gethostname())}-ram" if config.critical_percentage and (IS_TESTING or percentage >= config.critical_percentage): return [ Alert( alert_type=AlertType.RAM, message=f"RAM usage: {percentage:.2f}% >= {config.critical_percentage:.2f}%", severity=Severity.CRITICAL, + healthchecks_slug=slug, ) ] elif config.warning_percentage and (IS_TESTING or percentage >= config.warning_percentage): @@ -24,9 +30,15 @@ def ram_check() -> list[Alert]: alert_type=AlertType.RAM, message=f"RAM usage: {percentage:.2f}% >= {config.warning_percentage:.2f}%", severity=Severity.WARNING, + healthchecks_slug=slug, ) ] else: return [ - Alert(alert_type=AlertType.RAM, message=f"RAM usage: {percentage:.2f}% (nominal)", severity=Severity.OK) + Alert( + alert_type=AlertType.RAM, + message=f"RAM usage: {percentage:.2f}% (nominal)", + severity=Severity.OK, + healthchecks_slug=slug, + ) ] diff --git a/src/lego_monitoring/checks/temp/__init__.py b/src/lego_monitoring/checks/temp/__init__.py index e461148..4cb74bc 100644 --- a/src/lego_monitoring/checks/temp/__init__.py +++ b/src/lego_monitoring/checks/temp/__init__.py @@ -1,6 +1,9 @@ +from socket import gethostname + from lego_monitoring.alerting.alert import Alert from lego_monitoring.alerting.enum import AlertType, Severity +from ..utils import format_for_healthchecks_slug from . import sensors IS_TESTING = False @@ -11,23 +14,29 @@ def temp_check() -> list[Alert]: temps = sensors.get_readings() for sensor, readings in temps.items(): for r in readings: + sensor_slug = format_for_healthchecks_slug(sensor) + label_slug = format_for_healthchecks_slug(r.label) + slug = f"{format_for_healthchecks_slug(gethostname())}-temp-{sensor_slug}-{label_slug}" if r.critical_temp is not None and (IS_TESTING or r.current_temp >= r.critical_temp): alert = Alert( alert_type=AlertType.TEMP, message=f"{sensor} {r.label}: {r.current_temp}°C >= {r.critical_temp}°C", severity=Severity.CRITICAL, + healthchecks_slug=slug, ) elif r.warning_temp is not None and (IS_TESTING or r.current_temp >= r.warning_temp): alert = Alert( alert_type=AlertType.TEMP, message=f"{sensor} {r.label}: {r.current_temp}°C >= {r.warning_temp}°C", severity=Severity.WARNING, + healthchecks_slug=slug, ) else: alert = Alert( alert_type=AlertType.TEMP, message=f"{sensor} {r.label}: {r.current_temp}°C (nominal)", severity=Severity.OK, + healthchecks_slug=slug, ) alert_list.append(alert) diff --git a/src/lego_monitoring/checks/utils.py b/src/lego_monitoring/checks/utils.py new file mode 100644 index 0000000..140c3be --- /dev/null +++ b/src/lego_monitoring/checks/utils.py @@ -0,0 +1,5 @@ +import re + + +def format_for_healthchecks_slug(s: str) -> str: + return re.sub(r"[^a-z0-9_-]", "_", s.lower()) diff --git a/src/lego_monitoring/checks/vulnix/__init__.py b/src/lego_monitoring/checks/vulnix/__init__.py index cd43c57..2e866e6 100644 --- a/src/lego_monitoring/checks/vulnix/__init__.py +++ b/src/lego_monitoring/checks/vulnix/__init__.py @@ -1,7 +1,10 @@ +from socket import gethostname + from lego_monitoring.alerting.alert import Alert from lego_monitoring.alerting.enum import AlertType, Severity from lego_monitoring.alerting.sender import send_alert +from ..utils import format_for_healthchecks_slug from .vulnix import get_vulnix_output IS_TESTING = False @@ -9,6 +12,7 @@ IS_TESTING = False async def vulnix_check() -> list[Alert]: alert_list = [] + slug = f"{format_for_healthchecks_slug(gethostname())}-vulnix" try: vulnix_output = get_vulnix_output(IS_TESTING) except Exception as e: @@ -17,6 +21,7 @@ async def vulnix_check() -> list[Alert]: alert_type=AlertType.ERROR, message=f"Exception {type(e).__name__} while calling vulnix: {e}", severity=Severity.CRITICAL, + healthchecks_slug=slug, ) ) return [] @@ -29,6 +34,7 @@ async def vulnix_check() -> list[Alert]: continue message = f"New findings in derivation {finding.derivation}:" short_message = f"New findings in {finding.derivation} (short ver):" + plain_message = f"New findings in derivation {finding.derivation}:" for cve in non_whitelisted_cves: if cve in finding.cvssv3_basescore: score_str = f"(CVSSv3 = {finding.cvssv3_basescore[cve]})" @@ -36,6 +42,7 @@ async def vulnix_check() -> list[Alert]: score_str = "(not scored by CVSSv3)" message += f'\n* {cve} - {finding.description[cve]} {score_str}' short_message += f'\n * {cve}' + plain_message += f"\n* https://nvd.nist.gov/vuln/detail/{cve} - {finding.description[cve]} {score_str}" if len(message) > 3700: message = short_message @@ -44,6 +51,8 @@ async def vulnix_check() -> list[Alert]: alert_type=AlertType.VULN, message=message, severity=Severity.WARNING, + healthchecks_slug=slug, + plain_message=plain_message, ) alert_list.append(alert) @@ -51,6 +60,6 @@ async def vulnix_check() -> list[Alert]: alert_list[0].message += "\n(just testing)" return [alert_list[0]] elif len(alert_list) == 0: - return [Alert(AlertType.VULN, message="No vulnerabilities found", severity=Severity.OK)] + return [Alert(AlertType.VULN, message="No vulnerabilities found", severity=Severity.OK, healthchecks_slug=slug)] else: return alert_list diff --git a/src/lego_monitoring/core/checkers.py b/src/lego_monitoring/core/checkers.py index ccb6c41..8b4bb5a 100644 --- a/src/lego_monitoring/core/checkers.py +++ b/src/lego_monitoring/core/checkers.py @@ -7,7 +7,7 @@ from typing import Any, Callable, Coroutine from ..alerting.alert import Alert from ..alerting.current import CurrentAlerts from ..alerting.enum import Severity -from ..alerting.sender import send_alert +from ..alerting.sender import send_alert, send_healthchecks_status @dataclass @@ -75,6 +75,9 @@ class BaseChecker: for alert in alerts: await send_alert(alert, note="ongoing") + for alert in alerts: + await send_healthchecks_status(alert) + async def run_checker(self) -> None: raise NotImplementedError