Add alert tools, mutes, short status, and backup summary

This commit is contained in:
2026-02-08 22:43:16 +03:00
parent ae2d085214
commit 972c8eb6a7
12 changed files with 280 additions and 11 deletions

42
services/alert_mute.py Normal file
View File

@@ -0,0 +1,42 @@
import time
from typing import Dict
# category -> unix timestamp until muted
_MUTES: Dict[str, float] = {}
def _cleanup() -> None:
now = time.time()
expired = [k for k, until in _MUTES.items() if until <= now]
for k in expired:
_MUTES.pop(k, None)
def set_mute(category: str, seconds: int) -> float:
_cleanup()
until = time.time() + max(0, seconds)
_MUTES[category] = until
return until
def clear_mute(category: str) -> None:
_MUTES.pop(category, None)
def is_muted(category: str | None) -> bool:
if not category:
return False
_cleanup()
until = _MUTES.get(category)
if until is None:
return False
if until <= time.time():
_MUTES.pop(category, None)
return False
return True
def list_mutes() -> dict[str, int]:
_cleanup()
now = time.time()
return {k: int(until - now) for k, until in _MUTES.items()}

View File

@@ -28,27 +28,27 @@ async def monitor_resources(cfg, notify, bot, chat_id):
usage, mount = worst_disk_usage()
if usage is None:
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na")
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na", category="disk")
state["disk_na"] = True
last_sent["disk_na"] = now
else:
if state["disk_na"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok")
if state["disk_na"] and notify_recovery and not load_only_critical:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
state["disk_na"] = False
if usage >= disk_warn:
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high")
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high", category="disk")
state["disk_high"] = True
last_sent["disk"] = now
else:
if state["disk_high"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok")
if state["disk_high"] and notify_recovery and not load_only_critical:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
state["disk_high"] = False
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
report = await build_disk_report(cfg, mount or "/", usage)
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot")
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot", category="disk")
last_sent["disk_report"] = now
load = psutil.getloadavg()[0]
@@ -63,14 +63,14 @@ async def monitor_resources(cfg, notify, bot, chat_id):
if level == 0:
if state["load_level"] > 0 and notify_recovery and not load_only_critical:
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok")
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok", category="load")
state["load_level"] = 0
else:
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
icon = "🔴" if level == 2 else "🟡"
level_name = "critical" if level == 2 else "warn"
key = "load_high_crit" if level == 2 else "load_high_warn"
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key)
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load")
last_sent["load"] = now
state["load_level"] = level
@@ -102,6 +102,7 @@ async def monitor_smart(cfg, notify, bot, chat_id):
f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}",
level="critical",
key=f"smart_fail:{dev}",
category="smart",
)
last_sent[key] = now
continue
@@ -118,6 +119,7 @@ async def monitor_smart(cfg, notify, bot, chat_id):
f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}",
level="warn",
key=f"smart_hot:{dev}",
category="smart",
)
last_sent[key] = now
continue

33
services/config_check.py Normal file
View File

@@ -0,0 +1,33 @@
import os
from typing import Any, Tuple, List
def validate_cfg(cfg: dict[str, Any]) -> Tuple[List[str], List[str]]:
errors: List[str] = []
warnings: List[str] = []
tg = cfg.get("telegram", {})
if not tg.get("token"):
errors.append("telegram.token is missing")
if not tg.get("admin_id"):
errors.append("telegram.admin_id is missing")
thresholds = cfg.get("thresholds", {})
for key in ("disk_warn", "load_warn", "high_load_warn"):
if key not in thresholds:
warnings.append(f"thresholds.{key} not set")
paths = cfg.get("paths", {})
env_path = paths.get("restic_env")
if env_path and not os.path.exists(env_path):
warnings.append(f"paths.restic_env not found: {env_path}")
npm = cfg.get("npmplus", {})
if npm and not npm.get("token") and (not npm.get("identity") or not npm.get("secret")):
warnings.append("npmplus: token missing and identity/secret missing")
ow = cfg.get("openwrt", {})
if ow and not ow.get("host"):
warnings.append("openwrt.host is missing")
return errors, warnings

View File

@@ -150,6 +150,7 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
f"⚠️ {alias} health: {health}",
level="warn",
key=f"docker_health:{alias}",
category="docker",
)
else:
await notify(
@@ -158,6 +159,7 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
f"🐳 {alias}: {status}",
level="info",
key=f"docker_status:{alias}:{status}",
category="docker",
)
last[alias] = (status, health)
await asyncio.sleep(120)

View File

@@ -2,6 +2,7 @@ import time
from datetime import datetime
from aiogram import Bot
from app import cfg
from services.alert_mute import is_muted
from services.incidents import log_incident
@@ -37,8 +38,17 @@ def _in_quiet_hours(alerts_cfg: dict) -> bool:
return now_min >= start_min or now_min < end_min
async def notify(bot: Bot, chat_id: int, text: str, level: str = "info", key: str | None = None):
async def notify(
bot: Bot,
chat_id: int,
text: str,
level: str = "info",
key: str | None = None,
category: str | None = None,
):
alerts_cfg = cfg.get("alerts", {})
if category and is_muted(category):
return
if _in_quiet_hours(alerts_cfg):
allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True))
if not (allow_critical and level == "critical"):

View File

@@ -53,6 +53,7 @@ async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int):
f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)",
level=level,
key=f"ssl:{name}:{threshold}",
category="ssl",
)
last_sent[key] = time.time()
break