Add alert tools, mutes, short status, and backup summary
This commit is contained in:
9
deploy.sh
Normal file
9
deploy.sh
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SSH_HOST="root@10.10.10.10"
|
||||
SSH_PORT="1090"
|
||||
APP_DIR="/opt/tg-bot"
|
||||
|
||||
ssh -p "$SSH_PORT" "$SSH_HOST" \
|
||||
"cd \"$APP_DIR\" && git pull --ff-only && systemctl restart tg-bot"
|
||||
95
handlers/alerts_admin.py
Normal file
95
handlers/alerts_admin.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from aiogram import F
|
||||
from aiogram.types import Message
|
||||
from app import dp, bot, cfg, ADMIN_ID
|
||||
from auth import is_admin_msg
|
||||
from services.alert_mute import set_mute, clear_mute, list_mutes
|
||||
from services.incidents import read_recent
|
||||
from services.notify import notify
|
||||
|
||||
|
||||
HELP_TEXT = (
|
||||
"Alerts:\n"
|
||||
"/alerts test <critical|warn|info> - send test alert\n"
|
||||
"/alerts mute <category> <minutes> - mute alerts for category\n"
|
||||
"/alerts unmute <category> - unmute category\n"
|
||||
"/alerts list - show active mutes\n"
|
||||
"/alerts recent [hours] - show incidents log (default 24h)\n"
|
||||
"Categories: load, disk, smart, ssl, docker, test\n"
|
||||
)
|
||||
|
||||
|
||||
@dp.message(F.text.startswith("/alerts"))
|
||||
async def alerts_cmd(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
|
||||
parts = msg.text.split()
|
||||
if len(parts) < 2:
|
||||
await msg.answer(HELP_TEXT)
|
||||
return
|
||||
|
||||
action = parts[1].lower()
|
||||
|
||||
if action == "test":
|
||||
level = parts[2].lower() if len(parts) >= 3 else "info"
|
||||
if level not in ("critical", "warn", "info"):
|
||||
level = "info"
|
||||
key = f"test:{level}:{int(time.time())}"
|
||||
await notify(bot, msg.chat.id, f"[TEST] {level.upper()} alert", level=level, key=key, category="test")
|
||||
await msg.answer(f"Sent test alert: {level}")
|
||||
return
|
||||
|
||||
if action == "mute":
|
||||
if len(parts) < 3:
|
||||
await msg.answer("Usage: /alerts mute <category> <minutes>")
|
||||
return
|
||||
category = parts[2].lower()
|
||||
minutes = 60
|
||||
if len(parts) >= 4:
|
||||
try:
|
||||
minutes = max(1, int(parts[3]))
|
||||
except ValueError:
|
||||
minutes = 60
|
||||
until = set_mute(category, minutes * 60)
|
||||
dt = datetime.fromtimestamp(until, tz=timezone.utc).astimezone()
|
||||
await msg.answer(f"🔕 Muted {category} for {minutes}m (until {dt:%Y-%m-%d %H:%M:%S})")
|
||||
return
|
||||
|
||||
if action == "unmute":
|
||||
if len(parts) < 3:
|
||||
await msg.answer("Usage: /alerts unmute <category>")
|
||||
return
|
||||
category = parts[2].lower()
|
||||
clear_mute(category)
|
||||
await msg.answer(f"🔔 Unmuted {category}")
|
||||
return
|
||||
|
||||
if action in ("list", "mutes"):
|
||||
mutes = list_mutes()
|
||||
if not mutes:
|
||||
await msg.answer("🔔 No active mutes")
|
||||
return
|
||||
lines = ["🔕 Active mutes:"]
|
||||
for cat, secs in mutes.items():
|
||||
mins = max(0, secs) // 60
|
||||
lines.append(f"- {cat}: {mins}m left")
|
||||
await msg.answer("\n".join(lines))
|
||||
return
|
||||
|
||||
if action == "recent":
|
||||
hours = 24
|
||||
if len(parts) >= 3:
|
||||
try:
|
||||
hours = max(1, int(parts[2]))
|
||||
except ValueError:
|
||||
hours = 24
|
||||
rows = read_recent(cfg, hours, limit=50)
|
||||
if not rows:
|
||||
await msg.answer(f"No incidents in last {hours}h")
|
||||
return
|
||||
await msg.answer("🧾 Incidents:\n" + "\n".join(rows))
|
||||
return
|
||||
|
||||
await msg.answer(HELP_TEXT)
|
||||
@@ -37,6 +37,16 @@ def _sudo_cmd(cmd: list[str]) -> list[str]:
|
||||
return ["sudo", "-E"] + cmd
|
||||
|
||||
|
||||
def _format_backup_result(rc: int, out: str) -> str:
|
||||
log_hint = "log: /var/log/backup-auto.log"
|
||||
header = "✅ Backup finished" if rc == 0 else "❌ Backup failed"
|
||||
lines = out.strip().splitlines()
|
||||
body = "\n".join(lines[:20])
|
||||
if len(lines) > 20:
|
||||
body += f"\n… trimmed {len(lines) - 20} lines"
|
||||
return f"{header} (rc={rc})\n{log_hint}\n\n{body}" if body else f"{header} (rc={rc})\n{log_hint}"
|
||||
|
||||
|
||||
def _load_json(raw: str, label: str) -> tuple[bool, object | None, str]:
|
||||
if not raw or not raw.strip():
|
||||
return False, None, f"? {label} returned empty output"
|
||||
@@ -215,7 +225,7 @@ async def cmd_backup_now(msg: Message):
|
||||
use_restic_env=True,
|
||||
timeout=6 * 3600,
|
||||
)
|
||||
await msg.answer(("✅ OK\n" if rc == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
|
||||
await msg.answer(_format_backup_result(rc, out), reply_markup=backup_kb)
|
||||
finally:
|
||||
release_lock("backup")
|
||||
|
||||
|
||||
24
handlers/config_check.py
Normal file
24
handlers/config_check.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from aiogram import F
|
||||
from aiogram.types import Message
|
||||
from app import dp, cfg
|
||||
from auth import is_admin_msg
|
||||
from services.config_check import validate_cfg
|
||||
|
||||
|
||||
@dp.message(F.text == "/config_check")
|
||||
async def config_check(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
errors, warnings = validate_cfg(cfg)
|
||||
lines = []
|
||||
if errors:
|
||||
lines.append("❌ Config errors:")
|
||||
lines += [f"- {e}" for e in errors]
|
||||
if warnings:
|
||||
if lines:
|
||||
lines.append("")
|
||||
lines.append("⚠️ Warnings:")
|
||||
lines += [f"- {w}" for w in warnings]
|
||||
if not lines:
|
||||
lines.append("✅ Config looks OK")
|
||||
await msg.answer("\n".join(lines))
|
||||
@@ -76,6 +76,45 @@ async def st(msg: Message):
|
||||
await cmd_status(msg)
|
||||
|
||||
|
||||
@dp.message(F.text == "/status_short")
|
||||
async def st_short(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
now = time.time()
|
||||
uptime_sec = int(now - psutil.boot_time())
|
||||
days, rem = divmod(uptime_sec, 86400)
|
||||
hours, rem = divmod(rem, 3600)
|
||||
minutes, _ = divmod(rem, 60)
|
||||
load1, load5, load15 = psutil.getloadavg()
|
||||
mem = psutil.virtual_memory()
|
||||
disks = format_disks().splitlines()
|
||||
disk_line = disks[1] if len(disks) > 1 else "Disks: n/a"
|
||||
await msg.answer(
|
||||
"📋 **Status (short)**\n"
|
||||
f"🖥 `{socket.gethostname()}`\n"
|
||||
f"⏱ Uptime: {days}d {hours}h {minutes}m\n"
|
||||
f"⚙️ Load: {load1:.2f} {load5:.2f} {load15:.2f}\n"
|
||||
f"🧠 RAM: {mem.percent}% ({mem.used // (1024**3)} / {mem.total // (1024**3)} GiB)\n"
|
||||
f"💾 {disk_line}",
|
||||
reply_markup=menu_kb,
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
|
||||
|
||||
@dp.message(F.text == "/health_short")
|
||||
async def health_short(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
try:
|
||||
text = await asyncio.to_thread(health, cfg, DOCKER_MAP)
|
||||
except Exception as e:
|
||||
await msg.answer(f"❌ Health failed: {type(e).__name__}: {e}", reply_markup=menu_kb)
|
||||
return
|
||||
lines = [ln for ln in text.splitlines() if ln.strip()]
|
||||
brief = " | ".join(lines[1:5]) if len(lines) > 1 else text
|
||||
await msg.answer(f"🩺 Health (short)\n{brief}", reply_markup=menu_kb)
|
||||
|
||||
|
||||
def _rate_str(value: float) -> str:
|
||||
if value >= 1024 * 1024:
|
||||
return f"{value / (1024 * 1024):.2f} MiB/s"
|
||||
|
||||
2
main.py
2
main.py
@@ -25,6 +25,8 @@ import handlers.help
|
||||
import handlers.callbacks
|
||||
import handlers.arcane
|
||||
import handlers.processes
|
||||
import handlers.alerts_admin
|
||||
import handlers.config_check
|
||||
|
||||
|
||||
def _handle_async_exception(_loop, context):
|
||||
|
||||
42
services/alert_mute.py
Normal file
42
services/alert_mute.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import time
|
||||
from typing import Dict
|
||||
|
||||
# category -> unix timestamp until muted
|
||||
_MUTES: Dict[str, float] = {}
|
||||
|
||||
|
||||
def _cleanup() -> None:
|
||||
now = time.time()
|
||||
expired = [k for k, until in _MUTES.items() if until <= now]
|
||||
for k in expired:
|
||||
_MUTES.pop(k, None)
|
||||
|
||||
|
||||
def set_mute(category: str, seconds: int) -> float:
|
||||
_cleanup()
|
||||
until = time.time() + max(0, seconds)
|
||||
_MUTES[category] = until
|
||||
return until
|
||||
|
||||
|
||||
def clear_mute(category: str) -> None:
|
||||
_MUTES.pop(category, None)
|
||||
|
||||
|
||||
def is_muted(category: str | None) -> bool:
|
||||
if not category:
|
||||
return False
|
||||
_cleanup()
|
||||
until = _MUTES.get(category)
|
||||
if until is None:
|
||||
return False
|
||||
if until <= time.time():
|
||||
_MUTES.pop(category, None)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def list_mutes() -> dict[str, int]:
|
||||
_cleanup()
|
||||
now = time.time()
|
||||
return {k: int(until - now) for k, until in _MUTES.items()}
|
||||
@@ -28,27 +28,27 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
||||
usage, mount = worst_disk_usage()
|
||||
if usage is None:
|
||||
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
|
||||
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na")
|
||||
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na", category="disk")
|
||||
state["disk_na"] = True
|
||||
last_sent["disk_na"] = now
|
||||
else:
|
||||
if state["disk_na"] and notify_recovery:
|
||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok")
|
||||
if state["disk_na"] and notify_recovery and not load_only_critical:
|
||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
|
||||
state["disk_na"] = False
|
||||
|
||||
if usage >= disk_warn:
|
||||
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
|
||||
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high")
|
||||
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high", category="disk")
|
||||
state["disk_high"] = True
|
||||
last_sent["disk"] = now
|
||||
else:
|
||||
if state["disk_high"] and notify_recovery:
|
||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok")
|
||||
if state["disk_high"] and notify_recovery and not load_only_critical:
|
||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
|
||||
state["disk_high"] = False
|
||||
|
||||
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
|
||||
report = await build_disk_report(cfg, mount or "/", usage)
|
||||
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot")
|
||||
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot", category="disk")
|
||||
last_sent["disk_report"] = now
|
||||
|
||||
load = psutil.getloadavg()[0]
|
||||
@@ -63,14 +63,14 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
||||
|
||||
if level == 0:
|
||||
if state["load_level"] > 0 and notify_recovery and not load_only_critical:
|
||||
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok")
|
||||
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok", category="load")
|
||||
state["load_level"] = 0
|
||||
else:
|
||||
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
|
||||
icon = "🔴" if level == 2 else "🟡"
|
||||
level_name = "critical" if level == 2 else "warn"
|
||||
key = "load_high_crit" if level == 2 else "load_high_warn"
|
||||
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key)
|
||||
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load")
|
||||
last_sent["load"] = now
|
||||
state["load_level"] = level
|
||||
|
||||
@@ -102,6 +102,7 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
||||
f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}",
|
||||
level="critical",
|
||||
key=f"smart_fail:{dev}",
|
||||
category="smart",
|
||||
)
|
||||
last_sent[key] = now
|
||||
continue
|
||||
@@ -118,6 +119,7 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
||||
f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}",
|
||||
level="warn",
|
||||
key=f"smart_hot:{dev}",
|
||||
category="smart",
|
||||
)
|
||||
last_sent[key] = now
|
||||
continue
|
||||
|
||||
33
services/config_check.py
Normal file
33
services/config_check.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
from typing import Any, Tuple, List
|
||||
|
||||
|
||||
def validate_cfg(cfg: dict[str, Any]) -> Tuple[List[str], List[str]]:
|
||||
errors: List[str] = []
|
||||
warnings: List[str] = []
|
||||
|
||||
tg = cfg.get("telegram", {})
|
||||
if not tg.get("token"):
|
||||
errors.append("telegram.token is missing")
|
||||
if not tg.get("admin_id"):
|
||||
errors.append("telegram.admin_id is missing")
|
||||
|
||||
thresholds = cfg.get("thresholds", {})
|
||||
for key in ("disk_warn", "load_warn", "high_load_warn"):
|
||||
if key not in thresholds:
|
||||
warnings.append(f"thresholds.{key} not set")
|
||||
|
||||
paths = cfg.get("paths", {})
|
||||
env_path = paths.get("restic_env")
|
||||
if env_path and not os.path.exists(env_path):
|
||||
warnings.append(f"paths.restic_env not found: {env_path}")
|
||||
|
||||
npm = cfg.get("npmplus", {})
|
||||
if npm and not npm.get("token") and (not npm.get("identity") or not npm.get("secret")):
|
||||
warnings.append("npmplus: token missing and identity/secret missing")
|
||||
|
||||
ow = cfg.get("openwrt", {})
|
||||
if ow and not ow.get("host"):
|
||||
warnings.append("openwrt.host is missing")
|
||||
|
||||
return errors, warnings
|
||||
@@ -150,6 +150,7 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
|
||||
f"⚠️ {alias} health: {health}",
|
||||
level="warn",
|
||||
key=f"docker_health:{alias}",
|
||||
category="docker",
|
||||
)
|
||||
else:
|
||||
await notify(
|
||||
@@ -158,6 +159,7 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
|
||||
f"🐳 {alias}: {status}",
|
||||
level="info",
|
||||
key=f"docker_status:{alias}:{status}",
|
||||
category="docker",
|
||||
)
|
||||
last[alias] = (status, health)
|
||||
await asyncio.sleep(120)
|
||||
|
||||
@@ -2,6 +2,7 @@ import time
|
||||
from datetime import datetime
|
||||
from aiogram import Bot
|
||||
from app import cfg
|
||||
from services.alert_mute import is_muted
|
||||
from services.incidents import log_incident
|
||||
|
||||
|
||||
@@ -37,8 +38,17 @@ def _in_quiet_hours(alerts_cfg: dict) -> bool:
|
||||
return now_min >= start_min or now_min < end_min
|
||||
|
||||
|
||||
async def notify(bot: Bot, chat_id: int, text: str, level: str = "info", key: str | None = None):
|
||||
async def notify(
|
||||
bot: Bot,
|
||||
chat_id: int,
|
||||
text: str,
|
||||
level: str = "info",
|
||||
key: str | None = None,
|
||||
category: str | None = None,
|
||||
):
|
||||
alerts_cfg = cfg.get("alerts", {})
|
||||
if category and is_muted(category):
|
||||
return
|
||||
if _in_quiet_hours(alerts_cfg):
|
||||
allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True))
|
||||
if not (allow_critical and level == "critical"):
|
||||
|
||||
@@ -53,6 +53,7 @@ async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int):
|
||||
f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)",
|
||||
level=level,
|
||||
key=f"ssl:{name}:{threshold}",
|
||||
category="ssl",
|
||||
)
|
||||
last_sent[key] = time.time()
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user