From 972c8eb6a71b02751856a978636a5581bbeb5bf9 Mon Sep 17 00:00:00 2001 From: benya Date: Sun, 8 Feb 2026 22:43:16 +0300 Subject: [PATCH] Add alert tools, mutes, short status, and backup summary --- deploy.sh | 9 ++++ handlers/alerts_admin.py | 95 ++++++++++++++++++++++++++++++++++++++++ handlers/backup.py | 12 ++++- handlers/config_check.py | 24 ++++++++++ handlers/status.py | 39 +++++++++++++++++ main.py | 2 + services/alert_mute.py | 42 ++++++++++++++++++ services/alerts.py | 20 +++++---- services/config_check.py | 33 ++++++++++++++ services/docker.py | 2 + services/notify.py | 12 ++++- services/ssl_alerts.py | 1 + 12 files changed, 280 insertions(+), 11 deletions(-) create mode 100644 deploy.sh create mode 100644 handlers/alerts_admin.py create mode 100644 handlers/config_check.py create mode 100644 services/alert_mute.py create mode 100644 services/config_check.py diff --git a/deploy.sh b/deploy.sh new file mode 100644 index 0000000..2c6a031 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +SSH_HOST="root@10.10.10.10" +SSH_PORT="1090" +APP_DIR="/opt/tg-bot" + +ssh -p "$SSH_PORT" "$SSH_HOST" \ + "cd \"$APP_DIR\" && git pull --ff-only && systemctl restart tg-bot" diff --git a/handlers/alerts_admin.py b/handlers/alerts_admin.py new file mode 100644 index 0000000..03f9a2e --- /dev/null +++ b/handlers/alerts_admin.py @@ -0,0 +1,95 @@ +import time +from datetime import datetime, timedelta, timezone +from aiogram import F +from aiogram.types import Message +from app import dp, bot, cfg, ADMIN_ID +from auth import is_admin_msg +from services.alert_mute import set_mute, clear_mute, list_mutes +from services.incidents import read_recent +from services.notify import notify + + +HELP_TEXT = ( + "Alerts:\n" + "/alerts test - send test alert\n" + "/alerts mute - mute alerts for category\n" + "/alerts unmute - unmute category\n" + "/alerts list - show active mutes\n" + "/alerts recent [hours] - show incidents log (default 24h)\n" + "Categories: load, disk, smart, ssl, docker, test\n" +) + + +@dp.message(F.text.startswith("/alerts")) +async def alerts_cmd(msg: Message): + if not is_admin_msg(msg): + return + + parts = msg.text.split() + if len(parts) < 2: + await msg.answer(HELP_TEXT) + return + + action = parts[1].lower() + + if action == "test": + level = parts[2].lower() if len(parts) >= 3 else "info" + if level not in ("critical", "warn", "info"): + level = "info" + key = f"test:{level}:{int(time.time())}" + await notify(bot, msg.chat.id, f"[TEST] {level.upper()} alert", level=level, key=key, category="test") + await msg.answer(f"Sent test alert: {level}") + return + + if action == "mute": + if len(parts) < 3: + await msg.answer("Usage: /alerts mute ") + return + category = parts[2].lower() + minutes = 60 + if len(parts) >= 4: + try: + minutes = max(1, int(parts[3])) + except ValueError: + minutes = 60 + until = set_mute(category, minutes * 60) + dt = datetime.fromtimestamp(until, tz=timezone.utc).astimezone() + await msg.answer(f"๐Ÿ”• Muted {category} for {minutes}m (until {dt:%Y-%m-%d %H:%M:%S})") + return + + if action == "unmute": + if len(parts) < 3: + await msg.answer("Usage: /alerts unmute ") + return + category = parts[2].lower() + clear_mute(category) + await msg.answer(f"๐Ÿ”” Unmuted {category}") + return + + if action in ("list", "mutes"): + mutes = list_mutes() + if not mutes: + await msg.answer("๐Ÿ”” No active mutes") + return + lines = ["๐Ÿ”• Active mutes:"] + for cat, secs in mutes.items(): + mins = max(0, secs) // 60 + lines.append(f"- {cat}: {mins}m left") + await msg.answer("\n".join(lines)) + return + + if action == "recent": + hours = 24 + if len(parts) >= 3: + try: + hours = max(1, int(parts[2])) + except ValueError: + hours = 24 + rows = read_recent(cfg, hours, limit=50) + if not rows: + await msg.answer(f"No incidents in last {hours}h") + return + await msg.answer("๐Ÿงพ Incidents:\n" + "\n".join(rows)) + return + + await msg.answer(HELP_TEXT) diff --git a/handlers/backup.py b/handlers/backup.py index 0981591..0230528 100644 --- a/handlers/backup.py +++ b/handlers/backup.py @@ -37,6 +37,16 @@ def _sudo_cmd(cmd: list[str]) -> list[str]: return ["sudo", "-E"] + cmd +def _format_backup_result(rc: int, out: str) -> str: + log_hint = "log: /var/log/backup-auto.log" + header = "โœ… Backup finished" if rc == 0 else "โŒ Backup failed" + lines = out.strip().splitlines() + body = "\n".join(lines[:20]) + if len(lines) > 20: + body += f"\nโ€ฆ trimmed {len(lines) - 20} lines" + return f"{header} (rc={rc})\n{log_hint}\n\n{body}" if body else f"{header} (rc={rc})\n{log_hint}" + + def _load_json(raw: str, label: str) -> tuple[bool, object | None, str]: if not raw or not raw.strip(): return False, None, f"? {label} returned empty output" @@ -215,7 +225,7 @@ async def cmd_backup_now(msg: Message): use_restic_env=True, timeout=6 * 3600, ) - await msg.answer(("โœ… OK\n" if rc == 0 else "โŒ FAIL\n") + out, reply_markup=backup_kb) + await msg.answer(_format_backup_result(rc, out), reply_markup=backup_kb) finally: release_lock("backup") diff --git a/handlers/config_check.py b/handlers/config_check.py new file mode 100644 index 0000000..3cbe31f --- /dev/null +++ b/handlers/config_check.py @@ -0,0 +1,24 @@ +from aiogram import F +from aiogram.types import Message +from app import dp, cfg +from auth import is_admin_msg +from services.config_check import validate_cfg + + +@dp.message(F.text == "/config_check") +async def config_check(msg: Message): + if not is_admin_msg(msg): + return + errors, warnings = validate_cfg(cfg) + lines = [] + if errors: + lines.append("โŒ Config errors:") + lines += [f"- {e}" for e in errors] + if warnings: + if lines: + lines.append("") + lines.append("โš ๏ธ Warnings:") + lines += [f"- {w}" for w in warnings] + if not lines: + lines.append("โœ… Config looks OK") + await msg.answer("\n".join(lines)) diff --git a/handlers/status.py b/handlers/status.py index d8fe503..e2a967d 100644 --- a/handlers/status.py +++ b/handlers/status.py @@ -76,6 +76,45 @@ async def st(msg: Message): await cmd_status(msg) +@dp.message(F.text == "/status_short") +async def st_short(msg: Message): + if not is_admin_msg(msg): + return + now = time.time() + uptime_sec = int(now - psutil.boot_time()) + days, rem = divmod(uptime_sec, 86400) + hours, rem = divmod(rem, 3600) + minutes, _ = divmod(rem, 60) + load1, load5, load15 = psutil.getloadavg() + mem = psutil.virtual_memory() + disks = format_disks().splitlines() + disk_line = disks[1] if len(disks) > 1 else "Disks: n/a" + await msg.answer( + "๐Ÿ“‹ **Status (short)**\n" + f"๐Ÿ–ฅ `{socket.gethostname()}`\n" + f"โฑ Uptime: {days}d {hours}h {minutes}m\n" + f"โš™๏ธ Load: {load1:.2f} {load5:.2f} {load15:.2f}\n" + f"๐Ÿง  RAM: {mem.percent}% ({mem.used // (1024**3)} / {mem.total // (1024**3)} GiB)\n" + f"๐Ÿ’พ {disk_line}", + reply_markup=menu_kb, + parse_mode="Markdown", + ) + + +@dp.message(F.text == "/health_short") +async def health_short(msg: Message): + if not is_admin_msg(msg): + return + try: + text = await asyncio.to_thread(health, cfg, DOCKER_MAP) + except Exception as e: + await msg.answer(f"โŒ Health failed: {type(e).__name__}: {e}", reply_markup=menu_kb) + return + lines = [ln for ln in text.splitlines() if ln.strip()] + brief = " | ".join(lines[1:5]) if len(lines) > 1 else text + await msg.answer(f"๐Ÿฉบ Health (short)\n{brief}", reply_markup=menu_kb) + + def _rate_str(value: float) -> str: if value >= 1024 * 1024: return f"{value / (1024 * 1024):.2f} MiB/s" diff --git a/main.py b/main.py index 1055127..cbb0e07 100644 --- a/main.py +++ b/main.py @@ -25,6 +25,8 @@ import handlers.help import handlers.callbacks import handlers.arcane import handlers.processes +import handlers.alerts_admin +import handlers.config_check def _handle_async_exception(_loop, context): diff --git a/services/alert_mute.py b/services/alert_mute.py new file mode 100644 index 0000000..131c61c --- /dev/null +++ b/services/alert_mute.py @@ -0,0 +1,42 @@ +import time +from typing import Dict + +# category -> unix timestamp until muted +_MUTES: Dict[str, float] = {} + + +def _cleanup() -> None: + now = time.time() + expired = [k for k, until in _MUTES.items() if until <= now] + for k in expired: + _MUTES.pop(k, None) + + +def set_mute(category: str, seconds: int) -> float: + _cleanup() + until = time.time() + max(0, seconds) + _MUTES[category] = until + return until + + +def clear_mute(category: str) -> None: + _MUTES.pop(category, None) + + +def is_muted(category: str | None) -> bool: + if not category: + return False + _cleanup() + until = _MUTES.get(category) + if until is None: + return False + if until <= time.time(): + _MUTES.pop(category, None) + return False + return True + + +def list_mutes() -> dict[str, int]: + _cleanup() + now = time.time() + return {k: int(until - now) for k, until in _MUTES.items()} diff --git a/services/alerts.py b/services/alerts.py index f4f2298..2a41b21 100644 --- a/services/alerts.py +++ b/services/alerts.py @@ -28,27 +28,27 @@ async def monitor_resources(cfg, notify, bot, chat_id): usage, mount = worst_disk_usage() if usage is None: if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown: - await notify(bot, chat_id, "โš ๏ธ Disk usage n/a", level="warn", key="disk_na") + await notify(bot, chat_id, "โš ๏ธ Disk usage n/a", level="warn", key="disk_na", category="disk") state["disk_na"] = True last_sent["disk_na"] = now else: - if state["disk_na"] and notify_recovery: - await notify(bot, chat_id, f"๐ŸŸข Disk usage OK ({usage}% {mount})", level="info", key="disk_ok") + if state["disk_na"] and notify_recovery and not load_only_critical: + await notify(bot, chat_id, f"๐ŸŸข Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk") state["disk_na"] = False if usage >= disk_warn: if not state["disk_high"] or now - last_sent["disk"] >= cooldown: - await notify(bot, chat_id, f"๐ŸŸก Disk usage {usage}% ({mount})", level="warn", key="disk_high") + await notify(bot, chat_id, f"๐ŸŸก Disk usage {usage}% ({mount})", level="warn", key="disk_high", category="disk") state["disk_high"] = True last_sent["disk"] = now else: - if state["disk_high"] and notify_recovery: - await notify(bot, chat_id, f"๐ŸŸข Disk usage OK ({usage}% {mount})", level="info", key="disk_ok") + if state["disk_high"] and notify_recovery and not load_only_critical: + await notify(bot, chat_id, f"๐ŸŸข Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk") state["disk_high"] = False if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown: report = await build_disk_report(cfg, mount or "/", usage) - await notify(bot, chat_id, f"๐Ÿ“ฆ Disk snapshot\n\n{report}", level="info", key="disk_snapshot") + await notify(bot, chat_id, f"๐Ÿ“ฆ Disk snapshot\n\n{report}", level="info", key="disk_snapshot", category="disk") last_sent["disk_report"] = now load = psutil.getloadavg()[0] @@ -63,14 +63,14 @@ async def monitor_resources(cfg, notify, bot, chat_id): if level == 0: if state["load_level"] > 0 and notify_recovery and not load_only_critical: - await notify(bot, chat_id, f"๐ŸŸข Load OK: {load:.2f}", level="info", key="load_ok") + await notify(bot, chat_id, f"๐ŸŸข Load OK: {load:.2f}", level="info", key="load_ok", category="load") state["load_level"] = 0 else: if level != state["load_level"] or now - last_sent["load"] >= cooldown: icon = "๐Ÿ”ด" if level == 2 else "๐ŸŸก" level_name = "critical" if level == 2 else "warn" key = "load_high_crit" if level == 2 else "load_high_warn" - await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key) + await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load") last_sent["load"] = now state["load_level"] = level @@ -102,6 +102,7 @@ async def monitor_smart(cfg, notify, bot, chat_id): f"๐Ÿ”ด SMART FAIL {dev}: {health}, ๐ŸŒก {temp}", level="critical", key=f"smart_fail:{dev}", + category="smart", ) last_sent[key] = now continue @@ -118,6 +119,7 @@ async def monitor_smart(cfg, notify, bot, chat_id): f"๐ŸŸก SMART HOT {dev}: {health}, ๐ŸŒก {temp}", level="warn", key=f"smart_hot:{dev}", + category="smart", ) last_sent[key] = now continue diff --git a/services/config_check.py b/services/config_check.py new file mode 100644 index 0000000..6925815 --- /dev/null +++ b/services/config_check.py @@ -0,0 +1,33 @@ +import os +from typing import Any, Tuple, List + + +def validate_cfg(cfg: dict[str, Any]) -> Tuple[List[str], List[str]]: + errors: List[str] = [] + warnings: List[str] = [] + + tg = cfg.get("telegram", {}) + if not tg.get("token"): + errors.append("telegram.token is missing") + if not tg.get("admin_id"): + errors.append("telegram.admin_id is missing") + + thresholds = cfg.get("thresholds", {}) + for key in ("disk_warn", "load_warn", "high_load_warn"): + if key not in thresholds: + warnings.append(f"thresholds.{key} not set") + + paths = cfg.get("paths", {}) + env_path = paths.get("restic_env") + if env_path and not os.path.exists(env_path): + warnings.append(f"paths.restic_env not found: {env_path}") + + npm = cfg.get("npmplus", {}) + if npm and not npm.get("token") and (not npm.get("identity") or not npm.get("secret")): + warnings.append("npmplus: token missing and identity/secret missing") + + ow = cfg.get("openwrt", {}) + if ow and not ow.get("host"): + warnings.append("openwrt.host is missing") + + return errors, warnings diff --git a/services/docker.py b/services/docker.py index a7ceb86..c492df2 100644 --- a/services/docker.py +++ b/services/docker.py @@ -150,6 +150,7 @@ async def docker_watchdog(container_map, notify, bot, chat_id): f"โš ๏ธ {alias} health: {health}", level="warn", key=f"docker_health:{alias}", + category="docker", ) else: await notify( @@ -158,6 +159,7 @@ async def docker_watchdog(container_map, notify, bot, chat_id): f"๐Ÿณ {alias}: {status}", level="info", key=f"docker_status:{alias}:{status}", + category="docker", ) last[alias] = (status, health) await asyncio.sleep(120) diff --git a/services/notify.py b/services/notify.py index 8b79461..a1d200c 100644 --- a/services/notify.py +++ b/services/notify.py @@ -2,6 +2,7 @@ import time from datetime import datetime from aiogram import Bot from app import cfg +from services.alert_mute import is_muted from services.incidents import log_incident @@ -37,8 +38,17 @@ def _in_quiet_hours(alerts_cfg: dict) -> bool: return now_min >= start_min or now_min < end_min -async def notify(bot: Bot, chat_id: int, text: str, level: str = "info", key: str | None = None): +async def notify( + bot: Bot, + chat_id: int, + text: str, + level: str = "info", + key: str | None = None, + category: str | None = None, +): alerts_cfg = cfg.get("alerts", {}) + if category and is_muted(category): + return if _in_quiet_hours(alerts_cfg): allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True)) if not (allow_critical and level == "critical"): diff --git a/services/ssl_alerts.py b/services/ssl_alerts.py index 439aa93..b1915ba 100644 --- a/services/ssl_alerts.py +++ b/services/ssl_alerts.py @@ -53,6 +53,7 @@ async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int): f"โš ๏ธ SSL `{name}` expires in {days_left}d (threshold {threshold}d)", level=level, key=f"ssl:{name}:{threshold}", + category="ssl", ) last_sent[key] = time.time() break