From 65682ca162093248a4f4793d3223f6b3c9802ea3 Mon Sep 17 00:00:00 2001 From: benya Date: Sun, 8 Feb 2026 04:19:28 +0300 Subject: [PATCH] Add quiet hours, health checks, and logging --- CONFIG.en.md | 18 ++++++++++++ CONFIG.md | 18 ++++++++++++ README.en.md | 4 ++- README.md | 4 ++- config.example.yaml | 19 +++++++++++++ handlers/backup.py | 5 +++- handlers/system.py | 6 ++++ main.py | 20 ++++++++++++++ services/alerts.py | 32 +++++++++++++++------ services/docker.py | 16 +++++++++-- services/health.py | 58 +++++++++++++++++++++++++++++++++++++++ services/logging_setup.py | 35 +++++++++++++++++++++++ services/notify.py | 51 +++++++++++++++++++++++++++++++++- services/openwrt.py | 26 +++++++++++++++++- services/ssl_alerts.py | 3 ++ 15 files changed, 299 insertions(+), 16 deletions(-) create mode 100644 services/logging_setup.py diff --git a/CONFIG.en.md b/CONFIG.en.md index 6bb07d1..a991ba9 100644 --- a/CONFIG.en.md +++ b/CONFIG.en.md @@ -23,6 +23,12 @@ This project uses `config.yaml`. Start from `config.example.yaml`. - `enabled` (bool): Enable resource alerts. - `interval_sec` (int): Poll interval. - `cooldown_sec` (int): Cooldown between alerts. +- `notify_cooldown_sec` (int): Global alert dedup cooldown (defaults to `cooldown_sec`). +- `quiet_hours` (object): Quiet hours for non‑critical alerts. + - `enabled` (bool): Enable quiet hours. + - `start` (string): Start time `HH:MM` (e.g. `23:00`). + - `end` (string): End time `HH:MM` (e.g. `08:00`). + - `allow_critical` (bool): Allow critical alerts during quiet hours. - `notify_recovery` (bool): Send recovery notifications. - `smart_enabled` (bool): Enable SMART health polling. - `smart_interval_sec` (int): SMART poll interval. @@ -51,6 +57,18 @@ This project uses `config.yaml`. Start from `config.example.yaml`. - `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday. - `backup_count` (int): How many rotated files to keep. +## logging + +- `enabled` (bool): Enable bot logging. +- `path` (string): Log file path. Default `/var/server-bot/bot.log`. +- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday. +- `backup_count` (int): How many rotated files to keep. +- `level` (string): Log level (`INFO`, `WARNING`, `ERROR`). + +## safety + +- `dry_run` (bool): If `true`, dangerous actions (upgrade/reboot/backup) are skipped. + ## external_checks - `enabled` (bool): Enable background checks. diff --git a/CONFIG.md b/CONFIG.md index 43e8059..98ad097 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -23,6 +23,12 @@ - `enabled` (bool): включить алерты. - `interval_sec` (int): интервал опроса. - `cooldown_sec` (int): кулдаун между алертами. +- `notify_cooldown_sec` (int): глобальный дедуп алертов (по умолчанию `cooldown_sec`). +- `quiet_hours` (object): тихие часы для не‑критичных уведомлений. + - `enabled` (bool): включить тихие часы. + - `start` (string): начало, формат `HH:MM` (например `23:00`). + - `end` (string): конец, формат `HH:MM` (например `08:00`). + - `allow_critical` (bool): слать критичные алерты в тишину. - `notify_recovery` (bool): уведомлять о восстановлении. - `smart_enabled` (bool): SMART проверки. - `smart_interval_sec` (int): интервал SMART. @@ -51,6 +57,18 @@ - `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`. - `backup_count` (int): сколько файлов хранить. +## logging + +- `enabled` (bool): включить лог бота. +- `path` (string): путь к лог-файлу. По умолчанию `/var/server-bot/bot.log`. +- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`. +- `backup_count` (int): сколько файлов хранить. +- `level` (string): уровень логирования (`INFO`, `WARNING`, `ERROR`). + +## safety + +- `dry_run` (bool): если `true`, опасные действия (upgrade/reboot/backup) не выполняются. + ## external_checks - `enabled` (bool): включить фоновые проверки. diff --git a/README.en.md b/README.en.md index 6bb54c2..143ff23 100644 --- a/README.en.md +++ b/README.en.md @@ -8,8 +8,9 @@ Telegram admin bot for Linux servers. Provides quick status checks, backup contr - Arcane: list projects, refresh, up/down, restart. - Backups (restic): snapshots, repo stats, run backup, queue, restic check, weekly report. - System: disks, security, URLs health, metrics, package updates, upgrade, reboot, hardware info, SSL cert status (NPMplus). -- Alerts: disk/load and SMART monitoring with cooldown. +- Alerts: disk/load/SMART with cooldown and quiet hours. - Audit log: all button presses and messages (weekly rotation). +- Logs: bot log rotation and incidents. ## Requirements @@ -68,4 +69,5 @@ GNU GPL v3.0. Full text in `LICENSE`. - For NPMplus with self-signed TLS, set `npmplus.verify_tls: false`. - The bot uses `sudo` for certain actions (reboot, upgrade, backup scripts). Ensure the service user has the required permissions. +- Enable `safety.dry_run` if you want a safe mode without actions. - Audit log default path is `/var/server-bot/audit.log`. diff --git a/README.md b/README.md index bfc12aa..10f6019 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,9 @@ Telegram-бот администратора для Linux-серверов. Да - Arcane: список проектов, refresh, up/down, restart. - Бэкапы (restic): снапшоты, статистика репозитория, запуск бэкапа, очередь, restic check, weekly report. - Система: диски, безопасность, проверка URL, метрики, обновления, upgrade, reboot, железо, SSL (NPMplus). -- Алерты: диск/нагрузка и SMART с cooldown. +- Алерты: диск/нагрузка/SMART с cooldown и quiet hours. - Аудит: все нажатия и сообщения (ротация раз в неделю). +- Логи: ротация логов бота и инциденты. ## Требования @@ -68,4 +69,5 @@ GNU GPL v3.0. Полный текст в `LICENSE`. - Для NPMplus с self-signed TLS установи `npmplus.verify_tls: false`. - Бот использует `sudo` для части операций — настрой права. +- Включи `safety.dry_run`, если хочешь безопасный режим без действий. - Аудит по умолчанию пишется в `/var/server-bot/audit.log`. diff --git a/config.example.yaml b/config.example.yaml index 19b6d2c..bc8e4d0 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -17,6 +17,14 @@ alerts: enabled: true interval_sec: 60 cooldown_sec: 900 + # Optional global dedup cooldown for notify() calls + notify_cooldown_sec: 900 + quiet_hours: + enabled: false + start: "23:00" + end: "08:00" + # Allow critical alerts during quiet hours + allow_critical: true notify_recovery: true smart_enabled: true smart_interval_sec: 3600 @@ -42,6 +50,17 @@ incidents: rotate_when: "W0" backup_count: 8 +logging: + enabled: true + path: "/var/server-bot/bot.log" + rotate_when: "W0" + backup_count: 8 + level: "INFO" + +safety: + # If true, dangerous actions will be skipped + dry_run: false + external_checks: enabled: true state_path: "/var/server-bot/external_checks.json" diff --git a/handlers/backup.py b/handlers/backup.py index 6abbf09..0981591 100644 --- a/handlers/backup.py +++ b/handlers/backup.py @@ -4,7 +4,7 @@ import os from datetime import datetime from aiogram import F from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery -from app import dp +from app import dp, cfg from auth import is_admin_msg, is_admin_cb from keyboards import backup_kb from lock_utils import acquire_lock, release_lock @@ -200,6 +200,9 @@ async def cmd_backup_status(msg: Message): async def cmd_backup_now(msg: Message): async def job(): + if cfg.get("safety", {}).get("dry_run", False): + await msg.answer("🧪 Dry-run: backup skipped", reply_markup=backup_kb) + return if not acquire_lock("backup"): await msg.answer("⚠️ Backup уже выполняется", reply_markup=backup_kb) return diff --git a/handlers/system.py b/handlers/system.py index 454bd02..128cc33 100644 --- a/handlers/system.py +++ b/handlers/system.py @@ -438,6 +438,9 @@ async def upgrade_confirm(cb: CallbackQuery): await cb.answer() async def job(): + if cfg.get("safety", {}).get("dry_run", False): + await cb.message.answer("🧪 Dry-run: upgrade skipped", reply_markup=system_ops_kb) + return text = await apply_updates() await cb.message.answer(text, reply_markup=system_ops_kb, parse_mode="Markdown") @@ -506,6 +509,9 @@ async def reboot_password(msg: Message): return async def job(): + if cfg.get("safety", {}).get("dry_run", False): + await msg.answer("🧪 Dry-run: reboot skipped", reply_markup=system_ops_kb) + return await msg.answer("🔄 Rebooting…", reply_markup=system_ops_kb) await run_cmd(["sudo", "reboot"], timeout=10) diff --git a/main.py b/main.py index 5d2a23d..1055127 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import asyncio +import logging import socket from datetime import datetime from app import bot, dp, cfg, ADMIN_ID @@ -11,6 +12,8 @@ from services.notify import notify from services.audit import AuditMiddleware, audit_start from services.ssl_alerts import monitor_ssl from services.external_checks import monitor_external +from services.incidents import log_incident +from services.logging_setup import setup_logging import state import handlers.menu import handlers.status @@ -24,6 +27,20 @@ import handlers.arcane import handlers.processes +def _handle_async_exception(_loop, context): + msg = context.get("message") or "Unhandled exception" + exc = context.get("exception") + if exc: + text = f"❌ {msg}: {type(exc).__name__}: {exc}" + else: + text = f"❌ {msg}" + try: + log_incident(cfg, text) + except Exception: + pass + logging.getLogger("asyncio").error(text) + + async def notify_start(): await bot.send_message( ADMIN_ID, @@ -33,6 +50,7 @@ async def notify_start(): async def main(): + setup_logging(cfg) dp.message.middleware(AuditMiddleware(cfg)) dp.callback_query.middleware(AuditMiddleware(cfg)) audit_start(cfg) @@ -51,6 +69,8 @@ async def main(): state.METRICS_STORE = MetricsStore() asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5)) asyncio.create_task(queue_worker()) + loop = asyncio.get_running_loop() + loop.set_exception_handler(_handle_async_exception) await notify_start() await dp.start_polling(bot) diff --git a/services/alerts.py b/services/alerts.py index c4cb8af..f95eb9b 100644 --- a/services/alerts.py +++ b/services/alerts.py @@ -27,27 +27,27 @@ async def monitor_resources(cfg, notify, bot, chat_id): usage, mount = worst_disk_usage() if usage is None: if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown: - await notify(bot, chat_id, "⚠️ Disk usage n/a") + await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na") state["disk_na"] = True last_sent["disk_na"] = now else: if state["disk_na"] and notify_recovery: - await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})") + await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok") state["disk_na"] = False if usage >= disk_warn: if not state["disk_high"] or now - last_sent["disk"] >= cooldown: - await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})") + await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high") state["disk_high"] = True last_sent["disk"] = now else: if state["disk_high"] and notify_recovery: - await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})") + await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok") state["disk_high"] = False if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown: report = await build_disk_report(cfg, mount or "/", usage) - await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}") + await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot") last_sent["disk_report"] = now load = psutil.getloadavg()[0] @@ -60,12 +60,14 @@ async def monitor_resources(cfg, notify, bot, chat_id): if level == 0: if state["load_level"] > 0 and notify_recovery: - await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}") + await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok") state["load_level"] = 0 else: if level != state["load_level"] or now - last_sent["load"] >= cooldown: icon = "🔴" if level == 2 else "🟡" - await notify(bot, chat_id, f"{icon} Load high: {load:.2f}") + level_name = "critical" if level == 2 else "warn" + key = "load_high_crit" if level == 2 else "load_high_warn" + await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key) last_sent["load"] = now state["load_level"] = level @@ -91,7 +93,13 @@ async def monitor_smart(cfg, notify, bot, chat_id): continue if "FAILED" in health: - await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}") + await notify( + bot, + chat_id, + f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}", + level="critical", + key=f"smart_fail:{dev}", + ) last_sent[key] = now continue @@ -101,7 +109,13 @@ async def monitor_smart(cfg, notify, bot, chat_id): except ValueError: t = None if t is not None and t >= temp_warn: - await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}") + await notify( + bot, + chat_id, + f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}", + level="warn", + key=f"smart_hot:{dev}", + ) last_sent[key] = now continue diff --git a/services/docker.py b/services/docker.py index a14517d..a7ceb86 100644 --- a/services/docker.py +++ b/services/docker.py @@ -144,8 +144,20 @@ async def docker_watchdog(container_map, notify, bot, chat_id): reply_markup=kb, ) elif health not in ("healthy", "n/a"): - await notify(bot, chat_id, f"⚠️ {alias} health: {health}") + await notify( + bot, + chat_id, + f"⚠️ {alias} health: {health}", + level="warn", + key=f"docker_health:{alias}", + ) else: - await notify(bot, chat_id, f"🐳 {alias}: {status}") + await notify( + bot, + chat_id, + f"🐳 {alias}: {status}", + level="info", + key=f"docker_status:{alias}:{status}", + ) last[alias] = (status, health) await asyncio.sleep(120) diff --git a/services/health.py b/services/health.py index 2ddd834..8986448 100644 --- a/services/health.py +++ b/services/health.py @@ -1,6 +1,9 @@ import os +import ssl import subprocess import psutil +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen from app import RESTIC_ENV from services.system import worst_disk_usage @@ -9,6 +12,30 @@ def _containers_from_cfg(cfg) -> dict: return cfg.get("docker", {}).get("containers", {}) +def _request_status(url: str, verify_tls: bool) -> int | None: + context = None + if not verify_tls: + context = ssl._create_unverified_context() # nosec - config-controlled + req = Request(url, headers={"User-Agent": "tg-admin-bot"}) + try: + with urlopen(req, timeout=8, context=context) as resp: + return int(resp.status) + except HTTPError as e: + return int(e.code) + except URLError: + return None + + +def _npm_api_base(cfg) -> str | None: + npm_cfg = cfg.get("npmplus", {}) + base = (npm_cfg.get("base_url") or "").rstrip("/") + if not base: + return None + if not base.endswith("/api"): + base = f"{base}/api" + return base + + def health(cfg, container_map: dict | None = None) -> str: lines = ["🩺 Health check\n"] @@ -30,6 +57,37 @@ def health(cfg, container_map: dict | None = None) -> str: else: lines.append(f"🟢 {alias} OK") + npm_cfg = cfg.get("npmplus", {}) + npm_base = _npm_api_base(cfg) + if npm_base: + npm_status = _request_status(npm_base, npm_cfg.get("verify_tls", True)) + if npm_status == 200: + lines.append("🟢 NPMplus API OK") + elif npm_status is None: + lines.append("🔴 NPMplus API unreachable") + else: + lines.append(f"🟡 NPMplus API HTTP {npm_status}") + + g_cfg = cfg.get("gitea", {}) + g_base = (g_cfg.get("base_url") or "").rstrip("/") + if g_base: + health_paths = ["/api/healthz", "/api/v1/healthz"] + g_status = None + for path in health_paths: + status = _request_status(f"{g_base}{path}", g_cfg.get("verify_tls", True)) + if status == 200: + g_status = status + break + if status not in (404, 405): + g_status = status + break + if g_status == 200: + lines.append("🟢 Gitea API OK") + elif g_status is None: + lines.append("🔴 Gitea API unreachable") + else: + lines.append(f"🟡 Gitea API HTTP {g_status}") + usage, mount = worst_disk_usage() if usage is None: lines.append("⚠️ Disk n/a") diff --git a/services/logging_setup.py b/services/logging_setup.py new file mode 100644 index 0000000..bb32acd --- /dev/null +++ b/services/logging_setup.py @@ -0,0 +1,35 @@ +import logging +import os +from logging.handlers import TimedRotatingFileHandler + + +def setup_logging(cfg: dict) -> None: + log_cfg = cfg.get("logging", {}) + if not log_cfg.get("enabled", True): + return + + path = log_cfg.get("path", "/var/server-bot/bot.log") + rotate_when = log_cfg.get("rotate_when", "W0") + backup_count = int(log_cfg.get("backup_count", 8)) + level = str(log_cfg.get("level", "INFO")).upper() + + os.makedirs(os.path.dirname(path), exist_ok=True) + + root = logging.getLogger() + for handler in root.handlers: + if isinstance(handler, TimedRotatingFileHandler) and handler.baseFilename == path: + return + + handler = TimedRotatingFileHandler( + path, + when=rotate_when, + interval=1, + backupCount=backup_count, + encoding="utf-8", + utc=True, + ) + formatter = logging.Formatter("%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s") + handler.setFormatter(formatter) + + root.setLevel(level) + root.addHandler(handler) diff --git a/services/notify.py b/services/notify.py index 2a4b623..8b79461 100644 --- a/services/notify.py +++ b/services/notify.py @@ -1,9 +1,58 @@ +import time +from datetime import datetime from aiogram import Bot from app import cfg from services.incidents import log_incident -async def notify(bot: Bot, chat_id: int, text: str): +_LAST_SENT: dict[str, float] = {} + + +def _parse_hhmm(value: str) -> int | None: + try: + hours, minutes = value.strip().split(":", 1) + h = int(hours) + m = int(minutes) + except Exception: + return None + if not (0 <= h <= 23 and 0 <= m <= 59): + return None + return h * 60 + m + + +def _in_quiet_hours(alerts_cfg: dict) -> bool: + quiet = alerts_cfg.get("quiet_hours", {}) + if not quiet.get("enabled", False): + return False + start_min = _parse_hhmm(quiet.get("start", "23:00")) + end_min = _parse_hhmm(quiet.get("end", "08:00")) + if start_min is None or end_min is None: + return False + if start_min == end_min: + return False + now = datetime.now() + now_min = now.hour * 60 + now.minute + if start_min < end_min: + return start_min <= now_min < end_min + return now_min >= start_min or now_min < end_min + + +async def notify(bot: Bot, chat_id: int, text: str, level: str = "info", key: str | None = None): + alerts_cfg = cfg.get("alerts", {}) + if _in_quiet_hours(alerts_cfg): + allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True)) + if not (allow_critical and level == "critical"): + return + + dedup_sec = int(alerts_cfg.get("notify_cooldown_sec", alerts_cfg.get("cooldown_sec", 900))) + if dedup_sec > 0: + dedup_key = key or text + now = time.time() + last_time = _LAST_SENT.get(dedup_key, 0) + if now - last_time < dedup_sec: + return + _LAST_SENT[dedup_key] = now + try: await bot.send_message(chat_id, text) except Exception: diff --git a/services/openwrt.py b/services/openwrt.py index e070eab..b07caa3 100644 --- a/services/openwrt.py +++ b/services/openwrt.py @@ -205,6 +205,19 @@ def _extract_hostapd_ifnames(raw: str) -> list[str]: return ifnames +def _net_label_for_ifname(ifname: str, ifname_meta: dict[str, dict[str, str]]) -> str: + meta = ifname_meta.get(ifname, {}) + ssid = meta.get("ssid") or "" + band = meta.get("band") or "" + if ssid and band: + return f"{ssid} ({band})" + if ssid: + return ssid + if band: + return band + return ifname + + def _safe_json_load(raw: str) -> Any | None: if not raw: return None @@ -378,6 +391,7 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str: lease_name_map = _extract_lease_name_map(leases or {}) if leases_fallback: lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback)) + wifi_net_counts: dict[str, int] = {} if ifnames: for ifname in ifnames: cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"] @@ -387,6 +401,10 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str: if rc2 == 0: payload = _safe_json_load(out2) if payload: + clients_payload = payload.get("clients") if isinstance(payload, dict) else None + if isinstance(clients_payload, dict): + label = _net_label_for_ifname(ifname, ifname_meta) + wifi_net_counts[label] = wifi_net_counts.get(label, 0) + len(clients_payload) wifi_clients.extend( _parse_hostapd_clients( payload, @@ -407,8 +425,14 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str: f"⚙️ Load: {load}", f"🌐 WAN: {wan_ip} ({wan_state})", "", - f"📶 Wi-Fi clients: {len(wifi_clients)}", ] + if wifi_net_counts: + lines.append("📶 Wi-Fi networks:") + for label, count in sorted(wifi_net_counts.items()): + lines.append(f" - {label}: {count}") + lines.append("") + + lines.append(f"📶 Wi-Fi clients: {len(wifi_clients)}") if wifi_clients: for line in wifi_clients[:20]: lines.append(f" - {line}") diff --git a/services/ssl_alerts.py b/services/ssl_alerts.py index 5131dc8..439aa93 100644 --- a/services/ssl_alerts.py +++ b/services/ssl_alerts.py @@ -46,10 +46,13 @@ async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int): key = f"{name}:{threshold}" last_time = last_sent.get(key, 0) if time.time() - last_time >= cooldown: + level = "critical" if days_left <= 1 else "warn" await notify( bot, chat_id, f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)", + level=level, + key=f"ssl:{name}:{threshold}", ) last_sent[key] = time.time() break