Add quiet hours, health checks, and logging
This commit is contained in:
18
CONFIG.en.md
18
CONFIG.en.md
@@ -23,6 +23,12 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `enabled` (bool): Enable resource alerts.
|
- `enabled` (bool): Enable resource alerts.
|
||||||
- `interval_sec` (int): Poll interval.
|
- `interval_sec` (int): Poll interval.
|
||||||
- `cooldown_sec` (int): Cooldown between alerts.
|
- `cooldown_sec` (int): Cooldown between alerts.
|
||||||
|
- `notify_cooldown_sec` (int): Global alert dedup cooldown (defaults to `cooldown_sec`).
|
||||||
|
- `quiet_hours` (object): Quiet hours for non‑critical alerts.
|
||||||
|
- `enabled` (bool): Enable quiet hours.
|
||||||
|
- `start` (string): Start time `HH:MM` (e.g. `23:00`).
|
||||||
|
- `end` (string): End time `HH:MM` (e.g. `08:00`).
|
||||||
|
- `allow_critical` (bool): Allow critical alerts during quiet hours.
|
||||||
- `notify_recovery` (bool): Send recovery notifications.
|
- `notify_recovery` (bool): Send recovery notifications.
|
||||||
- `smart_enabled` (bool): Enable SMART health polling.
|
- `smart_enabled` (bool): Enable SMART health polling.
|
||||||
- `smart_interval_sec` (int): SMART poll interval.
|
- `smart_interval_sec` (int): SMART poll interval.
|
||||||
@@ -51,6 +57,18 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||||
- `backup_count` (int): How many rotated files to keep.
|
- `backup_count` (int): How many rotated files to keep.
|
||||||
|
|
||||||
|
## logging
|
||||||
|
|
||||||
|
- `enabled` (bool): Enable bot logging.
|
||||||
|
- `path` (string): Log file path. Default `/var/server-bot/bot.log`.
|
||||||
|
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||||
|
- `backup_count` (int): How many rotated files to keep.
|
||||||
|
- `level` (string): Log level (`INFO`, `WARNING`, `ERROR`).
|
||||||
|
|
||||||
|
## safety
|
||||||
|
|
||||||
|
- `dry_run` (bool): If `true`, dangerous actions (upgrade/reboot/backup) are skipped.
|
||||||
|
|
||||||
## external_checks
|
## external_checks
|
||||||
|
|
||||||
- `enabled` (bool): Enable background checks.
|
- `enabled` (bool): Enable background checks.
|
||||||
|
|||||||
18
CONFIG.md
18
CONFIG.md
@@ -23,6 +23,12 @@
|
|||||||
- `enabled` (bool): включить алерты.
|
- `enabled` (bool): включить алерты.
|
||||||
- `interval_sec` (int): интервал опроса.
|
- `interval_sec` (int): интервал опроса.
|
||||||
- `cooldown_sec` (int): кулдаун между алертами.
|
- `cooldown_sec` (int): кулдаун между алертами.
|
||||||
|
- `notify_cooldown_sec` (int): глобальный дедуп алертов (по умолчанию `cooldown_sec`).
|
||||||
|
- `quiet_hours` (object): тихие часы для не‑критичных уведомлений.
|
||||||
|
- `enabled` (bool): включить тихие часы.
|
||||||
|
- `start` (string): начало, формат `HH:MM` (например `23:00`).
|
||||||
|
- `end` (string): конец, формат `HH:MM` (например `08:00`).
|
||||||
|
- `allow_critical` (bool): слать критичные алерты в тишину.
|
||||||
- `notify_recovery` (bool): уведомлять о восстановлении.
|
- `notify_recovery` (bool): уведомлять о восстановлении.
|
||||||
- `smart_enabled` (bool): SMART проверки.
|
- `smart_enabled` (bool): SMART проверки.
|
||||||
- `smart_interval_sec` (int): интервал SMART.
|
- `smart_interval_sec` (int): интервал SMART.
|
||||||
@@ -51,6 +57,18 @@
|
|||||||
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||||
- `backup_count` (int): сколько файлов хранить.
|
- `backup_count` (int): сколько файлов хранить.
|
||||||
|
|
||||||
|
## logging
|
||||||
|
|
||||||
|
- `enabled` (bool): включить лог бота.
|
||||||
|
- `path` (string): путь к лог-файлу. По умолчанию `/var/server-bot/bot.log`.
|
||||||
|
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||||
|
- `backup_count` (int): сколько файлов хранить.
|
||||||
|
- `level` (string): уровень логирования (`INFO`, `WARNING`, `ERROR`).
|
||||||
|
|
||||||
|
## safety
|
||||||
|
|
||||||
|
- `dry_run` (bool): если `true`, опасные действия (upgrade/reboot/backup) не выполняются.
|
||||||
|
|
||||||
## external_checks
|
## external_checks
|
||||||
|
|
||||||
- `enabled` (bool): включить фоновые проверки.
|
- `enabled` (bool): включить фоновые проверки.
|
||||||
|
|||||||
@@ -8,8 +8,9 @@ Telegram admin bot for Linux servers. Provides quick status checks, backup contr
|
|||||||
- Arcane: list projects, refresh, up/down, restart.
|
- Arcane: list projects, refresh, up/down, restart.
|
||||||
- Backups (restic): snapshots, repo stats, run backup, queue, restic check, weekly report.
|
- Backups (restic): snapshots, repo stats, run backup, queue, restic check, weekly report.
|
||||||
- System: disks, security, URLs health, metrics, package updates, upgrade, reboot, hardware info, SSL cert status (NPMplus).
|
- System: disks, security, URLs health, metrics, package updates, upgrade, reboot, hardware info, SSL cert status (NPMplus).
|
||||||
- Alerts: disk/load and SMART monitoring with cooldown.
|
- Alerts: disk/load/SMART with cooldown and quiet hours.
|
||||||
- Audit log: all button presses and messages (weekly rotation).
|
- Audit log: all button presses and messages (weekly rotation).
|
||||||
|
- Logs: bot log rotation and incidents.
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
@@ -68,4 +69,5 @@ GNU GPL v3.0. Full text in `LICENSE`.
|
|||||||
|
|
||||||
- For NPMplus with self-signed TLS, set `npmplus.verify_tls: false`.
|
- For NPMplus with self-signed TLS, set `npmplus.verify_tls: false`.
|
||||||
- The bot uses `sudo` for certain actions (reboot, upgrade, backup scripts). Ensure the service user has the required permissions.
|
- The bot uses `sudo` for certain actions (reboot, upgrade, backup scripts). Ensure the service user has the required permissions.
|
||||||
|
- Enable `safety.dry_run` if you want a safe mode without actions.
|
||||||
- Audit log default path is `/var/server-bot/audit.log`.
|
- Audit log default path is `/var/server-bot/audit.log`.
|
||||||
|
|||||||
@@ -8,8 +8,9 @@ Telegram-бот администратора для Linux-серверов. Да
|
|||||||
- Arcane: список проектов, refresh, up/down, restart.
|
- Arcane: список проектов, refresh, up/down, restart.
|
||||||
- Бэкапы (restic): снапшоты, статистика репозитория, запуск бэкапа, очередь, restic check, weekly report.
|
- Бэкапы (restic): снапшоты, статистика репозитория, запуск бэкапа, очередь, restic check, weekly report.
|
||||||
- Система: диски, безопасность, проверка URL, метрики, обновления, upgrade, reboot, железо, SSL (NPMplus).
|
- Система: диски, безопасность, проверка URL, метрики, обновления, upgrade, reboot, железо, SSL (NPMplus).
|
||||||
- Алерты: диск/нагрузка и SMART с cooldown.
|
- Алерты: диск/нагрузка/SMART с cooldown и quiet hours.
|
||||||
- Аудит: все нажатия и сообщения (ротация раз в неделю).
|
- Аудит: все нажатия и сообщения (ротация раз в неделю).
|
||||||
|
- Логи: ротация логов бота и инциденты.
|
||||||
|
|
||||||
## Требования
|
## Требования
|
||||||
|
|
||||||
@@ -68,4 +69,5 @@ GNU GPL v3.0. Полный текст в `LICENSE`.
|
|||||||
|
|
||||||
- Для NPMplus с self-signed TLS установи `npmplus.verify_tls: false`.
|
- Для NPMplus с self-signed TLS установи `npmplus.verify_tls: false`.
|
||||||
- Бот использует `sudo` для части операций — настрой права.
|
- Бот использует `sudo` для части операций — настрой права.
|
||||||
|
- Включи `safety.dry_run`, если хочешь безопасный режим без действий.
|
||||||
- Аудит по умолчанию пишется в `/var/server-bot/audit.log`.
|
- Аудит по умолчанию пишется в `/var/server-bot/audit.log`.
|
||||||
|
|||||||
@@ -17,6 +17,14 @@ alerts:
|
|||||||
enabled: true
|
enabled: true
|
||||||
interval_sec: 60
|
interval_sec: 60
|
||||||
cooldown_sec: 900
|
cooldown_sec: 900
|
||||||
|
# Optional global dedup cooldown for notify() calls
|
||||||
|
notify_cooldown_sec: 900
|
||||||
|
quiet_hours:
|
||||||
|
enabled: false
|
||||||
|
start: "23:00"
|
||||||
|
end: "08:00"
|
||||||
|
# Allow critical alerts during quiet hours
|
||||||
|
allow_critical: true
|
||||||
notify_recovery: true
|
notify_recovery: true
|
||||||
smart_enabled: true
|
smart_enabled: true
|
||||||
smart_interval_sec: 3600
|
smart_interval_sec: 3600
|
||||||
@@ -42,6 +50,17 @@ incidents:
|
|||||||
rotate_when: "W0"
|
rotate_when: "W0"
|
||||||
backup_count: 8
|
backup_count: 8
|
||||||
|
|
||||||
|
logging:
|
||||||
|
enabled: true
|
||||||
|
path: "/var/server-bot/bot.log"
|
||||||
|
rotate_when: "W0"
|
||||||
|
backup_count: 8
|
||||||
|
level: "INFO"
|
||||||
|
|
||||||
|
safety:
|
||||||
|
# If true, dangerous actions will be skipped
|
||||||
|
dry_run: false
|
||||||
|
|
||||||
external_checks:
|
external_checks:
|
||||||
enabled: true
|
enabled: true
|
||||||
state_path: "/var/server-bot/external_checks.json"
|
state_path: "/var/server-bot/external_checks.json"
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import os
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from aiogram import F
|
from aiogram import F
|
||||||
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
|
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
|
||||||
from app import dp
|
from app import dp, cfg
|
||||||
from auth import is_admin_msg, is_admin_cb
|
from auth import is_admin_msg, is_admin_cb
|
||||||
from keyboards import backup_kb
|
from keyboards import backup_kb
|
||||||
from lock_utils import acquire_lock, release_lock
|
from lock_utils import acquire_lock, release_lock
|
||||||
@@ -200,6 +200,9 @@ async def cmd_backup_status(msg: Message):
|
|||||||
|
|
||||||
async def cmd_backup_now(msg: Message):
|
async def cmd_backup_now(msg: Message):
|
||||||
async def job():
|
async def job():
|
||||||
|
if cfg.get("safety", {}).get("dry_run", False):
|
||||||
|
await msg.answer("🧪 Dry-run: backup skipped", reply_markup=backup_kb)
|
||||||
|
return
|
||||||
if not acquire_lock("backup"):
|
if not acquire_lock("backup"):
|
||||||
await msg.answer("⚠️ Backup уже выполняется", reply_markup=backup_kb)
|
await msg.answer("⚠️ Backup уже выполняется", reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -438,6 +438,9 @@ async def upgrade_confirm(cb: CallbackQuery):
|
|||||||
await cb.answer()
|
await cb.answer()
|
||||||
|
|
||||||
async def job():
|
async def job():
|
||||||
|
if cfg.get("safety", {}).get("dry_run", False):
|
||||||
|
await cb.message.answer("🧪 Dry-run: upgrade skipped", reply_markup=system_ops_kb)
|
||||||
|
return
|
||||||
text = await apply_updates()
|
text = await apply_updates()
|
||||||
await cb.message.answer(text, reply_markup=system_ops_kb, parse_mode="Markdown")
|
await cb.message.answer(text, reply_markup=system_ops_kb, parse_mode="Markdown")
|
||||||
|
|
||||||
@@ -506,6 +509,9 @@ async def reboot_password(msg: Message):
|
|||||||
return
|
return
|
||||||
|
|
||||||
async def job():
|
async def job():
|
||||||
|
if cfg.get("safety", {}).get("dry_run", False):
|
||||||
|
await msg.answer("🧪 Dry-run: reboot skipped", reply_markup=system_ops_kb)
|
||||||
|
return
|
||||||
await msg.answer("🔄 Rebooting…", reply_markup=system_ops_kb)
|
await msg.answer("🔄 Rebooting…", reply_markup=system_ops_kb)
|
||||||
await run_cmd(["sudo", "reboot"], timeout=10)
|
await run_cmd(["sudo", "reboot"], timeout=10)
|
||||||
|
|
||||||
|
|||||||
20
main.py
20
main.py
@@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
import socket
|
import socket
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from app import bot, dp, cfg, ADMIN_ID
|
from app import bot, dp, cfg, ADMIN_ID
|
||||||
@@ -11,6 +12,8 @@ from services.notify import notify
|
|||||||
from services.audit import AuditMiddleware, audit_start
|
from services.audit import AuditMiddleware, audit_start
|
||||||
from services.ssl_alerts import monitor_ssl
|
from services.ssl_alerts import monitor_ssl
|
||||||
from services.external_checks import monitor_external
|
from services.external_checks import monitor_external
|
||||||
|
from services.incidents import log_incident
|
||||||
|
from services.logging_setup import setup_logging
|
||||||
import state
|
import state
|
||||||
import handlers.menu
|
import handlers.menu
|
||||||
import handlers.status
|
import handlers.status
|
||||||
@@ -24,6 +27,20 @@ import handlers.arcane
|
|||||||
import handlers.processes
|
import handlers.processes
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_async_exception(_loop, context):
|
||||||
|
msg = context.get("message") or "Unhandled exception"
|
||||||
|
exc = context.get("exception")
|
||||||
|
if exc:
|
||||||
|
text = f"❌ {msg}: {type(exc).__name__}: {exc}"
|
||||||
|
else:
|
||||||
|
text = f"❌ {msg}"
|
||||||
|
try:
|
||||||
|
log_incident(cfg, text)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
logging.getLogger("asyncio").error(text)
|
||||||
|
|
||||||
|
|
||||||
async def notify_start():
|
async def notify_start():
|
||||||
await bot.send_message(
|
await bot.send_message(
|
||||||
ADMIN_ID,
|
ADMIN_ID,
|
||||||
@@ -33,6 +50,7 @@ async def notify_start():
|
|||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
setup_logging(cfg)
|
||||||
dp.message.middleware(AuditMiddleware(cfg))
|
dp.message.middleware(AuditMiddleware(cfg))
|
||||||
dp.callback_query.middleware(AuditMiddleware(cfg))
|
dp.callback_query.middleware(AuditMiddleware(cfg))
|
||||||
audit_start(cfg)
|
audit_start(cfg)
|
||||||
@@ -51,6 +69,8 @@ async def main():
|
|||||||
state.METRICS_STORE = MetricsStore()
|
state.METRICS_STORE = MetricsStore()
|
||||||
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
||||||
asyncio.create_task(queue_worker())
|
asyncio.create_task(queue_worker())
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
loop.set_exception_handler(_handle_async_exception)
|
||||||
await notify_start()
|
await notify_start()
|
||||||
await dp.start_polling(bot)
|
await dp.start_polling(bot)
|
||||||
|
|
||||||
|
|||||||
@@ -27,27 +27,27 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
|||||||
usage, mount = worst_disk_usage()
|
usage, mount = worst_disk_usage()
|
||||||
if usage is None:
|
if usage is None:
|
||||||
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
|
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
|
||||||
await notify(bot, chat_id, "⚠️ Disk usage n/a")
|
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na")
|
||||||
state["disk_na"] = True
|
state["disk_na"] = True
|
||||||
last_sent["disk_na"] = now
|
last_sent["disk_na"] = now
|
||||||
else:
|
else:
|
||||||
if state["disk_na"] and notify_recovery:
|
if state["disk_na"] and notify_recovery:
|
||||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
|
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok")
|
||||||
state["disk_na"] = False
|
state["disk_na"] = False
|
||||||
|
|
||||||
if usage >= disk_warn:
|
if usage >= disk_warn:
|
||||||
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
|
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
|
||||||
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})")
|
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high")
|
||||||
state["disk_high"] = True
|
state["disk_high"] = True
|
||||||
last_sent["disk"] = now
|
last_sent["disk"] = now
|
||||||
else:
|
else:
|
||||||
if state["disk_high"] and notify_recovery:
|
if state["disk_high"] and notify_recovery:
|
||||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
|
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok")
|
||||||
state["disk_high"] = False
|
state["disk_high"] = False
|
||||||
|
|
||||||
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
|
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
|
||||||
report = await build_disk_report(cfg, mount or "/", usage)
|
report = await build_disk_report(cfg, mount or "/", usage)
|
||||||
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}")
|
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot")
|
||||||
last_sent["disk_report"] = now
|
last_sent["disk_report"] = now
|
||||||
|
|
||||||
load = psutil.getloadavg()[0]
|
load = psutil.getloadavg()[0]
|
||||||
@@ -60,12 +60,14 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
|||||||
|
|
||||||
if level == 0:
|
if level == 0:
|
||||||
if state["load_level"] > 0 and notify_recovery:
|
if state["load_level"] > 0 and notify_recovery:
|
||||||
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}")
|
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok")
|
||||||
state["load_level"] = 0
|
state["load_level"] = 0
|
||||||
else:
|
else:
|
||||||
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
|
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
|
||||||
icon = "🔴" if level == 2 else "🟡"
|
icon = "🔴" if level == 2 else "🟡"
|
||||||
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}")
|
level_name = "critical" if level == 2 else "warn"
|
||||||
|
key = "load_high_crit" if level == 2 else "load_high_warn"
|
||||||
|
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key)
|
||||||
last_sent["load"] = now
|
last_sent["load"] = now
|
||||||
state["load_level"] = level
|
state["load_level"] = level
|
||||||
|
|
||||||
@@ -91,7 +93,13 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if "FAILED" in health:
|
if "FAILED" in health:
|
||||||
await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}")
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}",
|
||||||
|
level="critical",
|
||||||
|
key=f"smart_fail:{dev}",
|
||||||
|
)
|
||||||
last_sent[key] = now
|
last_sent[key] = now
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -101,7 +109,13 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
t = None
|
t = None
|
||||||
if t is not None and t >= temp_warn:
|
if t is not None and t >= temp_warn:
|
||||||
await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}")
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}",
|
||||||
|
level="warn",
|
||||||
|
key=f"smart_hot:{dev}",
|
||||||
|
)
|
||||||
last_sent[key] = now
|
last_sent[key] = now
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
@@ -144,8 +144,20 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
|
|||||||
reply_markup=kb,
|
reply_markup=kb,
|
||||||
)
|
)
|
||||||
elif health not in ("healthy", "n/a"):
|
elif health not in ("healthy", "n/a"):
|
||||||
await notify(bot, chat_id, f"⚠️ {alias} health: {health}")
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"⚠️ {alias} health: {health}",
|
||||||
|
level="warn",
|
||||||
|
key=f"docker_health:{alias}",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
await notify(bot, chat_id, f"🐳 {alias}: {status}")
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"🐳 {alias}: {status}",
|
||||||
|
level="info",
|
||||||
|
key=f"docker_status:{alias}:{status}",
|
||||||
|
)
|
||||||
last[alias] = (status, health)
|
last[alias] = (status, health)
|
||||||
await asyncio.sleep(120)
|
await asyncio.sleep(120)
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
|
import ssl
|
||||||
import subprocess
|
import subprocess
|
||||||
import psutil
|
import psutil
|
||||||
|
from urllib.error import HTTPError, URLError
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
from app import RESTIC_ENV
|
from app import RESTIC_ENV
|
||||||
from services.system import worst_disk_usage
|
from services.system import worst_disk_usage
|
||||||
|
|
||||||
@@ -9,6 +12,30 @@ def _containers_from_cfg(cfg) -> dict:
|
|||||||
return cfg.get("docker", {}).get("containers", {})
|
return cfg.get("docker", {}).get("containers", {})
|
||||||
|
|
||||||
|
|
||||||
|
def _request_status(url: str, verify_tls: bool) -> int | None:
|
||||||
|
context = None
|
||||||
|
if not verify_tls:
|
||||||
|
context = ssl._create_unverified_context() # nosec - config-controlled
|
||||||
|
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
|
||||||
|
try:
|
||||||
|
with urlopen(req, timeout=8, context=context) as resp:
|
||||||
|
return int(resp.status)
|
||||||
|
except HTTPError as e:
|
||||||
|
return int(e.code)
|
||||||
|
except URLError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _npm_api_base(cfg) -> str | None:
|
||||||
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
|
base = (npm_cfg.get("base_url") or "").rstrip("/")
|
||||||
|
if not base:
|
||||||
|
return None
|
||||||
|
if not base.endswith("/api"):
|
||||||
|
base = f"{base}/api"
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
def health(cfg, container_map: dict | None = None) -> str:
|
def health(cfg, container_map: dict | None = None) -> str:
|
||||||
lines = ["🩺 Health check\n"]
|
lines = ["🩺 Health check\n"]
|
||||||
|
|
||||||
@@ -30,6 +57,37 @@ def health(cfg, container_map: dict | None = None) -> str:
|
|||||||
else:
|
else:
|
||||||
lines.append(f"🟢 {alias} OK")
|
lines.append(f"🟢 {alias} OK")
|
||||||
|
|
||||||
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
|
npm_base = _npm_api_base(cfg)
|
||||||
|
if npm_base:
|
||||||
|
npm_status = _request_status(npm_base, npm_cfg.get("verify_tls", True))
|
||||||
|
if npm_status == 200:
|
||||||
|
lines.append("🟢 NPMplus API OK")
|
||||||
|
elif npm_status is None:
|
||||||
|
lines.append("🔴 NPMplus API unreachable")
|
||||||
|
else:
|
||||||
|
lines.append(f"🟡 NPMplus API HTTP {npm_status}")
|
||||||
|
|
||||||
|
g_cfg = cfg.get("gitea", {})
|
||||||
|
g_base = (g_cfg.get("base_url") or "").rstrip("/")
|
||||||
|
if g_base:
|
||||||
|
health_paths = ["/api/healthz", "/api/v1/healthz"]
|
||||||
|
g_status = None
|
||||||
|
for path in health_paths:
|
||||||
|
status = _request_status(f"{g_base}{path}", g_cfg.get("verify_tls", True))
|
||||||
|
if status == 200:
|
||||||
|
g_status = status
|
||||||
|
break
|
||||||
|
if status not in (404, 405):
|
||||||
|
g_status = status
|
||||||
|
break
|
||||||
|
if g_status == 200:
|
||||||
|
lines.append("🟢 Gitea API OK")
|
||||||
|
elif g_status is None:
|
||||||
|
lines.append("🔴 Gitea API unreachable")
|
||||||
|
else:
|
||||||
|
lines.append(f"🟡 Gitea API HTTP {g_status}")
|
||||||
|
|
||||||
usage, mount = worst_disk_usage()
|
usage, mount = worst_disk_usage()
|
||||||
if usage is None:
|
if usage is None:
|
||||||
lines.append("⚠️ Disk n/a")
|
lines.append("⚠️ Disk n/a")
|
||||||
|
|||||||
35
services/logging_setup.py
Normal file
35
services/logging_setup.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from logging.handlers import TimedRotatingFileHandler
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging(cfg: dict) -> None:
|
||||||
|
log_cfg = cfg.get("logging", {})
|
||||||
|
if not log_cfg.get("enabled", True):
|
||||||
|
return
|
||||||
|
|
||||||
|
path = log_cfg.get("path", "/var/server-bot/bot.log")
|
||||||
|
rotate_when = log_cfg.get("rotate_when", "W0")
|
||||||
|
backup_count = int(log_cfg.get("backup_count", 8))
|
||||||
|
level = str(log_cfg.get("level", "INFO")).upper()
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
|
|
||||||
|
root = logging.getLogger()
|
||||||
|
for handler in root.handlers:
|
||||||
|
if isinstance(handler, TimedRotatingFileHandler) and handler.baseFilename == path:
|
||||||
|
return
|
||||||
|
|
||||||
|
handler = TimedRotatingFileHandler(
|
||||||
|
path,
|
||||||
|
when=rotate_when,
|
||||||
|
interval=1,
|
||||||
|
backupCount=backup_count,
|
||||||
|
encoding="utf-8",
|
||||||
|
utc=True,
|
||||||
|
)
|
||||||
|
formatter = logging.Formatter("%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s")
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
root.setLevel(level)
|
||||||
|
root.addHandler(handler)
|
||||||
@@ -1,9 +1,58 @@
|
|||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
from aiogram import Bot
|
from aiogram import Bot
|
||||||
from app import cfg
|
from app import cfg
|
||||||
from services.incidents import log_incident
|
from services.incidents import log_incident
|
||||||
|
|
||||||
|
|
||||||
async def notify(bot: Bot, chat_id: int, text: str):
|
_LAST_SENT: dict[str, float] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_hhmm(value: str) -> int | None:
|
||||||
|
try:
|
||||||
|
hours, minutes = value.strip().split(":", 1)
|
||||||
|
h = int(hours)
|
||||||
|
m = int(minutes)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
if not (0 <= h <= 23 and 0 <= m <= 59):
|
||||||
|
return None
|
||||||
|
return h * 60 + m
|
||||||
|
|
||||||
|
|
||||||
|
def _in_quiet_hours(alerts_cfg: dict) -> bool:
|
||||||
|
quiet = alerts_cfg.get("quiet_hours", {})
|
||||||
|
if not quiet.get("enabled", False):
|
||||||
|
return False
|
||||||
|
start_min = _parse_hhmm(quiet.get("start", "23:00"))
|
||||||
|
end_min = _parse_hhmm(quiet.get("end", "08:00"))
|
||||||
|
if start_min is None or end_min is None:
|
||||||
|
return False
|
||||||
|
if start_min == end_min:
|
||||||
|
return False
|
||||||
|
now = datetime.now()
|
||||||
|
now_min = now.hour * 60 + now.minute
|
||||||
|
if start_min < end_min:
|
||||||
|
return start_min <= now_min < end_min
|
||||||
|
return now_min >= start_min or now_min < end_min
|
||||||
|
|
||||||
|
|
||||||
|
async def notify(bot: Bot, chat_id: int, text: str, level: str = "info", key: str | None = None):
|
||||||
|
alerts_cfg = cfg.get("alerts", {})
|
||||||
|
if _in_quiet_hours(alerts_cfg):
|
||||||
|
allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True))
|
||||||
|
if not (allow_critical and level == "critical"):
|
||||||
|
return
|
||||||
|
|
||||||
|
dedup_sec = int(alerts_cfg.get("notify_cooldown_sec", alerts_cfg.get("cooldown_sec", 900)))
|
||||||
|
if dedup_sec > 0:
|
||||||
|
dedup_key = key or text
|
||||||
|
now = time.time()
|
||||||
|
last_time = _LAST_SENT.get(dedup_key, 0)
|
||||||
|
if now - last_time < dedup_sec:
|
||||||
|
return
|
||||||
|
_LAST_SENT[dedup_key] = now
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await bot.send_message(chat_id, text)
|
await bot.send_message(chat_id, text)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
@@ -205,6 +205,19 @@ def _extract_hostapd_ifnames(raw: str) -> list[str]:
|
|||||||
return ifnames
|
return ifnames
|
||||||
|
|
||||||
|
|
||||||
|
def _net_label_for_ifname(ifname: str, ifname_meta: dict[str, dict[str, str]]) -> str:
|
||||||
|
meta = ifname_meta.get(ifname, {})
|
||||||
|
ssid = meta.get("ssid") or ""
|
||||||
|
band = meta.get("band") or ""
|
||||||
|
if ssid and band:
|
||||||
|
return f"{ssid} ({band})"
|
||||||
|
if ssid:
|
||||||
|
return ssid
|
||||||
|
if band:
|
||||||
|
return band
|
||||||
|
return ifname
|
||||||
|
|
||||||
|
|
||||||
def _safe_json_load(raw: str) -> Any | None:
|
def _safe_json_load(raw: str) -> Any | None:
|
||||||
if not raw:
|
if not raw:
|
||||||
return None
|
return None
|
||||||
@@ -378,6 +391,7 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str:
|
|||||||
lease_name_map = _extract_lease_name_map(leases or {})
|
lease_name_map = _extract_lease_name_map(leases or {})
|
||||||
if leases_fallback:
|
if leases_fallback:
|
||||||
lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback))
|
lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback))
|
||||||
|
wifi_net_counts: dict[str, int] = {}
|
||||||
if ifnames:
|
if ifnames:
|
||||||
for ifname in ifnames:
|
for ifname in ifnames:
|
||||||
cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"]
|
cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"]
|
||||||
@@ -387,6 +401,10 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str:
|
|||||||
if rc2 == 0:
|
if rc2 == 0:
|
||||||
payload = _safe_json_load(out2)
|
payload = _safe_json_load(out2)
|
||||||
if payload:
|
if payload:
|
||||||
|
clients_payload = payload.get("clients") if isinstance(payload, dict) else None
|
||||||
|
if isinstance(clients_payload, dict):
|
||||||
|
label = _net_label_for_ifname(ifname, ifname_meta)
|
||||||
|
wifi_net_counts[label] = wifi_net_counts.get(label, 0) + len(clients_payload)
|
||||||
wifi_clients.extend(
|
wifi_clients.extend(
|
||||||
_parse_hostapd_clients(
|
_parse_hostapd_clients(
|
||||||
payload,
|
payload,
|
||||||
@@ -407,8 +425,14 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str:
|
|||||||
f"⚙️ Load: {load}",
|
f"⚙️ Load: {load}",
|
||||||
f"🌐 WAN: {wan_ip} ({wan_state})",
|
f"🌐 WAN: {wan_ip} ({wan_state})",
|
||||||
"",
|
"",
|
||||||
f"📶 Wi-Fi clients: {len(wifi_clients)}",
|
|
||||||
]
|
]
|
||||||
|
if wifi_net_counts:
|
||||||
|
lines.append("📶 Wi-Fi networks:")
|
||||||
|
for label, count in sorted(wifi_net_counts.items()):
|
||||||
|
lines.append(f" - {label}: {count}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
lines.append(f"📶 Wi-Fi clients: {len(wifi_clients)}")
|
||||||
if wifi_clients:
|
if wifi_clients:
|
||||||
for line in wifi_clients[:20]:
|
for line in wifi_clients[:20]:
|
||||||
lines.append(f" - {line}")
|
lines.append(f" - {line}")
|
||||||
|
|||||||
@@ -46,10 +46,13 @@ async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int):
|
|||||||
key = f"{name}:{threshold}"
|
key = f"{name}:{threshold}"
|
||||||
last_time = last_sent.get(key, 0)
|
last_time = last_sent.get(key, 0)
|
||||||
if time.time() - last_time >= cooldown:
|
if time.time() - last_time >= cooldown:
|
||||||
|
level = "critical" if days_left <= 1 else "warn"
|
||||||
await notify(
|
await notify(
|
||||||
bot,
|
bot,
|
||||||
chat_id,
|
chat_id,
|
||||||
f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)",
|
f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)",
|
||||||
|
level=level,
|
||||||
|
key=f"ssl:{name}:{threshold}",
|
||||||
)
|
)
|
||||||
last_sent[key] = time.time()
|
last_sent[key] = time.time()
|
||||||
break
|
break
|
||||||
|
|||||||
Reference in New Issue
Block a user