Add quiet hours, health checks, and logging

This commit is contained in:
2026-02-08 04:19:28 +03:00
parent 8bcc3c6878
commit 65682ca162
15 changed files with 299 additions and 16 deletions

View File

@@ -27,27 +27,27 @@ async def monitor_resources(cfg, notify, bot, chat_id):
usage, mount = worst_disk_usage()
if usage is None:
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
await notify(bot, chat_id, "⚠️ Disk usage n/a")
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na")
state["disk_na"] = True
last_sent["disk_na"] = now
else:
if state["disk_na"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok")
state["disk_na"] = False
if usage >= disk_warn:
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})")
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high")
state["disk_high"] = True
last_sent["disk"] = now
else:
if state["disk_high"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok")
state["disk_high"] = False
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
report = await build_disk_report(cfg, mount or "/", usage)
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}")
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot")
last_sent["disk_report"] = now
load = psutil.getloadavg()[0]
@@ -60,12 +60,14 @@ async def monitor_resources(cfg, notify, bot, chat_id):
if level == 0:
if state["load_level"] > 0 and notify_recovery:
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}")
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok")
state["load_level"] = 0
else:
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
icon = "🔴" if level == 2 else "🟡"
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}")
level_name = "critical" if level == 2 else "warn"
key = "load_high_crit" if level == 2 else "load_high_warn"
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key)
last_sent["load"] = now
state["load_level"] = level
@@ -91,7 +93,13 @@ async def monitor_smart(cfg, notify, bot, chat_id):
continue
if "FAILED" in health:
await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}")
await notify(
bot,
chat_id,
f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}",
level="critical",
key=f"smart_fail:{dev}",
)
last_sent[key] = now
continue
@@ -101,7 +109,13 @@ async def monitor_smart(cfg, notify, bot, chat_id):
except ValueError:
t = None
if t is not None and t >= temp_warn:
await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}")
await notify(
bot,
chat_id,
f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}",
level="warn",
key=f"smart_hot:{dev}",
)
last_sent[key] = now
continue

View File

@@ -144,8 +144,20 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
reply_markup=kb,
)
elif health not in ("healthy", "n/a"):
await notify(bot, chat_id, f"⚠️ {alias} health: {health}")
await notify(
bot,
chat_id,
f"⚠️ {alias} health: {health}",
level="warn",
key=f"docker_health:{alias}",
)
else:
await notify(bot, chat_id, f"🐳 {alias}: {status}")
await notify(
bot,
chat_id,
f"🐳 {alias}: {status}",
level="info",
key=f"docker_status:{alias}:{status}",
)
last[alias] = (status, health)
await asyncio.sleep(120)

View File

@@ -1,6 +1,9 @@
import os
import ssl
import subprocess
import psutil
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
from app import RESTIC_ENV
from services.system import worst_disk_usage
@@ -9,6 +12,30 @@ def _containers_from_cfg(cfg) -> dict:
return cfg.get("docker", {}).get("containers", {})
def _request_status(url: str, verify_tls: bool) -> int | None:
context = None
if not verify_tls:
context = ssl._create_unverified_context() # nosec - config-controlled
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
try:
with urlopen(req, timeout=8, context=context) as resp:
return int(resp.status)
except HTTPError as e:
return int(e.code)
except URLError:
return None
def _npm_api_base(cfg) -> str | None:
npm_cfg = cfg.get("npmplus", {})
base = (npm_cfg.get("base_url") or "").rstrip("/")
if not base:
return None
if not base.endswith("/api"):
base = f"{base}/api"
return base
def health(cfg, container_map: dict | None = None) -> str:
lines = ["🩺 Health check\n"]
@@ -30,6 +57,37 @@ def health(cfg, container_map: dict | None = None) -> str:
else:
lines.append(f"🟢 {alias} OK")
npm_cfg = cfg.get("npmplus", {})
npm_base = _npm_api_base(cfg)
if npm_base:
npm_status = _request_status(npm_base, npm_cfg.get("verify_tls", True))
if npm_status == 200:
lines.append("🟢 NPMplus API OK")
elif npm_status is None:
lines.append("🔴 NPMplus API unreachable")
else:
lines.append(f"🟡 NPMplus API HTTP {npm_status}")
g_cfg = cfg.get("gitea", {})
g_base = (g_cfg.get("base_url") or "").rstrip("/")
if g_base:
health_paths = ["/api/healthz", "/api/v1/healthz"]
g_status = None
for path in health_paths:
status = _request_status(f"{g_base}{path}", g_cfg.get("verify_tls", True))
if status == 200:
g_status = status
break
if status not in (404, 405):
g_status = status
break
if g_status == 200:
lines.append("🟢 Gitea API OK")
elif g_status is None:
lines.append("🔴 Gitea API unreachable")
else:
lines.append(f"🟡 Gitea API HTTP {g_status}")
usage, mount = worst_disk_usage()
if usage is None:
lines.append("⚠️ Disk n/a")

35
services/logging_setup.py Normal file
View File

@@ -0,0 +1,35 @@
import logging
import os
from logging.handlers import TimedRotatingFileHandler
def setup_logging(cfg: dict) -> None:
log_cfg = cfg.get("logging", {})
if not log_cfg.get("enabled", True):
return
path = log_cfg.get("path", "/var/server-bot/bot.log")
rotate_when = log_cfg.get("rotate_when", "W0")
backup_count = int(log_cfg.get("backup_count", 8))
level = str(log_cfg.get("level", "INFO")).upper()
os.makedirs(os.path.dirname(path), exist_ok=True)
root = logging.getLogger()
for handler in root.handlers:
if isinstance(handler, TimedRotatingFileHandler) and handler.baseFilename == path:
return
handler = TimedRotatingFileHandler(
path,
when=rotate_when,
interval=1,
backupCount=backup_count,
encoding="utf-8",
utc=True,
)
formatter = logging.Formatter("%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s")
handler.setFormatter(formatter)
root.setLevel(level)
root.addHandler(handler)

View File

@@ -1,9 +1,58 @@
import time
from datetime import datetime
from aiogram import Bot
from app import cfg
from services.incidents import log_incident
async def notify(bot: Bot, chat_id: int, text: str):
_LAST_SENT: dict[str, float] = {}
def _parse_hhmm(value: str) -> int | None:
try:
hours, minutes = value.strip().split(":", 1)
h = int(hours)
m = int(minutes)
except Exception:
return None
if not (0 <= h <= 23 and 0 <= m <= 59):
return None
return h * 60 + m
def _in_quiet_hours(alerts_cfg: dict) -> bool:
quiet = alerts_cfg.get("quiet_hours", {})
if not quiet.get("enabled", False):
return False
start_min = _parse_hhmm(quiet.get("start", "23:00"))
end_min = _parse_hhmm(quiet.get("end", "08:00"))
if start_min is None or end_min is None:
return False
if start_min == end_min:
return False
now = datetime.now()
now_min = now.hour * 60 + now.minute
if start_min < end_min:
return start_min <= now_min < end_min
return now_min >= start_min or now_min < end_min
async def notify(bot: Bot, chat_id: int, text: str, level: str = "info", key: str | None = None):
alerts_cfg = cfg.get("alerts", {})
if _in_quiet_hours(alerts_cfg):
allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True))
if not (allow_critical and level == "critical"):
return
dedup_sec = int(alerts_cfg.get("notify_cooldown_sec", alerts_cfg.get("cooldown_sec", 900)))
if dedup_sec > 0:
dedup_key = key or text
now = time.time()
last_time = _LAST_SENT.get(dedup_key, 0)
if now - last_time < dedup_sec:
return
_LAST_SENT[dedup_key] = now
try:
await bot.send_message(chat_id, text)
except Exception:

View File

@@ -205,6 +205,19 @@ def _extract_hostapd_ifnames(raw: str) -> list[str]:
return ifnames
def _net_label_for_ifname(ifname: str, ifname_meta: dict[str, dict[str, str]]) -> str:
meta = ifname_meta.get(ifname, {})
ssid = meta.get("ssid") or ""
band = meta.get("band") or ""
if ssid and band:
return f"{ssid} ({band})"
if ssid:
return ssid
if band:
return band
return ifname
def _safe_json_load(raw: str) -> Any | None:
if not raw:
return None
@@ -378,6 +391,7 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str:
lease_name_map = _extract_lease_name_map(leases or {})
if leases_fallback:
lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback))
wifi_net_counts: dict[str, int] = {}
if ifnames:
for ifname in ifnames:
cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"]
@@ -387,6 +401,10 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str:
if rc2 == 0:
payload = _safe_json_load(out2)
if payload:
clients_payload = payload.get("clients") if isinstance(payload, dict) else None
if isinstance(clients_payload, dict):
label = _net_label_for_ifname(ifname, ifname_meta)
wifi_net_counts[label] = wifi_net_counts.get(label, 0) + len(clients_payload)
wifi_clients.extend(
_parse_hostapd_clients(
payload,
@@ -407,8 +425,14 @@ async def get_openwrt_status(cfg: dict[str, Any]) -> str:
f"⚙️ Load: {load}",
f"🌐 WAN: {wan_ip} ({wan_state})",
"",
f"📶 Wi-Fi clients: {len(wifi_clients)}",
]
if wifi_net_counts:
lines.append("📶 Wi-Fi networks:")
for label, count in sorted(wifi_net_counts.items()):
lines.append(f" - {label}: {count}")
lines.append("")
lines.append(f"📶 Wi-Fi clients: {len(wifi_clients)}")
if wifi_clients:
for line in wifi_clients[:20]:
lines.append(f" - {line}")

View File

@@ -46,10 +46,13 @@ async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int):
key = f"{name}:{threshold}"
last_time = last_sent.get(key, 0)
if time.time() - last_time >= cooldown:
level = "critical" if days_left <= 1 else "warn"
await notify(
bot,
chat_id,
f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)",
level=level,
key=f"ssl:{name}:{threshold}",
)
last_sent[key] = time.time()
break