Compare commits

..

2 Commits

Author SHA1 Message Date
745a5171a1 Add SMART alert scheduler 2026-02-07 22:46:31 +03:00
e3a1321d3f Add restart confirmation from watchdog alerts 2026-02-07 22:45:10 +03:00
5 changed files with 115 additions and 2 deletions

View File

@@ -17,6 +17,10 @@ alerts:
interval_sec: 60 interval_sec: 60
cooldown_sec: 900 cooldown_sec: 900
notify_recovery: true notify_recovery: true
smart_enabled: true
smart_interval_sec: 3600
smart_cooldown_sec: 21600
smart_temp_warn: 50
docker: docker:
# If true, discover containers by name/label # If true, discover containers by name/label

View File

@@ -150,3 +150,57 @@ async def logs_options(cb: CallbackQuery):
return return
await cb.answer("Bad request") await cb.answer("Bad request")
@dp.callback_query(F.data.startswith("wdrestart:"))
async def watchdog_restart_request(cb: CallbackQuery):
if cb.from_user.id != ADMIN_ID:
return
_, alias = cb.data.split(":", 1)
if alias not in DOCKER_MAP:
await cb.answer("Container not found")
return
kb = InlineKeyboardMarkup(
inline_keyboard=[[
InlineKeyboardButton(
text="✅ Confirm restart",
callback_data=f"wdconfirm:{alias}"
),
InlineKeyboardButton(
text="✖ Cancel",
callback_data="wdcancel"
),
]]
)
await cb.message.answer(
f"⚠️ Confirm restart `{alias}`?",
reply_markup=kb,
parse_mode="Markdown",
)
await cb.answer()
@dp.callback_query(F.data == "wdcancel")
async def watchdog_restart_cancel(cb: CallbackQuery):
await cb.answer("Cancelled")
@dp.callback_query(F.data.startswith("wdconfirm:"))
async def watchdog_restart_confirm(cb: CallbackQuery):
if cb.from_user.id != ADMIN_ID:
return
_, alias = cb.data.split(":", 1)
real = DOCKER_MAP.get(alias)
if not real:
await cb.answer("Container not found")
return
await cb.answer("Restarting…")
rc, out = await docker_cmd(["restart", real])
await cb.message.answer(
f"🔄 **{alias} restarted**\n```{out}```",
parse_mode="Markdown",
)

View File

@@ -4,7 +4,7 @@ from datetime import datetime
from app import bot, dp, cfg, ADMIN_ID from app import bot, dp, cfg, ADMIN_ID
from keyboards import menu_kb from keyboards import menu_kb
from services.docker import discover_containers, docker_watchdog from services.docker import discover_containers, docker_watchdog
from services.alerts import monitor_resources from services.alerts import monitor_resources, monitor_smart
from services.notify import notify from services.notify import notify
import state import state
import handlers.menu import handlers.menu
@@ -32,6 +32,8 @@ async def main():
asyncio.create_task(docker_watchdog(state.DOCKER_MAP, notify, bot, ADMIN_ID)) asyncio.create_task(docker_watchdog(state.DOCKER_MAP, notify, bot, ADMIN_ID))
if cfg.get("alerts", {}).get("enabled", True): if cfg.get("alerts", {}).get("enabled", True):
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID)) asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
if cfg.get("alerts", {}).get("smart_enabled", True):
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
await notify_start() await notify_start()
await dp.start_polling(bot) await dp.start_polling(bot)

View File

@@ -1,6 +1,7 @@
import asyncio import asyncio
import time import time
import psutil import psutil
from system_checks import list_disks, smart_health, disk_temperature
from services.system import worst_disk_usage from services.system import worst_disk_usage
@@ -52,3 +53,39 @@ async def monitor_resources(cfg, notify, bot, chat_id):
state["load_high"] = False state["load_high"] = False
await asyncio.sleep(interval) await asyncio.sleep(interval)
async def monitor_smart(cfg, notify, bot, chat_id):
alerts_cfg = cfg.get("alerts", {})
interval = int(alerts_cfg.get("smart_interval_sec", 3600))
cooldown = int(alerts_cfg.get("smart_cooldown_sec", 6 * 3600))
temp_warn = int(alerts_cfg.get("smart_temp_warn", 50))
last_sent = {}
while True:
for dev in list_disks():
health = smart_health(dev)
temp = disk_temperature(dev)
key = f"{dev}:{health}:{temp}"
now = time.time()
if last_sent.get(key, 0) + cooldown > now:
continue
if "FAILED" in health:
await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}")
last_sent[key] = now
continue
if temp != "n/a":
try:
t = int(temp.replace("°C", ""))
except ValueError:
t = None
if t is not None and t >= temp_warn:
await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}")
last_sent[key] = now
continue
await asyncio.sleep(interval)

View File

@@ -1,6 +1,7 @@
import asyncio import asyncio
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Dict from typing import Dict
from aiogram.types import InlineKeyboardMarkup, InlineKeyboardButton
from services.runner import run_cmd from services.runner import run_cmd
@@ -109,6 +110,21 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
state = "error" state = "error"
state = state.strip() state = state.strip()
if last.get(alias) != state: if last.get(alias) != state:
if state != "running":
kb = InlineKeyboardMarkup(
inline_keyboard=[[
InlineKeyboardButton(
text="🔄 Restart",
callback_data=f"wdrestart:{alias}"
)
]]
)
await bot.send_message(
chat_id,
f"🐳 {alias}: {state}",
reply_markup=kb,
)
else:
await notify(bot, chat_id, f"🐳 {alias}: {state}") await notify(bot, chat_id, f"🐳 {alias}: {state}")
last[alias] = state last[alias] = state
await asyncio.sleep(120) await asyncio.sleep(120)