Files
tg-admin-bot/services/alerts.py

101 lines
3.7 KiB
Python

import asyncio
import time
import psutil
from system_checks import list_disks, smart_health, disk_temperature
from services.system import worst_disk_usage
async def monitor_resources(cfg, notify, bot, chat_id):
alerts_cfg = cfg.get("alerts", {})
interval = int(alerts_cfg.get("interval_sec", 60))
cooldown = int(alerts_cfg.get("cooldown_sec", 900))
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
disk_warn = int(cfg.get("thresholds", {}).get("disk_warn", 80))
load_warn = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
high_warn = float(cfg.get("thresholds", {}).get("high_load_warn", load_warn * 1.5))
last_sent = {"disk": 0.0, "load": 0.0, "disk_na": 0.0}
state = {"disk_high": False, "disk_na": False, "load_level": 0}
while True:
now = time.time()
usage, mount = worst_disk_usage()
if usage is None:
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
await notify(bot, chat_id, "⚠️ Disk usage n/a")
state["disk_na"] = True
last_sent["disk_na"] = now
else:
if state["disk_na"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
state["disk_na"] = False
if usage >= disk_warn:
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})")
state["disk_high"] = True
last_sent["disk"] = now
else:
if state["disk_high"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
state["disk_high"] = False
load = psutil.getloadavg()[0]
if load >= high_warn:
level = 2
elif load >= load_warn:
level = 1
else:
level = 0
if level == 0:
if state["load_level"] > 0 and notify_recovery:
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}")
state["load_level"] = 0
else:
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
icon = "🔴" if level == 2 else "🟡"
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}")
last_sent["load"] = now
state["load_level"] = level
await asyncio.sleep(interval)
async def monitor_smart(cfg, notify, bot, chat_id):
alerts_cfg = cfg.get("alerts", {})
interval = int(alerts_cfg.get("smart_interval_sec", 3600))
cooldown = int(alerts_cfg.get("smart_cooldown_sec", 6 * 3600))
temp_warn = int(alerts_cfg.get("smart_temp_warn", 50))
last_sent = {}
while True:
for dev in list_disks():
health = smart_health(dev)
temp = disk_temperature(dev)
key = f"{dev}:{health}:{temp}"
now = time.time()
if last_sent.get(key, 0) + cooldown > now:
continue
if "FAILED" in health:
await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}")
last_sent[key] = now
continue
if temp != "n/a":
try:
t = int(temp.replace("°C", ""))
except ValueError:
t = None
if t is not None and t >= temp_warn:
await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}")
last_sent[key] = now
continue
await asyncio.sleep(interval)