133 lines
5.5 KiB
Python
133 lines
5.5 KiB
Python
import asyncio
|
|
import time
|
|
import psutil
|
|
from system_checks import list_disks, smart_health, disk_temperature
|
|
from services.system import worst_disk_usage
|
|
from services.disk_report import build_disk_report
|
|
|
|
|
|
async def monitor_resources(cfg, notify, bot, chat_id):
|
|
alerts_cfg = cfg.get("alerts", {})
|
|
interval = int(alerts_cfg.get("interval_sec", 60))
|
|
cooldown = int(alerts_cfg.get("cooldown_sec", 900))
|
|
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
|
load_only_critical = bool(alerts_cfg.get("load_only_critical", False))
|
|
auto_mute_high_load_sec = int(alerts_cfg.get("auto_mute_on_high_load_sec", 0))
|
|
|
|
disk_warn = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
|
snapshot_warn = int(cfg.get("disk_report", {}).get("threshold", disk_warn))
|
|
snapshot_cooldown = int(cfg.get("disk_report", {}).get("cooldown_sec", 21600))
|
|
load_warn = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
|
|
high_warn = float(cfg.get("thresholds", {}).get("high_load_warn", load_warn * 1.5))
|
|
|
|
last_sent = {"disk": 0.0, "load": 0.0, "disk_na": 0.0, "disk_report": 0.0}
|
|
state = {"disk_high": False, "disk_na": False, "load_level": 0}
|
|
|
|
while True:
|
|
now = time.time()
|
|
|
|
usage, mount = worst_disk_usage()
|
|
if usage is None:
|
|
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
|
|
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na", category="disk")
|
|
state["disk_na"] = True
|
|
last_sent["disk_na"] = now
|
|
else:
|
|
if state["disk_na"] and notify_recovery and not load_only_critical:
|
|
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
|
|
state["disk_na"] = False
|
|
|
|
if usage >= disk_warn:
|
|
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
|
|
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high", category="disk")
|
|
state["disk_high"] = True
|
|
last_sent["disk"] = now
|
|
else:
|
|
if state["disk_high"] and notify_recovery and not load_only_critical:
|
|
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
|
|
state["disk_high"] = False
|
|
|
|
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
|
|
report = await build_disk_report(cfg, mount or "/", usage)
|
|
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot", category="disk")
|
|
last_sent["disk_report"] = now
|
|
|
|
load = psutil.getloadavg()[0]
|
|
if load >= high_warn:
|
|
level = 2
|
|
elif load >= load_warn:
|
|
level = 1
|
|
else:
|
|
level = 0
|
|
if load_only_critical and level == 1:
|
|
level = 0
|
|
|
|
if level == 0:
|
|
if state["load_level"] > 0 and notify_recovery and not load_only_critical:
|
|
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok", category="load")
|
|
state["load_level"] = 0
|
|
else:
|
|
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
|
|
icon = "🔴" if level == 2 else "🟡"
|
|
level_name = "critical" if level == 2 else "warn"
|
|
key = "load_high_crit" if level == 2 else "load_high_warn"
|
|
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load")
|
|
last_sent["load"] = now
|
|
if level == 2 and auto_mute_high_load_sec > 0:
|
|
from services.alert_mute import set_mute
|
|
|
|
set_mute("load", auto_mute_high_load_sec)
|
|
state["load_level"] = level
|
|
|
|
await asyncio.sleep(interval)
|
|
|
|
|
|
async def monitor_smart(cfg, notify, bot, chat_id):
|
|
alerts_cfg = cfg.get("alerts", {})
|
|
interval = int(alerts_cfg.get("smart_interval_sec", 3600))
|
|
cooldown = int(alerts_cfg.get("smart_cooldown_sec", 6 * 3600))
|
|
temp_warn = int(alerts_cfg.get("smart_temp_warn", 50))
|
|
|
|
last_sent = {}
|
|
|
|
while True:
|
|
for dev in list_disks():
|
|
health = smart_health(dev)
|
|
temp = disk_temperature(dev)
|
|
|
|
key = f"{dev}:{health}:{temp}"
|
|
now = time.time()
|
|
if last_sent.get(key, 0) + cooldown > now:
|
|
continue
|
|
|
|
if "FAILED" in health:
|
|
await notify(
|
|
bot,
|
|
chat_id,
|
|
f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}",
|
|
level="critical",
|
|
key=f"smart_fail:{dev}",
|
|
category="smart",
|
|
)
|
|
last_sent[key] = now
|
|
continue
|
|
|
|
if temp != "n/a":
|
|
try:
|
|
t = int(temp.replace("°C", ""))
|
|
except ValueError:
|
|
t = None
|
|
if t is not None and t >= temp_warn:
|
|
await notify(
|
|
bot,
|
|
chat_id,
|
|
f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}",
|
|
level="warn",
|
|
key=f"smart_hot:{dev}",
|
|
category="smart",
|
|
)
|
|
last_sent[key] = now
|
|
continue
|
|
|
|
await asyncio.sleep(interval)
|