Add SMART alert scheduler
This commit is contained in:
@@ -17,6 +17,10 @@ alerts:
|
|||||||
interval_sec: 60
|
interval_sec: 60
|
||||||
cooldown_sec: 900
|
cooldown_sec: 900
|
||||||
notify_recovery: true
|
notify_recovery: true
|
||||||
|
smart_enabled: true
|
||||||
|
smart_interval_sec: 3600
|
||||||
|
smart_cooldown_sec: 21600
|
||||||
|
smart_temp_warn: 50
|
||||||
|
|
||||||
docker:
|
docker:
|
||||||
# If true, discover containers by name/label
|
# If true, discover containers by name/label
|
||||||
|
|||||||
4
main.py
4
main.py
@@ -4,7 +4,7 @@ from datetime import datetime
|
|||||||
from app import bot, dp, cfg, ADMIN_ID
|
from app import bot, dp, cfg, ADMIN_ID
|
||||||
from keyboards import menu_kb
|
from keyboards import menu_kb
|
||||||
from services.docker import discover_containers, docker_watchdog
|
from services.docker import discover_containers, docker_watchdog
|
||||||
from services.alerts import monitor_resources
|
from services.alerts import monitor_resources, monitor_smart
|
||||||
from services.notify import notify
|
from services.notify import notify
|
||||||
import state
|
import state
|
||||||
import handlers.menu
|
import handlers.menu
|
||||||
@@ -32,6 +32,8 @@ async def main():
|
|||||||
asyncio.create_task(docker_watchdog(state.DOCKER_MAP, notify, bot, ADMIN_ID))
|
asyncio.create_task(docker_watchdog(state.DOCKER_MAP, notify, bot, ADMIN_ID))
|
||||||
if cfg.get("alerts", {}).get("enabled", True):
|
if cfg.get("alerts", {}).get("enabled", True):
|
||||||
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
|
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
|
||||||
|
if cfg.get("alerts", {}).get("smart_enabled", True):
|
||||||
|
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
||||||
await notify_start()
|
await notify_start()
|
||||||
await dp.start_polling(bot)
|
await dp.start_polling(bot)
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
import psutil
|
import psutil
|
||||||
|
from system_checks import list_disks, smart_health, disk_temperature
|
||||||
from services.system import worst_disk_usage
|
from services.system import worst_disk_usage
|
||||||
|
|
||||||
|
|
||||||
@@ -52,3 +53,39 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
|||||||
state["load_high"] = False
|
state["load_high"] = False
|
||||||
|
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
|
async def monitor_smart(cfg, notify, bot, chat_id):
|
||||||
|
alerts_cfg = cfg.get("alerts", {})
|
||||||
|
interval = int(alerts_cfg.get("smart_interval_sec", 3600))
|
||||||
|
cooldown = int(alerts_cfg.get("smart_cooldown_sec", 6 * 3600))
|
||||||
|
temp_warn = int(alerts_cfg.get("smart_temp_warn", 50))
|
||||||
|
|
||||||
|
last_sent = {}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
for dev in list_disks():
|
||||||
|
health = smart_health(dev)
|
||||||
|
temp = disk_temperature(dev)
|
||||||
|
|
||||||
|
key = f"{dev}:{health}:{temp}"
|
||||||
|
now = time.time()
|
||||||
|
if last_sent.get(key, 0) + cooldown > now:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "FAILED" in health:
|
||||||
|
await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}")
|
||||||
|
last_sent[key] = now
|
||||||
|
continue
|
||||||
|
|
||||||
|
if temp != "n/a":
|
||||||
|
try:
|
||||||
|
t = int(temp.replace("°C", ""))
|
||||||
|
except ValueError:
|
||||||
|
t = None
|
||||||
|
if t is not None and t >= temp_warn:
|
||||||
|
await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}")
|
||||||
|
last_sent[key] = now
|
||||||
|
continue
|
||||||
|
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
|||||||
Reference in New Issue
Block a user