diff --git a/CONFIG.en.md b/CONFIG.en.md index e859f92..c82b1c8 100644 --- a/CONFIG.en.md +++ b/CONFIG.en.md @@ -33,7 +33,7 @@ This project uses `config.yaml`. Start from `config.example.yaml`. - `end` (string): End time `HH:MM` (e.g. `08:00`). - `allow_critical` (bool): Allow critical alerts during quiet hours. - `auto_mute` (list): Per-category auto mutes by time window. - - `category` (string): load/disk/smart/ssl/docker/test. + - `category` (string): load/disk/smart/raid/ssl/docker/test. - `start` (string): Start `HH:MM`. - `end` (string): End `HH:MM` (can wrap over midnight). - `auto_mute_on_high_load_sec` (int): auto-mute `load` category for N seconds on critical load (0 disables). @@ -42,6 +42,9 @@ This project uses `config.yaml`. Start from `config.example.yaml`. - `smart_interval_sec` (int): SMART poll interval. - `smart_cooldown_sec` (int): SMART alert cooldown. - `smart_temp_warn` (int): SMART temperature warning (C). +- `raid_enabled` (bool): Enable md RAID polling (`/proc/mdstat`). +- `raid_interval_sec` (int): RAID poll interval. +- `raid_cooldown_sec` (int): RAID alert cooldown. ## disk_report diff --git a/CONFIG.md b/CONFIG.md index f16f7fc..dcbbc9b 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -33,7 +33,7 @@ - `end` (string): конец, формат `HH:MM` (например `08:00`). - `allow_critical` (bool): слать критичные алерты в тишину. - `auto_mute` (list): авто‑мьюты по категориям и времени. - - `category` (string): load/disk/smart/ssl/docker/test. + - `category` (string): load/disk/smart/raid/ssl/docker/test. - `start` (string): начало `HH:MM`. - `end` (string): конец `HH:MM` (интервал может пересекать ночь). - `auto_mute_on_high_load_sec` (int): при critical load автоматически мьютить категорию `load` на N секунд (0 — выкл). @@ -42,6 +42,9 @@ - `smart_interval_sec` (int): интервал SMART. - `smart_cooldown_sec` (int): кулдаун SMART. - `smart_temp_warn` (int): порог температуры (C). +- `raid_enabled` (bool): RAID проверки (`/proc/mdstat`). +- `raid_interval_sec` (int): интервал RAID. +- `raid_cooldown_sec` (int): кулдаун RAID алертов. ## disk_report diff --git a/config.example.yaml b/config.example.yaml index 81f3607..871e76f 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -43,6 +43,9 @@ alerts: smart_interval_sec: 3600 smart_cooldown_sec: 21600 smart_temp_warn: 50 + raid_enabled: true + raid_interval_sec: 300 + raid_cooldown_sec: 1800 disk_report: threshold: 90 diff --git a/handlers/alerts_admin.py b/handlers/alerts_admin.py index 82419c3..ab52403 100644 --- a/handlers/alerts_admin.py +++ b/handlers/alerts_admin.py @@ -16,7 +16,7 @@ HELP_TEXT = ( "/alerts unmute - unmute category\n" "/alerts list - show active mutes\n" "/alerts recent [hours] - show incidents log (default 24h)\n" - "Categories: load, disk, smart, ssl, docker, test\n" + "Categories: load, disk, smart, raid, ssl, docker, test\n" ) diff --git a/handlers/help.py b/handlers/help.py index 26ca79d..e4ed13d 100644 --- a/handlers/help.py +++ b/handlers/help.py @@ -24,7 +24,7 @@ HELP_PAGES = [ "• `/alerts mute ` / `/alerts unmute ` / `/alerts list`\n" "• `/alerts recent [hours]`\n" "Шорткаты: `/alerts_list`, `/alerts_recent`, `/alerts_mute_load` (60м).\n" - "Категории: load, disk, smart, ssl, docker, test.\n" + "Категории: load, disk, smart, raid, ssl, docker, test.\n" "Quiet hours: `alerts.quiet_hours` для не‑критичных.\n" "Авто-мьют: `alerts.auto_mute` со слотами времени.\n" "Только красные load: `alerts.load_only_critical: true`.\n" diff --git a/main.py b/main.py index d1974ed..8264de1 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ from datetime import datetime from app import bot, dp, cfg, ADMIN_ID, ADMIN_IDS from keyboards import menu_kb from services.docker import discover_containers, docker_watchdog -from services.alerts import monitor_resources, monitor_smart +from services.alerts import monitor_resources, monitor_smart, monitor_raid from services.metrics import MetricsStore, start_sampler from services.queue import worker as queue_worker, configure as queue_configure from services.notify import notify @@ -82,6 +82,8 @@ async def main(): asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID)) if cfg.get("alerts", {}).get("smart_enabled", True): asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID)) + if cfg.get("alerts", {}).get("raid_enabled", True): + asyncio.create_task(monitor_raid(cfg, notify, bot, ADMIN_ID)) if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True): asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID)) if cfg.get("external_checks", {}).get("enabled", True): diff --git a/services/alerts.py b/services/alerts.py index 1cd6057..0b6e76f 100644 --- a/services/alerts.py +++ b/services/alerts.py @@ -1,7 +1,7 @@ import asyncio import time import psutil -from system_checks import list_disks, smart_health, disk_temperature +from system_checks import list_disks, smart_health, disk_temperature, list_md_arrays, md_array_status from services.system import worst_disk_usage from services.disk_report import build_disk_report @@ -130,3 +130,54 @@ async def monitor_smart(cfg, notify, bot, chat_id): continue await asyncio.sleep(interval) + + +async def monitor_raid(cfg, notify, bot, chat_id): + alerts_cfg = cfg.get("alerts", {}) + interval = int(alerts_cfg.get("raid_interval_sec", 300)) + cooldown = int(alerts_cfg.get("raid_cooldown_sec", 1800)) + notify_recovery = bool(alerts_cfg.get("notify_recovery", True)) + + last_sent: dict[str, float] = {} + bad_state: dict[str, bool] = {} + + while True: + now = time.time() + for dev in list_md_arrays(): + status = md_array_status(dev) + lower = status.lower() + level = None + key_suffix = None + if "inactive" in lower: + level = "critical" + key_suffix = "inactive" + elif "degraded" in lower: + level = "warn" + key_suffix = "degraded" + + if level: + if not bad_state.get(dev) or (now - last_sent.get(dev, 0.0) >= cooldown): + icon = "🔴" if level == "critical" else "🟡" + await notify( + bot, + chat_id, + f"{icon} RAID {dev}: {status}", + level=level, + key=f"raid_{key_suffix}:{dev}", + category="raid", + ) + last_sent[dev] = now + bad_state[dev] = True + else: + if bad_state.get(dev) and notify_recovery: + await notify( + bot, + chat_id, + f"🟢 RAID {dev}: {status}", + level="info", + key=f"raid_ok:{dev}", + category="raid", + ) + bad_state[dev] = False + + await asyncio.sleep(interval)