Add dedicated RAID alert category and monitor
This commit is contained in:
@@ -33,7 +33,7 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `end` (string): End time `HH:MM` (e.g. `08:00`).
|
- `end` (string): End time `HH:MM` (e.g. `08:00`).
|
||||||
- `allow_critical` (bool): Allow critical alerts during quiet hours.
|
- `allow_critical` (bool): Allow critical alerts during quiet hours.
|
||||||
- `auto_mute` (list): Per-category auto mutes by time window.
|
- `auto_mute` (list): Per-category auto mutes by time window.
|
||||||
- `category` (string): load/disk/smart/ssl/docker/test.
|
- `category` (string): load/disk/smart/raid/ssl/docker/test.
|
||||||
- `start` (string): Start `HH:MM`.
|
- `start` (string): Start `HH:MM`.
|
||||||
- `end` (string): End `HH:MM` (can wrap over midnight).
|
- `end` (string): End `HH:MM` (can wrap over midnight).
|
||||||
- `auto_mute_on_high_load_sec` (int): auto-mute `load` category for N seconds on critical load (0 disables).
|
- `auto_mute_on_high_load_sec` (int): auto-mute `load` category for N seconds on critical load (0 disables).
|
||||||
@@ -42,6 +42,9 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `smart_interval_sec` (int): SMART poll interval.
|
- `smart_interval_sec` (int): SMART poll interval.
|
||||||
- `smart_cooldown_sec` (int): SMART alert cooldown.
|
- `smart_cooldown_sec` (int): SMART alert cooldown.
|
||||||
- `smart_temp_warn` (int): SMART temperature warning (C).
|
- `smart_temp_warn` (int): SMART temperature warning (C).
|
||||||
|
- `raid_enabled` (bool): Enable md RAID polling (`/proc/mdstat`).
|
||||||
|
- `raid_interval_sec` (int): RAID poll interval.
|
||||||
|
- `raid_cooldown_sec` (int): RAID alert cooldown.
|
||||||
|
|
||||||
## disk_report
|
## disk_report
|
||||||
|
|
||||||
|
|||||||
@@ -33,7 +33,7 @@
|
|||||||
- `end` (string): конец, формат `HH:MM` (например `08:00`).
|
- `end` (string): конец, формат `HH:MM` (например `08:00`).
|
||||||
- `allow_critical` (bool): слать критичные алерты в тишину.
|
- `allow_critical` (bool): слать критичные алерты в тишину.
|
||||||
- `auto_mute` (list): авто‑мьюты по категориям и времени.
|
- `auto_mute` (list): авто‑мьюты по категориям и времени.
|
||||||
- `category` (string): load/disk/smart/ssl/docker/test.
|
- `category` (string): load/disk/smart/raid/ssl/docker/test.
|
||||||
- `start` (string): начало `HH:MM`.
|
- `start` (string): начало `HH:MM`.
|
||||||
- `end` (string): конец `HH:MM` (интервал может пересекать ночь).
|
- `end` (string): конец `HH:MM` (интервал может пересекать ночь).
|
||||||
- `auto_mute_on_high_load_sec` (int): при critical load автоматически мьютить категорию `load` на N секунд (0 — выкл).
|
- `auto_mute_on_high_load_sec` (int): при critical load автоматически мьютить категорию `load` на N секунд (0 — выкл).
|
||||||
@@ -42,6 +42,9 @@
|
|||||||
- `smart_interval_sec` (int): интервал SMART.
|
- `smart_interval_sec` (int): интервал SMART.
|
||||||
- `smart_cooldown_sec` (int): кулдаун SMART.
|
- `smart_cooldown_sec` (int): кулдаун SMART.
|
||||||
- `smart_temp_warn` (int): порог температуры (C).
|
- `smart_temp_warn` (int): порог температуры (C).
|
||||||
|
- `raid_enabled` (bool): RAID проверки (`/proc/mdstat`).
|
||||||
|
- `raid_interval_sec` (int): интервал RAID.
|
||||||
|
- `raid_cooldown_sec` (int): кулдаун RAID алертов.
|
||||||
|
|
||||||
## disk_report
|
## disk_report
|
||||||
|
|
||||||
|
|||||||
@@ -43,6 +43,9 @@ alerts:
|
|||||||
smart_interval_sec: 3600
|
smart_interval_sec: 3600
|
||||||
smart_cooldown_sec: 21600
|
smart_cooldown_sec: 21600
|
||||||
smart_temp_warn: 50
|
smart_temp_warn: 50
|
||||||
|
raid_enabled: true
|
||||||
|
raid_interval_sec: 300
|
||||||
|
raid_cooldown_sec: 1800
|
||||||
|
|
||||||
disk_report:
|
disk_report:
|
||||||
threshold: 90
|
threshold: 90
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ HELP_TEXT = (
|
|||||||
"/alerts unmute <category> - unmute category\n"
|
"/alerts unmute <category> - unmute category\n"
|
||||||
"/alerts list - show active mutes\n"
|
"/alerts list - show active mutes\n"
|
||||||
"/alerts recent [hours] - show incidents log (default 24h)\n"
|
"/alerts recent [hours] - show incidents log (default 24h)\n"
|
||||||
"Categories: load, disk, smart, ssl, docker, test\n"
|
"Categories: load, disk, smart, raid, ssl, docker, test\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ HELP_PAGES = [
|
|||||||
"• `/alerts mute <cat> <minutes>` / `/alerts unmute <cat>` / `/alerts list`\n"
|
"• `/alerts mute <cat> <minutes>` / `/alerts unmute <cat>` / `/alerts list`\n"
|
||||||
"• `/alerts recent [hours]`\n"
|
"• `/alerts recent [hours]`\n"
|
||||||
"Шорткаты: `/alerts_list`, `/alerts_recent`, `/alerts_mute_load` (60м).\n"
|
"Шорткаты: `/alerts_list`, `/alerts_recent`, `/alerts_mute_load` (60м).\n"
|
||||||
"Категории: load, disk, smart, ssl, docker, test.\n"
|
"Категории: load, disk, smart, raid, ssl, docker, test.\n"
|
||||||
"Quiet hours: `alerts.quiet_hours` для не‑критичных.\n"
|
"Quiet hours: `alerts.quiet_hours` для не‑критичных.\n"
|
||||||
"Авто-мьют: `alerts.auto_mute` со слотами времени.\n"
|
"Авто-мьют: `alerts.auto_mute` со слотами времени.\n"
|
||||||
"Только красные load: `alerts.load_only_critical: true`.\n"
|
"Только красные load: `alerts.load_only_critical: true`.\n"
|
||||||
|
|||||||
4
main.py
4
main.py
@@ -5,7 +5,7 @@ from datetime import datetime
|
|||||||
from app import bot, dp, cfg, ADMIN_ID, ADMIN_IDS
|
from app import bot, dp, cfg, ADMIN_ID, ADMIN_IDS
|
||||||
from keyboards import menu_kb
|
from keyboards import menu_kb
|
||||||
from services.docker import discover_containers, docker_watchdog
|
from services.docker import discover_containers, docker_watchdog
|
||||||
from services.alerts import monitor_resources, monitor_smart
|
from services.alerts import monitor_resources, monitor_smart, monitor_raid
|
||||||
from services.metrics import MetricsStore, start_sampler
|
from services.metrics import MetricsStore, start_sampler
|
||||||
from services.queue import worker as queue_worker, configure as queue_configure
|
from services.queue import worker as queue_worker, configure as queue_configure
|
||||||
from services.notify import notify
|
from services.notify import notify
|
||||||
@@ -82,6 +82,8 @@ async def main():
|
|||||||
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
|
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
|
||||||
if cfg.get("alerts", {}).get("smart_enabled", True):
|
if cfg.get("alerts", {}).get("smart_enabled", True):
|
||||||
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
||||||
|
if cfg.get("alerts", {}).get("raid_enabled", True):
|
||||||
|
asyncio.create_task(monitor_raid(cfg, notify, bot, ADMIN_ID))
|
||||||
if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
|
if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
|
||||||
asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
|
asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
|
||||||
if cfg.get("external_checks", {}).get("enabled", True):
|
if cfg.get("external_checks", {}).get("enabled", True):
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
import psutil
|
import psutil
|
||||||
from system_checks import list_disks, smart_health, disk_temperature
|
from system_checks import list_disks, smart_health, disk_temperature, list_md_arrays, md_array_status
|
||||||
from services.system import worst_disk_usage
|
from services.system import worst_disk_usage
|
||||||
from services.disk_report import build_disk_report
|
from services.disk_report import build_disk_report
|
||||||
|
|
||||||
@@ -130,3 +130,54 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
|
async def monitor_raid(cfg, notify, bot, chat_id):
|
||||||
|
alerts_cfg = cfg.get("alerts", {})
|
||||||
|
interval = int(alerts_cfg.get("raid_interval_sec", 300))
|
||||||
|
cooldown = int(alerts_cfg.get("raid_cooldown_sec", 1800))
|
||||||
|
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
||||||
|
|
||||||
|
last_sent: dict[str, float] = {}
|
||||||
|
bad_state: dict[str, bool] = {}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
now = time.time()
|
||||||
|
for dev in list_md_arrays():
|
||||||
|
status = md_array_status(dev)
|
||||||
|
lower = status.lower()
|
||||||
|
level = None
|
||||||
|
key_suffix = None
|
||||||
|
if "inactive" in lower:
|
||||||
|
level = "critical"
|
||||||
|
key_suffix = "inactive"
|
||||||
|
elif "degraded" in lower:
|
||||||
|
level = "warn"
|
||||||
|
key_suffix = "degraded"
|
||||||
|
|
||||||
|
if level:
|
||||||
|
if not bad_state.get(dev) or (now - last_sent.get(dev, 0.0) >= cooldown):
|
||||||
|
icon = "🔴" if level == "critical" else "🟡"
|
||||||
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"{icon} RAID {dev}: {status}",
|
||||||
|
level=level,
|
||||||
|
key=f"raid_{key_suffix}:{dev}",
|
||||||
|
category="raid",
|
||||||
|
)
|
||||||
|
last_sent[dev] = now
|
||||||
|
bad_state[dev] = True
|
||||||
|
else:
|
||||||
|
if bad_state.get(dev) and notify_recovery:
|
||||||
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"🟢 RAID {dev}: {status}",
|
||||||
|
level="info",
|
||||||
|
key=f"raid_ok:{dev}",
|
||||||
|
category="raid",
|
||||||
|
)
|
||||||
|
bad_state[dev] = False
|
||||||
|
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
|||||||
Reference in New Issue
Block a user