Add dedicated RAID alert category and monitor

Detect md arrays via /proc/mdstat for RAID status
Fix md RAID detection for lsblk raid* types
2026-02-25 01:43:10 +03:00 · 2026-02-25 01:39:11 +03:00 · 2026-02-25 01:36:59 +03:00 · 2026-02-25 01:32:55 +03:00 · 2026-02-15 01:25:11 +03:00
14 changed files with 277 additions and 23 deletions
--- a/CONFIG.en.md
+++ b/CONFIG.en.md
@@ -33,7 +33,7 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
  - `end` (string): End time `HH:MM` (e.g. `08:00`).
  - `allow_critical` (bool): Allow critical alerts during quiet hours.
 - `auto_mute` (list): Per-category auto mutes by time window.
-  - `category` (string): load/disk/smart/ssl/docker/test.
+  - `category` (string): load/disk/smart/raid/ssl/docker/test.
  - `start` (string): Start `HH:MM`.
  - `end` (string): End `HH:MM` (can wrap over midnight).
 - `auto_mute_on_high_load_sec` (int): auto-mute `load` category for N seconds on critical load (0 disables).
@@ -42,6 +42,9 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
 - `smart_interval_sec` (int): SMART poll interval.
 - `smart_cooldown_sec` (int): SMART alert cooldown.
 - `smart_temp_warn` (int): SMART temperature warning (C).
+- `raid_enabled` (bool): Enable md RAID polling (`/proc/mdstat`).
+- `raid_interval_sec` (int): RAID poll interval.
+- `raid_cooldown_sec` (int): RAID alert cooldown.

 ## disk_report

--- a/CONFIG.md
+++ b/CONFIG.md
@@ -33,7 +33,7 @@
  - `end` (string): конец, формат `HH:MM` (например `08:00`).
  - `allow_critical` (bool): слать критичные алерты в тишину.
 - `auto_mute` (list): авто‑мьюты по категориям и времени.
-  - `category` (string): load/disk/smart/ssl/docker/test.
+  - `category` (string): load/disk/smart/raid/ssl/docker/test.
  - `start` (string): начало `HH:MM`.
  - `end` (string): конец `HH:MM` (интервал может пересекать ночь).
 - `auto_mute_on_high_load_sec` (int): при critical load автоматически мьютить категорию `load` на N секунд (0 — выкл).
@@ -42,6 +42,9 @@
 - `smart_interval_sec` (int): интервал SMART.
 - `smart_cooldown_sec` (int): кулдаун SMART.
 - `smart_temp_warn` (int): порог температуры (C).
+- `raid_enabled` (bool): RAID проверки (`/proc/mdstat`).
+- `raid_interval_sec` (int): интервал RAID.
+- `raid_cooldown_sec` (int): кулдаун RAID алертов.

 ## disk_report

--- a/config.example.yaml
+++ b/config.example.yaml
@@ -43,6 +43,9 @@ alerts:
  smart_interval_sec: 3600
  smart_cooldown_sec: 21600
  smart_temp_warn: 50
+  raid_enabled: true
+  raid_interval_sec: 300
+  raid_cooldown_sec: 1800

 disk_report:
  threshold: 90
--- a/handlers/alerts_admin.py
+++ b/handlers/alerts_admin.py
@@ -16,7 +16,7 @@ HELP_TEXT = (
    "/alerts unmute <category> - unmute category\n"
    "/alerts list - show active mutes\n"
    "/alerts recent [hours] - show incidents log (default 24h)\n"
-    "Categories: load, disk, smart, ssl, docker, test\n"
+    "Categories: load, disk, smart, raid, ssl, docker, test\n"
 )


--- a/handlers/callbacks.py
+++ b/handlers/callbacks.py
@@ -67,7 +67,7 @@ async def snapshot_details(cb: CallbackQuery):
    snap_id = cb.data.split(":", 1)[1]
    await cb.answer("Loading snapshot…")

-    # РїРѕР»СѓС‡Р°РµРј СЃС‚Р°С‚РёСЃС‚РёРєСѓ snapshot
+    # получаем статистику snapshot
    rc, raw = await run_cmd(
        ["restic", "stats", snap_id, "--json"],
        use_restic_env=True,
--- a/handlers/help.py
+++ b/handlers/help.py
@@ -24,7 +24,7 @@ HELP_PAGES = [
        "• `/alerts mute <cat> <minutes>` / `/alerts unmute <cat>` / `/alerts list`\n"
        "• `/alerts recent [hours]`\n"
        "Шорткаты: `/alerts_list`, `/alerts_recent`, `/alerts_mute_load` (60м).\n"
-        "Категории: load, disk, smart, ssl, docker, test.\n"
+        "Категории: load, disk, smart, raid, ssl, docker, test.\n"
        "Quiet hours: `alerts.quiet_hours` для не‑критичных.\n"
        "Авто-мьют: `alerts.auto_mute` со слотами времени.\n"
        "Только красные load: `alerts.load_only_critical: true`.\n"
--- a/main.py
+++ b/main.py
@@ -5,7 +5,7 @@ from datetime import datetime
 from app import bot, dp, cfg, ADMIN_ID, ADMIN_IDS
 from keyboards import menu_kb
 from services.docker import discover_containers, docker_watchdog
-from services.alerts import monitor_resources, monitor_smart
+from services.alerts import monitor_resources, monitor_smart, monitor_raid
 from services.metrics import MetricsStore, start_sampler
 from services.queue import worker as queue_worker, configure as queue_configure
 from services.notify import notify
@@ -82,6 +82,8 @@ async def main():
        asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
    if cfg.get("alerts", {}).get("smart_enabled", True):
        asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
+    if cfg.get("alerts", {}).get("raid_enabled", True):
+        asyncio.create_task(monitor_raid(cfg, notify, bot, ADMIN_ID))
    if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
        asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
    if cfg.get("external_checks", {}).get("enabled", True):
--- a/services/alerts.py
+++ b/services/alerts.py
@@ -1,7 +1,7 @@
 import asyncio
 import time
 import psutil
-from system_checks import list_disks, smart_health, disk_temperature
+from system_checks import list_disks, smart_health, disk_temperature, list_md_arrays, md_array_status
 from services.system import worst_disk_usage
 from services.disk_report import build_disk_report

@@ -130,3 +130,54 @@ async def monitor_smart(cfg, notify, bot, chat_id):
                    continue

        await asyncio.sleep(interval)
+
+
+async def monitor_raid(cfg, notify, bot, chat_id):
+    alerts_cfg = cfg.get("alerts", {})
+    interval = int(alerts_cfg.get("raid_interval_sec", 300))
+    cooldown = int(alerts_cfg.get("raid_cooldown_sec", 1800))
+    notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
+
+    last_sent: dict[str, float] = {}
+    bad_state: dict[str, bool] = {}
+
+    while True:
+        now = time.time()
+        for dev in list_md_arrays():
+            status = md_array_status(dev)
+            lower = status.lower()
+            level = None
+            key_suffix = None
+            if "inactive" in lower:
+                level = "critical"
+                key_suffix = "inactive"
+            elif "degraded" in lower:
+                level = "warn"
+                key_suffix = "degraded"
+
+            if level:
+                if not bad_state.get(dev) or (now - last_sent.get(dev, 0.0) >= cooldown):
+                    icon = "🔴" if level == "critical" else "🟡"
+                    await notify(
+                        bot,
+                        chat_id,
+                        f"{icon} RAID {dev}: {status}",
+                        level=level,
+                        key=f"raid_{key_suffix}:{dev}",
+                        category="raid",
+                    )
+                    last_sent[dev] = now
+                bad_state[dev] = True
+            else:
+                if bad_state.get(dev) and notify_recovery:
+                    await notify(
+                        bot,
+                        chat_id,
+                        f"🟢 RAID {dev}: {status}",
+                        level="info",
+                        key=f"raid_ok:{dev}",
+                        category="raid",
+                    )
+                bad_state[dev] = False
+
+        await asyncio.sleep(interval)
--- a/services/health.py
+++ b/services/health.py
@@ -37,7 +37,7 @@ def _npm_api_base(cfg) -> str | None:


 def health(cfg, container_map: dict | None = None) -> str:
-    lines = ["рџ©є Health check\n"]
+    lines = ["🩺 Health check\n"]
    thresholds = cfg.get("thresholds", {})
    disk_warn = int(thresholds.get("disk_warn", 80))
    load_warn = float(thresholds.get("load_warn", 2.0))
@@ -45,9 +45,9 @@ def health(cfg, container_map: dict | None = None) -> str:
        env = os.environ.copy()
        env.update(RESTIC_ENV)
        subprocess.check_output(["restic", "snapshots"], timeout=10, env=env)
-        lines.append("рџџў Backup repo reachable")
+        lines.append("🟢 Backup repo reachable")
    except Exception:
-        lines.append("рџ”ґ Backup repo unreachable")
+        lines.append("🔴 Backup repo unreachable")

    containers = container_map if container_map is not None else _containers_from_cfg(cfg)
    for alias, real in containers.items():
@@ -55,20 +55,20 @@ def health(cfg, container_map: dict | None = None) -> str:
            f"docker inspect -f '{{{{.State.Status}}}}' {real}"
        )
        if out.strip() != "running":
-            lines.append(f"рџ”ґ {alias} down")
+            lines.append(f"🔴 {alias} down")
        else:
-            lines.append(f"рџџў {alias} OK")
+            lines.append(f"🟢 {alias} OK")

    npm_cfg = cfg.get("npmplus", {})
    npm_base = _npm_api_base(cfg)
    if npm_base:
        npm_status = _request_status(npm_base, npm_cfg.get("verify_tls", True))
        if npm_status == 200:
-            lines.append("рџџў NPMplus API OK")
+            lines.append("🟢 NPMplus API OK")
        elif npm_status is None:
-            lines.append("рџ”ґ NPMplus API unreachable")
+            lines.append("🔴 NPMplus API unreachable")
        else:
-            lines.append(f"рџџЎ NPMplus API HTTP {npm_status}")
+            lines.append(f"🟡 NPMplus API HTTP {npm_status}")

    g_cfg = cfg.get("gitea", {})
    g_base = (g_cfg.get("base_url") or "").rstrip("/")
@@ -84,22 +84,22 @@ def health(cfg, container_map: dict | None = None) -> str:
                g_status = status
                break
        if g_status == 200:
-            lines.append("рџџў Gitea API OK")
+            lines.append("🟢 Gitea API OK")
        elif g_status is None:
-            lines.append("рџ”ґ Gitea API unreachable")
+            lines.append("🔴 Gitea API unreachable")
        else:
-            lines.append(f"рџџЎ Gitea API HTTP {g_status}")
+            lines.append(f"🟡 Gitea API HTTP {g_status}")

    usage, mount = worst_disk_usage()
    if usage is None:
-        lines.append("вљ пёЏ Disk n/a")
+        lines.append("⚠️ Disk n/a")
    elif usage > disk_warn:
-        lines.append(f"рџџЎ Disk {usage}% ({mount})")
+        lines.append(f"🟡 Disk {usage}% ({mount})")
    else:
-        lines.append(f"рџџў Disk {usage}% ({mount})")
+        lines.append(f"🟢 Disk {usage}% ({mount})")

    load = psutil.getloadavg()[0]
-    lines.append(f"{'рџџў' if load < load_warn else 'рџџЎ'} Load {load}")
+    lines.append(f"{'🟢' if load < load_warn else '🟡'} Load {load}")

    return "\n".join(lines)

--- a/system_checks.py
+++ b/system_checks.py
@@ -1,5 +1,6 @@
 import subprocess
 import os
+import re


 def _cmd(cmd: str) -> str:
@@ -82,6 +83,62 @@ def list_disks() -> list[str]:
    return disks


+def list_md_arrays() -> list[str]:
+    # Prefer /proc/mdstat: it reliably lists active md arrays
+    # even when lsblk tree/filters differ across distros.
+    out = _cmd("cat /proc/mdstat")
+    arrays: set[str] = set()
+    for line in out.splitlines():
+        m = re.match(r"^\s*(md\d+)\s*:", line)
+        if m:
+            arrays.add(f"/dev/{m.group(1)}")
+
+    if arrays:
+        return sorted(arrays)
+
+    # Fallback for environments where mdstat parsing is unavailable.
+    out = _cmd("ls -1 /dev/md* 2>/dev/null")
+    for line in out.splitlines():
+        dev = line.strip()
+        if dev and re.match(r"^/dev/md\d+$", dev):
+            arrays.add(dev)
+    return sorted(arrays)
+
+
+def md_array_status(dev: str) -> str:
+    out = _cmd("cat /proc/mdstat")
+    if not out or "ERROR:" in out:
+        return "⚠️ n/a"
+
+    name = dev.rsplit("/", 1)[-1]
+    lines = out.splitlines()
+    header = None
+    idx = -1
+    for i, line in enumerate(lines):
+        s = line.strip()
+        if s.startswith(f"{name} :"):
+            header = s
+            idx = i
+            break
+
+    if not header:
+        return "⚠️ not found in /proc/mdstat"
+
+    if "inactive" in header:
+        return "🔴 inactive"
+
+    # Typical mdstat health marker: [UU] for healthy mirrors/raid members.
+    block = [header]
+    for line in lines[idx + 1:]:
+        if not line.strip():
+            break
+        block.append(line.strip())
+    block_text = " ".join(block)
+    if "[U_" in block_text or "[_U" in block_text:
+        return "🟡 degraded"
+    return "🟢 active"
+
+
 def smart_health(dev: str) -> str:
    out = _cmd(f"smartctl -H {dev}")

@@ -138,8 +195,9 @@ def smart_last_test(dev: str) -> str:

 def disks() -> str:
    disks = list_disks()
+    md_arrays = list_md_arrays()

-    if not disks:
+    if not disks and not md_arrays:
        return "💽 Disks\n\n❌ No disks found"

    lines = ["💽 Disks (SMART)\n"]
@@ -158,6 +216,12 @@ def disks() -> str:

        lines.append(f"{icon} {d} — {health}, 🌡 {temp}")

+    if md_arrays:
+        lines.append("")
+        lines.append("🧱 RAID (md)")
+        for md in md_arrays:
+            lines.append(f"{md} — {md_array_status(md)}")
+
    return "\n".join(lines)


--- a/tests/test_config_check.py
+++ b/tests/test_config_check.py
@@ -0,0 +1,20 @@
+import unittest
+
+from services.config_check import validate_cfg
+
+
+class ConfigCheckTests(unittest.TestCase):
+    def test_admin_ids_without_admin_id_is_valid(self):
+        cfg = {
+            "telegram": {
+                "token": "x",
+                "admin_ids": [1, 2],
+            }
+        }
+        errors, warnings = validate_cfg(cfg)
+        self.assertEqual(errors, [])
+        self.assertIsInstance(warnings, list)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_disk_report.py
+++ b/tests/test_disk_report.py
@@ -0,0 +1,21 @@
+import unittest
+import types
+import sys
+
+# Avoid runtime import of real app/aiogram in services.runner.
+sys.modules.setdefault("app", types.SimpleNamespace(RESTIC_ENV={}))
+
+from services.disk_report import _top_dirs_cmd
+
+
+class DiskReportTests(unittest.TestCase):
+    def test_top_dirs_cmd_uses_exec_args_without_shell(self):
+        cmd = _top_dirs_cmd("/tmp/path with spaces", 5)
+        self.assertEqual(cmd[:4], ["du", "-x", "-h", "-d"])
+        self.assertNotIn("bash", cmd)
+        self.assertNotIn("-lc", cmd)
+        self.assertEqual(cmd[-1], "/tmp/path with spaces")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_queue.py
+++ b/tests/test_queue.py
@@ -0,0 +1,59 @@
+import asyncio
+import tempfile
+import unittest
+
+from services import runtime_state
+from services import queue as queue_service
+
+
+class QueueTests(unittest.IsolatedAsyncioTestCase):
+    async def asyncSetUp(self):
+        self.tmp = tempfile.TemporaryDirectory()
+        runtime_state.configure(f"{self.tmp.name}/runtime.json")
+
+        queue_service._pending.clear()  # type: ignore[attr-defined]
+        queue_service._history.clear()  # type: ignore[attr-defined]
+        queue_service._stats = {  # type: ignore[attr-defined]
+            "processed": 0,
+            "avg_wait_sec": 0.0,
+            "avg_runtime_sec": 0.0,
+            "last_label": "",
+            "last_finished_at": 0.0,
+        }
+        queue_service._cfg = {"incidents": {"enabled": True}}  # type: ignore[attr-defined]
+
+    async def asyncTearDown(self):
+        self.tmp.cleanup()
+
+    async def test_worker_logs_failed_job_to_incidents(self):
+        logged = []
+
+        def fake_log_incident(cfg, text, category=None):
+            logged.append((text, category))
+
+        orig = queue_service.log_incident
+        queue_service.log_incident = fake_log_incident
+
+        async def boom():
+            raise RuntimeError("boom")
+
+        worker_task = asyncio.create_task(queue_service.worker())
+        try:
+            await queue_service.enqueue("broken-job", boom)
+            await asyncio.wait_for(queue_service._queue.join(), timeout=2.0)  # type: ignore[attr-defined]
+        finally:
+            worker_task.cancel()
+            with contextlib.suppress(asyncio.CancelledError):
+                await worker_task
+            queue_service.log_incident = orig
+
+        self.assertEqual(queue_service._stats.get("processed"), 1)  # type: ignore[attr-defined]
+        self.assertTrue(any("queue_job_failed label=broken-job" in t for t, _c in logged))
+        self.assertTrue(any(c == "queue" for _t, c in logged))
+
+
+import contextlib
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_runtime_state.py
+++ b/tests/test_runtime_state.py
@@ -0,0 +1,28 @@
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+from services import runtime_state
+
+
+class RuntimeStateTests(unittest.TestCase):
+    def test_set_and_get_persist_between_loads(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            path = Path(tmp) / "runtime.json"
+            runtime_state.configure(str(path))
+
+            runtime_state.set_state("foo", {"bar": 1})
+            self.assertEqual(runtime_state.get("foo"), {"bar": 1})
+
+            # Force a fresh in-memory state and load from disk again.
+            runtime_state._STATE = {}  # type: ignore[attr-defined]
+            runtime_state._LOADED = False  # type: ignore[attr-defined]
+            self.assertEqual(runtime_state.get("foo"), {"bar": 1})
+
+            raw = json.loads(path.read_text(encoding="utf-8"))
+            self.assertEqual(raw.get("foo"), {"bar": 1})
+
+
+if __name__ == "__main__":
+    unittest.main()
Author	SHA1	Message	Date
benya	b84107463c	Add dedicated RAID alert category and monitor	2026-02-25 01:43:10 +03:00
benya	ee361abb99	Detect md arrays via /proc/mdstat for RAID status	2026-02-25 01:39:11 +03:00
benya	2ad423fb6a	Fix md RAID detection for lsblk raid* types	2026-02-25 01:36:59 +03:00
benya	efa5dd9644	Fix mojibake text and add md RAID checks	2026-02-25 01:32:55 +03:00
benya	678332e6d0	Add lightweight unittest coverage for stability fixes	2026-02-15 01:25:11 +03:00