Compare commits
98 Commits
4e79c401a9
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| b84107463c | |||
| ee361abb99 | |||
| 2ad423fb6a | |||
| efa5dd9644 | |||
| 678332e6d0 | |||
| 7c56430f32 | |||
| b54a094185 | |||
| 6d5fb9c258 | |||
| 5099ae4fe2 | |||
| 568cd86844 | |||
| b138ee316d | |||
| fa98a96b34 | |||
| 1dba6d4a0f | |||
| b784deb02b | |||
| 5ae54618e8 | |||
| 3fc99bdcfc | |||
| c1d69adbc5 | |||
| a14fb8fccd | |||
| 4ba8f48228 | |||
| 10bf265c29 | |||
| fd179d24e8 | |||
| 2905528677 | |||
| 2b87ce04a3 | |||
| 02b8e2bb55 | |||
| f0fb2aad0e | |||
| 219776c642 | |||
| 28caa551bd | |||
| 783f4abd98 | |||
| f71c02835a | |||
| f7081b78e1 | |||
| 0fbd374823 | |||
| c3db70160c | |||
| 1b9d260530 | |||
| 040a6c96e4 | |||
| 4f6d6dd549 | |||
| 2e0bf0c6ea | |||
| 5a4234f59d | |||
| 1d24caa2a2 | |||
| c91c961134 | |||
| 75113b6182 | |||
| aa7bd85687 | |||
| ff65e15509 | |||
| 08fa95dffd | |||
| b0a4413671 | |||
| 9399be4168 | |||
| 2e35885a5e | |||
| 4d4e3767bc | |||
| b78dc3cd5c | |||
| 20cd56a8c0 | |||
| 7d251a7078 | |||
| 2ee9756d12 | |||
| 77571da4d9 | |||
| d4a19d309f | |||
| 972c8eb6a7 | |||
| ae2d085214 | |||
| 5da7125fbb | |||
| 65682ca162 | |||
| 8bcc3c6878 | |||
| ab58592523 | |||
| a98292604a | |||
| 97524b92a2 | |||
| 0a761e5799 | |||
| d242dafb9b | |||
| 7db336f2aa | |||
| b4a243e72f | |||
| 01c539fad9 | |||
| 8cec8ae53e | |||
| e36bf49f1c | |||
| a029bbfa7a | |||
| ad8a6bff69 | |||
| 64d899d971 | |||
| 8b08b5418f | |||
| 7a5e3d46cf | |||
| c31a194651 | |||
| 5e01a8d596 | |||
| fc061ece30 | |||
| 0f7f53cb27 | |||
| 857fa86e85 | |||
| ea6ad1d5b2 | |||
| e1b0f1153e | |||
| 054d1d0d50 | |||
| 200b8104a6 | |||
| e7a120657b | |||
| c34a142698 | |||
| 3df9db3bf7 | |||
| aab54d4108 | |||
| 45756636b9 | |||
| 51b24be0be | |||
| 1d7262eb78 | |||
| f7ebdfe325 | |||
| 9ced16cfbd | |||
| c8db1be2d8 | |||
| dbf9b1fd2f | |||
| 118d4bf7f2 | |||
| a7d5fb5459 | |||
| 48dc1f38ac | |||
| 4a00deadc3 | |||
| c51e2d4a59 |
85
CONFIG.en.md
85
CONFIG.en.md
@@ -6,10 +6,12 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
|
|
||||||
- `token` (string, required): Telegram bot token.
|
- `token` (string, required): Telegram bot token.
|
||||||
- `admin_id` (int, required): Telegram user id with admin access.
|
- `admin_id` (int, required): Telegram user id with admin access.
|
||||||
|
- `admin_ids` (list<int>): Optional list of admins (first is primary for alerts).
|
||||||
|
|
||||||
## paths
|
## paths
|
||||||
|
|
||||||
- `artifact_state` (string): JSON file for artifact state.
|
- `artifact_state` (string): JSON file for artifact state.
|
||||||
|
- `runtime_state` (string): File for runtime state (mutes, metrics, etc.).
|
||||||
- `restic_env` (string): Path to a file with RESTIC_* environment variables.
|
- `restic_env` (string): Path to a file with RESTIC_* environment variables.
|
||||||
|
|
||||||
## thresholds
|
## thresholds
|
||||||
@@ -23,11 +25,34 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `enabled` (bool): Enable resource alerts.
|
- `enabled` (bool): Enable resource alerts.
|
||||||
- `interval_sec` (int): Poll interval.
|
- `interval_sec` (int): Poll interval.
|
||||||
- `cooldown_sec` (int): Cooldown between alerts.
|
- `cooldown_sec` (int): Cooldown between alerts.
|
||||||
|
- `notify_cooldown_sec` (int): Global alert dedup cooldown (defaults to `cooldown_sec`).
|
||||||
|
- `load_only_critical` (bool): Only send critical load alerts (no warn/OK).
|
||||||
|
- `quiet_hours` (object): Quiet hours for non‑critical alerts.
|
||||||
|
- `enabled` (bool): Enable quiet hours.
|
||||||
|
- `start` (string): Start time `HH:MM` (e.g. `23:00`).
|
||||||
|
- `end` (string): End time `HH:MM` (e.g. `08:00`).
|
||||||
|
- `allow_critical` (bool): Allow critical alerts during quiet hours.
|
||||||
|
- `auto_mute` (list): Per-category auto mutes by time window.
|
||||||
|
- `category` (string): load/disk/smart/raid/ssl/docker/test.
|
||||||
|
- `start` (string): Start `HH:MM`.
|
||||||
|
- `end` (string): End `HH:MM` (can wrap over midnight).
|
||||||
|
- `auto_mute_on_high_load_sec` (int): auto-mute `load` category for N seconds on critical load (0 disables).
|
||||||
- `notify_recovery` (bool): Send recovery notifications.
|
- `notify_recovery` (bool): Send recovery notifications.
|
||||||
- `smart_enabled` (bool): Enable SMART health polling.
|
- `smart_enabled` (bool): Enable SMART health polling.
|
||||||
- `smart_interval_sec` (int): SMART poll interval.
|
- `smart_interval_sec` (int): SMART poll interval.
|
||||||
- `smart_cooldown_sec` (int): SMART alert cooldown.
|
- `smart_cooldown_sec` (int): SMART alert cooldown.
|
||||||
- `smart_temp_warn` (int): SMART temperature warning (C).
|
- `smart_temp_warn` (int): SMART temperature warning (C).
|
||||||
|
- `raid_enabled` (bool): Enable md RAID polling (`/proc/mdstat`).
|
||||||
|
- `raid_interval_sec` (int): RAID poll interval.
|
||||||
|
- `raid_cooldown_sec` (int): RAID alert cooldown.
|
||||||
|
|
||||||
|
## disk_report
|
||||||
|
|
||||||
|
- `threshold` (int): Disk usage threshold for auto snapshot.
|
||||||
|
- `cooldown_sec` (int): Cooldown between snapshots.
|
||||||
|
- `top_dirs` (int): How many directories to show.
|
||||||
|
- `docker_dir` (string): Path to docker data.
|
||||||
|
- `logs_dir` (string): Path to logs.
|
||||||
|
|
||||||
## audit
|
## audit
|
||||||
|
|
||||||
@@ -43,6 +68,47 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||||
- `backup_count` (int): How many rotated files to keep.
|
- `backup_count` (int): How many rotated files to keep.
|
||||||
|
|
||||||
|
## logging
|
||||||
|
|
||||||
|
- `enabled` (bool): Enable bot logging.
|
||||||
|
- `path` (string): Log file path. Default `/var/server-bot/bot.log`.
|
||||||
|
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||||
|
- `backup_count` (int): How many rotated files to keep.
|
||||||
|
- `level` (string): Log level (`INFO`, `WARNING`, `ERROR`).
|
||||||
|
|
||||||
|
## safety
|
||||||
|
|
||||||
|
- `dry_run` (bool): If `true`, dangerous actions (upgrade/reboot/backup) are skipped.
|
||||||
|
|
||||||
|
## reports
|
||||||
|
|
||||||
|
- `weekly.enabled` (bool): Enable weekly report.
|
||||||
|
- `weekly.day` (string): Weekday `Mon`..`Sun` (default `Sun`).
|
||||||
|
- `weekly.time` (string): Local time `HH:MM` (default `08:00`).
|
||||||
|
|
||||||
|
## selftest
|
||||||
|
|
||||||
|
- `schedule.enabled` (bool): Enable auto self-test.
|
||||||
|
- `schedule.time` (string): Local time `HH:MM` (default `03:30`).
|
||||||
|
|
||||||
|
## queue
|
||||||
|
|
||||||
|
- `max_pending_alert` (int): Alert if pending tasks >= this value.
|
||||||
|
- `avg_wait_alert` (int): Alert if average wait exceeds N seconds.
|
||||||
|
- `cooldown_sec` (int): Cooldown between queue alerts (default 300s).
|
||||||
|
## external_checks
|
||||||
|
|
||||||
|
- `enabled` (bool): Enable background checks.
|
||||||
|
- `state_path` (string): State file for uptime, default `/var/server-bot/external_checks.json`.
|
||||||
|
- `timeout_sec` (int): Check timeout in seconds.
|
||||||
|
- `interval_sec` (int): Background check interval.
|
||||||
|
- `services` (list): List of checks.
|
||||||
|
- `name` (string): Service name.
|
||||||
|
- `type` (string): `http`, `tcp`, `ping`.
|
||||||
|
- `url` (string): URL for `http`.
|
||||||
|
- `host` (string): Host for `tcp`/`ping`.
|
||||||
|
- `port` (int): Port for `tcp`.
|
||||||
|
|
||||||
## arcane
|
## arcane
|
||||||
|
|
||||||
- `base_url` (string): Arcane API base url.
|
- `base_url` (string): Arcane API base url.
|
||||||
@@ -58,12 +124,31 @@ Used for SSL certificate status.
|
|||||||
- `secret` (string): Login password.
|
- `secret` (string): Login password.
|
||||||
- `token` (string): Optional static token (not recommended if it expires).
|
- `token` (string): Optional static token (not recommended if it expires).
|
||||||
- `verify_tls` (bool): Set to `false` for self-signed TLS.
|
- `verify_tls` (bool): Set to `false` for self-signed TLS.
|
||||||
|
- `alerts.enabled` (bool): Enable expiry notifications.
|
||||||
|
- `alerts.days` (list): Thresholds in days (e.g. 30/14/7/1).
|
||||||
|
- `alerts.cooldown_sec` (int): Cooldown between identical alerts.
|
||||||
|
- `alerts.interval_sec` (int): Check interval.
|
||||||
|
|
||||||
Token flow:
|
Token flow:
|
||||||
|
|
||||||
- First token: `POST /api/tokens` with `identity` and `secret`.
|
- First token: `POST /api/tokens` with `identity` and `secret`.
|
||||||
- Refresh: `GET /api/tokens` using the cached token.
|
- Refresh: `GET /api/tokens` using the cached token.
|
||||||
|
|
||||||
|
## gitea
|
||||||
|
|
||||||
|
- `base_url` (string): Gitea base url, for example `http://localhost:3000`.
|
||||||
|
- `token` (string): Optional API token.
|
||||||
|
- `verify_tls` (bool): Set to `false` for self-signed TLS.
|
||||||
|
|
||||||
|
## openwrt
|
||||||
|
|
||||||
|
- `host` (string): Router address, for example `10.10.10.1`.
|
||||||
|
- `user` (string): SSH user (usually `root`).
|
||||||
|
- `port` (int): SSH port (usually `22`).
|
||||||
|
- `identity_file` (string): Path to SSH key (optional).
|
||||||
|
- `strict_host_key_checking` (bool): Set to `false` to skip key confirmation.
|
||||||
|
- `timeout_sec` (int): SSH request timeout.
|
||||||
|
|
||||||
## security
|
## security
|
||||||
|
|
||||||
- `reboot_password` (string): Password required before reboot.
|
- `reboot_password` (string): Password required before reboot.
|
||||||
|
|||||||
86
CONFIG.md
86
CONFIG.md
@@ -6,10 +6,12 @@
|
|||||||
|
|
||||||
- `token` (string, обяз.): токен бота.
|
- `token` (string, обяз.): токен бота.
|
||||||
- `admin_id` (int, обяз.): Telegram user id администратора.
|
- `admin_id` (int, обяз.): Telegram user id администратора.
|
||||||
|
- `admin_ids` (list<int>): список админов (первый используется как основной для уведомлений).
|
||||||
|
|
||||||
## paths
|
## paths
|
||||||
|
|
||||||
- `artifact_state` (string): JSON файл состояния артефактов.
|
- `artifact_state` (string): JSON файл состояния артефактов.
|
||||||
|
- `runtime_state` (string): файл с runtime-состоянием (мьюты, метрики и т.п.).
|
||||||
- `restic_env` (string): путь к файлу с RESTIC_* переменными.
|
- `restic_env` (string): путь к файлу с RESTIC_* переменными.
|
||||||
|
|
||||||
## thresholds
|
## thresholds
|
||||||
@@ -23,11 +25,34 @@
|
|||||||
- `enabled` (bool): включить алерты.
|
- `enabled` (bool): включить алерты.
|
||||||
- `interval_sec` (int): интервал опроса.
|
- `interval_sec` (int): интервал опроса.
|
||||||
- `cooldown_sec` (int): кулдаун между алертами.
|
- `cooldown_sec` (int): кулдаун между алертами.
|
||||||
|
- `notify_cooldown_sec` (int): глобальный дедуп алертов (по умолчанию `cooldown_sec`).
|
||||||
|
- `load_only_critical` (bool): слать только критичные алерты по нагрузке (без warn/OK).
|
||||||
|
- `quiet_hours` (object): тихие часы для не‑критичных уведомлений.
|
||||||
|
- `enabled` (bool): включить тихие часы.
|
||||||
|
- `start` (string): начало, формат `HH:MM` (например `23:00`).
|
||||||
|
- `end` (string): конец, формат `HH:MM` (например `08:00`).
|
||||||
|
- `allow_critical` (bool): слать критичные алерты в тишину.
|
||||||
|
- `auto_mute` (list): авто‑мьюты по категориям и времени.
|
||||||
|
- `category` (string): load/disk/smart/raid/ssl/docker/test.
|
||||||
|
- `start` (string): начало `HH:MM`.
|
||||||
|
- `end` (string): конец `HH:MM` (интервал может пересекать ночь).
|
||||||
|
- `auto_mute_on_high_load_sec` (int): при critical load автоматически мьютить категорию `load` на N секунд (0 — выкл).
|
||||||
- `notify_recovery` (bool): уведомлять о восстановлении.
|
- `notify_recovery` (bool): уведомлять о восстановлении.
|
||||||
- `smart_enabled` (bool): SMART проверки.
|
- `smart_enabled` (bool): SMART проверки.
|
||||||
- `smart_interval_sec` (int): интервал SMART.
|
- `smart_interval_sec` (int): интервал SMART.
|
||||||
- `smart_cooldown_sec` (int): кулдаун SMART.
|
- `smart_cooldown_sec` (int): кулдаун SMART.
|
||||||
- `smart_temp_warn` (int): порог температуры (C).
|
- `smart_temp_warn` (int): порог температуры (C).
|
||||||
|
- `raid_enabled` (bool): RAID проверки (`/proc/mdstat`).
|
||||||
|
- `raid_interval_sec` (int): интервал RAID.
|
||||||
|
- `raid_cooldown_sec` (int): кулдаун RAID алертов.
|
||||||
|
|
||||||
|
## disk_report
|
||||||
|
|
||||||
|
- `threshold` (int): порог диска для авто‑снимка.
|
||||||
|
- `cooldown_sec` (int): кулдаун между снимками.
|
||||||
|
- `top_dirs` (int): сколько директорий показывать.
|
||||||
|
- `docker_dir` (string): путь к docker данным.
|
||||||
|
- `logs_dir` (string): путь к логам.
|
||||||
|
|
||||||
## audit
|
## audit
|
||||||
|
|
||||||
@@ -43,6 +68,48 @@
|
|||||||
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||||
- `backup_count` (int): сколько файлов хранить.
|
- `backup_count` (int): сколько файлов хранить.
|
||||||
|
|
||||||
|
## logging
|
||||||
|
|
||||||
|
- `enabled` (bool): включить лог бота.
|
||||||
|
- `path` (string): путь к лог-файлу. По умолчанию `/var/server-bot/bot.log`.
|
||||||
|
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||||
|
- `backup_count` (int): сколько файлов хранить.
|
||||||
|
- `level` (string): уровень логирования (`INFO`, `WARNING`, `ERROR`).
|
||||||
|
|
||||||
|
## safety
|
||||||
|
|
||||||
|
- `dry_run` (bool): если `true`, опасные действия (upgrade/reboot/backup) не выполняются.
|
||||||
|
|
||||||
|
## reports
|
||||||
|
|
||||||
|
- `weekly.enabled` (bool): включить еженедельный отчёт.
|
||||||
|
- `weekly.day` (string): день недели (`Mon`..`Sun`), по умолчанию `Sun`.
|
||||||
|
- `weekly.time` (string): локальное время `HH:MM`, по умолчанию `08:00`.
|
||||||
|
|
||||||
|
## selftest
|
||||||
|
|
||||||
|
- `schedule.enabled` (bool): включить авто self-test.
|
||||||
|
- `schedule.time` (string): локальное время `HH:MM`, по умолчанию `03:30`.
|
||||||
|
|
||||||
|
## queue
|
||||||
|
|
||||||
|
- `max_pending_alert` (int): алерт, если задач в очереди >= этому значению.
|
||||||
|
- `avg_wait_alert` (int): алерт, если среднее ожидание превышает N секунд.
|
||||||
|
- `cooldown_sec` (int): кулдаун между алертами очереди, по умолчанию 300с.
|
||||||
|
|
||||||
|
## external_checks
|
||||||
|
|
||||||
|
- `enabled` (bool): включить фоновые проверки.
|
||||||
|
- `state_path` (string): файл состояния для аптайма, по умолчанию `/var/server-bot/external_checks.json`.
|
||||||
|
- `timeout_sec` (int): таймаут проверки в секундах.
|
||||||
|
- `interval_sec` (int): интервал фоновых проверок.
|
||||||
|
- `services` (list): список проверок.
|
||||||
|
- `name` (string): название сервиса.
|
||||||
|
- `type` (string): `http`, `tcp`, `ping`.
|
||||||
|
- `url` (string): URL для `http`.
|
||||||
|
- `host` (string): хост для `tcp`/`ping`.
|
||||||
|
- `port` (int): порт для `tcp`.
|
||||||
|
|
||||||
## arcane
|
## arcane
|
||||||
|
|
||||||
- `base_url` (string): base url API Arcane.
|
- `base_url` (string): base url API Arcane.
|
||||||
@@ -58,12 +125,31 @@
|
|||||||
- `secret` (string): пароль.
|
- `secret` (string): пароль.
|
||||||
- `token` (string): опционально статический токен (не рекомендуется при истечении).
|
- `token` (string): опционально статический токен (не рекомендуется при истечении).
|
||||||
- `verify_tls` (bool): `false` для self-signed TLS.
|
- `verify_tls` (bool): `false` для self-signed TLS.
|
||||||
|
- `alerts.enabled` (bool): включить уведомления по истечению.
|
||||||
|
- `alerts.days` (list): пороги в днях (например 30/14/7/1).
|
||||||
|
- `alerts.cooldown_sec` (int): кулдаун между одинаковыми алертами.
|
||||||
|
- `alerts.interval_sec` (int): интервал проверки.
|
||||||
|
|
||||||
Логика токена:
|
Логика токена:
|
||||||
|
|
||||||
- первый токен: `POST /api/tokens` с `identity` и `secret`.
|
- первый токен: `POST /api/tokens` с `identity` и `secret`.
|
||||||
- refresh: `GET /api/tokens` с текущим токеном.
|
- refresh: `GET /api/tokens` с текущим токеном.
|
||||||
|
|
||||||
|
## gitea
|
||||||
|
|
||||||
|
- `base_url` (string): base url Gitea, например `http://localhost:3000`.
|
||||||
|
- `token` (string): опциональный API токен.
|
||||||
|
- `verify_tls` (bool): `false` для self-signed TLS.
|
||||||
|
|
||||||
|
## openwrt
|
||||||
|
|
||||||
|
- `host` (string): адрес роутера, например `10.10.10.1`.
|
||||||
|
- `user` (string): SSH пользователь (обычно `root`).
|
||||||
|
- `port` (int): SSH порт (обычно `22`).
|
||||||
|
- `identity_file` (string): путь к SSH ключу (опционально).
|
||||||
|
- `strict_host_key_checking` (bool): `false` чтобы не спрашивать подтверждение ключа.
|
||||||
|
- `timeout_sec` (int): таймаут SSH запроса.
|
||||||
|
|
||||||
## security
|
## security
|
||||||
|
|
||||||
- `reboot_password` (string): пароль для подтверждения reboot.
|
- `reboot_password` (string): пароль для подтверждения reboot.
|
||||||
|
|||||||
@@ -8,8 +8,9 @@ Telegram admin bot for Linux servers. Provides quick status checks, backup contr
|
|||||||
- Arcane: list projects, refresh, up/down, restart.
|
- Arcane: list projects, refresh, up/down, restart.
|
||||||
- Backups (restic): snapshots, repo stats, run backup, queue, restic check, weekly report.
|
- Backups (restic): snapshots, repo stats, run backup, queue, restic check, weekly report.
|
||||||
- System: disks, security, URLs health, metrics, package updates, upgrade, reboot, hardware info, SSL cert status (NPMplus).
|
- System: disks, security, URLs health, metrics, package updates, upgrade, reboot, hardware info, SSL cert status (NPMplus).
|
||||||
- Alerts: disk/load and SMART monitoring with cooldown.
|
- Alerts: disk/load/SMART with cooldown and quiet hours.
|
||||||
- Audit log: all button presses and messages (weekly rotation).
|
- Audit log: all button presses and messages (weekly rotation).
|
||||||
|
- Logs: bot log rotation and incidents.
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
@@ -68,4 +69,5 @@ GNU GPL v3.0. Full text in `LICENSE`.
|
|||||||
|
|
||||||
- For NPMplus with self-signed TLS, set `npmplus.verify_tls: false`.
|
- For NPMplus with self-signed TLS, set `npmplus.verify_tls: false`.
|
||||||
- The bot uses `sudo` for certain actions (reboot, upgrade, backup scripts). Ensure the service user has the required permissions.
|
- The bot uses `sudo` for certain actions (reboot, upgrade, backup scripts). Ensure the service user has the required permissions.
|
||||||
|
- Enable `safety.dry_run` if you want a safe mode without actions.
|
||||||
- Audit log default path is `/var/server-bot/audit.log`.
|
- Audit log default path is `/var/server-bot/audit.log`.
|
||||||
|
|||||||
@@ -8,8 +8,9 @@ Telegram-бот администратора для Linux-серверов. Да
|
|||||||
- Arcane: список проектов, refresh, up/down, restart.
|
- Arcane: список проектов, refresh, up/down, restart.
|
||||||
- Бэкапы (restic): снапшоты, статистика репозитория, запуск бэкапа, очередь, restic check, weekly report.
|
- Бэкапы (restic): снапшоты, статистика репозитория, запуск бэкапа, очередь, restic check, weekly report.
|
||||||
- Система: диски, безопасность, проверка URL, метрики, обновления, upgrade, reboot, железо, SSL (NPMplus).
|
- Система: диски, безопасность, проверка URL, метрики, обновления, upgrade, reboot, железо, SSL (NPMplus).
|
||||||
- Алерты: диск/нагрузка и SMART с cooldown.
|
- Алерты: диск/нагрузка/SMART с cooldown и quiet hours.
|
||||||
- Аудит: все нажатия и сообщения (ротация раз в неделю).
|
- Аудит: все нажатия и сообщения (ротация раз в неделю).
|
||||||
|
- Логи: ротация логов бота и инциденты.
|
||||||
|
|
||||||
## Требования
|
## Требования
|
||||||
|
|
||||||
@@ -68,4 +69,5 @@ GNU GPL v3.0. Полный текст в `LICENSE`.
|
|||||||
|
|
||||||
- Для NPMplus с self-signed TLS установи `npmplus.verify_tls: false`.
|
- Для NPMplus с self-signed TLS установи `npmplus.verify_tls: false`.
|
||||||
- Бот использует `sudo` для части операций — настрой права.
|
- Бот использует `sudo` для части операций — настрой права.
|
||||||
|
- Включи `safety.dry_run`, если хочешь безопасный режим без действий.
|
||||||
- Аудит по умолчанию пишется в `/var/server-bot/audit.log`.
|
- Аудит по умолчанию пишется в `/var/server-bot/audit.log`.
|
||||||
|
|||||||
15
app.py
15
app.py
@@ -1,13 +1,22 @@
|
|||||||
from aiogram import Bot, Dispatcher
|
from aiogram import Bot, Dispatcher
|
||||||
from config import load_cfg, load_env
|
from config import load_cfg, load_env
|
||||||
|
from services import runtime_state
|
||||||
|
|
||||||
cfg = load_cfg()
|
cfg = load_cfg()
|
||||||
|
|
||||||
TOKEN = cfg["telegram"]["token"]
|
TOKEN = cfg["telegram"]["token"]
|
||||||
ADMIN_ID = cfg["telegram"]["admin_id"]
|
admin_ids_cfg = cfg["telegram"].get("admin_ids")
|
||||||
|
if isinstance(admin_ids_cfg, list) and admin_ids_cfg:
|
||||||
|
ADMIN_IDS = [int(x) for x in admin_ids_cfg]
|
||||||
|
ADMIN_ID = ADMIN_IDS[0]
|
||||||
|
else:
|
||||||
|
ADMIN_ID = int(cfg["telegram"]["admin_id"])
|
||||||
|
ADMIN_IDS = [ADMIN_ID]
|
||||||
|
|
||||||
ARTIFACT_STATE = cfg["paths"]["artifact_state"]
|
paths_cfg = cfg.get("paths", {})
|
||||||
RESTIC_ENV = load_env(cfg["paths"].get("restic_env", "/etc/restic/restic.env"))
|
runtime_state.configure(paths_cfg.get("runtime_state", "/var/server-bot/runtime.json"))
|
||||||
|
ARTIFACT_STATE = paths_cfg.get("artifact_state", "/opt/tg-bot/state.json")
|
||||||
|
RESTIC_ENV = load_env(paths_cfg.get("restic_env", "/etc/restic/restic.env"))
|
||||||
|
|
||||||
DISK_WARN = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
DISK_WARN = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
||||||
LOAD_WARN = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
|
LOAD_WARN = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
|
||||||
|
|||||||
6
auth.py
6
auth.py
@@ -1,10 +1,10 @@
|
|||||||
from aiogram.types import Message, CallbackQuery
|
from aiogram.types import Message, CallbackQuery
|
||||||
from app import ADMIN_ID
|
from app import ADMIN_IDS
|
||||||
|
|
||||||
|
|
||||||
def is_admin_msg(msg: Message) -> bool:
|
def is_admin_msg(msg: Message) -> bool:
|
||||||
return msg.from_user and msg.from_user.id == ADMIN_ID
|
return msg.from_user and msg.from_user.id in ADMIN_IDS
|
||||||
|
|
||||||
|
|
||||||
def is_admin_cb(cb: CallbackQuery) -> bool:
|
def is_admin_cb(cb: CallbackQuery) -> bool:
|
||||||
return cb.from_user and cb.from_user.id == ADMIN_ID
|
return cb.from_user and cb.from_user.id in ADMIN_IDS
|
||||||
|
|||||||
@@ -1,10 +1,14 @@
|
|||||||
telegram:
|
telegram:
|
||||||
token: "YOUR_TELEGRAM_BOT_TOKEN"
|
token: "YOUR_TELEGRAM_BOT_TOKEN"
|
||||||
admin_id: 123456789
|
admin_id: 123456789
|
||||||
|
# Optional list of admins (first is primary for alerts)
|
||||||
|
admin_ids:
|
||||||
|
- 123456789
|
||||||
|
|
||||||
paths:
|
paths:
|
||||||
# JSON state file for artifacts
|
# JSON state file for artifacts
|
||||||
artifact_state: "/opt/tg-bot/state.json"
|
artifact_state: "/opt/tg-bot/state.json"
|
||||||
|
runtime_state: "/var/server-bot/runtime.json"
|
||||||
# Optional env file with RESTIC_* variables
|
# Optional env file with RESTIC_* variables
|
||||||
restic_env: "/etc/restic/restic.env"
|
restic_env: "/etc/restic/restic.env"
|
||||||
|
|
||||||
@@ -17,11 +21,38 @@ alerts:
|
|||||||
enabled: true
|
enabled: true
|
||||||
interval_sec: 60
|
interval_sec: 60
|
||||||
cooldown_sec: 900
|
cooldown_sec: 900
|
||||||
|
# Optional global dedup cooldown for notify() calls
|
||||||
|
notify_cooldown_sec: 900
|
||||||
|
# If true, only critical load alerts are sent (no warn/OK)
|
||||||
|
load_only_critical: false
|
||||||
|
# Optional auto-mute windows per category
|
||||||
|
auto_mute:
|
||||||
|
- category: "load"
|
||||||
|
start: "23:00"
|
||||||
|
end: "08:00"
|
||||||
|
# Auto-mute load when critical load fires (seconds)
|
||||||
|
auto_mute_on_high_load_sec: 600
|
||||||
|
quiet_hours:
|
||||||
|
enabled: false
|
||||||
|
start: "23:00"
|
||||||
|
end: "08:00"
|
||||||
|
# Allow critical alerts during quiet hours
|
||||||
|
allow_critical: true
|
||||||
notify_recovery: true
|
notify_recovery: true
|
||||||
smart_enabled: true
|
smart_enabled: true
|
||||||
smart_interval_sec: 3600
|
smart_interval_sec: 3600
|
||||||
smart_cooldown_sec: 21600
|
smart_cooldown_sec: 21600
|
||||||
smart_temp_warn: 50
|
smart_temp_warn: 50
|
||||||
|
raid_enabled: true
|
||||||
|
raid_interval_sec: 300
|
||||||
|
raid_cooldown_sec: 1800
|
||||||
|
|
||||||
|
disk_report:
|
||||||
|
threshold: 90
|
||||||
|
cooldown_sec: 21600
|
||||||
|
top_dirs: 8
|
||||||
|
docker_dir: "/var/lib/docker"
|
||||||
|
logs_dir: "/var/log"
|
||||||
|
|
||||||
audit:
|
audit:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -35,6 +66,47 @@ incidents:
|
|||||||
rotate_when: "W0"
|
rotate_when: "W0"
|
||||||
backup_count: 8
|
backup_count: 8
|
||||||
|
|
||||||
|
logging:
|
||||||
|
enabled: true
|
||||||
|
path: "/var/server-bot/bot.log"
|
||||||
|
rotate_when: "W0"
|
||||||
|
backup_count: 8
|
||||||
|
level: "INFO"
|
||||||
|
|
||||||
|
safety:
|
||||||
|
# If true, dangerous actions will be skipped
|
||||||
|
dry_run: false
|
||||||
|
|
||||||
|
reports:
|
||||||
|
weekly:
|
||||||
|
enabled: false
|
||||||
|
day: "Sun" # Mon/Tue/Wed/Thu/Fri/Sat/Sun
|
||||||
|
time: "08:00" # HH:MM server local time
|
||||||
|
|
||||||
|
selftest:
|
||||||
|
schedule:
|
||||||
|
enabled: false
|
||||||
|
time: "03:30"
|
||||||
|
|
||||||
|
queue:
|
||||||
|
max_pending_alert: 5
|
||||||
|
avg_wait_alert: 120
|
||||||
|
cooldown_sec: 300
|
||||||
|
|
||||||
|
external_checks:
|
||||||
|
enabled: true
|
||||||
|
state_path: "/var/server-bot/external_checks.json"
|
||||||
|
timeout_sec: 5
|
||||||
|
interval_sec: 300
|
||||||
|
services:
|
||||||
|
- name: "example-site"
|
||||||
|
type: "http"
|
||||||
|
url: "https://example.com"
|
||||||
|
- name: "example-ssh"
|
||||||
|
type: "tcp"
|
||||||
|
host: "example.com"
|
||||||
|
port: 22
|
||||||
|
|
||||||
arcane:
|
arcane:
|
||||||
base_url: "http://localhost:3552"
|
base_url: "http://localhost:3552"
|
||||||
api_key: "arc_..."
|
api_key: "arc_..."
|
||||||
@@ -47,6 +119,31 @@ npmplus:
|
|||||||
# Optional static token (not recommended if it expires)
|
# Optional static token (not recommended if it expires)
|
||||||
token: ""
|
token: ""
|
||||||
verify_tls: true
|
verify_tls: true
|
||||||
|
alerts:
|
||||||
|
enabled: true
|
||||||
|
days:
|
||||||
|
- 30
|
||||||
|
- 14
|
||||||
|
- 7
|
||||||
|
- 1
|
||||||
|
cooldown_sec: 86400
|
||||||
|
interval_sec: 3600
|
||||||
|
|
||||||
|
gitea:
|
||||||
|
base_url: "http://localhost:3000"
|
||||||
|
# Optional API token for private instances
|
||||||
|
token: ""
|
||||||
|
verify_tls: true
|
||||||
|
|
||||||
|
openwrt:
|
||||||
|
host: "10.10.10.1"
|
||||||
|
user: "root"
|
||||||
|
port: 22
|
||||||
|
# Optional identity file for SSH
|
||||||
|
identity_file: ""
|
||||||
|
# Disable strict host key checking for auto-accept
|
||||||
|
strict_host_key_checking: false
|
||||||
|
timeout_sec: 8
|
||||||
|
|
||||||
security:
|
security:
|
||||||
reboot_password: "CHANGE_ME"
|
reboot_password: "CHANGE_ME"
|
||||||
|
|||||||
9
deploy.sh
Normal file
9
deploy.sh
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SSH_HOST="root@10.10.10.10"
|
||||||
|
SSH_PORT="1090"
|
||||||
|
APP_DIR="/opt/tg-bot"
|
||||||
|
|
||||||
|
ssh -p "$SSH_PORT" "$SSH_HOST" \
|
||||||
|
"cd \"$APP_DIR\" && git pull --ff-only && systemctl restart tg-bot"
|
||||||
162
handlers/alerts_admin.py
Normal file
162
handlers/alerts_admin.py
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
import time
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from aiogram import F
|
||||||
|
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
||||||
|
from app import dp, bot, cfg, ADMIN_ID
|
||||||
|
from auth import is_admin_msg
|
||||||
|
from services.alert_mute import set_mute, clear_mute, list_mutes
|
||||||
|
from services.incidents import read_recent, log_incident
|
||||||
|
from services.notify import notify
|
||||||
|
|
||||||
|
|
||||||
|
HELP_TEXT = (
|
||||||
|
"Alerts:\n"
|
||||||
|
"/alerts test <critical|warn|info> - send test alert\n"
|
||||||
|
"/alerts mute <category> <minutes> - mute alerts for category\n"
|
||||||
|
"/alerts unmute <category> - unmute category\n"
|
||||||
|
"/alerts list - show active mutes\n"
|
||||||
|
"/alerts recent [hours] - show incidents log (default 24h)\n"
|
||||||
|
"Categories: load, disk, smart, raid, ssl, docker, test\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _dispatch(msg: Message, action: str, args: list[str]):
|
||||||
|
return {"action": action, "args": args}
|
||||||
|
|
||||||
|
|
||||||
|
async def _handle_alerts(msg: Message, action: str, args: list[str]):
|
||||||
|
if action == "test":
|
||||||
|
level = args[0].lower() if args else "info"
|
||||||
|
if level not in ("critical", "warn", "info"):
|
||||||
|
level = "info"
|
||||||
|
key = f"test:{level}:{int(time.time())}"
|
||||||
|
await notify(bot, msg.chat.id, f"[TEST] {level.upper()} alert", level=level, key=key, category="test")
|
||||||
|
await msg.answer(f"Sent test alert: {level}")
|
||||||
|
log_incident(cfg, f"alert_test level={level} by {msg.from_user.id}", category="test")
|
||||||
|
return
|
||||||
|
|
||||||
|
if action == "mute":
|
||||||
|
if len(args) < 1:
|
||||||
|
await msg.answer("Usage: /alerts mute <category> <minutes>")
|
||||||
|
return
|
||||||
|
category = args[0].lower()
|
||||||
|
minutes = 60
|
||||||
|
if len(args) >= 2:
|
||||||
|
try:
|
||||||
|
minutes = max(1, int(args[1]))
|
||||||
|
except ValueError:
|
||||||
|
minutes = 60
|
||||||
|
until = set_mute(category, minutes * 60)
|
||||||
|
dt = datetime.fromtimestamp(until, tz=timezone.utc).astimezone()
|
||||||
|
await msg.answer(f"🔕 Muted {category} for {minutes}m (until {dt:%Y-%m-%d %H:%M:%S})")
|
||||||
|
log_incident(cfg, f"alert_mute category={category} minutes={minutes} by {msg.from_user.id}", category=category)
|
||||||
|
return
|
||||||
|
|
||||||
|
if action == "unmute":
|
||||||
|
if len(args) < 1:
|
||||||
|
await msg.answer("Usage: /alerts unmute <category>")
|
||||||
|
return
|
||||||
|
category = args[0].lower()
|
||||||
|
clear_mute(category)
|
||||||
|
await msg.answer(f"🔔 Unmuted {category}")
|
||||||
|
log_incident(cfg, f"alert_unmute category={category} by {msg.from_user.id}", category=category)
|
||||||
|
return
|
||||||
|
|
||||||
|
if action in ("list", "mutes"):
|
||||||
|
mutes = list_mutes()
|
||||||
|
if not mutes:
|
||||||
|
await msg.answer("🔔 No active mutes")
|
||||||
|
return
|
||||||
|
lines = ["🔕 Active mutes:"]
|
||||||
|
for cat, secs in mutes.items():
|
||||||
|
mins = max(0, secs) // 60
|
||||||
|
lines.append(f"- {cat}: {mins}m left")
|
||||||
|
await msg.answer("\n".join(lines))
|
||||||
|
return
|
||||||
|
|
||||||
|
if action == "recent":
|
||||||
|
hours = 24
|
||||||
|
if args:
|
||||||
|
try:
|
||||||
|
hours = max(1, int(args[0]))
|
||||||
|
except ValueError:
|
||||||
|
hours = 24
|
||||||
|
rows = read_recent(cfg, hours, limit=50)
|
||||||
|
if not rows:
|
||||||
|
await msg.answer(f"No incidents in last {hours}h")
|
||||||
|
return
|
||||||
|
await msg.answer("🧾 Incidents:\n" + "\n".join(rows))
|
||||||
|
return
|
||||||
|
|
||||||
|
await msg.answer(HELP_TEXT)
|
||||||
|
|
||||||
|
|
||||||
|
ALERTS_KB = InlineKeyboardMarkup(
|
||||||
|
inline_keyboard=[
|
||||||
|
[
|
||||||
|
InlineKeyboardButton(text="List", callback_data="alerts:list"),
|
||||||
|
InlineKeyboardButton(text="Recent 24h", callback_data="alerts:recent:24"),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
InlineKeyboardButton(text="Mute load 60m", callback_data="alerts:mute:load:60"),
|
||||||
|
InlineKeyboardButton(text="Unmute load", callback_data="alerts:unmute:load"),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
InlineKeyboardButton(text="Test CRIT", callback_data="alerts:test:critical"),
|
||||||
|
InlineKeyboardButton(text="Test WARN", callback_data="alerts:test:warn"),
|
||||||
|
InlineKeyboardButton(text="Test INFO", callback_data="alerts:test:info"),
|
||||||
|
],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text.regexp(r"^/alerts(\\s|$)"))
|
||||||
|
async def alerts_cmd(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
|
||||||
|
parts = msg.text.split()
|
||||||
|
if len(parts) < 2:
|
||||||
|
await msg.answer(HELP_TEXT, reply_markup=ALERTS_KB)
|
||||||
|
return
|
||||||
|
|
||||||
|
action = parts[1].lower()
|
||||||
|
args = parts[2:]
|
||||||
|
await _handle_alerts(msg, action, args)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/alerts_list")
|
||||||
|
async def alerts_list(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
await _handle_alerts(msg, "list", [])
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/alerts_recent")
|
||||||
|
async def alerts_recent(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
await _handle_alerts(msg, "recent", ["24"])
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/alerts_mute_load")
|
||||||
|
async def alerts_mute_load(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
await _handle_alerts(msg, "mute", ["load", "60"])
|
||||||
|
|
||||||
|
|
||||||
|
@dp.callback_query(F.data.startswith("alerts:"))
|
||||||
|
async def alerts_cb(cb: CallbackQuery):
|
||||||
|
if cb.from_user.id != ADMIN_ID:
|
||||||
|
await cb.answer()
|
||||||
|
return
|
||||||
|
parts = cb.data.split(":")
|
||||||
|
# formats: alerts:action or alerts:action:arg1:arg2
|
||||||
|
if len(parts) < 2:
|
||||||
|
await cb.answer()
|
||||||
|
return
|
||||||
|
action = parts[1]
|
||||||
|
args = parts[2:] if len(parts) > 2 else []
|
||||||
|
await _handle_alerts(cb.message, action, args)
|
||||||
|
await cb.answer()
|
||||||
@@ -2,7 +2,7 @@ import asyncio
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from aiogram import F
|
from aiogram import F
|
||||||
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
|
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
|
||||||
from app import dp, cfg
|
from app import dp, cfg, ADMIN_IDS
|
||||||
from auth import is_admin_msg
|
from auth import is_admin_msg
|
||||||
from keyboards import docker_kb, arcane_kb
|
from keyboards import docker_kb, arcane_kb
|
||||||
from services.arcane import list_projects, restart_project, set_project_state, get_project_details
|
from services.arcane import list_projects, restart_project, set_project_state, get_project_details
|
||||||
@@ -27,6 +27,7 @@ def _arcane_kb(page: int, total_pages: int, items: list[dict]) -> InlineKeyboard
|
|||||||
rows.append([
|
rows.append([
|
||||||
InlineKeyboardButton(text=f"🔄 {name}", callback_data=f"arcane:restart:{pid}"),
|
InlineKeyboardButton(text=f"🔄 {name}", callback_data=f"arcane:restart:{pid}"),
|
||||||
InlineKeyboardButton(text="ℹ️", callback_data=f"arcane:details:{pid}"),
|
InlineKeyboardButton(text="ℹ️", callback_data=f"arcane:details:{pid}"),
|
||||||
|
InlineKeyboardButton(text="📦", callback_data=f"arcane:deploy:{pid}"),
|
||||||
InlineKeyboardButton(text=action_text, callback_data=f"arcane:{action}:{pid}"),
|
InlineKeyboardButton(text=action_text, callback_data=f"arcane:{action}:{pid}"),
|
||||||
])
|
])
|
||||||
|
|
||||||
@@ -114,7 +115,7 @@ async def arcane_refresh(msg: Message):
|
|||||||
|
|
||||||
@dp.callback_query(F.data == "arcane:refresh")
|
@dp.callback_query(F.data == "arcane:refresh")
|
||||||
async def arcane_refresh_inline(cb: CallbackQuery):
|
async def arcane_refresh_inline(cb: CallbackQuery):
|
||||||
if cb.from_user.id != cfg["telegram"]["admin_id"]:
|
if cb.from_user.id not in ADMIN_IDS:
|
||||||
return
|
return
|
||||||
await cb.answer()
|
await cb.answer()
|
||||||
await cmd_arcane_projects(cb.message, edit=True)
|
await cmd_arcane_projects(cb.message, edit=True)
|
||||||
@@ -122,7 +123,7 @@ async def arcane_refresh_inline(cb: CallbackQuery):
|
|||||||
|
|
||||||
@dp.callback_query(F.data.startswith("arcane:page:"))
|
@dp.callback_query(F.data.startswith("arcane:page:"))
|
||||||
async def arcane_page(cb: CallbackQuery):
|
async def arcane_page(cb: CallbackQuery):
|
||||||
if cb.from_user.id != cfg["telegram"]["admin_id"]:
|
if cb.from_user.id not in ADMIN_IDS:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
page = int(cb.data.split(":", 2)[2])
|
page = int(cb.data.split(":", 2)[2])
|
||||||
@@ -140,7 +141,7 @@ async def arcane_page(cb: CallbackQuery):
|
|||||||
|
|
||||||
@dp.callback_query(F.data.startswith("arcane:restart:"))
|
@dp.callback_query(F.data.startswith("arcane:restart:"))
|
||||||
async def arcane_restart(cb: CallbackQuery):
|
async def arcane_restart(cb: CallbackQuery):
|
||||||
if cb.from_user.id != cfg["telegram"]["admin_id"]:
|
if cb.from_user.id not in ADMIN_IDS:
|
||||||
return
|
return
|
||||||
|
|
||||||
_, _, pid = cb.data.split(":", 2)
|
_, _, pid = cb.data.split(":", 2)
|
||||||
@@ -159,7 +160,7 @@ async def arcane_restart(cb: CallbackQuery):
|
|||||||
|
|
||||||
@dp.callback_query(F.data.startswith("arcane:details:"))
|
@dp.callback_query(F.data.startswith("arcane:details:"))
|
||||||
async def arcane_details(cb: CallbackQuery):
|
async def arcane_details(cb: CallbackQuery):
|
||||||
if cb.from_user.id != cfg["telegram"]["admin_id"]:
|
if cb.from_user.id not in ADMIN_IDS:
|
||||||
return
|
return
|
||||||
|
|
||||||
_, _, pid = cb.data.split(":", 2)
|
_, _, pid = cb.data.split(":", 2)
|
||||||
@@ -205,9 +206,55 @@ async def arcane_details(cb: CallbackQuery):
|
|||||||
await cb.message.answer("\n".join(lines), parse_mode="Markdown", reply_markup=arcane_kb)
|
await cb.message.answer("\n".join(lines), parse_mode="Markdown", reply_markup=arcane_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.callback_query(F.data.startswith("arcane:deploy:"))
|
||||||
|
async def arcane_deploy_status(cb: CallbackQuery):
|
||||||
|
if cb.from_user.id not in ADMIN_IDS:
|
||||||
|
return
|
||||||
|
|
||||||
|
_, _, pid = cb.data.split(":", 2)
|
||||||
|
base_url, api_key, env_id = _arcane_cfg()
|
||||||
|
if not base_url or not api_key:
|
||||||
|
await cb.answer("Arcane config missing")
|
||||||
|
return
|
||||||
|
|
||||||
|
await cb.answer("Loading…")
|
||||||
|
ok, info, data = await asyncio.to_thread(get_project_details, base_url, api_key, env_id, pid)
|
||||||
|
if not ok:
|
||||||
|
await cb.message.answer(f"❌ Arcane deploy status failed: {info}", reply_markup=arcane_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
name = data.get("name", "?")
|
||||||
|
status = data.get("status", "unknown")
|
||||||
|
status_reason = data.get("statusReason")
|
||||||
|
updated = data.get("updatedAt")
|
||||||
|
path = data.get("path")
|
||||||
|
repo = data.get("gitRepositoryURL")
|
||||||
|
commit = data.get("lastSyncCommit")
|
||||||
|
running = data.get("runningCount", 0)
|
||||||
|
total = data.get("serviceCount", 0)
|
||||||
|
icon = "🟢" if status == "running" else "🟡"
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"📦 **Deploy status: {name}**",
|
||||||
|
f"{icon} Status: {status} ({running}/{total})",
|
||||||
|
]
|
||||||
|
if status_reason:
|
||||||
|
lines.append(f"⚠️ {status_reason}")
|
||||||
|
if updated:
|
||||||
|
lines.append(f"🕒 Updated: {updated}")
|
||||||
|
if path:
|
||||||
|
lines.append(f"📁 Path: {path}")
|
||||||
|
if repo:
|
||||||
|
lines.append(f"🔗 Repo: {repo}")
|
||||||
|
if commit:
|
||||||
|
lines.append(f"🧾 Commit: {commit}")
|
||||||
|
|
||||||
|
await cb.message.answer("\n".join(lines), parse_mode="Markdown", reply_markup=arcane_kb)
|
||||||
|
|
||||||
|
|
||||||
@dp.callback_query(F.data.startswith("arcane:up:"))
|
@dp.callback_query(F.data.startswith("arcane:up:"))
|
||||||
async def arcane_up(cb: CallbackQuery):
|
async def arcane_up(cb: CallbackQuery):
|
||||||
if cb.from_user.id != cfg["telegram"]["admin_id"]:
|
if cb.from_user.id not in ADMIN_IDS:
|
||||||
return
|
return
|
||||||
|
|
||||||
_, _, pid = cb.data.split(":", 2)
|
_, _, pid = cb.data.split(":", 2)
|
||||||
@@ -226,7 +273,7 @@ async def arcane_up(cb: CallbackQuery):
|
|||||||
|
|
||||||
@dp.callback_query(F.data.startswith("arcane:down:"))
|
@dp.callback_query(F.data.startswith("arcane:down:"))
|
||||||
async def arcane_down(cb: CallbackQuery):
|
async def arcane_down(cb: CallbackQuery):
|
||||||
if cb.from_user.id != cfg["telegram"]["admin_id"]:
|
if cb.from_user.id not in ADMIN_IDS:
|
||||||
return
|
return
|
||||||
|
|
||||||
_, _, pid = cb.data.split(":", 2)
|
_, _, pid = cb.data.split(":", 2)
|
||||||
|
|||||||
@@ -1,15 +1,17 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from aiogram import F
|
from aiogram import F
|
||||||
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton
|
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
|
||||||
from app import dp
|
from app import dp, cfg
|
||||||
from auth import is_admin_msg
|
from auth import is_admin_msg, is_admin_cb
|
||||||
from keyboards import backup_kb
|
from keyboards import backup_kb
|
||||||
from lock_utils import acquire_lock, release_lock
|
from lock_utils import acquire_lock, release_lock
|
||||||
from services.queue import enqueue, format_status
|
from services.queue import enqueue, format_status, format_details, format_history
|
||||||
from services.backup import backup_badge, restore_help
|
from services.backup import backup_badge, restore_help
|
||||||
from services.runner import run_cmd
|
from services.runner import run_cmd, run_cmd_full
|
||||||
|
from services.incidents import log_incident
|
||||||
|
|
||||||
|
|
||||||
def _parse_systemctl_kv(raw: str) -> dict[str, str]:
|
def _parse_systemctl_kv(raw: str) -> dict[str, str]:
|
||||||
@@ -30,6 +32,156 @@ async def _unit_status(unit: str, props: list[str]) -> dict[str, str]:
|
|||||||
return _parse_systemctl_kv(out)
|
return _parse_systemctl_kv(out)
|
||||||
|
|
||||||
|
|
||||||
|
def _sudo_cmd(cmd: list[str]) -> list[str]:
|
||||||
|
if os.geteuid() == 0:
|
||||||
|
return cmd
|
||||||
|
return ["sudo", "-E"] + cmd
|
||||||
|
|
||||||
|
|
||||||
|
def _format_backup_result(rc: int, out: str) -> str:
|
||||||
|
log_path = "/var/log/backup-auto.log"
|
||||||
|
header = "✅ Backup finished" if rc == 0 else "❌ Backup failed"
|
||||||
|
lines = out.strip().splitlines()
|
||||||
|
body = "\n".join(lines[:20])
|
||||||
|
if len(lines) > 20:
|
||||||
|
body += f"\n… trimmed {len(lines) - 20} lines"
|
||||||
|
extra = ""
|
||||||
|
if rc != 0 and os.path.exists(log_path):
|
||||||
|
try:
|
||||||
|
tail = ""
|
||||||
|
with open(log_path, "r", encoding="utf-8", errors="replace") as f:
|
||||||
|
tail_lines = f.readlines()[-40:]
|
||||||
|
tail = "".join(tail_lines).strip()
|
||||||
|
if tail:
|
||||||
|
extra = "\n\nLog tail:\n" + tail
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
base = f"{header} (rc={rc})\nlog: {log_path}"
|
||||||
|
if body:
|
||||||
|
base += "\n\n" + body
|
||||||
|
if extra:
|
||||||
|
base += extra
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def _tail(path: str, lines: int = 120) -> str:
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return f"⚠️ Log not found: {path}"
|
||||||
|
try:
|
||||||
|
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
||||||
|
data = f.readlines()[-lines:]
|
||||||
|
except Exception as e:
|
||||||
|
return f"⚠️ Failed to read log: {e}"
|
||||||
|
return "".join(data).strip() or "(empty)"
|
||||||
|
|
||||||
|
|
||||||
|
def _beautify_restic_forget(raw: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Parse restic forget output tables into a compact bullet list.
|
||||||
|
"""
|
||||||
|
if "Reasons" not in raw or "Paths" not in raw:
|
||||||
|
return None
|
||||||
|
import re
|
||||||
|
|
||||||
|
lines = raw.splitlines()
|
||||||
|
headers = []
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
if line.startswith("ID") and "Reasons" in line and "Paths" in line:
|
||||||
|
headers.append(idx)
|
||||||
|
if not headers:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _valid_id(val: str) -> bool:
|
||||||
|
return bool(re.fullmatch(r"[0-9a-f]{7,64}", val.strip()))
|
||||||
|
|
||||||
|
def parse_block(start_idx: int, end_idx: int) -> list[dict]:
|
||||||
|
header = lines[start_idx]
|
||||||
|
cols = ["ID", "Time", "Host", "Tags", "Reasons", "Paths", "Size"]
|
||||||
|
positions = []
|
||||||
|
for name in cols:
|
||||||
|
pos = header.find(name)
|
||||||
|
if pos == -1:
|
||||||
|
return []
|
||||||
|
positions.append(pos)
|
||||||
|
positions.append(len(header))
|
||||||
|
|
||||||
|
entries: list[dict] = []
|
||||||
|
current: dict | None = None
|
||||||
|
for line in lines[start_idx + 2 : end_idx]:
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
segments = []
|
||||||
|
for i in range(len(cols)):
|
||||||
|
segments.append(line[positions[i] : positions[i + 1]].strip())
|
||||||
|
row = dict(zip(cols, segments))
|
||||||
|
if row["ID"] and _valid_id(row["ID"]):
|
||||||
|
current = {
|
||||||
|
"id": row["ID"],
|
||||||
|
"time": row["Time"],
|
||||||
|
"host": row["Host"],
|
||||||
|
"size": row["Size"],
|
||||||
|
"tags": row["Tags"],
|
||||||
|
"reasons": [],
|
||||||
|
"paths": [],
|
||||||
|
}
|
||||||
|
if row["Reasons"]:
|
||||||
|
current["reasons"].append(row["Reasons"])
|
||||||
|
if row["Paths"]:
|
||||||
|
current["paths"].append(row["Paths"])
|
||||||
|
entries.append(current)
|
||||||
|
elif current:
|
||||||
|
if row["Reasons"] and not row["Reasons"].startswith("-"):
|
||||||
|
current["reasons"].append(row["Reasons"])
|
||||||
|
if row["Paths"] and not row["Paths"].startswith("-"):
|
||||||
|
current["paths"].append(row["Paths"])
|
||||||
|
return entries
|
||||||
|
|
||||||
|
blocks = []
|
||||||
|
for i, start in enumerate(headers):
|
||||||
|
end = headers[i + 1] if i + 1 < len(headers) else len(lines)
|
||||||
|
entries = parse_block(start, end)
|
||||||
|
if not entries:
|
||||||
|
continue
|
||||||
|
label = "Plan"
|
||||||
|
prev_line = lines[start - 1].lower() if start - 1 >= 0 else ""
|
||||||
|
prev2 = lines[start - 2].lower() if start - 2 >= 0 else ""
|
||||||
|
if "keep" in prev_line:
|
||||||
|
label = prev_line.strip()
|
||||||
|
elif "keep" in prev2:
|
||||||
|
label = prev2.strip()
|
||||||
|
elif "snapshots" in prev_line:
|
||||||
|
label = prev_line.strip()
|
||||||
|
blocks.append((label, entries))
|
||||||
|
|
||||||
|
if not blocks:
|
||||||
|
return None
|
||||||
|
|
||||||
|
out_lines = []
|
||||||
|
for label, entries in blocks:
|
||||||
|
out_lines.append(f"📦 {label}")
|
||||||
|
for e in entries:
|
||||||
|
head = f"🧉 {e['id']} | {e['time']} | {e['host']} | {e['size'] or 'n/a'}"
|
||||||
|
out_lines.append(head)
|
||||||
|
if e["reasons"]:
|
||||||
|
out_lines.append(" 📌 " + "; ".join(e["reasons"]))
|
||||||
|
if e["paths"]:
|
||||||
|
for p in e["paths"]:
|
||||||
|
out_lines.append(f" • {p}")
|
||||||
|
out_lines.append("")
|
||||||
|
return "\n".join(out_lines).rstrip()
|
||||||
|
|
||||||
|
|
||||||
|
def _load_json(raw: str, label: str) -> tuple[bool, object | None, str]:
|
||||||
|
if not raw or not raw.strip():
|
||||||
|
return False, None, f"? {label} returned empty output"
|
||||||
|
try:
|
||||||
|
return True, json.loads(raw), ""
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
preview = raw.strip().splitlines()
|
||||||
|
head = preview[0] if preview else "invalid output"
|
||||||
|
return False, None, f"? {label} invalid JSON: {head}"
|
||||||
|
|
||||||
|
|
||||||
async def send_backup_jobs_status(msg: Message):
|
async def send_backup_jobs_status(msg: Message):
|
||||||
services = [
|
services = [
|
||||||
("backup-auto", "backup-auto.timer"),
|
("backup-auto", "backup-auto.timer"),
|
||||||
@@ -69,7 +221,7 @@ async def cmd_repo_stats(msg: Message):
|
|||||||
await msg.answer("⏳ Loading repo stats…", reply_markup=backup_kb)
|
await msg.answer("⏳ Loading repo stats…", reply_markup=backup_kb)
|
||||||
|
|
||||||
# --- restore-size stats ---
|
# --- restore-size stats ---
|
||||||
rc1, raw1 = await run_cmd(
|
rc1, raw1 = await run_cmd_full(
|
||||||
["restic", "stats", "--json"],
|
["restic", "stats", "--json"],
|
||||||
use_restic_env=True,
|
use_restic_env=True,
|
||||||
timeout=30
|
timeout=30
|
||||||
@@ -78,10 +230,14 @@ async def cmd_repo_stats(msg: Message):
|
|||||||
await msg.answer(raw1, reply_markup=backup_kb)
|
await msg.answer(raw1, reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
|
|
||||||
restore = json.loads(raw1)
|
ok, restore, err = _load_json(raw1, "restic stats")
|
||||||
|
if not ok:
|
||||||
|
await msg.answer(err, reply_markup=backup_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
# --- raw-data stats ---
|
# --- raw-data stats ---
|
||||||
rc2, raw2 = await run_cmd(
|
rc2, raw2 = await run_cmd_full(
|
||||||
["restic", "stats", "--json", "--mode", "raw-data"],
|
["restic", "stats", "--json", "--mode", "raw-data"],
|
||||||
use_restic_env=True,
|
use_restic_env=True,
|
||||||
timeout=30
|
timeout=30
|
||||||
@@ -90,15 +246,26 @@ async def cmd_repo_stats(msg: Message):
|
|||||||
await msg.answer(raw2, reply_markup=backup_kb)
|
await msg.answer(raw2, reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
|
|
||||||
raw = json.loads(raw2)
|
ok, raw, err = _load_json(raw2, "restic stats raw-data")
|
||||||
|
if not ok:
|
||||||
|
await msg.answer(err, reply_markup=backup_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
# --- snapshots count ---
|
# --- snapshots count ---
|
||||||
rc3, raw_snaps = await run_cmd(
|
rc3, raw_snaps = await run_cmd_full(
|
||||||
["restic", "snapshots", "--json"],
|
["restic", "snapshots", "--json"],
|
||||||
use_restic_env=True,
|
use_restic_env=True,
|
||||||
timeout=20
|
timeout=20
|
||||||
)
|
)
|
||||||
snaps = len(json.loads(raw_snaps)) if rc3 == 0 else "n/a"
|
if rc3 != 0:
|
||||||
|
snaps = "n/a"
|
||||||
|
else:
|
||||||
|
ok, snap_data, err = _load_json(raw_snaps, "restic snapshots")
|
||||||
|
if ok and isinstance(snap_data, list):
|
||||||
|
snaps = len(snap_data)
|
||||||
|
else:
|
||||||
|
snaps = "n/a"
|
||||||
|
|
||||||
msg_text = (
|
msg_text = (
|
||||||
"📦 **Repository stats**\n\n"
|
"📦 **Repository stats**\n\n"
|
||||||
@@ -115,7 +282,7 @@ async def cmd_backup_status(msg: Message):
|
|||||||
await msg.answer("⏳ Loading snapshots…", reply_markup=backup_kb)
|
await msg.answer("⏳ Loading snapshots…", reply_markup=backup_kb)
|
||||||
|
|
||||||
async def worker():
|
async def worker():
|
||||||
rc, raw = await run_cmd(
|
rc, raw = await run_cmd_full(
|
||||||
["restic", "snapshots", "--json"],
|
["restic", "snapshots", "--json"],
|
||||||
use_restic_env=True,
|
use_restic_env=True,
|
||||||
timeout=30
|
timeout=30
|
||||||
@@ -124,7 +291,10 @@ async def cmd_backup_status(msg: Message):
|
|||||||
await msg.answer(raw, reply_markup=backup_kb)
|
await msg.answer(raw, reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
|
|
||||||
snaps = json.loads(raw)
|
ok, snaps, err = _load_json(raw, "restic snapshots")
|
||||||
|
if not ok or not isinstance(snaps, list):
|
||||||
|
await msg.answer(err, reply_markup=backup_kb)
|
||||||
|
return
|
||||||
if not snaps:
|
if not snaps:
|
||||||
await msg.answer("📦 Snapshots: none", reply_markup=backup_kb)
|
await msg.answer("📦 Snapshots: none", reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
@@ -163,7 +333,14 @@ async def cmd_backup_status(msg: Message):
|
|||||||
|
|
||||||
|
|
||||||
async def cmd_backup_now(msg: Message):
|
async def cmd_backup_now(msg: Message):
|
||||||
|
await schedule_backup(msg)
|
||||||
|
|
||||||
|
|
||||||
|
async def schedule_backup(msg: Message):
|
||||||
async def job():
|
async def job():
|
||||||
|
if cfg.get("safety", {}).get("dry_run", False):
|
||||||
|
await msg.answer("🧪 Dry-run: backup skipped", reply_markup=backup_kb)
|
||||||
|
return
|
||||||
if not acquire_lock("backup"):
|
if not acquire_lock("backup"):
|
||||||
await msg.answer("⚠️ Backup уже выполняется", reply_markup=backup_kb)
|
await msg.answer("⚠️ Backup уже выполняется", reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
@@ -171,20 +348,36 @@ async def cmd_backup_now(msg: Message):
|
|||||||
await msg.answer("▶️ Backup запущен", reply_markup=backup_kb)
|
await msg.answer("▶️ Backup запущен", reply_markup=backup_kb)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rc, out = await run_cmd(["sudo", "/usr/local/bin/backup.py", "restic-backup"], timeout=6 * 3600)
|
rc, out = await run_cmd(
|
||||||
await msg.answer(("✅ OK\n" if rc == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
|
_sudo_cmd(["/usr/local/bin/backup.py", "restic-backup"]),
|
||||||
|
use_restic_env=True,
|
||||||
|
timeout=6 * 3600,
|
||||||
|
)
|
||||||
|
kb = backup_kb
|
||||||
|
if rc != 0:
|
||||||
|
kb = InlineKeyboardMarkup(
|
||||||
|
inline_keyboard=[
|
||||||
|
[InlineKeyboardButton(text="🔁 Retry backup", callback_data="backup:retry")]
|
||||||
|
]
|
||||||
|
)
|
||||||
|
await msg.answer(_format_backup_result(rc, out), reply_markup=kb)
|
||||||
finally:
|
finally:
|
||||||
release_lock("backup")
|
release_lock("backup")
|
||||||
|
|
||||||
pos = await enqueue("backup", job)
|
pos = await enqueue("backup", job)
|
||||||
await msg.answer(f"🕓 Backup queued (#{pos})", reply_markup=backup_kb)
|
await msg.answer(f"🕓 Backup queued (#{pos})", reply_markup=backup_kb)
|
||||||
|
try:
|
||||||
|
from services.incidents import log_incident
|
||||||
|
log_incident(cfg, f"backup_queued by {msg.from_user.id}", category="backup")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def cmd_last_snapshot(msg: Message):
|
async def cmd_last_snapshot(msg: Message):
|
||||||
await msg.answer("⏳ Loading last snapshot…", reply_markup=backup_kb)
|
await msg.answer("⏳ Loading last snapshot…", reply_markup=backup_kb)
|
||||||
|
|
||||||
async def worker():
|
async def worker():
|
||||||
rc, raw = await run_cmd(
|
rc, raw = await run_cmd_full(
|
||||||
["restic", "snapshots", "--json"],
|
["restic", "snapshots", "--json"],
|
||||||
use_restic_env=True,
|
use_restic_env=True,
|
||||||
timeout=20
|
timeout=20
|
||||||
@@ -193,7 +386,10 @@ async def cmd_last_snapshot(msg: Message):
|
|||||||
await msg.answer(raw, reply_markup=backup_kb)
|
await msg.answer(raw, reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
|
|
||||||
snaps = json.loads(raw)
|
ok, snaps, err = _load_json(raw, "restic snapshots")
|
||||||
|
if not ok or not isinstance(snaps, list):
|
||||||
|
await msg.answer(err, reply_markup=backup_kb)
|
||||||
|
return
|
||||||
if not snaps:
|
if not snaps:
|
||||||
await msg.answer("📦 Snapshots: none", reply_markup=backup_kb)
|
await msg.answer("📦 Snapshots: none", reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
@@ -203,7 +399,7 @@ async def cmd_last_snapshot(msg: Message):
|
|||||||
t = datetime.fromisoformat(s["time"].replace("Z", "+00:00"))
|
t = datetime.fromisoformat(s["time"].replace("Z", "+00:00"))
|
||||||
short_id = s["short_id"]
|
short_id = s["short_id"]
|
||||||
|
|
||||||
rc2, raw2 = await run_cmd(
|
rc2, raw2 = await run_cmd_full(
|
||||||
["restic", "stats", short_id, "--json"],
|
["restic", "stats", short_id, "--json"],
|
||||||
use_restic_env=True,
|
use_restic_env=True,
|
||||||
timeout=20
|
timeout=20
|
||||||
@@ -212,7 +408,10 @@ async def cmd_last_snapshot(msg: Message):
|
|||||||
await msg.answer(raw2, reply_markup=backup_kb)
|
await msg.answer(raw2, reply_markup=backup_kb)
|
||||||
return
|
return
|
||||||
|
|
||||||
stats = json.loads(raw2)
|
ok, stats, err = _load_json(raw2, f"restic stats {short_id}")
|
||||||
|
if not ok or not isinstance(stats, dict):
|
||||||
|
await msg.answer(err, reply_markup=backup_kb)
|
||||||
|
return
|
||||||
|
|
||||||
msg_text = (
|
msg_text = (
|
||||||
"📦 **Last snapshot**\n\n"
|
"📦 **Last snapshot**\n\n"
|
||||||
@@ -247,7 +446,20 @@ async def ls(msg: Message):
|
|||||||
@dp.message(F.text == "🧾 Queue")
|
@dp.message(F.text == "🧾 Queue")
|
||||||
async def qb(msg: Message):
|
async def qb(msg: Message):
|
||||||
if is_admin_msg(msg):
|
if is_admin_msg(msg):
|
||||||
await msg.answer(format_status(), reply_markup=backup_kb)
|
kb = InlineKeyboardMarkup(
|
||||||
|
inline_keyboard=[
|
||||||
|
[InlineKeyboardButton(text="Details", callback_data="queue:details")],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
await msg.answer(format_status(), reply_markup=kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.callback_query(F.data == "queue:details")
|
||||||
|
async def qd(cb: CallbackQuery):
|
||||||
|
if not is_admin_cb(cb):
|
||||||
|
return
|
||||||
|
await cb.answer()
|
||||||
|
await cb.message.answer(format_details(), reply_markup=backup_kb)
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text == "▶️ Run backup")
|
@dp.message(F.text == "▶️ Run backup")
|
||||||
@@ -256,6 +468,12 @@ async def br(msg: Message):
|
|||||||
await cmd_backup_now(msg)
|
await cmd_backup_now(msg)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/backup_run")
|
||||||
|
async def br_cmd(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await schedule_backup(msg)
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text == "🧪 Restic check")
|
@dp.message(F.text == "🧪 Restic check")
|
||||||
async def rc(msg: Message):
|
async def rc(msg: Message):
|
||||||
if not is_admin_msg(msg):
|
if not is_admin_msg(msg):
|
||||||
@@ -263,8 +481,19 @@ async def rc(msg: Message):
|
|||||||
|
|
||||||
async def job():
|
async def job():
|
||||||
await msg.answer("🧪 Restic check запущен", reply_markup=backup_kb)
|
await msg.answer("🧪 Restic check запущен", reply_markup=backup_kb)
|
||||||
rc2, out = await run_cmd(["sudo", "/usr/local/bin/restic-check.sh"], timeout=6 * 3600)
|
rc2, out = await run_cmd(
|
||||||
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
|
_sudo_cmd(["/usr/local/bin/restic-check.sh"]),
|
||||||
|
use_restic_env=True,
|
||||||
|
timeout=6 * 3600,
|
||||||
|
)
|
||||||
|
kb = backup_kb
|
||||||
|
if rc2 != 0:
|
||||||
|
kb = InlineKeyboardMarkup(
|
||||||
|
inline_keyboard=[
|
||||||
|
[InlineKeyboardButton(text="🔁 Retry restic check", callback_data="backup:retry_check")]
|
||||||
|
]
|
||||||
|
)
|
||||||
|
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=kb)
|
||||||
|
|
||||||
pos = await enqueue("restic-check", job)
|
pos = await enqueue("restic-check", job)
|
||||||
await msg.answer(f"🕓 Restic check queued (#{pos})", reply_markup=backup_kb)
|
await msg.answer(f"🕓 Restic check queued (#{pos})", reply_markup=backup_kb)
|
||||||
@@ -277,7 +506,11 @@ async def wr(msg: Message):
|
|||||||
|
|
||||||
async def job():
|
async def job():
|
||||||
await msg.answer("📬 Weekly report запущен", reply_markup=backup_kb)
|
await msg.answer("📬 Weekly report запущен", reply_markup=backup_kb)
|
||||||
rc2, out = await run_cmd(["sudo", "/usr/local/bin/weekly-report.sh"], timeout=3600)
|
rc2, out = await run_cmd(
|
||||||
|
_sudo_cmd(["/usr/local/bin/weekly-report.sh"]),
|
||||||
|
use_restic_env=True,
|
||||||
|
timeout=3600,
|
||||||
|
)
|
||||||
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
|
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
|
||||||
|
|
||||||
pos = await enqueue("weekly-report", job)
|
pos = await enqueue("weekly-report", job)
|
||||||
@@ -288,3 +521,55 @@ async def wr(msg: Message):
|
|||||||
async def rh(msg: Message):
|
async def rh(msg: Message):
|
||||||
if is_admin_msg(msg):
|
if is_admin_msg(msg):
|
||||||
await msg.answer(restore_help(), reply_markup=backup_kb)
|
await msg.answer(restore_help(), reply_markup=backup_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "📜 History")
|
||||||
|
@dp.message(F.text == "/backup_history")
|
||||||
|
async def backup_history(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
log_path = "/var/log/backup-auto.log"
|
||||||
|
content = _tail(log_path, lines=160)
|
||||||
|
if content.startswith("⚠️"):
|
||||||
|
await msg.answer(content, reply_markup=backup_kb)
|
||||||
|
return
|
||||||
|
pretty = _beautify_restic_forget(content)
|
||||||
|
trimmed = False
|
||||||
|
max_len = 3500
|
||||||
|
if len(content) > max_len:
|
||||||
|
content = content[-max_len:]
|
||||||
|
trimmed = True
|
||||||
|
header = "📜 Backup history (tail)"
|
||||||
|
if trimmed:
|
||||||
|
header += " (trimmed)"
|
||||||
|
if pretty:
|
||||||
|
await msg.answer(f"{header}\n`{log_path}`\n\n{pretty}", reply_markup=backup_kb)
|
||||||
|
else:
|
||||||
|
await msg.answer(
|
||||||
|
f"{header}\n`{log_path}`\n```\n{content}\n```",
|
||||||
|
reply_markup=backup_kb,
|
||||||
|
parse_mode="Markdown",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/queue_history")
|
||||||
|
async def queue_history(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
await msg.answer(format_history(), reply_markup=backup_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.callback_query(F.data == "backup:retry")
|
||||||
|
async def backup_retry(cb: CallbackQuery):
|
||||||
|
if not is_admin_cb(cb):
|
||||||
|
return
|
||||||
|
await cb.answer("Queuing backup…")
|
||||||
|
await schedule_backup(cb.message)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.callback_query(F.data == "backup:retry_check")
|
||||||
|
async def backup_retry_check(cb: CallbackQuery):
|
||||||
|
if not is_admin_cb(cb):
|
||||||
|
return
|
||||||
|
await cb.answer("Queuing restic check…")
|
||||||
|
await rc(cb.message)
|
||||||
|
|||||||
@@ -2,8 +2,10 @@ import json
|
|||||||
import time
|
import time
|
||||||
from aiogram import F
|
from aiogram import F
|
||||||
from aiogram.types import CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
from aiogram.types import CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
||||||
from app import dp, ADMIN_ID
|
from app import dp, ADMIN_ID, cfg
|
||||||
from services.docker import docker_cmd
|
from services.docker import docker_cmd
|
||||||
|
from services.incidents import log_incident
|
||||||
|
from services.runner import run_cmd
|
||||||
from state import DOCKER_MAP, LOG_FILTER_PENDING
|
from state import DOCKER_MAP, LOG_FILTER_PENDING
|
||||||
from handlers.backup import cmd_backup_status
|
from handlers.backup import cmd_backup_status
|
||||||
|
|
||||||
@@ -13,8 +15,15 @@ async def docker_callback(cb: CallbackQuery):
|
|||||||
if cb.from_user.id != ADMIN_ID:
|
if cb.from_user.id != ADMIN_ID:
|
||||||
return
|
return
|
||||||
|
|
||||||
_, action, alias = cb.data.split(":", 2)
|
try:
|
||||||
real = DOCKER_MAP[alias]
|
_, action, alias = cb.data.split(":", 2)
|
||||||
|
except ValueError:
|
||||||
|
await cb.answer("Bad request")
|
||||||
|
return
|
||||||
|
real = DOCKER_MAP.get(alias)
|
||||||
|
if not real:
|
||||||
|
await cb.answer("Container not found")
|
||||||
|
return
|
||||||
|
|
||||||
if action == "restart":
|
if action == "restart":
|
||||||
await cb.answer("Restarting…")
|
await cb.answer("Restarting…")
|
||||||
@@ -24,6 +33,10 @@ async def docker_callback(cb: CallbackQuery):
|
|||||||
f"🔄 **{alias} restarted**\n```{out}```",
|
f"🔄 **{alias} restarted**\n```{out}```",
|
||||||
parse_mode="Markdown"
|
parse_mode="Markdown"
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
log_incident(cfg, f"docker_restart {alias}", category="docker")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
elif action == "logs":
|
elif action == "logs":
|
||||||
await cb.answer()
|
await cb.answer()
|
||||||
@@ -54,7 +67,7 @@ async def snapshot_details(cb: CallbackQuery):
|
|||||||
snap_id = cb.data.split(":", 1)[1]
|
snap_id = cb.data.split(":", 1)[1]
|
||||||
await cb.answer("Loading snapshot…")
|
await cb.answer("Loading snapshot…")
|
||||||
|
|
||||||
# получаем статистику snapshot
|
# получаем статистику snapshot
|
||||||
rc, raw = await run_cmd(
|
rc, raw = await run_cmd(
|
||||||
["restic", "stats", snap_id, "--json"],
|
["restic", "stats", snap_id, "--json"],
|
||||||
use_restic_env=True,
|
use_restic_env=True,
|
||||||
|
|||||||
24
handlers/config_check.py
Normal file
24
handlers/config_check.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from aiogram import F
|
||||||
|
from aiogram.types import Message
|
||||||
|
from app import dp, cfg
|
||||||
|
from auth import is_admin_msg
|
||||||
|
from services.config_check import validate_cfg
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/config_check")
|
||||||
|
async def config_check(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
errors, warnings = validate_cfg(cfg)
|
||||||
|
lines = []
|
||||||
|
if errors:
|
||||||
|
lines.append("❌ Config errors:")
|
||||||
|
lines += [f"- {e}" for e in errors]
|
||||||
|
if warnings:
|
||||||
|
if lines:
|
||||||
|
lines.append("")
|
||||||
|
lines.append("⚠️ Warnings:")
|
||||||
|
lines += [f"- {w}" for w in warnings]
|
||||||
|
if not lines:
|
||||||
|
lines.append("✅ Config looks OK")
|
||||||
|
await msg.answer("\n".join(lines))
|
||||||
@@ -1,11 +1,13 @@
|
|||||||
from aiogram import F
|
from aiogram import F
|
||||||
from aiogram.types import Message
|
from aiogram.types import Message
|
||||||
from app import dp
|
from app import dp, cfg
|
||||||
from auth import is_admin_msg
|
from auth import is_admin_msg
|
||||||
from keyboards import docker_kb, docker_inline_kb
|
from keyboards import docker_kb, docker_inline_kb
|
||||||
from services.docker import container_uptime, docker_cmd
|
from services.docker import container_uptime, docker_cmd
|
||||||
|
from services.incidents import log_incident
|
||||||
from state import DOCKER_MAP, LOG_FILTER_PENDING
|
from state import DOCKER_MAP, LOG_FILTER_PENDING
|
||||||
import time
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
async def cmd_docker_status(msg: Message):
|
async def cmd_docker_status(msg: Message):
|
||||||
@@ -42,7 +44,7 @@ async def cmd_docker_status(msg: Message):
|
|||||||
lines.append(f"{icon} {alias}: {status} ({up})")
|
lines.append(f"{icon} {alias}: {status} ({up})")
|
||||||
|
|
||||||
await msg.answer("\n".join(lines), reply_markup=docker_kb)
|
await msg.answer("\n".join(lines), reply_markup=docker_kb)
|
||||||
|
log_incident(cfg, f"docker_status by {msg.from_user.id}", category="docker")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# ⬅️ КРИТИЧЕСКИ ВАЖНО
|
# ⬅️ КРИТИЧЕСКИ ВАЖНО
|
||||||
await msg.answer(
|
await msg.answer(
|
||||||
@@ -77,6 +79,137 @@ async def ds(msg: Message):
|
|||||||
await cmd_docker_status(msg)
|
await cmd_docker_status(msg)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/docker_status")
|
||||||
|
async def ds_cmd(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await cmd_docker_status(msg)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text, F.func(lambda m: (m.text or "").split()[0] == "/docker_health"))
|
||||||
|
async def docker_health(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
parts = msg.text.split()
|
||||||
|
if len(parts) < 2:
|
||||||
|
await msg.answer("Usage: /docker_health <alias>")
|
||||||
|
return
|
||||||
|
alias = parts[1]
|
||||||
|
real = DOCKER_MAP.get(alias)
|
||||||
|
if not real:
|
||||||
|
await msg.answer(f"⚠️ Unknown container: {alias}", reply_markup=docker_kb)
|
||||||
|
return
|
||||||
|
rc, out = await docker_cmd(["inspect", "-f", "{{json .State.Health}}", real], timeout=10)
|
||||||
|
if rc != 0 or not out.strip():
|
||||||
|
await msg.answer(f"⚠️ Failed to get health for {alias}", reply_markup=docker_kb)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
data = json.loads(out)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
await msg.answer(f"⚠️ Invalid health JSON for {alias}", reply_markup=docker_kb)
|
||||||
|
return
|
||||||
|
status = data.get("Status", "n/a")
|
||||||
|
fail = data.get("FailingStreak", "n/a")
|
||||||
|
logs = data.get("Log") or []
|
||||||
|
lines = [f"🐳 {alias} health", f"Status: {status}", f"Failing streak: {fail}"]
|
||||||
|
if logs:
|
||||||
|
lines.append("Recent logs:")
|
||||||
|
for entry in logs[-5:]:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
ts = entry.get("Start") or entry.get("End") or ""
|
||||||
|
exitc = entry.get("ExitCode", "")
|
||||||
|
out_line = entry.get("Output", "").strip()
|
||||||
|
lines.append(f"- {ts} rc={exitc} {out_line}")
|
||||||
|
await msg.answer("\n".join(lines), reply_markup=docker_kb)
|
||||||
|
log_incident(cfg, f"docker_health alias={alias} by {msg.from_user.id}", category="docker")
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/docker_health_summary")
|
||||||
|
async def docker_health_summary(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
if not DOCKER_MAP:
|
||||||
|
await msg.answer("⚠️ DOCKER_MAP пуст", reply_markup=docker_kb)
|
||||||
|
return
|
||||||
|
problems = []
|
||||||
|
total = len(DOCKER_MAP)
|
||||||
|
for alias, real in DOCKER_MAP.items():
|
||||||
|
rc, out = await docker_cmd(["inspect", "-f", "{{json .State}}", real], timeout=10)
|
||||||
|
if rc != 0:
|
||||||
|
problems.append(f"{alias}: inspect error")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
state = json.loads(out)
|
||||||
|
except Exception:
|
||||||
|
problems.append(f"{alias}: bad JSON")
|
||||||
|
continue
|
||||||
|
status = state.get("Status", "n/a")
|
||||||
|
health = (state.get("Health") or {}).get("Status", "n/a")
|
||||||
|
if status != "running" or health not in ("healthy", "none"):
|
||||||
|
problems.append(f"{alias}: {status}/{health}")
|
||||||
|
ok = total - len(problems)
|
||||||
|
lines = [f"🐳 Docker health: 🟢 {ok}/{total} healthy, 🔴 {len(problems)} issues"]
|
||||||
|
if problems:
|
||||||
|
lines.append("Problems:")
|
||||||
|
lines.extend([f"- {p}" for p in problems])
|
||||||
|
await msg.answer("\n".join(lines), reply_markup=docker_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "📈 Stats")
|
||||||
|
async def dstats(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
if not DOCKER_MAP:
|
||||||
|
await msg.answer(
|
||||||
|
"⚠️ DOCKER_MAP пуст.\n"
|
||||||
|
"Контейнеры не обнаружены.",
|
||||||
|
reply_markup=docker_kb,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
names = list(DOCKER_MAP.values())
|
||||||
|
fmt = "{{.Name}}|{{.CPUPerc}}|{{.MemUsage}}|{{.NetIO}}|{{.BlockIO}}"
|
||||||
|
rc, out = await docker_cmd(["stats", "--no-stream", "--format", fmt] + names)
|
||||||
|
if rc != 0:
|
||||||
|
await msg.answer(out, reply_markup=docker_kb)
|
||||||
|
return
|
||||||
|
lines = [line.strip() for line in out.splitlines() if line.strip()]
|
||||||
|
if not lines:
|
||||||
|
await msg.answer("📈 Stats\n\n(no data)", reply_markup=docker_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
alias_by_name = {v: k for k, v in DOCKER_MAP.items()}
|
||||||
|
rows = []
|
||||||
|
for line in lines:
|
||||||
|
parts = line.split("|")
|
||||||
|
if len(parts) != 5:
|
||||||
|
continue
|
||||||
|
name, cpu, mem, net, blk = [p.strip() for p in parts]
|
||||||
|
display = alias_by_name.get(name, name)
|
||||||
|
try:
|
||||||
|
cpu_val = float(cpu.strip("%"))
|
||||||
|
except ValueError:
|
||||||
|
cpu_val = 0.0
|
||||||
|
rows.append((cpu_val, display, cpu, mem, net, blk))
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
await msg.answer("📈 Stats\n\n(no data)", reply_markup=docker_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
rows.sort(key=lambda r: r[0], reverse=True)
|
||||||
|
header = f"{'NAME':<18} {'CPU':>6} {'MEM':>18} {'NET':>16} {'IO':>16}"
|
||||||
|
formatted = [header]
|
||||||
|
for _cpu_val, name, cpu, mem, net, blk in rows:
|
||||||
|
formatted.append(f"{name[:18]:<18} {cpu:>6} {mem:>18} {net:>16} {blk:>16}")
|
||||||
|
|
||||||
|
body = "\n".join(formatted)
|
||||||
|
await msg.answer(
|
||||||
|
f"📈 **Docker stats**\n```\n{body}\n```",
|
||||||
|
reply_markup=docker_kb,
|
||||||
|
parse_mode="Markdown",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in LOG_FILTER_PENDING))
|
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in LOG_FILTER_PENDING))
|
||||||
async def log_filter_input(msg: Message):
|
async def log_filter_input(msg: Message):
|
||||||
if not is_admin_msg(msg):
|
if not is_admin_msg(msg):
|
||||||
|
|||||||
166
handlers/help.py
166
handlers/help.py
@@ -1,24 +1,164 @@
|
|||||||
from aiogram import F
|
from aiogram import F
|
||||||
from aiogram.types import Message
|
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
||||||
from app import dp
|
from app import dp, ADMIN_ID
|
||||||
from auth import is_admin_msg
|
from auth import is_admin_msg
|
||||||
from keyboards import menu_kb
|
from keyboards import menu_kb
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text.in_({"ℹ️ Help", "ℹ Help", "Help"}))
|
HELP_PAGES = [
|
||||||
|
(
|
||||||
|
"Overview",
|
||||||
|
"ℹ️ **Help — Overview**\n\n"
|
||||||
|
"🩺 *Health* — быстрый health-check.\n"
|
||||||
|
"📊 *Статус* — общая загрузка.\n"
|
||||||
|
"📋 */status_short* — кратко (load/RAM/диски).\n"
|
||||||
|
"🩺 */health_short* — краткий health.\n"
|
||||||
|
"🧪 */selftest* — health + restic snapshot probe.\n"
|
||||||
|
"🔧 Разделы: Docker, Backup, Artifacts, System, OpenWrt.",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Alerts",
|
||||||
|
"🚨 **Alerts & Mute**\n\n"
|
||||||
|
"Команды:\n"
|
||||||
|
"• `/alerts test <critical|warn|info>`\n"
|
||||||
|
"• `/alerts mute <cat> <minutes>` / `/alerts unmute <cat>` / `/alerts list`\n"
|
||||||
|
"• `/alerts recent [hours]`\n"
|
||||||
|
"Шорткаты: `/alerts_list`, `/alerts_recent`, `/alerts_mute_load` (60м).\n"
|
||||||
|
"Категории: load, disk, smart, raid, ssl, docker, test.\n"
|
||||||
|
"Quiet hours: `alerts.quiet_hours` для не‑критичных.\n"
|
||||||
|
"Авто-мьют: `alerts.auto_mute` со слотами времени.\n"
|
||||||
|
"Только красные load: `alerts.load_only_critical: true`.\n"
|
||||||
|
"Валидатор конфига: `/config_check`.",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Backup",
|
||||||
|
"💾 **Backup (restic)**\n\n"
|
||||||
|
"Кнопки: Status, Last snapshot, Repo stats, Run backup, Queue, Restic check, Weekly report, History.\n"
|
||||||
|
"History — хвост `/var/log/backup-auto.log`.\n"
|
||||||
|
"Fail → кнопка Retry (backup/check).\n"
|
||||||
|
"Run backup/Check учитывают `safety.dry_run`.\n"
|
||||||
|
"После бэкапа приходит TL;DR + путь к логу `/var/log/backup-auto.log`.\n"
|
||||||
|
"Queue → Details показывает отложенные задачи.",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Docker & System",
|
||||||
|
"🐳 **Docker**\n"
|
||||||
|
"Status/Restart/Logs/Stats — клавиатура Docker.\n"
|
||||||
|
"Команды: `/docker_status`, `/docker_health <alias>`.\n\n"
|
||||||
|
"🖥 **System**\n"
|
||||||
|
"Info: Disks/Security/Metrics/Hardware/SMART/OpenWrt.\n"
|
||||||
|
"Ops: Updates/Upgrade/Reboot.\n"
|
||||||
|
"Logs: Audit/Incidents/Security/Integrations/Processes.\n"
|
||||||
|
"OpenWrt: `/openwrt`, `/openwrt_wan`, `/openwrt_clients`, `/openwrt_leases`.",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Admin",
|
||||||
|
"🛠 **Admin & Deploy**\n\n"
|
||||||
|
"Config: `/config_check`, файл `config.yaml` (см. config.example.yaml).\n"
|
||||||
|
"Deploy: `deploy.sh` (ssh 10.10.10.10:1090 → git pull → systemctl restart tg-bot).\n"
|
||||||
|
"Incidents: `/incidents_summary`, `/incidents_diff [hours]`.\n"
|
||||||
|
"Export: `/incidents_export [hours] [csv|json]`, `/export_all [hours]` (zip).\n"
|
||||||
|
"Alerts log/heatmap: `/alerts_log [hours]`, `/alerts_heatmap [hours] [cat]`.\n"
|
||||||
|
"Backup SLA: `/backup_sla`; Docker restarts: `/docker_restarts [hours]`.\n"
|
||||||
|
"Disk snapshot: `/disk_snapshot`.\n"
|
||||||
|
"Queue: `/queue_history`, `/queue_sla`.\n"
|
||||||
|
"Self-test history: `/selftest_history`.\n"
|
||||||
|
"OpenWrt leases diff: `/openwrt_leases_diff`.\n"
|
||||||
|
"BotFather list: `/botfather_list`.\n"
|
||||||
|
"Безопасность: `safety.dry_run: true` блокирует опасные действия.\n"
|
||||||
|
"OpenWrt: кнопка в System → Info.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _help_kb(idx: int) -> InlineKeyboardMarkup:
|
||||||
|
buttons = []
|
||||||
|
if idx > 0:
|
||||||
|
buttons.append(InlineKeyboardButton(text="◀️ Prev", callback_data=f"help:{idx-1}"))
|
||||||
|
buttons.append(InlineKeyboardButton(text=f"{idx+1}/{len(HELP_PAGES)}", callback_data="help:noop"))
|
||||||
|
if idx < len(HELP_PAGES) - 1:
|
||||||
|
buttons.append(InlineKeyboardButton(text="Next ▶️", callback_data=f"help:{idx+1}"))
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard=[buttons])
|
||||||
|
|
||||||
|
|
||||||
|
def _help_text(idx: int) -> str:
|
||||||
|
_title, body = HELP_PAGES[idx]
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text.in_({"ℹ️ Help", "ℹ Help", "Help", "/help"}))
|
||||||
async def help_cmd(msg: Message):
|
async def help_cmd(msg: Message):
|
||||||
if not is_admin_msg(msg):
|
if not is_admin_msg(msg):
|
||||||
return
|
return
|
||||||
|
idx = 0
|
||||||
await msg.answer(
|
await msg.answer(
|
||||||
"ℹ️ **Help / Справка**\n\n"
|
_help_text(idx),
|
||||||
"🩺 Health — быстрый health-check сервера\n"
|
reply_markup=_help_kb(idx),
|
||||||
"📊 Статус — общая загрузка сервера\n"
|
|
||||||
"🐳 Docker — управление контейнерами\n"
|
|
||||||
"📦 Backup — restic бэкапы\n"
|
|
||||||
"🧉 Artifacts — критичные образы (Clonezilla, NAND)\n"
|
|
||||||
"⚙️ System — диски, безопасность, URL, metrics, reboot\n\n"
|
|
||||||
"Inline-кнопки используются для выбора контейнеров.",
|
|
||||||
reply_markup=menu_kb,
|
|
||||||
parse_mode="Markdown",
|
parse_mode="Markdown",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.callback_query(F.data.startswith("help:"))
|
||||||
|
async def help_cb(cb: CallbackQuery):
|
||||||
|
if cb.from_user.id != ADMIN_ID:
|
||||||
|
await cb.answer()
|
||||||
|
return
|
||||||
|
payload = cb.data.split(":", 1)[1]
|
||||||
|
if payload == "noop":
|
||||||
|
await cb.answer()
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
idx = int(payload)
|
||||||
|
except ValueError:
|
||||||
|
await cb.answer()
|
||||||
|
return
|
||||||
|
idx = max(0, min(idx, len(HELP_PAGES) - 1))
|
||||||
|
await cb.message.edit_text(
|
||||||
|
_help_text(idx),
|
||||||
|
reply_markup=_help_kb(idx),
|
||||||
|
parse_mode="Markdown",
|
||||||
|
)
|
||||||
|
await cb.answer()
|
||||||
|
|
||||||
|
|
||||||
|
BOTFATHER_LIST = """\
|
||||||
|
help - Show help pages
|
||||||
|
status_short - Compact host status
|
||||||
|
health_short - Compact health report
|
||||||
|
selftest - Health + restic snapshot probe
|
||||||
|
alerts - Manage alerts
|
||||||
|
alerts_list - List active mutes
|
||||||
|
alerts_recent - Show recent incidents (24h)
|
||||||
|
alerts_mute_load - Mute load alerts for 60m
|
||||||
|
alerts_log - Show suppressed alerts
|
||||||
|
alerts_heatmap - Hourly incidents heatmap
|
||||||
|
backup_run - Run backup (queued)
|
||||||
|
backup_history - Show backup log tail
|
||||||
|
queue_history - Show queue recent jobs
|
||||||
|
queue_sla - Queue SLA stats
|
||||||
|
docker_status - Docker summary
|
||||||
|
docker_health - Docker inspect/health by alias
|
||||||
|
docker_health_summary - Docker health summary (problems only)
|
||||||
|
openwrt - Full OpenWrt status
|
||||||
|
openwrt_wan - OpenWrt WAN only
|
||||||
|
openwrt_clients - OpenWrt wifi clients
|
||||||
|
openwrt_leases - OpenWrt DHCP leases
|
||||||
|
openwrt_fast - OpenWrt quick WAN view
|
||||||
|
openwrt_leases_diff - OpenWrt DHCP diff
|
||||||
|
incidents_summary - Incidents counters (24h/7d)
|
||||||
|
incidents_export - Export incidents (hours fmt)
|
||||||
|
incidents_diff - Show incidents since last check
|
||||||
|
export_all - Zip with incidents/queue/selftest
|
||||||
|
backup_sla - Backup SLA check
|
||||||
|
docker_restarts - Docker restart history
|
||||||
|
selftest_history - Self-test history
|
||||||
|
disk_snapshot - Disk usage snapshot
|
||||||
|
config_check - Validate config
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/botfather_list")
|
||||||
|
async def botfather_list(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
await msg.answer(f"Commands for BotFather:\n```\n{BOTFATHER_LIST}\n```", parse_mode="Markdown")
|
||||||
|
|||||||
@@ -2,7 +2,19 @@ from aiogram import F
|
|||||||
from aiogram.types import Message
|
from aiogram.types import Message
|
||||||
from app import dp
|
from app import dp
|
||||||
from auth import is_admin_msg
|
from auth import is_admin_msg
|
||||||
from keyboards import menu_kb, docker_kb, backup_kb, artifacts_kb, system_kb
|
from keyboards import (
|
||||||
|
menu_kb,
|
||||||
|
docker_kb,
|
||||||
|
backup_kb,
|
||||||
|
artifacts_kb,
|
||||||
|
system_menu_kb,
|
||||||
|
system_info_kb,
|
||||||
|
system_ops_kb,
|
||||||
|
system_logs_kb,
|
||||||
|
system_logs_audit_kb,
|
||||||
|
system_logs_security_kb,
|
||||||
|
system_logs_integrations_kb,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text == "/start")
|
@dp.message(F.text == "/start")
|
||||||
@@ -38,4 +50,53 @@ async def am(msg: Message):
|
|||||||
@dp.message(F.text == "⚙️ System")
|
@dp.message(F.text == "⚙️ System")
|
||||||
async def sm(msg: Message):
|
async def sm(msg: Message):
|
||||||
if is_admin_msg(msg):
|
if is_admin_msg(msg):
|
||||||
await msg.answer("⚙️ System", reply_markup=system_kb)
|
await msg.answer("⚙️ System", reply_markup=system_menu_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "⬅️ System")
|
||||||
|
async def back_system(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await msg.answer("⚙️ System", reply_markup=system_menu_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "ℹ️ Info")
|
||||||
|
async def sys_info(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await msg.answer("ℹ️ System info", reply_markup=system_info_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "🛠 Ops")
|
||||||
|
async def sys_ops(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await msg.answer("🛠 System ops", reply_markup=system_ops_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "📄 Logs")
|
||||||
|
async def sys_logs(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await msg.answer("📄 System logs", reply_markup=system_logs_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "⬅️ Logs")
|
||||||
|
async def back_logs(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await msg.answer("📄 System logs", reply_markup=system_logs_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "🧾 Audit/Incidents")
|
||||||
|
async def logs_audit_menu(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await msg.answer("🧾 Logs: Audit/Incidents", reply_markup=system_logs_audit_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "🔒 Security")
|
||||||
|
async def logs_security_menu(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await msg.answer("🔒 Logs: Security", reply_markup=system_logs_security_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "🧩 Integrations")
|
||||||
|
async def logs_integrations_menu(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await msg.answer("🧩 Logs: Integrations", reply_markup=system_logs_integrations_kb)
|
||||||
|
|
||||||
|
|||||||
141
handlers/processes.py
Normal file
141
handlers/processes.py
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
import asyncio
|
||||||
|
from aiogram import F
|
||||||
|
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
||||||
|
from app import dp, ADMIN_ID
|
||||||
|
from auth import is_admin_msg
|
||||||
|
from keyboards import system_logs_tools_kb
|
||||||
|
from services.processes import get_top_processes, search_processes, terminate_process
|
||||||
|
from state import PROC_SEARCH_PENDING, PROC_KILL_PENDING
|
||||||
|
|
||||||
|
|
||||||
|
def _proc_kb() -> InlineKeyboardMarkup:
|
||||||
|
return InlineKeyboardMarkup(
|
||||||
|
inline_keyboard=[[
|
||||||
|
InlineKeyboardButton(text="🔄 Refresh", callback_data="proc:refresh"),
|
||||||
|
InlineKeyboardButton(text="🔍 Search", callback_data="proc:search"),
|
||||||
|
InlineKeyboardButton(text="🛑 Kill", callback_data="proc:kill"),
|
||||||
|
]]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _format_top(title: str, rows: list[dict]) -> str:
|
||||||
|
if not rows:
|
||||||
|
return f"{title}\n(no data)"
|
||||||
|
lines = ["PID CPU% MEM% NAME"]
|
||||||
|
for row in rows:
|
||||||
|
lines.append(
|
||||||
|
f"{row['pid']:<5} {row['cpu']:<5.1f} {row['mem']:<5.1f} {row['name']}"
|
||||||
|
)
|
||||||
|
return f"{title}\n" + "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
async def send_processes(msg: Message, edit: bool = False):
|
||||||
|
top_cpu, top_mem = await asyncio.to_thread(get_top_processes)
|
||||||
|
body = (
|
||||||
|
"🧰 **Processes**\n\n"
|
||||||
|
"```\n"
|
||||||
|
f"{_format_top('Top CPU', top_cpu)}\n\n"
|
||||||
|
f"{_format_top('Top RAM', top_mem)}\n"
|
||||||
|
"```"
|
||||||
|
)
|
||||||
|
if edit:
|
||||||
|
await msg.edit_text(body, reply_markup=_proc_kb(), parse_mode="Markdown")
|
||||||
|
else:
|
||||||
|
await msg.answer(body, reply_markup=_proc_kb(), parse_mode="Markdown")
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "🧰 Processes")
|
||||||
|
async def proc_menu(msg: Message):
|
||||||
|
if is_admin_msg(msg):
|
||||||
|
await send_processes(msg, edit=False)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.callback_query(F.data.startswith("proc:"))
|
||||||
|
async def proc_actions(cb: CallbackQuery):
|
||||||
|
if cb.from_user.id != ADMIN_ID:
|
||||||
|
return
|
||||||
|
await cb.answer()
|
||||||
|
action = cb.data.split(":", 1)[1]
|
||||||
|
if action == "refresh":
|
||||||
|
await send_processes(cb.message, edit=True)
|
||||||
|
return
|
||||||
|
if action == "search":
|
||||||
|
PROC_SEARCH_PENDING[cb.from_user.id] = {}
|
||||||
|
await cb.message.answer("🔍 Send search text", reply_markup=system_logs_tools_kb)
|
||||||
|
return
|
||||||
|
if action == "kill":
|
||||||
|
PROC_KILL_PENDING[cb.from_user.id] = {}
|
||||||
|
await cb.message.answer("🛑 Send PID to terminate", reply_markup=system_logs_tools_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in PROC_SEARCH_PENDING))
|
||||||
|
async def proc_search(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
PROC_SEARCH_PENDING.pop(msg.from_user.id, None)
|
||||||
|
query = (msg.text or "").strip()
|
||||||
|
if not query:
|
||||||
|
await msg.answer("⚠️ Empty search", reply_markup=system_logs_tools_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
rows = await asyncio.to_thread(search_processes, query)
|
||||||
|
if not rows:
|
||||||
|
await msg.answer("🔍 No matches", reply_markup=system_logs_tools_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
lines = ["PID NAME CMD"]
|
||||||
|
for row in rows:
|
||||||
|
cmd = row["cmdline"] or "-"
|
||||||
|
if len(cmd) > 80:
|
||||||
|
cmd = cmd[:80] + "…"
|
||||||
|
lines.append(f"{row['pid']:<5} {row['name']:<6} {cmd}")
|
||||||
|
|
||||||
|
text = "🔍 **Search results**\n```\n" + "\n".join(lines) + "\n```"
|
||||||
|
await msg.answer(text, reply_markup=system_logs_tools_kb, parse_mode="Markdown")
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in PROC_KILL_PENDING))
|
||||||
|
async def proc_kill_pid(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
PROC_KILL_PENDING.pop(msg.from_user.id, None)
|
||||||
|
raw = (msg.text or "").strip()
|
||||||
|
try:
|
||||||
|
pid = int(raw)
|
||||||
|
except ValueError:
|
||||||
|
await msg.answer("⚠️ Invalid PID", reply_markup=system_logs_tools_kb)
|
||||||
|
return
|
||||||
|
|
||||||
|
kb = InlineKeyboardMarkup(
|
||||||
|
inline_keyboard=[[
|
||||||
|
InlineKeyboardButton(text="✅ Confirm", callback_data=f"prockill:{pid}:confirm"),
|
||||||
|
InlineKeyboardButton(text="✖ Cancel", callback_data="prockill:cancel"),
|
||||||
|
]]
|
||||||
|
)
|
||||||
|
await msg.answer(f"⚠️ Terminate PID `{pid}`?", reply_markup=kb, parse_mode="Markdown")
|
||||||
|
|
||||||
|
|
||||||
|
@dp.callback_query(F.data.startswith("prockill:"))
|
||||||
|
async def proc_kill_confirm(cb: CallbackQuery):
|
||||||
|
if cb.from_user.id != ADMIN_ID:
|
||||||
|
return
|
||||||
|
parts = cb.data.split(":")
|
||||||
|
if len(parts) < 2:
|
||||||
|
await cb.answer("Bad request")
|
||||||
|
return
|
||||||
|
if parts[1] == "cancel":
|
||||||
|
await cb.answer("Cancelled")
|
||||||
|
await cb.message.delete()
|
||||||
|
return
|
||||||
|
if len(parts) != 3 or parts[2] != "confirm":
|
||||||
|
await cb.answer("Bad request")
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
pid = int(parts[1])
|
||||||
|
except ValueError:
|
||||||
|
await cb.answer("Bad PID")
|
||||||
|
return
|
||||||
|
await cb.answer()
|
||||||
|
result = await asyncio.to_thread(terminate_process, pid)
|
||||||
|
await cb.message.answer(result, reply_markup=system_logs_tools_kb)
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import json
|
||||||
import socket
|
import socket
|
||||||
import time
|
import time
|
||||||
import psutil
|
import psutil
|
||||||
@@ -10,6 +11,8 @@ from keyboards import menu_kb
|
|||||||
from services.system import format_disks
|
from services.system import format_disks
|
||||||
from services.health import health
|
from services.health import health
|
||||||
from state import DOCKER_MAP
|
from state import DOCKER_MAP
|
||||||
|
from services.runner import run_cmd_full
|
||||||
|
from services.selftest import run_selftest
|
||||||
|
|
||||||
|
|
||||||
async def cmd_status(msg: Message):
|
async def cmd_status(msg: Message):
|
||||||
@@ -34,6 +37,7 @@ async def cmd_status(msg: Message):
|
|||||||
cpu_percent = psutil.cpu_percent(interval=None)
|
cpu_percent = psutil.cpu_percent(interval=None)
|
||||||
|
|
||||||
disks = format_disks()
|
disks = format_disks()
|
||||||
|
net_lines = await _network_snapshot()
|
||||||
|
|
||||||
await msg.answer(
|
await msg.answer(
|
||||||
"📊 **Server status**\n\n"
|
"📊 **Server status**\n\n"
|
||||||
@@ -42,7 +46,8 @@ async def cmd_status(msg: Message):
|
|||||||
f"{cpu_icon} **Load (1m):** {load1:.2f}\n"
|
f"{cpu_icon} **Load (1m):** {load1:.2f}\n"
|
||||||
f"🧮 **CPU:** {cpu_percent:.0f}%\n"
|
f"🧮 **CPU:** {cpu_percent:.0f}%\n"
|
||||||
f"🧠 **RAM:** {mem.used // (1024**3)} / {mem.total // (1024**3)} GiB ({mem.percent}%)\n\n"
|
f"🧠 **RAM:** {mem.used // (1024**3)} / {mem.total // (1024**3)} GiB ({mem.percent}%)\n\n"
|
||||||
f"{disks}",
|
f"{disks}\n\n"
|
||||||
|
f"{net_lines}",
|
||||||
reply_markup=menu_kb,
|
reply_markup=menu_kb,
|
||||||
parse_mode="Markdown",
|
parse_mode="Markdown",
|
||||||
)
|
)
|
||||||
@@ -72,3 +77,96 @@ async def h(msg: Message):
|
|||||||
async def st(msg: Message):
|
async def st(msg: Message):
|
||||||
if is_admin_msg(msg):
|
if is_admin_msg(msg):
|
||||||
await cmd_status(msg)
|
await cmd_status(msg)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/status_short")
|
||||||
|
async def st_short(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
now = time.time()
|
||||||
|
uptime_sec = int(now - psutil.boot_time())
|
||||||
|
days, rem = divmod(uptime_sec, 86400)
|
||||||
|
hours, rem = divmod(rem, 3600)
|
||||||
|
minutes, _ = divmod(rem, 60)
|
||||||
|
load1, load5, load15 = psutil.getloadavg()
|
||||||
|
mem = psutil.virtual_memory()
|
||||||
|
disks = format_disks().splitlines()
|
||||||
|
disk_line = disks[1] if len(disks) > 1 else "Disks: n/a"
|
||||||
|
await msg.answer(
|
||||||
|
"📋 **Status (short)**\n"
|
||||||
|
f"🖥 `{socket.gethostname()}`\n"
|
||||||
|
f"⏱ Uptime: {days}d {hours}h {minutes}m\n"
|
||||||
|
f"⚙️ Load: {load1:.2f} {load5:.2f} {load15:.2f}\n"
|
||||||
|
f"🧠 RAM: {mem.percent}% ({mem.used // (1024**3)} / {mem.total // (1024**3)} GiB)\n"
|
||||||
|
f"💾 {disk_line}",
|
||||||
|
reply_markup=menu_kb,
|
||||||
|
parse_mode="Markdown",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/health_short")
|
||||||
|
async def health_short(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
text = await asyncio.to_thread(health, cfg, DOCKER_MAP)
|
||||||
|
except Exception as e:
|
||||||
|
await msg.answer(f"❌ Health failed: {type(e).__name__}: {e}", reply_markup=menu_kb)
|
||||||
|
return
|
||||||
|
lines = [ln for ln in text.splitlines() if ln.strip()]
|
||||||
|
brief = " | ".join(lines[1:5]) if len(lines) > 1 else text
|
||||||
|
await msg.answer(f"🩺 Health (short)\n{brief}", reply_markup=menu_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text.in_({"🧪 Self-test", "/selftest"}))
|
||||||
|
async def selftest(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
|
||||||
|
await msg.answer("⏳ Self-test…", reply_markup=menu_kb)
|
||||||
|
|
||||||
|
async def worker():
|
||||||
|
text, _ok = await run_selftest(cfg, DOCKER_MAP)
|
||||||
|
await msg.answer(text, reply_markup=menu_kb)
|
||||||
|
|
||||||
|
asyncio.create_task(worker())
|
||||||
|
|
||||||
|
|
||||||
|
def _rate_str(value: float) -> str:
|
||||||
|
if value >= 1024 * 1024:
|
||||||
|
return f"{value / (1024 * 1024):.2f} MiB/s"
|
||||||
|
if value >= 1024:
|
||||||
|
return f"{value / 1024:.1f} KiB/s"
|
||||||
|
return f"{value:.0f} B/s"
|
||||||
|
|
||||||
|
|
||||||
|
async def _network_snapshot(interval: float = 1.0) -> str:
|
||||||
|
start = psutil.net_io_counters(pernic=True)
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
end = psutil.net_io_counters(pernic=True)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for nic, s in end.items():
|
||||||
|
if nic.startswith("lo"):
|
||||||
|
continue
|
||||||
|
if not nic.startswith("enp"):
|
||||||
|
continue
|
||||||
|
e = start.get(nic)
|
||||||
|
if not e:
|
||||||
|
continue
|
||||||
|
rx = max(0, s.bytes_recv - e.bytes_recv)
|
||||||
|
tx = max(0, s.bytes_sent - e.bytes_sent)
|
||||||
|
err = max(0, (s.errin - e.errin) + (s.errout - e.errout))
|
||||||
|
score = rx + tx + (err * 1024)
|
||||||
|
rows.append((score, nic, rx, tx, err))
|
||||||
|
|
||||||
|
rows.sort(reverse=True)
|
||||||
|
top = rows[:3]
|
||||||
|
if not top:
|
||||||
|
return "📡 **Network (1s):** no data"
|
||||||
|
|
||||||
|
lines = ["📡 **Network (1s):**"]
|
||||||
|
for _score, nic, rx, tx, err in top:
|
||||||
|
err_part = f", err {err}" if err else ""
|
||||||
|
lines.append(f"- {nic}: RX {_rate_str(rx / interval)}, TX {_rate_str(tx / interval)}{err_part}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
90
keyboards.py
90
keyboards.py
@@ -10,7 +10,7 @@ menu_kb = ReplyKeyboardMarkup(
|
|||||||
keyboard=[
|
keyboard=[
|
||||||
[KeyboardButton(text="🩺 Health"), KeyboardButton(text="📊 Статус")],
|
[KeyboardButton(text="🩺 Health"), KeyboardButton(text="📊 Статус")],
|
||||||
[KeyboardButton(text="🐳 Docker"), KeyboardButton(text="📦 Backup")],
|
[KeyboardButton(text="🐳 Docker"), KeyboardButton(text="📦 Backup")],
|
||||||
[KeyboardButton(text="🧉 Artifacts"), KeyboardButton(text="⚙️ System")],
|
[KeyboardButton(text="⚙️ System")],
|
||||||
[KeyboardButton(text="ℹ️ Help")],
|
[KeyboardButton(text="ℹ️ Help")],
|
||||||
],
|
],
|
||||||
resize_keyboard=True,
|
resize_keyboard=True,
|
||||||
@@ -20,6 +20,7 @@ docker_kb = ReplyKeyboardMarkup(
|
|||||||
keyboard=[
|
keyboard=[
|
||||||
[KeyboardButton(text="🐳 Status"), KeyboardButton(text="🧰 Arcane")],
|
[KeyboardButton(text="🐳 Status"), KeyboardButton(text="🧰 Arcane")],
|
||||||
[KeyboardButton(text="🔄 Restart"), KeyboardButton(text="📜 Logs")],
|
[KeyboardButton(text="🔄 Restart"), KeyboardButton(text="📜 Logs")],
|
||||||
|
[KeyboardButton(text="📈 Stats"), KeyboardButton(text="♻️ Restarts")],
|
||||||
[KeyboardButton(text="⬅️ Назад")],
|
[KeyboardButton(text="⬅️ Назад")],
|
||||||
],
|
],
|
||||||
resize_keyboard=True,
|
resize_keyboard=True,
|
||||||
@@ -37,8 +38,8 @@ backup_kb = ReplyKeyboardMarkup(
|
|||||||
keyboard=[
|
keyboard=[
|
||||||
[KeyboardButton(text="📦 Status"), KeyboardButton(text="📦 Last snapshot")],
|
[KeyboardButton(text="📦 Status"), KeyboardButton(text="📦 Last snapshot")],
|
||||||
[KeyboardButton(text="📊 Repo stats"), KeyboardButton(text="🧯 Restore help")],
|
[KeyboardButton(text="📊 Repo stats"), KeyboardButton(text="🧯 Restore help")],
|
||||||
[KeyboardButton(text="▶️ Run backup"), KeyboardButton(text="🧾 Queue")],
|
[KeyboardButton(text="▶️ Run backup"), KeyboardButton(text="🧾 Queue"), KeyboardButton(text="📊 Queue SLA")],
|
||||||
[KeyboardButton(text="🧪 Restic check"), KeyboardButton(text="📬 Weekly report"), KeyboardButton(text="⬅️ Назад")],
|
[KeyboardButton(text="📉 Backup SLA"), KeyboardButton(text="📜 History"), KeyboardButton(text="⬅️ Назад")],
|
||||||
],
|
],
|
||||||
resize_keyboard=True,
|
resize_keyboard=True,
|
||||||
)
|
)
|
||||||
@@ -52,12 +53,85 @@ artifacts_kb = ReplyKeyboardMarkup(
|
|||||||
resize_keyboard=True,
|
resize_keyboard=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
system_kb = ReplyKeyboardMarkup(
|
system_menu_kb = ReplyKeyboardMarkup(
|
||||||
keyboard=[
|
keyboard=[
|
||||||
[KeyboardButton(text="💽 Disks"), KeyboardButton(text="🔐 Security"), KeyboardButton(text="🧾 Audit")],
|
[KeyboardButton(text="ℹ️ Info"), KeyboardButton(text="🛠 Ops")],
|
||||||
[KeyboardButton(text="🌐 URLs"), KeyboardButton(text="📈 Metrics"), KeyboardButton(text="🔒 SSL")],
|
[KeyboardButton(text="📄 Logs"), KeyboardButton(text="⬅️ Назад")],
|
||||||
[KeyboardButton(text="📦 Updates"), KeyboardButton(text="⬆️ Upgrade"), KeyboardButton(text="📣 Incidents")],
|
],
|
||||||
[KeyboardButton(text="🧱 Hardware"), KeyboardButton(text="🔄 Reboot"), KeyboardButton(text="⬅️ Назад")],
|
resize_keyboard=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
system_info_kb = ReplyKeyboardMarkup(
|
||||||
|
keyboard=[
|
||||||
|
[KeyboardButton(text="💽 Disks"), KeyboardButton(text="🔐 Security")],
|
||||||
|
[KeyboardButton(text="📈 Metrics"), KeyboardButton(text="🧱 Hardware")],
|
||||||
|
[KeyboardButton(text="🧪 SMART test"), KeyboardButton(text="🧪 SMART status")],
|
||||||
|
[KeyboardButton(text="📡 OpenWrt"), KeyboardButton(text="⬅️ System")],
|
||||||
|
],
|
||||||
|
resize_keyboard=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
system_ops_kb = ReplyKeyboardMarkup(
|
||||||
|
keyboard=[
|
||||||
|
[KeyboardButton(text="📦 Updates"), KeyboardButton(text="⬆️ Upgrade")],
|
||||||
|
[KeyboardButton(text="🔄 Reboot")],
|
||||||
|
[KeyboardButton(text="⬅️ System")],
|
||||||
|
],
|
||||||
|
resize_keyboard=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
system_logs_kb = ReplyKeyboardMarkup(
|
||||||
|
keyboard=[
|
||||||
|
[KeyboardButton(text="🧾 Audit/Incidents"), KeyboardButton(text="🔒 Security")],
|
||||||
|
[KeyboardButton(text="🧩 Integrations"), KeyboardButton(text="🧰 Processes")],
|
||||||
|
[KeyboardButton(text="📣 Summary"), KeyboardButton(text="🔥 Heatmap")],
|
||||||
|
[KeyboardButton(text="⬅️ System")],
|
||||||
|
],
|
||||||
|
resize_keyboard=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
system_logs_audit_kb = ReplyKeyboardMarkup(
|
||||||
|
keyboard=[
|
||||||
|
[KeyboardButton(text="🧾 Audit"), KeyboardButton(text="📣 Incidents")],
|
||||||
|
[KeyboardButton(text="🆕 Diff"), KeyboardButton(text="📤 Export")],
|
||||||
|
[KeyboardButton(text="📦 Export all"), KeyboardButton(text="🧰 Alerts log")],
|
||||||
|
[KeyboardButton(text="⬅️ Logs")],
|
||||||
|
],
|
||||||
|
resize_keyboard=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
system_logs_security_kb = ReplyKeyboardMarkup(
|
||||||
|
keyboard=[
|
||||||
|
[KeyboardButton(text="🔑 SSH log"), KeyboardButton(text="🔒 SSL")],
|
||||||
|
[KeyboardButton(text="🌍 External"), KeyboardButton(text="🌐 URLs")],
|
||||||
|
[KeyboardButton(text="⬅️ Logs")],
|
||||||
|
],
|
||||||
|
resize_keyboard=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
system_logs_integrations_kb = ReplyKeyboardMarkup(
|
||||||
|
keyboard=[
|
||||||
|
[KeyboardButton(text="🧩 NPMplus"), KeyboardButton(text="🍵 Gitea")],
|
||||||
|
[KeyboardButton(text="⬅️ Logs")],
|
||||||
|
],
|
||||||
|
resize_keyboard=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
system_logs_tools_kb = ReplyKeyboardMarkup(
|
||||||
|
keyboard=[
|
||||||
|
[KeyboardButton(text="🧰 Processes")],
|
||||||
|
[KeyboardButton(text="⬅️ Logs")],
|
||||||
|
],
|
||||||
|
resize_keyboard=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# OpenWrt submenu (4 ряда)
|
||||||
|
openwrt_kb = ReplyKeyboardMarkup(
|
||||||
|
keyboard=[
|
||||||
|
[KeyboardButton(text="🌐 WAN fast"), KeyboardButton(text="📡 Full status")],
|
||||||
|
[KeyboardButton(text="📶 Wi-Fi clients"), KeyboardButton(text="🧾 Leases")],
|
||||||
|
[KeyboardButton(text="🔀 Leases diff")],
|
||||||
|
[KeyboardButton(text="⬅️ System")],
|
||||||
],
|
],
|
||||||
resize_keyboard=True,
|
resize_keyboard=True,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
LOCK_DIR = Path("/var/run/tg-bot")
|
LOCK_DIR = Path("/var/run/tg-bot")
|
||||||
@@ -11,9 +12,14 @@ def lock_path(name: str) -> Path:
|
|||||||
|
|
||||||
def acquire_lock(name: str) -> bool:
|
def acquire_lock(name: str) -> bool:
|
||||||
p = lock_path(name)
|
p = lock_path(name)
|
||||||
if p.exists():
|
try:
|
||||||
|
fd = os.open(str(p), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
||||||
|
except FileExistsError:
|
||||||
return False
|
return False
|
||||||
p.write_text(str(time.time()))
|
try:
|
||||||
|
os.write(fd, str(time.time()).encode("ascii", errors="ignore"))
|
||||||
|
finally:
|
||||||
|
os.close(fd)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
58
main.py
58
main.py
@@ -1,14 +1,20 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
import socket
|
import socket
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from app import bot, dp, cfg, ADMIN_ID
|
from app import bot, dp, cfg, ADMIN_ID, ADMIN_IDS
|
||||||
from keyboards import menu_kb
|
from keyboards import menu_kb
|
||||||
from services.docker import discover_containers, docker_watchdog
|
from services.docker import discover_containers, docker_watchdog
|
||||||
from services.alerts import monitor_resources, monitor_smart
|
from services.alerts import monitor_resources, monitor_smart, monitor_raid
|
||||||
from services.metrics import MetricsStore, start_sampler
|
from services.metrics import MetricsStore, start_sampler
|
||||||
from services.queue import worker as queue_worker
|
from services.queue import worker as queue_worker, configure as queue_configure
|
||||||
from services.notify import notify
|
from services.notify import notify
|
||||||
from services.audit import AuditMiddleware, audit_start
|
from services.audit import AuditMiddleware, audit_start
|
||||||
|
from services.ssl_alerts import monitor_ssl
|
||||||
|
from services.external_checks import monitor_external
|
||||||
|
from services.incidents import log_incident
|
||||||
|
from services.logging_setup import setup_logging
|
||||||
|
from services.selftest import schedule_selftest
|
||||||
import state
|
import state
|
||||||
import handlers.menu
|
import handlers.menu
|
||||||
import handlers.status
|
import handlers.status
|
||||||
@@ -19,6 +25,40 @@ import handlers.system
|
|||||||
import handlers.help
|
import handlers.help
|
||||||
import handlers.callbacks
|
import handlers.callbacks
|
||||||
import handlers.arcane
|
import handlers.arcane
|
||||||
|
import handlers.processes
|
||||||
|
from services.weekly_report import weekly_reporter
|
||||||
|
import handlers.alerts_admin
|
||||||
|
import handlers.config_check
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_async_exception(_loop, context):
|
||||||
|
msg = context.get("message") or "Unhandled exception"
|
||||||
|
exc = context.get("exception")
|
||||||
|
if exc:
|
||||||
|
text = f"❌ {msg}: {type(exc).__name__}: {exc}"
|
||||||
|
else:
|
||||||
|
text = f"❌ {msg}"
|
||||||
|
now = datetime.now()
|
||||||
|
if not hasattr(_handle_async_exception, "_recent"):
|
||||||
|
_handle_async_exception._recent = []
|
||||||
|
_handle_async_exception._last_alert = None
|
||||||
|
recent = _handle_async_exception._recent
|
||||||
|
recent.append(now)
|
||||||
|
# keep last hour
|
||||||
|
_handle_async_exception._recent = [t for t in recent if (now - t).total_seconds() < 3600]
|
||||||
|
if len(_handle_async_exception._recent) >= 3:
|
||||||
|
last_alert = getattr(_handle_async_exception, "_last_alert", None)
|
||||||
|
if not last_alert or (now - last_alert).total_seconds() > 3600:
|
||||||
|
try:
|
||||||
|
log_incident(cfg, "exception_flood", category="system")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
_handle_async_exception._last_alert = now
|
||||||
|
try:
|
||||||
|
log_incident(cfg, text, category="system")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
logging.getLogger("asyncio").error(text)
|
||||||
|
|
||||||
|
|
||||||
async def notify_start():
|
async def notify_start():
|
||||||
@@ -30,6 +70,7 @@ async def notify_start():
|
|||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
setup_logging(cfg)
|
||||||
dp.message.middleware(AuditMiddleware(cfg))
|
dp.message.middleware(AuditMiddleware(cfg))
|
||||||
dp.callback_query.middleware(AuditMiddleware(cfg))
|
dp.callback_query.middleware(AuditMiddleware(cfg))
|
||||||
audit_start(cfg)
|
audit_start(cfg)
|
||||||
@@ -41,9 +82,20 @@ async def main():
|
|||||||
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
|
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
|
||||||
if cfg.get("alerts", {}).get("smart_enabled", True):
|
if cfg.get("alerts", {}).get("smart_enabled", True):
|
||||||
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
||||||
|
if cfg.get("alerts", {}).get("raid_enabled", True):
|
||||||
|
asyncio.create_task(monitor_raid(cfg, notify, bot, ADMIN_ID))
|
||||||
|
if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
|
||||||
|
asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
|
||||||
|
if cfg.get("external_checks", {}).get("enabled", True):
|
||||||
|
asyncio.create_task(monitor_external(cfg))
|
||||||
state.METRICS_STORE = MetricsStore()
|
state.METRICS_STORE = MetricsStore()
|
||||||
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
||||||
|
queue_configure(cfg.get("queue", {}), cfg)
|
||||||
asyncio.create_task(queue_worker())
|
asyncio.create_task(queue_worker())
|
||||||
|
asyncio.create_task(weekly_reporter(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
|
||||||
|
asyncio.create_task(schedule_selftest(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
loop.set_exception_handler(_handle_async_exception)
|
||||||
await notify_start()
|
await notify_start()
|
||||||
await dp.start_polling(bot)
|
await dp.start_polling(bot)
|
||||||
|
|
||||||
|
|||||||
93
services/alert_mute.py
Normal file
93
services/alert_mute.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
import time
|
||||||
|
from typing import Dict
|
||||||
|
from services.runtime_state import get_state, set_state
|
||||||
|
|
||||||
|
# category -> unix timestamp until muted
|
||||||
|
|
||||||
|
|
||||||
|
def _mutes() -> Dict[str, float]:
|
||||||
|
return get_state().get("mutes", {})
|
||||||
|
|
||||||
|
|
||||||
|
def _save(mutes: Dict[str, float]):
|
||||||
|
set_state("mutes", mutes)
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup() -> None:
|
||||||
|
mutes = _mutes()
|
||||||
|
now = time.time()
|
||||||
|
expired = [k for k, until in mutes.items() if until <= now]
|
||||||
|
for k in expired:
|
||||||
|
mutes.pop(k, None)
|
||||||
|
_save(mutes)
|
||||||
|
|
||||||
|
|
||||||
|
def set_mute(category: str, seconds: int) -> float:
|
||||||
|
_cleanup()
|
||||||
|
mutes = _mutes()
|
||||||
|
until = time.time() + max(0, seconds)
|
||||||
|
mutes[category] = until
|
||||||
|
_save(mutes)
|
||||||
|
return until
|
||||||
|
|
||||||
|
|
||||||
|
def clear_mute(category: str) -> None:
|
||||||
|
mutes = _mutes()
|
||||||
|
mutes.pop(category, None)
|
||||||
|
_save(mutes)
|
||||||
|
|
||||||
|
|
||||||
|
def is_muted(category: str | None) -> bool:
|
||||||
|
if not category:
|
||||||
|
return False
|
||||||
|
_cleanup()
|
||||||
|
mutes = _mutes()
|
||||||
|
until = mutes.get(category)
|
||||||
|
if until is None:
|
||||||
|
return False
|
||||||
|
if until <= time.time():
|
||||||
|
mutes.pop(category, None)
|
||||||
|
_save(mutes)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def list_mutes() -> dict[str, int]:
|
||||||
|
_cleanup()
|
||||||
|
now = time.time()
|
||||||
|
mutes = _mutes()
|
||||||
|
return {k: int(until - now) for k, until in mutes.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def is_auto_muted(cfg: dict, category: str | None) -> bool:
|
||||||
|
if not category:
|
||||||
|
return False
|
||||||
|
auto_list = cfg.get("alerts", {}).get("auto_mute", [])
|
||||||
|
if not isinstance(auto_list, list):
|
||||||
|
return False
|
||||||
|
now = time.localtime()
|
||||||
|
now_minutes = now.tm_hour * 60 + now.tm_min
|
||||||
|
for item in auto_list:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
cat = item.get("category")
|
||||||
|
if cat != category:
|
||||||
|
continue
|
||||||
|
start = item.get("start", "00:00")
|
||||||
|
end = item.get("end", "00:00")
|
||||||
|
try:
|
||||||
|
sh, sm = [int(x) for x in start.split(":")]
|
||||||
|
eh, em = [int(x) for x in end.split(":")]
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
start_min = sh * 60 + sm
|
||||||
|
end_min = eh * 60 + em
|
||||||
|
if start_min == end_min:
|
||||||
|
continue
|
||||||
|
if start_min < end_min:
|
||||||
|
if start_min <= now_minutes < end_min:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
if now_minutes >= start_min or now_minutes < end_min:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
@@ -1,8 +1,9 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
import psutil
|
import psutil
|
||||||
from system_checks import list_disks, smart_health, disk_temperature
|
from system_checks import list_disks, smart_health, disk_temperature, list_md_arrays, md_array_status
|
||||||
from services.system import worst_disk_usage
|
from services.system import worst_disk_usage
|
||||||
|
from services.disk_report import build_disk_report
|
||||||
|
|
||||||
|
|
||||||
async def monitor_resources(cfg, notify, bot, chat_id):
|
async def monitor_resources(cfg, notify, bot, chat_id):
|
||||||
@@ -10,12 +11,16 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
|||||||
interval = int(alerts_cfg.get("interval_sec", 60))
|
interval = int(alerts_cfg.get("interval_sec", 60))
|
||||||
cooldown = int(alerts_cfg.get("cooldown_sec", 900))
|
cooldown = int(alerts_cfg.get("cooldown_sec", 900))
|
||||||
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
||||||
|
load_only_critical = bool(alerts_cfg.get("load_only_critical", False))
|
||||||
|
auto_mute_high_load_sec = int(alerts_cfg.get("auto_mute_on_high_load_sec", 0))
|
||||||
|
|
||||||
disk_warn = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
disk_warn = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
||||||
|
snapshot_warn = int(cfg.get("disk_report", {}).get("threshold", disk_warn))
|
||||||
|
snapshot_cooldown = int(cfg.get("disk_report", {}).get("cooldown_sec", 21600))
|
||||||
load_warn = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
|
load_warn = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
|
||||||
high_warn = float(cfg.get("thresholds", {}).get("high_load_warn", load_warn * 1.5))
|
high_warn = float(cfg.get("thresholds", {}).get("high_load_warn", load_warn * 1.5))
|
||||||
|
|
||||||
last_sent = {"disk": 0.0, "load": 0.0, "disk_na": 0.0}
|
last_sent = {"disk": 0.0, "load": 0.0, "disk_na": 0.0, "disk_report": 0.0}
|
||||||
state = {"disk_high": False, "disk_na": False, "load_level": 0}
|
state = {"disk_high": False, "disk_na": False, "load_level": 0}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@@ -24,24 +29,29 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
|||||||
usage, mount = worst_disk_usage()
|
usage, mount = worst_disk_usage()
|
||||||
if usage is None:
|
if usage is None:
|
||||||
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
|
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
|
||||||
await notify(bot, chat_id, "⚠️ Disk usage n/a")
|
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na", category="disk")
|
||||||
state["disk_na"] = True
|
state["disk_na"] = True
|
||||||
last_sent["disk_na"] = now
|
last_sent["disk_na"] = now
|
||||||
else:
|
else:
|
||||||
if state["disk_na"] and notify_recovery:
|
if state["disk_na"] and notify_recovery and not load_only_critical:
|
||||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
|
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
|
||||||
state["disk_na"] = False
|
state["disk_na"] = False
|
||||||
|
|
||||||
if usage >= disk_warn:
|
if usage >= disk_warn:
|
||||||
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
|
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
|
||||||
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})")
|
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high", category="disk")
|
||||||
state["disk_high"] = True
|
state["disk_high"] = True
|
||||||
last_sent["disk"] = now
|
last_sent["disk"] = now
|
||||||
else:
|
else:
|
||||||
if state["disk_high"] and notify_recovery:
|
if state["disk_high"] and notify_recovery and not load_only_critical:
|
||||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
|
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
|
||||||
state["disk_high"] = False
|
state["disk_high"] = False
|
||||||
|
|
||||||
|
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
|
||||||
|
report = await build_disk_report(cfg, mount or "/", usage)
|
||||||
|
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot", category="disk")
|
||||||
|
last_sent["disk_report"] = now
|
||||||
|
|
||||||
load = psutil.getloadavg()[0]
|
load = psutil.getloadavg()[0]
|
||||||
if load >= high_warn:
|
if load >= high_warn:
|
||||||
level = 2
|
level = 2
|
||||||
@@ -49,16 +59,24 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
|||||||
level = 1
|
level = 1
|
||||||
else:
|
else:
|
||||||
level = 0
|
level = 0
|
||||||
|
if load_only_critical and level == 1:
|
||||||
|
level = 0
|
||||||
|
|
||||||
if level == 0:
|
if level == 0:
|
||||||
if state["load_level"] > 0 and notify_recovery:
|
if state["load_level"] > 0 and notify_recovery and not load_only_critical:
|
||||||
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}")
|
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok", category="load")
|
||||||
state["load_level"] = 0
|
state["load_level"] = 0
|
||||||
else:
|
else:
|
||||||
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
|
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
|
||||||
icon = "🔴" if level == 2 else "🟡"
|
icon = "🔴" if level == 2 else "🟡"
|
||||||
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}")
|
level_name = "critical" if level == 2 else "warn"
|
||||||
|
key = "load_high_crit" if level == 2 else "load_high_warn"
|
||||||
|
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load")
|
||||||
last_sent["load"] = now
|
last_sent["load"] = now
|
||||||
|
if level == 2 and auto_mute_high_load_sec > 0:
|
||||||
|
from services.alert_mute import set_mute
|
||||||
|
|
||||||
|
set_mute("load", auto_mute_high_load_sec)
|
||||||
state["load_level"] = level
|
state["load_level"] = level
|
||||||
|
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
@@ -83,7 +101,14 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if "FAILED" in health:
|
if "FAILED" in health:
|
||||||
await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}")
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}",
|
||||||
|
level="critical",
|
||||||
|
key=f"smart_fail:{dev}",
|
||||||
|
category="smart",
|
||||||
|
)
|
||||||
last_sent[key] = now
|
last_sent[key] = now
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -93,8 +118,66 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
t = None
|
t = None
|
||||||
if t is not None and t >= temp_warn:
|
if t is not None and t >= temp_warn:
|
||||||
await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}")
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}",
|
||||||
|
level="warn",
|
||||||
|
key=f"smart_hot:{dev}",
|
||||||
|
category="smart",
|
||||||
|
)
|
||||||
last_sent[key] = now
|
last_sent[key] = now
|
||||||
continue
|
continue
|
||||||
|
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
|
async def monitor_raid(cfg, notify, bot, chat_id):
|
||||||
|
alerts_cfg = cfg.get("alerts", {})
|
||||||
|
interval = int(alerts_cfg.get("raid_interval_sec", 300))
|
||||||
|
cooldown = int(alerts_cfg.get("raid_cooldown_sec", 1800))
|
||||||
|
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
||||||
|
|
||||||
|
last_sent: dict[str, float] = {}
|
||||||
|
bad_state: dict[str, bool] = {}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
now = time.time()
|
||||||
|
for dev in list_md_arrays():
|
||||||
|
status = md_array_status(dev)
|
||||||
|
lower = status.lower()
|
||||||
|
level = None
|
||||||
|
key_suffix = None
|
||||||
|
if "inactive" in lower:
|
||||||
|
level = "critical"
|
||||||
|
key_suffix = "inactive"
|
||||||
|
elif "degraded" in lower:
|
||||||
|
level = "warn"
|
||||||
|
key_suffix = "degraded"
|
||||||
|
|
||||||
|
if level:
|
||||||
|
if not bad_state.get(dev) or (now - last_sent.get(dev, 0.0) >= cooldown):
|
||||||
|
icon = "🔴" if level == "critical" else "🟡"
|
||||||
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"{icon} RAID {dev}: {status}",
|
||||||
|
level=level,
|
||||||
|
key=f"raid_{key_suffix}:{dev}",
|
||||||
|
category="raid",
|
||||||
|
)
|
||||||
|
last_sent[dev] = now
|
||||||
|
bad_state[dev] = True
|
||||||
|
else:
|
||||||
|
if bad_state.get(dev) and notify_recovery:
|
||||||
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"🟢 RAID {dev}: {status}",
|
||||||
|
level="info",
|
||||||
|
key=f"raid_ok:{dev}",
|
||||||
|
category="raid",
|
||||||
|
)
|
||||||
|
bad_state[dev] = False
|
||||||
|
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
|||||||
35
services/config_check.py
Normal file
35
services/config_check.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import os
|
||||||
|
from typing import Any, Tuple, List
|
||||||
|
|
||||||
|
|
||||||
|
def validate_cfg(cfg: dict[str, Any]) -> Tuple[List[str], List[str]]:
|
||||||
|
errors: List[str] = []
|
||||||
|
warnings: List[str] = []
|
||||||
|
|
||||||
|
tg = cfg.get("telegram", {})
|
||||||
|
if not tg.get("token"):
|
||||||
|
errors.append("telegram.token is missing")
|
||||||
|
admin_ids = tg.get("admin_ids")
|
||||||
|
has_admin_ids = isinstance(admin_ids, list) and len(admin_ids) > 0
|
||||||
|
if not tg.get("admin_id") and not has_admin_ids:
|
||||||
|
errors.append("telegram.admin_id is missing")
|
||||||
|
|
||||||
|
thresholds = cfg.get("thresholds", {})
|
||||||
|
for key in ("disk_warn", "load_warn", "high_load_warn"):
|
||||||
|
if key not in thresholds:
|
||||||
|
warnings.append(f"thresholds.{key} not set")
|
||||||
|
|
||||||
|
paths = cfg.get("paths", {})
|
||||||
|
env_path = paths.get("restic_env")
|
||||||
|
if env_path and not os.path.exists(env_path):
|
||||||
|
warnings.append(f"paths.restic_env not found: {env_path}")
|
||||||
|
|
||||||
|
npm = cfg.get("npmplus", {})
|
||||||
|
if npm and not npm.get("token") and (not npm.get("identity") or not npm.get("secret")):
|
||||||
|
warnings.append("npmplus: token missing and identity/secret missing")
|
||||||
|
|
||||||
|
ow = cfg.get("openwrt", {})
|
||||||
|
if ow and not ow.get("host"):
|
||||||
|
warnings.append("openwrt.host is missing")
|
||||||
|
|
||||||
|
return errors, warnings
|
||||||
78
services/disk_report.py
Normal file
78
services/disk_report.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from services.runner import run_cmd
|
||||||
|
|
||||||
|
|
||||||
|
def _top_dirs_cmd(path: str, limit: int) -> list[str]:
|
||||||
|
_ = limit
|
||||||
|
return ["du", "-x", "-h", "-d", "1", path]
|
||||||
|
|
||||||
|
|
||||||
|
_SIZE_RE = re.compile(r"^\s*([0-9]+(?:\.[0-9]+)?)([KMGTP]?)(i?B?)?$", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _size_to_bytes(value: str) -> float:
|
||||||
|
m = _SIZE_RE.match(value.strip())
|
||||||
|
if not m:
|
||||||
|
return -1.0
|
||||||
|
num = float(m.group(1))
|
||||||
|
unit = (m.group(2) or "").upper()
|
||||||
|
mul = {
|
||||||
|
"": 1,
|
||||||
|
"K": 1024,
|
||||||
|
"M": 1024**2,
|
||||||
|
"G": 1024**3,
|
||||||
|
"T": 1024**4,
|
||||||
|
"P": 1024**5,
|
||||||
|
}.get(unit, 1)
|
||||||
|
return num * mul
|
||||||
|
|
||||||
|
|
||||||
|
def _format_top_dirs(raw: str, limit: int) -> str:
|
||||||
|
rows: list[tuple[float, str]] = []
|
||||||
|
for line in raw.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
parts = line.split(maxsplit=1)
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
size, name = parts
|
||||||
|
rows.append((_size_to_bytes(size), f"{size}\t{name}"))
|
||||||
|
rows.sort(key=lambda x: x[0])
|
||||||
|
return "\n".join(line for _sz, line in rows[-max(1, limit):])
|
||||||
|
|
||||||
|
|
||||||
|
async def build_disk_report(cfg: dict[str, Any], mount: str, usage: int) -> str:
|
||||||
|
limit = int(cfg.get("disk_report", {}).get("top_dirs", 8))
|
||||||
|
|
||||||
|
lines = ["🧱 Disk report", f"💽 {mount}: {usage}%"]
|
||||||
|
|
||||||
|
rc, out = await run_cmd(_top_dirs_cmd(mount, limit), timeout=30)
|
||||||
|
if rc == 0 and out.strip():
|
||||||
|
top_out = _format_top_dirs(out, limit)
|
||||||
|
lines.append("")
|
||||||
|
lines.append("Top directories:")
|
||||||
|
lines.append(top_out)
|
||||||
|
|
||||||
|
docker_dir = cfg.get("disk_report", {}).get("docker_dir", "/var/lib/docker")
|
||||||
|
if docker_dir and os.path.exists(docker_dir):
|
||||||
|
rc2, out2 = await run_cmd(_top_dirs_cmd(docker_dir, limit), timeout=30)
|
||||||
|
if rc2 == 0 and out2.strip():
|
||||||
|
top_out2 = _format_top_dirs(out2, limit)
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"Docker dir: {docker_dir}")
|
||||||
|
lines.append(top_out2)
|
||||||
|
|
||||||
|
logs_dir = cfg.get("disk_report", {}).get("logs_dir", "/var/log")
|
||||||
|
if logs_dir and os.path.exists(logs_dir):
|
||||||
|
rc3, out3 = await run_cmd(_top_dirs_cmd(logs_dir, limit), timeout=30)
|
||||||
|
if rc3 == 0 and out3.strip():
|
||||||
|
top_out3 = _format_top_dirs(out3, limit)
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"Logs dir: {logs_dir}")
|
||||||
|
lines.append(top_out3)
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
@@ -144,8 +144,22 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
|
|||||||
reply_markup=kb,
|
reply_markup=kb,
|
||||||
)
|
)
|
||||||
elif health not in ("healthy", "n/a"):
|
elif health not in ("healthy", "n/a"):
|
||||||
await notify(bot, chat_id, f"⚠️ {alias} health: {health}")
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"⚠️ {alias} health: {health}",
|
||||||
|
level="warn",
|
||||||
|
key=f"docker_health:{alias}",
|
||||||
|
category="docker",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
await notify(bot, chat_id, f"🐳 {alias}: {status}")
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"🐳 {alias}: {status}",
|
||||||
|
level="info",
|
||||||
|
key=f"docker_status:{alias}:{status}",
|
||||||
|
category="docker",
|
||||||
|
)
|
||||||
last[alias] = (status, health)
|
last[alias] = (status, health)
|
||||||
await asyncio.sleep(120)
|
await asyncio.sleep(120)
|
||||||
|
|||||||
143
services/external_checks.py
Normal file
143
services/external_checks.py
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
from urllib.error import HTTPError, URLError
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
|
|
||||||
|
def _state_path(cfg: dict[str, Any]) -> str:
|
||||||
|
return cfg.get("external_checks", {}).get("state_path", "/var/server-bot/external_checks.json")
|
||||||
|
|
||||||
|
|
||||||
|
def _load_state(cfg: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
path = _state_path(cfg)
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return {"services": {}, "total_checks": 0, "ok_checks": 0}
|
||||||
|
try:
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
except Exception:
|
||||||
|
return {"services": {}, "total_checks": 0, "ok_checks": 0}
|
||||||
|
|
||||||
|
|
||||||
|
def _save_state(cfg: dict[str, Any], state: dict[str, Any]) -> None:
|
||||||
|
path = _state_path(cfg)
|
||||||
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(state, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_http(url: str, timeout: int) -> tuple[bool, str]:
|
||||||
|
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
|
||||||
|
try:
|
||||||
|
with urlopen(req, timeout=timeout) as resp:
|
||||||
|
status = int(resp.status)
|
||||||
|
return status < 400, f"HTTP {status}"
|
||||||
|
except HTTPError as e:
|
||||||
|
return False, f"HTTP {int(e.code)}"
|
||||||
|
except URLError as e:
|
||||||
|
return False, str(e.reason)
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_tcp(host: str, port: int, timeout: int) -> tuple[bool, str]:
|
||||||
|
try:
|
||||||
|
with socket.create_connection((host, port), timeout=timeout):
|
||||||
|
return True, "TCP ok"
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_ping(host: str, timeout: int) -> tuple[bool, str]:
|
||||||
|
try:
|
||||||
|
socket.gethostbyname(host)
|
||||||
|
return True, "DNS ok"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return _check_tcp(host, 80, timeout)
|
||||||
|
|
||||||
|
|
||||||
|
def run_checks(cfg: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
checks_cfg = cfg.get("external_checks", {})
|
||||||
|
services = checks_cfg.get("services", [])
|
||||||
|
timeout = int(checks_cfg.get("timeout_sec", 5))
|
||||||
|
|
||||||
|
state = _load_state(cfg)
|
||||||
|
services_state = state.setdefault("services", {})
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for entry in services:
|
||||||
|
name = entry.get("name") or "unknown"
|
||||||
|
check_type = entry.get("type", "http")
|
||||||
|
ok = False
|
||||||
|
detail = "n/a"
|
||||||
|
|
||||||
|
if check_type == "http":
|
||||||
|
url = entry.get("url")
|
||||||
|
if url:
|
||||||
|
ok, detail = _check_http(url, timeout)
|
||||||
|
elif check_type == "tcp":
|
||||||
|
host = entry.get("host")
|
||||||
|
port = int(entry.get("port", 0))
|
||||||
|
if host and port:
|
||||||
|
ok, detail = _check_tcp(host, port, timeout)
|
||||||
|
elif check_type == "ping":
|
||||||
|
host = entry.get("host")
|
||||||
|
if host:
|
||||||
|
ok, detail = _check_ping(host, timeout)
|
||||||
|
|
||||||
|
service_state = services_state.setdefault(name, {"ok": 0, "total": 0})
|
||||||
|
service_state["total"] += 1
|
||||||
|
if ok:
|
||||||
|
service_state["ok"] += 1
|
||||||
|
|
||||||
|
state["total_checks"] = state.get("total_checks", 0) + 1
|
||||||
|
if ok:
|
||||||
|
state["ok_checks"] = state.get("ok_checks", 0) + 1
|
||||||
|
|
||||||
|
results.append({"name": name, "ok": ok, "detail": detail})
|
||||||
|
|
||||||
|
_save_state(cfg, state)
|
||||||
|
return {"results": results, "state": state}
|
||||||
|
|
||||||
|
|
||||||
|
def format_report(cfg: dict[str, Any]) -> str:
|
||||||
|
checks_cfg = cfg.get("external_checks", {})
|
||||||
|
services = checks_cfg.get("services", [])
|
||||||
|
if not services:
|
||||||
|
return "🌍 External checks\n\nℹ️ No services configured"
|
||||||
|
|
||||||
|
data = run_checks(cfg)
|
||||||
|
results = data["results"]
|
||||||
|
state = data["state"]
|
||||||
|
|
||||||
|
total = state.get("total_checks", 0) or 1
|
||||||
|
ok_total = state.get("ok_checks", 0)
|
||||||
|
uptime = 100.0 * ok_total / total
|
||||||
|
|
||||||
|
lines = ["🌍 External checks", ""]
|
||||||
|
for item in results:
|
||||||
|
icon = "🟢" if item["ok"] else "🔴"
|
||||||
|
lines.append(f"{icon} {item['name']}: {item['detail']}")
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"📈 Uptime (global): {uptime:.2f}%")
|
||||||
|
|
||||||
|
lines.append(f"🕒 {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
async def monitor_external(cfg: dict[str, Any]):
|
||||||
|
checks_cfg = cfg.get("external_checks", {})
|
||||||
|
if not checks_cfg.get("enabled", True):
|
||||||
|
return
|
||||||
|
interval = int(checks_cfg.get("interval_sec", 300))
|
||||||
|
|
||||||
|
while True:
|
||||||
|
run_checks(cfg)
|
||||||
|
await asyncio.sleep(interval)
|
||||||
88
services/gitea.py
Normal file
88
services/gitea.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
import json
|
||||||
|
import ssl
|
||||||
|
from typing import Any
|
||||||
|
from urllib.error import HTTPError, URLError
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
|
|
||||||
|
def _request(url: str, headers: dict[str, str], verify_tls: bool) -> tuple[int, str]:
|
||||||
|
context = None
|
||||||
|
if not verify_tls:
|
||||||
|
context = ssl._create_unverified_context() # nosec - config-controlled
|
||||||
|
|
||||||
|
req = Request(url, headers=headers)
|
||||||
|
try:
|
||||||
|
with urlopen(req, timeout=10, context=context) as resp:
|
||||||
|
body = resp.read().decode("utf-8")
|
||||||
|
return int(resp.status), body
|
||||||
|
except HTTPError as e:
|
||||||
|
try:
|
||||||
|
body = e.read().decode("utf-8")
|
||||||
|
except Exception:
|
||||||
|
body = ""
|
||||||
|
return int(e.code), body
|
||||||
|
except URLError as e:
|
||||||
|
raise RuntimeError(str(e.reason)) from e
|
||||||
|
|
||||||
|
|
||||||
|
def _api_base(cfg: dict[str, Any]) -> str:
|
||||||
|
g_cfg = cfg.get("gitea", {})
|
||||||
|
base = (g_cfg.get("base_url") or "").rstrip("/")
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def get_gitea_health(cfg: dict[str, Any]) -> str:
|
||||||
|
g_cfg = cfg.get("gitea", {})
|
||||||
|
base = _api_base(cfg)
|
||||||
|
verify_tls = g_cfg.get("verify_tls", True)
|
||||||
|
if not base:
|
||||||
|
return "⚠️ Gitea base_url not configured"
|
||||||
|
|
||||||
|
token = (g_cfg.get("token") or "").strip()
|
||||||
|
headers = {"User-Agent": "tg-admin-bot"}
|
||||||
|
if token:
|
||||||
|
headers["Authorization"] = f"token {token}"
|
||||||
|
|
||||||
|
lines = ["🍵 Gitea\n"]
|
||||||
|
|
||||||
|
health_paths = ["/api/healthz", "/api/v1/healthz"]
|
||||||
|
health_status = None
|
||||||
|
health_payload = None
|
||||||
|
for path in health_paths:
|
||||||
|
status, body = _request(f"{base}{path}", headers, verify_tls)
|
||||||
|
if status == 200:
|
||||||
|
health_status = (status, path)
|
||||||
|
try:
|
||||||
|
health_payload = json.loads(body)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
health_payload = None
|
||||||
|
break
|
||||||
|
if status not in (404, 405):
|
||||||
|
health_status = (status, path)
|
||||||
|
break
|
||||||
|
|
||||||
|
if health_status:
|
||||||
|
status, path = health_status
|
||||||
|
icon = "🟢" if status == 200 else "🔴"
|
||||||
|
if status == 200 and isinstance(health_payload, dict):
|
||||||
|
state = health_payload.get("status") or "ok"
|
||||||
|
checks = health_payload.get("checks") or {}
|
||||||
|
checks_total = len(checks) if isinstance(checks, dict) else 0
|
||||||
|
lines.append(f"{icon} API health: {state} ({checks_total} checks)")
|
||||||
|
else:
|
||||||
|
lines.append(f"{icon} API health: {status} ({path})")
|
||||||
|
else:
|
||||||
|
lines.append("🟡 API health: endpoint not found")
|
||||||
|
|
||||||
|
ver_status, ver_body = _request(f"{base}/api/v1/version", headers, verify_tls)
|
||||||
|
if ver_status == 200:
|
||||||
|
try:
|
||||||
|
payload = json.loads(ver_body)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
payload = {}
|
||||||
|
version = payload.get("version") or "unknown"
|
||||||
|
lines.append(f"ℹ️ Version: {version}")
|
||||||
|
else:
|
||||||
|
lines.append(f"🟡 Version: HTTP {ver_status}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
@@ -1,6 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
|
import ssl
|
||||||
import subprocess
|
import subprocess
|
||||||
import psutil
|
import psutil
|
||||||
|
from urllib.error import HTTPError, URLError
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
from app import RESTIC_ENV
|
from app import RESTIC_ENV
|
||||||
from services.system import worst_disk_usage
|
from services.system import worst_disk_usage
|
||||||
|
|
||||||
@@ -9,9 +12,35 @@ def _containers_from_cfg(cfg) -> dict:
|
|||||||
return cfg.get("docker", {}).get("containers", {})
|
return cfg.get("docker", {}).get("containers", {})
|
||||||
|
|
||||||
|
|
||||||
|
def _request_status(url: str, verify_tls: bool) -> int | None:
|
||||||
|
context = None
|
||||||
|
if not verify_tls:
|
||||||
|
context = ssl._create_unverified_context() # nosec - config-controlled
|
||||||
|
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
|
||||||
|
try:
|
||||||
|
with urlopen(req, timeout=8, context=context) as resp:
|
||||||
|
return int(resp.status)
|
||||||
|
except HTTPError as e:
|
||||||
|
return int(e.code)
|
||||||
|
except URLError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _npm_api_base(cfg) -> str | None:
|
||||||
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
|
base = (npm_cfg.get("base_url") or "").rstrip("/")
|
||||||
|
if not base:
|
||||||
|
return None
|
||||||
|
if not base.endswith("/api"):
|
||||||
|
base = f"{base}/api"
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
def health(cfg, container_map: dict | None = None) -> str:
|
def health(cfg, container_map: dict | None = None) -> str:
|
||||||
lines = ["🩺 Health check\n"]
|
lines = ["🩺 Health check\n"]
|
||||||
|
thresholds = cfg.get("thresholds", {})
|
||||||
|
disk_warn = int(thresholds.get("disk_warn", 80))
|
||||||
|
load_warn = float(thresholds.get("load_warn", 2.0))
|
||||||
try:
|
try:
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
env.update(RESTIC_ENV)
|
env.update(RESTIC_ENV)
|
||||||
@@ -30,15 +59,47 @@ def health(cfg, container_map: dict | None = None) -> str:
|
|||||||
else:
|
else:
|
||||||
lines.append(f"🟢 {alias} OK")
|
lines.append(f"🟢 {alias} OK")
|
||||||
|
|
||||||
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
|
npm_base = _npm_api_base(cfg)
|
||||||
|
if npm_base:
|
||||||
|
npm_status = _request_status(npm_base, npm_cfg.get("verify_tls", True))
|
||||||
|
if npm_status == 200:
|
||||||
|
lines.append("🟢 NPMplus API OK")
|
||||||
|
elif npm_status is None:
|
||||||
|
lines.append("🔴 NPMplus API unreachable")
|
||||||
|
else:
|
||||||
|
lines.append(f"🟡 NPMplus API HTTP {npm_status}")
|
||||||
|
|
||||||
|
g_cfg = cfg.get("gitea", {})
|
||||||
|
g_base = (g_cfg.get("base_url") or "").rstrip("/")
|
||||||
|
if g_base:
|
||||||
|
health_paths = ["/api/healthz", "/api/v1/healthz"]
|
||||||
|
g_status = None
|
||||||
|
for path in health_paths:
|
||||||
|
status = _request_status(f"{g_base}{path}", g_cfg.get("verify_tls", True))
|
||||||
|
if status == 200:
|
||||||
|
g_status = status
|
||||||
|
break
|
||||||
|
if status not in (404, 405):
|
||||||
|
g_status = status
|
||||||
|
break
|
||||||
|
if g_status == 200:
|
||||||
|
lines.append("🟢 Gitea API OK")
|
||||||
|
elif g_status is None:
|
||||||
|
lines.append("🔴 Gitea API unreachable")
|
||||||
|
else:
|
||||||
|
lines.append(f"🟡 Gitea API HTTP {g_status}")
|
||||||
|
|
||||||
usage, mount = worst_disk_usage()
|
usage, mount = worst_disk_usage()
|
||||||
if usage is None:
|
if usage is None:
|
||||||
lines.append("⚠️ Disk n/a")
|
lines.append("⚠️ Disk n/a")
|
||||||
elif usage > cfg["thresholds"]["disk_warn"]:
|
elif usage > disk_warn:
|
||||||
lines.append(f"🟡 Disk {usage}% ({mount})")
|
lines.append(f"🟡 Disk {usage}% ({mount})")
|
||||||
else:
|
else:
|
||||||
lines.append(f"🟢 Disk {usage}% ({mount})")
|
lines.append(f"🟢 Disk {usage}% ({mount})")
|
||||||
|
|
||||||
load = psutil.getloadavg()[0]
|
load = psutil.getloadavg()[0]
|
||||||
lines.append(f"{'🟢' if load < cfg['thresholds']['load_warn'] else '🟡'} Load {load}")
|
lines.append(f"{'🟢' if load < load_warn else '🟡'} Load {load}")
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from collections import deque
|
|||||||
from datetime import datetime, timedelta, timezone
|
from datetime import datetime, timedelta, timezone
|
||||||
from logging.handlers import TimedRotatingFileHandler
|
from logging.handlers import TimedRotatingFileHandler
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from services import runtime_state
|
||||||
|
|
||||||
|
|
||||||
def _get_path(cfg: dict[str, Any]) -> str:
|
def _get_path(cfg: dict[str, Any]) -> str:
|
||||||
@@ -44,9 +45,11 @@ def _get_logger(cfg: dict[str, Any]) -> logging.Logger:
|
|||||||
return logger
|
return logger
|
||||||
|
|
||||||
|
|
||||||
def log_incident(cfg: dict[str, Any], text: str) -> None:
|
def log_incident(cfg: dict[str, Any], text: str, category: str | None = None) -> None:
|
||||||
if not cfg.get("incidents", {}).get("enabled", True):
|
if not cfg.get("incidents", {}).get("enabled", True):
|
||||||
return
|
return
|
||||||
|
if category and "category=" not in text:
|
||||||
|
text = f"category={category} {text}"
|
||||||
logger = _get_logger(cfg)
|
logger = _get_logger(cfg)
|
||||||
logger.info(text)
|
logger.info(text)
|
||||||
|
|
||||||
@@ -63,6 +66,10 @@ def _parse_line(line: str) -> tuple[datetime | None, str]:
|
|||||||
|
|
||||||
|
|
||||||
def read_recent(cfg: dict[str, Any], hours: int, limit: int = 200) -> list[str]:
|
def read_recent(cfg: dict[str, Any], hours: int, limit: int = 200) -> list[str]:
|
||||||
|
return [f"{dt:%Y-%m-%d %H:%M} {msg}" for dt, msg in read_raw(cfg, hours, limit=limit)]
|
||||||
|
|
||||||
|
|
||||||
|
def read_raw(cfg: dict[str, Any], hours: int, limit: int = 200, *, include_old: bool = False) -> list[tuple[datetime, str]]:
|
||||||
path = _get_path(cfg)
|
path = _get_path(cfg)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
return []
|
return []
|
||||||
@@ -72,7 +79,40 @@ def read_recent(cfg: dict[str, Any], hours: int, limit: int = 200) -> list[str]:
|
|||||||
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
dt, msg = _parse_line(line.rstrip())
|
dt, msg = _parse_line(line.rstrip())
|
||||||
if dt is None or dt < since:
|
if dt is None:
|
||||||
continue
|
continue
|
||||||
lines.append(f"{dt:%Y-%m-%d %H:%M} {msg}")
|
if not include_old and dt < since:
|
||||||
|
continue
|
||||||
|
lines.append((dt, msg))
|
||||||
return list(lines)
|
return list(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def infer_category(text: str) -> str | None:
|
||||||
|
lower = text.lower()
|
||||||
|
if "category=" in lower:
|
||||||
|
import re
|
||||||
|
|
||||||
|
m = re.search(r"category=([a-z0-9_-]+)", lower)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
if "load" in lower:
|
||||||
|
return "load"
|
||||||
|
if "docker" in lower:
|
||||||
|
return "docker"
|
||||||
|
if "restic" in lower or "backup" in lower:
|
||||||
|
return "backup"
|
||||||
|
if "smart" in lower:
|
||||||
|
return "smart"
|
||||||
|
if "ssl" in lower or "cert" in lower:
|
||||||
|
return "ssl"
|
||||||
|
if "npmplus" in lower:
|
||||||
|
return "npmplus"
|
||||||
|
if "gitea" in lower:
|
||||||
|
return "gitea"
|
||||||
|
if "openwrt" in lower:
|
||||||
|
return "openwrt"
|
||||||
|
if "queue" in lower:
|
||||||
|
return "queue"
|
||||||
|
if "selftest" in lower:
|
||||||
|
return "selftest"
|
||||||
|
return None
|
||||||
|
|||||||
35
services/logging_setup.py
Normal file
35
services/logging_setup.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from logging.handlers import TimedRotatingFileHandler
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging(cfg: dict) -> None:
|
||||||
|
log_cfg = cfg.get("logging", {})
|
||||||
|
if not log_cfg.get("enabled", True):
|
||||||
|
return
|
||||||
|
|
||||||
|
path = log_cfg.get("path", "/var/server-bot/bot.log")
|
||||||
|
rotate_when = log_cfg.get("rotate_when", "W0")
|
||||||
|
backup_count = int(log_cfg.get("backup_count", 8))
|
||||||
|
level = str(log_cfg.get("level", "INFO")).upper()
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
|
|
||||||
|
root = logging.getLogger()
|
||||||
|
for handler in root.handlers:
|
||||||
|
if isinstance(handler, TimedRotatingFileHandler) and handler.baseFilename == path:
|
||||||
|
return
|
||||||
|
|
||||||
|
handler = TimedRotatingFileHandler(
|
||||||
|
path,
|
||||||
|
when=rotate_when,
|
||||||
|
interval=1,
|
||||||
|
backupCount=backup_count,
|
||||||
|
encoding="utf-8",
|
||||||
|
utc=True,
|
||||||
|
)
|
||||||
|
formatter = logging.Formatter("%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s")
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
root.setLevel(level)
|
||||||
|
root.addHandler(handler)
|
||||||
@@ -1,14 +1,83 @@
|
|||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
from aiogram import Bot
|
from aiogram import Bot
|
||||||
from app import cfg
|
from app import cfg
|
||||||
|
from services.alert_mute import is_muted, is_auto_muted
|
||||||
from services.incidents import log_incident
|
from services.incidents import log_incident
|
||||||
|
|
||||||
|
|
||||||
async def notify(bot: Bot, chat_id: int, text: str):
|
_LAST_SENT: dict[str, float] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_hhmm(value: str) -> int | None:
|
||||||
|
try:
|
||||||
|
hours, minutes = value.strip().split(":", 1)
|
||||||
|
h = int(hours)
|
||||||
|
m = int(minutes)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
if not (0 <= h <= 23 and 0 <= m <= 59):
|
||||||
|
return None
|
||||||
|
return h * 60 + m
|
||||||
|
|
||||||
|
|
||||||
|
def _in_quiet_hours(alerts_cfg: dict) -> bool:
|
||||||
|
quiet = alerts_cfg.get("quiet_hours", {})
|
||||||
|
if not quiet.get("enabled", False):
|
||||||
|
return False
|
||||||
|
start_min = _parse_hhmm(quiet.get("start", "23:00"))
|
||||||
|
end_min = _parse_hhmm(quiet.get("end", "08:00"))
|
||||||
|
if start_min is None or end_min is None:
|
||||||
|
return False
|
||||||
|
if start_min == end_min:
|
||||||
|
return False
|
||||||
|
now = datetime.now()
|
||||||
|
now_min = now.hour * 60 + now.minute
|
||||||
|
if start_min < end_min:
|
||||||
|
return start_min <= now_min < end_min
|
||||||
|
return now_min >= start_min or now_min < end_min
|
||||||
|
|
||||||
|
|
||||||
|
async def notify(
|
||||||
|
bot: Bot,
|
||||||
|
chat_id: int,
|
||||||
|
text: str,
|
||||||
|
level: str = "info",
|
||||||
|
key: str | None = None,
|
||||||
|
category: str | None = None,
|
||||||
|
):
|
||||||
|
alerts_cfg = cfg.get("alerts", {})
|
||||||
|
suppressed_reason = None
|
||||||
|
if category and is_muted(category):
|
||||||
|
suppressed_reason = "muted"
|
||||||
|
elif category and is_auto_muted(cfg, category):
|
||||||
|
suppressed_reason = "auto_mute"
|
||||||
|
elif _in_quiet_hours(alerts_cfg):
|
||||||
|
allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True))
|
||||||
|
if not (allow_critical and level == "critical"):
|
||||||
|
suppressed_reason = "quiet_hours"
|
||||||
|
|
||||||
|
if suppressed_reason:
|
||||||
|
try:
|
||||||
|
log_incident(cfg, f"[suppressed:{suppressed_reason}] {text}", category=category)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
|
dedup_sec = int(alerts_cfg.get("notify_cooldown_sec", alerts_cfg.get("cooldown_sec", 900)))
|
||||||
|
if dedup_sec > 0:
|
||||||
|
dedup_key = key or text
|
||||||
|
now = time.time()
|
||||||
|
last_time = _LAST_SENT.get(dedup_key, 0)
|
||||||
|
if now - last_time < dedup_sec:
|
||||||
|
return
|
||||||
|
_LAST_SENT[dedup_key] = now
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await bot.send_message(chat_id, text)
|
await bot.send_message(chat_id, text)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
log_incident(cfg, text)
|
log_incident(cfg, text, category=category)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -34,11 +34,12 @@ def _request_json(
|
|||||||
headers: dict[str, str],
|
headers: dict[str, str],
|
||||||
data: dict[str, Any] | None,
|
data: dict[str, Any] | None,
|
||||||
verify_tls: bool,
|
verify_tls: bool,
|
||||||
|
method: str | None = None,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
body = None
|
body = None
|
||||||
if data is not None:
|
if data is not None:
|
||||||
body = json.dumps(data).encode("utf-8")
|
body = json.dumps(data).encode("utf-8")
|
||||||
req = Request(url, headers=headers, data=body)
|
req = Request(url, headers=headers, data=body, method=method)
|
||||||
|
|
||||||
context = None
|
context = None
|
||||||
if not verify_tls:
|
if not verify_tls:
|
||||||
@@ -48,16 +49,36 @@ def _request_json(
|
|||||||
with urlopen(req, timeout=10, context=context) as resp:
|
with urlopen(req, timeout=10, context=context) as resp:
|
||||||
payload = resp.read().decode("utf-8")
|
payload = resp.read().decode("utf-8")
|
||||||
except HTTPError as e:
|
except HTTPError as e:
|
||||||
raise RuntimeError(f"HTTP {e.code}") from e
|
detail = f"HTTP {e.code}"
|
||||||
|
try:
|
||||||
|
payload = e.read().decode("utf-8").strip()
|
||||||
|
except Exception:
|
||||||
|
payload = ""
|
||||||
|
if payload:
|
||||||
|
payload = " ".join(payload.split())
|
||||||
|
if len(payload) > 300:
|
||||||
|
payload = payload[:300] + "..."
|
||||||
|
detail = f"{detail}: {payload}"
|
||||||
|
raise RuntimeError(f"{detail} ({url})") from e
|
||||||
except URLError as e:
|
except URLError as e:
|
||||||
raise RuntimeError(str(e.reason)) from e
|
raise RuntimeError(str(e.reason)) from e
|
||||||
|
|
||||||
return json.loads(payload)
|
return json.loads(payload)
|
||||||
|
|
||||||
|
|
||||||
|
def _api_base(cfg: dict[str, Any]) -> str:
|
||||||
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
|
base = (npm_cfg.get("base_url") or "").rstrip("/")
|
||||||
|
if not base:
|
||||||
|
return ""
|
||||||
|
if not base.endswith("/api"):
|
||||||
|
base = f"{base}/api"
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
def _get_token(cfg: dict[str, Any]) -> str:
|
def _get_token(cfg: dict[str, Any]) -> str:
|
||||||
npm_cfg = cfg.get("npmplus", {})
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
base_url = (npm_cfg.get("base_url") or "").rstrip("/")
|
base_url = _api_base(cfg)
|
||||||
identity = npm_cfg.get("identity")
|
identity = npm_cfg.get("identity")
|
||||||
secret = npm_cfg.get("secret")
|
secret = npm_cfg.get("secret")
|
||||||
static_token = npm_cfg.get("token")
|
static_token = npm_cfg.get("token")
|
||||||
@@ -113,7 +134,7 @@ def _get_token(cfg: dict[str, Any]) -> str:
|
|||||||
|
|
||||||
def fetch_certificates(cfg: dict[str, Any]) -> list[dict[str, Any]]:
|
def fetch_certificates(cfg: dict[str, Any]) -> list[dict[str, Any]]:
|
||||||
npm_cfg = cfg.get("npmplus", {})
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
base_url = (npm_cfg.get("base_url") or "").rstrip("/")
|
base_url = _api_base(cfg)
|
||||||
verify_tls = npm_cfg.get("verify_tls", True)
|
verify_tls = npm_cfg.get("verify_tls", True)
|
||||||
|
|
||||||
if not base_url:
|
if not base_url:
|
||||||
@@ -132,6 +153,48 @@ def fetch_certificates(cfg: dict[str, Any]) -> list[dict[str, Any]]:
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def list_proxy_hosts(cfg: dict[str, Any]) -> list[dict[str, Any]]:
|
||||||
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
|
base_url = _api_base(cfg)
|
||||||
|
verify_tls = npm_cfg.get("verify_tls", True)
|
||||||
|
if not base_url:
|
||||||
|
raise ValueError("NPMplus base_url not configured")
|
||||||
|
|
||||||
|
token = _get_token(cfg)
|
||||||
|
url = f"{base_url}/nginx/proxy-hosts"
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {token}",
|
||||||
|
"User-Agent": "tg-admin-bot",
|
||||||
|
}
|
||||||
|
data = _request_json(url, headers, None, verify_tls)
|
||||||
|
if not isinstance(data, list):
|
||||||
|
raise RuntimeError("Unexpected API response")
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def set_proxy_host(cfg: dict[str, Any], host_id: int, enabled: bool) -> tuple[bool, str]:
|
||||||
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
|
base_url = _api_base(cfg)
|
||||||
|
verify_tls = npm_cfg.get("verify_tls", True)
|
||||||
|
if not base_url:
|
||||||
|
return False, "NPMplus base_url not configured"
|
||||||
|
|
||||||
|
token = _get_token(cfg)
|
||||||
|
action = "enable" if enabled else "disable"
|
||||||
|
url = f"{base_url}/nginx/proxy-hosts/{host_id}/{action}"
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {token}",
|
||||||
|
"User-Agent": "tg-admin-bot",
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
payload = _request_json(url, headers, None, verify_tls, method="POST")
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)
|
||||||
|
if payload is True or (isinstance(payload, dict) and payload.get("success", True)):
|
||||||
|
return True, "OK"
|
||||||
|
return False, "API returned error"
|
||||||
|
|
||||||
|
|
||||||
def format_certificates(certs: list[dict[str, Any]]) -> str:
|
def format_certificates(certs: list[dict[str, Any]]) -> str:
|
||||||
if not certs:
|
if not certs:
|
||||||
return "🔒 SSL certificates\n\nℹ️ No certificates found"
|
return "🔒 SSL certificates\n\nℹ️ No certificates found"
|
||||||
|
|||||||
504
services/openwrt.py
Normal file
504
services/openwrt.py
Normal file
@@ -0,0 +1,504 @@
|
|||||||
|
import json
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from services.runner import run_cmd, run_cmd_full
|
||||||
|
|
||||||
|
|
||||||
|
def _format_uptime(seconds: int | float | None) -> str:
|
||||||
|
if seconds is None:
|
||||||
|
return "unknown"
|
||||||
|
total = int(seconds)
|
||||||
|
days, rem = divmod(total, 86400)
|
||||||
|
hours, rem = divmod(rem, 3600)
|
||||||
|
minutes, _ = divmod(rem, 60)
|
||||||
|
if days > 0:
|
||||||
|
return f"{days}d {hours:02d}:{minutes:02d}"
|
||||||
|
return f"{hours:02d}:{minutes:02d}"
|
||||||
|
|
||||||
|
|
||||||
|
def _format_load(load: list[Any] | None) -> str:
|
||||||
|
if not load or not isinstance(load, list):
|
||||||
|
return "unknown"
|
||||||
|
values = []
|
||||||
|
for raw in load[:3]:
|
||||||
|
try:
|
||||||
|
values.append(float(raw))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
values.append(0.0)
|
||||||
|
scale = 1.0
|
||||||
|
if values and max(values) > 1000:
|
||||||
|
scale = 1 / 65536.0
|
||||||
|
return " ".join(f"{val * scale:.2f}" for val in values)
|
||||||
|
|
||||||
|
|
||||||
|
def _format_rate(rate: Any) -> str:
|
||||||
|
try:
|
||||||
|
val = float(rate)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return "?"
|
||||||
|
if val <= 0:
|
||||||
|
return "?"
|
||||||
|
if val >= 1_000_000:
|
||||||
|
return f"{val / 1_000_000:.1f}M"
|
||||||
|
if val >= 1_000:
|
||||||
|
return f"{val / 1_000:.1f}K"
|
||||||
|
return f"{val:.0f}b"
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_wan_ip(wan: dict[str, Any]) -> str | None:
|
||||||
|
if not isinstance(wan, dict):
|
||||||
|
return None
|
||||||
|
addrs = wan.get("ipv4-address") or []
|
||||||
|
if isinstance(addrs, list):
|
||||||
|
for item in addrs:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
ip = item.get("address")
|
||||||
|
if ip:
|
||||||
|
return str(ip)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_wifi_clients(wireless: dict[str, Any]) -> list[str]:
|
||||||
|
clients: list[str] = []
|
||||||
|
if not isinstance(wireless, dict):
|
||||||
|
return clients
|
||||||
|
for radio in wireless.values():
|
||||||
|
if not isinstance(radio, dict):
|
||||||
|
continue
|
||||||
|
for iface in radio.get("interfaces", []) or []:
|
||||||
|
if not isinstance(iface, dict):
|
||||||
|
continue
|
||||||
|
ifname = iface.get("ifname") or "wifi"
|
||||||
|
assoclist = iface.get("assoclist")
|
||||||
|
stations = iface.get("stations")
|
||||||
|
if isinstance(assoclist, dict):
|
||||||
|
for mac, meta in assoclist.items():
|
||||||
|
if not isinstance(meta, dict):
|
||||||
|
continue
|
||||||
|
signal = meta.get("signal")
|
||||||
|
rx = _format_rate((meta.get("rx") or {}).get("rate"))
|
||||||
|
tx = _format_rate((meta.get("tx") or {}).get("rate"))
|
||||||
|
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
|
||||||
|
clients.append(f"{ifname} {mac} {sig} rx:{rx} tx:{tx}")
|
||||||
|
elif isinstance(stations, list):
|
||||||
|
for meta in stations:
|
||||||
|
if not isinstance(meta, dict):
|
||||||
|
continue
|
||||||
|
mac = meta.get("mac") or "?"
|
||||||
|
signal = meta.get("signal")
|
||||||
|
rx = _format_rate((meta.get("rx") or {}).get("rate"))
|
||||||
|
tx = _format_rate((meta.get("tx") or {}).get("rate"))
|
||||||
|
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
|
||||||
|
clients.append(f"{ifname} {mac} {sig} rx:{rx} tx:{tx}")
|
||||||
|
return clients
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_leases(leases: dict[str, Any]) -> list[str]:
|
||||||
|
items = None
|
||||||
|
if isinstance(leases, dict):
|
||||||
|
items = leases.get("leases") or leases.get("dhcp_leases") or leases.get("ipv4_leases")
|
||||||
|
elif isinstance(leases, list):
|
||||||
|
items = leases
|
||||||
|
if not isinstance(items, list):
|
||||||
|
return []
|
||||||
|
out = []
|
||||||
|
for lease in items:
|
||||||
|
if not isinstance(lease, dict):
|
||||||
|
continue
|
||||||
|
ipaddr = lease.get("ipaddr") or "?"
|
||||||
|
host = lease.get("hostname") or "unknown"
|
||||||
|
mac = lease.get("macaddr") or "?"
|
||||||
|
out.append(f"{ipaddr} {host} ({mac})")
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_lease_name_map(leases: Any) -> dict[str, str]:
|
||||||
|
items = None
|
||||||
|
if isinstance(leases, dict):
|
||||||
|
items = leases.get("leases") or leases.get("dhcp_leases") or leases.get("ipv4_leases")
|
||||||
|
elif isinstance(leases, list):
|
||||||
|
items = leases
|
||||||
|
if not isinstance(items, list):
|
||||||
|
return {}
|
||||||
|
out: dict[str, str] = {}
|
||||||
|
for lease in items:
|
||||||
|
if not isinstance(lease, dict):
|
||||||
|
continue
|
||||||
|
mac = lease.get("macaddr")
|
||||||
|
if not mac:
|
||||||
|
continue
|
||||||
|
host = lease.get("hostname") or "unknown"
|
||||||
|
out[str(mac).lower()] = str(host)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_lease_name_map_fallback(raw: str) -> dict[str, str]:
|
||||||
|
out: dict[str, str] = {}
|
||||||
|
for line in raw.splitlines():
|
||||||
|
parts = line.strip().split()
|
||||||
|
if len(parts) < 4:
|
||||||
|
continue
|
||||||
|
_expiry, mac, _ipaddr, host = parts[:4]
|
||||||
|
host = host if host != "*" else "unknown"
|
||||||
|
out[str(mac).lower()] = str(host)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_ifnames(wireless: dict[str, Any]) -> list[str]:
|
||||||
|
ifnames: list[str] = []
|
||||||
|
if not isinstance(wireless, dict):
|
||||||
|
return ifnames
|
||||||
|
for radio in wireless.values():
|
||||||
|
if not isinstance(radio, dict):
|
||||||
|
continue
|
||||||
|
for iface in radio.get("interfaces", []) or []:
|
||||||
|
if not isinstance(iface, dict):
|
||||||
|
continue
|
||||||
|
ifname = iface.get("ifname")
|
||||||
|
if ifname:
|
||||||
|
ifnames.append(str(ifname))
|
||||||
|
return ifnames
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_ifname_meta(wireless: dict[str, Any]) -> dict[str, dict[str, str]]:
|
||||||
|
meta: dict[str, dict[str, str]] = {}
|
||||||
|
if not isinstance(wireless, dict):
|
||||||
|
return meta
|
||||||
|
for radio in wireless.values():
|
||||||
|
if not isinstance(radio, dict):
|
||||||
|
continue
|
||||||
|
band = None
|
||||||
|
cfg = radio.get("config") or {}
|
||||||
|
if isinstance(cfg, dict):
|
||||||
|
band = cfg.get("band")
|
||||||
|
band_label = None
|
||||||
|
if band == "2g":
|
||||||
|
band_label = "2.4GHz"
|
||||||
|
elif band == "5g":
|
||||||
|
band_label = "5GHz"
|
||||||
|
elif band:
|
||||||
|
band_label = str(band)
|
||||||
|
for iface in radio.get("interfaces", []) or []:
|
||||||
|
if not isinstance(iface, dict):
|
||||||
|
continue
|
||||||
|
ifname = iface.get("ifname")
|
||||||
|
if not ifname:
|
||||||
|
continue
|
||||||
|
iface_cfg = iface.get("config") or {}
|
||||||
|
ssid = None
|
||||||
|
if isinstance(iface_cfg, dict):
|
||||||
|
ssid = iface_cfg.get("ssid")
|
||||||
|
meta[str(ifname)] = {
|
||||||
|
"ssid": str(ssid) if ssid else "",
|
||||||
|
"band": band_label or "",
|
||||||
|
}
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_hostapd_ifnames(raw: str) -> list[str]:
|
||||||
|
ifnames: list[str] = []
|
||||||
|
for line in raw.splitlines():
|
||||||
|
name = line.strip()
|
||||||
|
if not name or name == "hostapd":
|
||||||
|
continue
|
||||||
|
ifnames.append(name)
|
||||||
|
return ifnames
|
||||||
|
|
||||||
|
|
||||||
|
def _net_label_for_ifname(ifname: str, ifname_meta: dict[str, dict[str, str]]) -> str:
|
||||||
|
meta = ifname_meta.get(ifname, {})
|
||||||
|
ssid = meta.get("ssid") or ""
|
||||||
|
band = meta.get("band") or ""
|
||||||
|
if ssid and band:
|
||||||
|
return f"{ssid} ({band})"
|
||||||
|
if ssid:
|
||||||
|
return ssid
|
||||||
|
if band:
|
||||||
|
return band
|
||||||
|
return ifname
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_json_load(raw: str) -> Any | None:
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(raw)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
start = raw.find("{")
|
||||||
|
end = raw.rfind("}")
|
||||||
|
if start == -1 or end == -1 or end <= start:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(raw[start : end + 1])
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_hostapd_clients(
|
||||||
|
payload: Any,
|
||||||
|
ifname: str,
|
||||||
|
*,
|
||||||
|
name_map: dict[str, str] | None = None,
|
||||||
|
ifname_meta: dict[str, dict[str, str]] | None = None,
|
||||||
|
) -> list[tuple[str, int | None, str]]:
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
return []
|
||||||
|
data = payload.get("clients")
|
||||||
|
if isinstance(data, dict):
|
||||||
|
items = data.items()
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
clients: list[tuple[str, int | None, str]] = []
|
||||||
|
name_map = name_map or {}
|
||||||
|
meta = (ifname_meta or {}).get(ifname, {})
|
||||||
|
ssid = meta.get("ssid") or ""
|
||||||
|
band = meta.get("band") or ""
|
||||||
|
if ssid and band:
|
||||||
|
net_label = f"{ssid} ({band})"
|
||||||
|
elif ssid:
|
||||||
|
net_label = ssid
|
||||||
|
elif band:
|
||||||
|
net_label = band
|
||||||
|
else:
|
||||||
|
net_label = ifname
|
||||||
|
for mac, meta in items:
|
||||||
|
if not isinstance(meta, dict):
|
||||||
|
continue
|
||||||
|
signal = meta.get("signal")
|
||||||
|
rate = meta.get("rate") or {}
|
||||||
|
rx = _format_rate((rate or {}).get("rx"))
|
||||||
|
tx = _format_rate((rate or {}).get("tx"))
|
||||||
|
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
|
||||||
|
host = name_map.get(str(mac).lower())
|
||||||
|
if host and host != "unknown":
|
||||||
|
client_label = host
|
||||||
|
else:
|
||||||
|
client_label = str(mac)
|
||||||
|
line = f"{net_label} {client_label} {sig} rx:{rx} tx:{tx}"
|
||||||
|
clients.append((line, signal if isinstance(signal, (int, float)) else None, net_label))
|
||||||
|
return clients
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_proc_fallback(raw: str) -> tuple[int | None, list[float] | None]:
|
||||||
|
uptime = None
|
||||||
|
load = None
|
||||||
|
for line in raw.splitlines():
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 2 and uptime is None:
|
||||||
|
try:
|
||||||
|
uptime = int(float(parts[0]))
|
||||||
|
except ValueError:
|
||||||
|
uptime = None
|
||||||
|
if len(parts) >= 3 and load is None:
|
||||||
|
try:
|
||||||
|
load = [float(parts[0]), float(parts[1]), float(parts[2])]
|
||||||
|
except ValueError:
|
||||||
|
load = None
|
||||||
|
return uptime, load
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_leases_fallback(raw: str) -> list[str]:
|
||||||
|
out = []
|
||||||
|
for line in raw.splitlines():
|
||||||
|
parts = line.strip().split()
|
||||||
|
if len(parts) < 4:
|
||||||
|
continue
|
||||||
|
_expiry, mac, ipaddr, host = parts[:4]
|
||||||
|
host = host if host != "*" else "unknown"
|
||||||
|
out.append(f"{ipaddr} {host} ({mac})")
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
async def get_openwrt_status(cfg: dict[str, Any], mode: str = "full") -> str:
|
||||||
|
ow_cfg = cfg.get("openwrt", {})
|
||||||
|
host = ow_cfg.get("host")
|
||||||
|
user = ow_cfg.get("user", "root")
|
||||||
|
port = ow_cfg.get("port", 22)
|
||||||
|
identity_file = ow_cfg.get("identity_file")
|
||||||
|
timeout_sec = ow_cfg.get("timeout_sec", 8)
|
||||||
|
strict = ow_cfg.get("strict_host_key_checking", True)
|
||||||
|
|
||||||
|
if not host:
|
||||||
|
return "⚠️ OpenWrt host not configured"
|
||||||
|
|
||||||
|
ssh_cmd = [
|
||||||
|
"ssh",
|
||||||
|
"-o",
|
||||||
|
"BatchMode=yes",
|
||||||
|
"-o",
|
||||||
|
f"ConnectTimeout={timeout_sec}",
|
||||||
|
"-o",
|
||||||
|
"LogLevel=ERROR",
|
||||||
|
]
|
||||||
|
if not strict:
|
||||||
|
ssh_cmd += ["-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"]
|
||||||
|
if identity_file:
|
||||||
|
ssh_cmd += ["-i", str(identity_file)]
|
||||||
|
ssh_cmd += ["-p", str(port), f"{user}@{host}"]
|
||||||
|
|
||||||
|
remote = (
|
||||||
|
"ubus call system info 2>/dev/null || (cat /proc/uptime; echo; cat /proc/loadavg); "
|
||||||
|
"echo __SEP__;"
|
||||||
|
"ubus call network.interface.wan status 2>/dev/null; echo __SEP__;"
|
||||||
|
"ubus call network.wireless status 2>/dev/null; echo __SEP__;"
|
||||||
|
"ubus call luci-rpc getDHCPLeases '{\"family\":4}' 2>/dev/null || cat /tmp/dhcp.leases"
|
||||||
|
)
|
||||||
|
cmd = ssh_cmd + ["sh", "-c", remote]
|
||||||
|
rc, out = await run_cmd_full(cmd, timeout=timeout_sec + 15)
|
||||||
|
if rc == 124:
|
||||||
|
return "⚠️ OpenWrt SSH error: timeout"
|
||||||
|
if rc != 0:
|
||||||
|
return f"⚠️ OpenWrt SSH error: {out.strip() or 'unknown error'}"
|
||||||
|
|
||||||
|
parts = [p.strip() for p in out.split("__SEP__")]
|
||||||
|
if len(parts) < 4:
|
||||||
|
return "⚠️ OpenWrt response incomplete"
|
||||||
|
|
||||||
|
sys_info = _safe_json_load(parts[0])
|
||||||
|
wan_status = _safe_json_load(parts[1]) or {}
|
||||||
|
wireless = _safe_json_load(parts[2]) or {}
|
||||||
|
leases = _safe_json_load(parts[3])
|
||||||
|
leases_fallback = "" if leases is not None else parts[3]
|
||||||
|
|
||||||
|
if isinstance(sys_info, dict):
|
||||||
|
uptime_raw = sys_info.get("uptime")
|
||||||
|
load_raw = sys_info.get("load")
|
||||||
|
else:
|
||||||
|
uptime_raw, load_raw = _parse_proc_fallback(parts[0])
|
||||||
|
uptime = _format_uptime(uptime_raw)
|
||||||
|
load = _format_load(load_raw)
|
||||||
|
wan_ip = _extract_wan_ip(wan_status) or "unknown"
|
||||||
|
wan_up = wan_status.get("up") if isinstance(wan_status, dict) else None
|
||||||
|
wan_state = "up" if wan_up else "down"
|
||||||
|
|
||||||
|
wifi_clients = _extract_wifi_clients(wireless)
|
||||||
|
ifnames = _extract_ifnames(wireless)
|
||||||
|
ifname_meta = _extract_ifname_meta(wireless)
|
||||||
|
rc_l, out_l = await run_cmd_full(
|
||||||
|
ssh_cmd + ["sh", "-c", r"ubus -S list | awk -F. '/^hostapd\.phy/{print $2}'"],
|
||||||
|
timeout=timeout_sec + 15,
|
||||||
|
)
|
||||||
|
if rc_l == 0 and out_l.strip():
|
||||||
|
ifnames.extend(_extract_hostapd_ifnames(out_l))
|
||||||
|
ifnames = sorted({name for name in ifnames if name})
|
||||||
|
lease_name_map = _extract_lease_name_map(leases or {})
|
||||||
|
if leases_fallback:
|
||||||
|
lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback))
|
||||||
|
wifi_net_counts: dict[str, int] = {}
|
||||||
|
wifi_signals: dict[str, list[int]] = {}
|
||||||
|
if ifnames:
|
||||||
|
for ifname in ifnames:
|
||||||
|
cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"]
|
||||||
|
rc2, out2 = await run_cmd_full(cmd_clients, timeout=timeout_sec + 15)
|
||||||
|
if rc2 == 124:
|
||||||
|
return f"⚠️ OpenWrt SSH error (wifi clients {ifname}): timeout"
|
||||||
|
if rc2 == 0:
|
||||||
|
payload = _safe_json_load(out2)
|
||||||
|
if payload:
|
||||||
|
clients_payload = payload.get("clients") if isinstance(payload, dict) else None
|
||||||
|
if isinstance(clients_payload, dict):
|
||||||
|
label = _net_label_for_ifname(ifname, ifname_meta)
|
||||||
|
wifi_net_counts[label] = wifi_net_counts.get(label, 0) + len(clients_payload)
|
||||||
|
parsed = _parse_hostapd_clients(
|
||||||
|
payload,
|
||||||
|
ifname,
|
||||||
|
name_map=lease_name_map,
|
||||||
|
ifname_meta=ifname_meta,
|
||||||
|
)
|
||||||
|
wifi_clients.extend([p[0] for p in parsed])
|
||||||
|
for _line, sig, net_label in parsed:
|
||||||
|
if sig is not None and net_label:
|
||||||
|
wifi_signals.setdefault(net_label, []).append(sig)
|
||||||
|
|
||||||
|
if leases:
|
||||||
|
leases_list = _extract_leases(leases)
|
||||||
|
else:
|
||||||
|
leases_list = _parse_leases_fallback(leases_fallback)
|
||||||
|
|
||||||
|
header = [
|
||||||
|
"📡 OpenWrt",
|
||||||
|
f"🕒 Uptime: {uptime}",
|
||||||
|
f"⚙️ Load: {load}",
|
||||||
|
f"🌐 WAN: {wan_ip} ({wan_state})",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
wifi_section: list[str] = []
|
||||||
|
if wifi_net_counts:
|
||||||
|
wifi_section.append("📶 Wi-Fi networks:")
|
||||||
|
for label, count in sorted(wifi_net_counts.items()):
|
||||||
|
sigs = wifi_signals.get(label) or []
|
||||||
|
if sigs:
|
||||||
|
avg_sig = sum(sigs) / len(sigs)
|
||||||
|
min_sig = min(sigs)
|
||||||
|
wifi_section.append(f" - {label}: {count} (avg {avg_sig:.0f}dBm, min {min_sig}dBm)")
|
||||||
|
else:
|
||||||
|
wifi_section.append(f" - {label}: {count}")
|
||||||
|
wifi_section.append("")
|
||||||
|
|
||||||
|
wifi_section.append(f"📶 Wi-Fi clients: {len(wifi_clients)}")
|
||||||
|
if wifi_clients:
|
||||||
|
for line in wifi_clients[:20]:
|
||||||
|
wifi_section.append(f" - {line}")
|
||||||
|
if len(wifi_clients) > 20:
|
||||||
|
wifi_section.append(f" … and {len(wifi_clients) - 20} more")
|
||||||
|
else:
|
||||||
|
wifi_section.append(" (none)")
|
||||||
|
|
||||||
|
lease_section: list[str] = ["", f"🧾 DHCP leases: {len(leases_list)}"]
|
||||||
|
if leases_list:
|
||||||
|
for line in leases_list[:20]:
|
||||||
|
lease_section.append(f" - {line}")
|
||||||
|
if len(leases_list) > 20:
|
||||||
|
lease_section.append(f" … and {len(leases_list) - 20} more")
|
||||||
|
else:
|
||||||
|
lease_section.append(" (none)")
|
||||||
|
|
||||||
|
if mode == "wan":
|
||||||
|
return "\n".join(header)
|
||||||
|
if mode == "clients":
|
||||||
|
return "\n".join(header + wifi_section)
|
||||||
|
if mode == "leases":
|
||||||
|
return "\n".join(header + lease_section)
|
||||||
|
return "\n".join(header + wifi_section + lease_section)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_openwrt_leases(cfg: dict[str, Any]) -> list[str]:
|
||||||
|
"""
|
||||||
|
Fetch DHCP leases as list of strings "IP hostname (MAC)".
|
||||||
|
"""
|
||||||
|
ow_cfg = cfg.get("openwrt", {})
|
||||||
|
host = ow_cfg.get("host")
|
||||||
|
user = ow_cfg.get("user", "root")
|
||||||
|
port = ow_cfg.get("port", 22)
|
||||||
|
identity_file = ow_cfg.get("identity_file")
|
||||||
|
timeout_sec = ow_cfg.get("timeout_sec", 8)
|
||||||
|
strict = ow_cfg.get("strict_host_key_checking", True)
|
||||||
|
|
||||||
|
if not host:
|
||||||
|
raise RuntimeError("OpenWrt host not configured")
|
||||||
|
|
||||||
|
ssh_cmd = [
|
||||||
|
"ssh",
|
||||||
|
"-o",
|
||||||
|
"BatchMode=yes",
|
||||||
|
"-o",
|
||||||
|
f"ConnectTimeout={timeout_sec}",
|
||||||
|
"-o",
|
||||||
|
"LogLevel=ERROR",
|
||||||
|
]
|
||||||
|
if not strict:
|
||||||
|
ssh_cmd += ["-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"]
|
||||||
|
if identity_file:
|
||||||
|
ssh_cmd += ["-i", str(identity_file)]
|
||||||
|
ssh_cmd += ["-p", str(port), f"{user}@{host}"]
|
||||||
|
|
||||||
|
remote = "ubus call luci-rpc getDHCPLeases '{\"family\":4}' 2>/dev/null || cat /tmp/dhcp.leases"
|
||||||
|
rc, out = await run_cmd_full(ssh_cmd + ["sh", "-c", remote], timeout=timeout_sec + 10)
|
||||||
|
if rc == 124:
|
||||||
|
raise RuntimeError("timeout")
|
||||||
|
if rc != 0:
|
||||||
|
raise RuntimeError(out.strip() or f"ssh rc={rc}")
|
||||||
|
leases = _safe_json_load(out)
|
||||||
|
if leases:
|
||||||
|
return _extract_leases(leases)
|
||||||
|
return _parse_leases_fallback(out)
|
||||||
88
services/processes.py
Normal file
88
services/processes.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
import time
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_name(info: dict[str, Any]) -> str:
|
||||||
|
name = info.get("name") or "unknown"
|
||||||
|
return str(name)
|
||||||
|
|
||||||
|
|
||||||
|
def get_top_processes(limit: int = 5, interval: float = 0.2) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
||||||
|
procs = []
|
||||||
|
for p in psutil.process_iter(attrs=["pid", "name"]):
|
||||||
|
procs.append(p)
|
||||||
|
|
||||||
|
for p in procs:
|
||||||
|
try:
|
||||||
|
p.cpu_percent(None)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
time.sleep(interval)
|
||||||
|
|
||||||
|
items = []
|
||||||
|
for p in procs:
|
||||||
|
try:
|
||||||
|
cpu = p.cpu_percent(None)
|
||||||
|
mem = p.memory_percent()
|
||||||
|
info = p.info
|
||||||
|
items.append({
|
||||||
|
"pid": info.get("pid"),
|
||||||
|
"name": _safe_name(info),
|
||||||
|
"cpu": cpu,
|
||||||
|
"mem": mem,
|
||||||
|
})
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
top_cpu = sorted(items, key=lambda x: x["cpu"], reverse=True)[:limit]
|
||||||
|
top_mem = sorted(items, key=lambda x: x["mem"], reverse=True)[:limit]
|
||||||
|
return top_cpu, top_mem
|
||||||
|
|
||||||
|
|
||||||
|
def search_processes(query: str, limit: int = 10) -> list[dict[str, Any]]:
|
||||||
|
needle = query.lower().strip()
|
||||||
|
if not needle:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for p in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
|
||||||
|
try:
|
||||||
|
info = p.info
|
||||||
|
name = _safe_name(info)
|
||||||
|
cmdline = " ".join(info.get("cmdline") or [])
|
||||||
|
hay = f"{name} {cmdline}".lower()
|
||||||
|
if needle not in hay:
|
||||||
|
continue
|
||||||
|
results.append({
|
||||||
|
"pid": info.get("pid"),
|
||||||
|
"name": name,
|
||||||
|
"cmdline": cmdline,
|
||||||
|
})
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return results[:limit]
|
||||||
|
|
||||||
|
|
||||||
|
def terminate_process(pid: int, timeout: float = 5.0) -> str:
|
||||||
|
try:
|
||||||
|
proc = psutil.Process(pid)
|
||||||
|
except Exception:
|
||||||
|
return f"Process {pid} not found"
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc.terminate()
|
||||||
|
proc.wait(timeout=timeout)
|
||||||
|
return f"Process {pid} terminated"
|
||||||
|
except psutil.TimeoutExpired:
|
||||||
|
try:
|
||||||
|
proc.kill()
|
||||||
|
proc.wait(timeout=timeout)
|
||||||
|
return f"Process {pid} killed"
|
||||||
|
except Exception as e:
|
||||||
|
return f"Kill failed for {pid}: {e}"
|
||||||
|
except Exception as e:
|
||||||
|
return f"Terminate failed for {pid}: {e}"
|
||||||
@@ -1,34 +1,209 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from typing import Awaitable, Callable
|
import logging
|
||||||
|
import time
|
||||||
|
from collections import deque
|
||||||
|
from typing import Awaitable, Callable, Any
|
||||||
|
from services import runtime_state
|
||||||
|
from services.incidents import log_incident
|
||||||
|
|
||||||
|
|
||||||
_queue: asyncio.Queue = asyncio.Queue()
|
_queue: asyncio.Queue = asyncio.Queue()
|
||||||
_current_label: str | None = None
|
_current_label: str | None = None
|
||||||
|
_current_meta: dict[str, Any] | None = None
|
||||||
|
_pending: deque[tuple[str, float]] = deque()
|
||||||
|
_stats: dict[str, Any] = runtime_state.get("queue_stats", {}) or {
|
||||||
|
"processed": 0,
|
||||||
|
"avg_wait_sec": 0.0,
|
||||||
|
"avg_runtime_sec": 0.0,
|
||||||
|
"last_label": "",
|
||||||
|
"last_finished_at": 0.0,
|
||||||
|
}
|
||||||
|
_history: deque[dict[str, Any]] = deque(runtime_state.get("queue_history", []) or [], maxlen=50)
|
||||||
|
_alert_cfg: dict[str, Any] = {
|
||||||
|
"max_pending": None,
|
||||||
|
"avg_wait": None,
|
||||||
|
"cooldown": 300,
|
||||||
|
"last_sent": 0.0,
|
||||||
|
}
|
||||||
|
_cfg: dict[str, Any] | None = None
|
||||||
|
_logger = logging.getLogger("queue")
|
||||||
|
|
||||||
|
|
||||||
|
def _save_stats():
|
||||||
|
runtime_state.set_state("queue_stats", _stats)
|
||||||
|
runtime_state.set_state("queue_history", list(_history))
|
||||||
|
|
||||||
|
|
||||||
|
def configure(queue_cfg: dict[str, Any], cfg: dict[str, Any]):
|
||||||
|
global _cfg
|
||||||
|
_cfg = cfg
|
||||||
|
_alert_cfg["max_pending"] = queue_cfg.get("max_pending_alert")
|
||||||
|
_alert_cfg["avg_wait"] = queue_cfg.get("avg_wait_alert")
|
||||||
|
_alert_cfg["cooldown"] = queue_cfg.get("cooldown_sec", 300)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_congestion(pending_count: int, avg_wait: float | None):
|
||||||
|
max_pending = _alert_cfg.get("max_pending")
|
||||||
|
avg_wait_thr = _alert_cfg.get("avg_wait")
|
||||||
|
cooldown = _alert_cfg.get("cooldown", 300)
|
||||||
|
now = time.time()
|
||||||
|
if now - _alert_cfg.get("last_sent", 0) < cooldown:
|
||||||
|
return
|
||||||
|
reason = None
|
||||||
|
if max_pending and pending_count >= max_pending:
|
||||||
|
reason = f"pending={pending_count} >= {max_pending}"
|
||||||
|
if avg_wait_thr and avg_wait is not None and avg_wait >= avg_wait_thr:
|
||||||
|
reason = reason or f"avg_wait={avg_wait:.1f}s >= {avg_wait_thr}s"
|
||||||
|
if reason and _cfg:
|
||||||
|
try:
|
||||||
|
log_incident(_cfg, f"queue_congested {reason}", category="queue")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
_alert_cfg["last_sent"] = now
|
||||||
|
|
||||||
|
|
||||||
async def enqueue(label: str, job: Callable[[], Awaitable[None]]) -> int:
|
async def enqueue(label: str, job: Callable[[], Awaitable[None]]) -> int:
|
||||||
await _queue.put((label, job))
|
enqueued_at = time.time()
|
||||||
return _queue.qsize()
|
await _queue.put((label, job, enqueued_at))
|
||||||
|
_pending.append((label, enqueued_at))
|
||||||
|
_check_congestion(len(_pending), None)
|
||||||
|
return len(_pending)
|
||||||
|
|
||||||
|
|
||||||
async def worker():
|
async def worker():
|
||||||
global _current_label
|
global _current_label, _current_meta
|
||||||
while True:
|
while True:
|
||||||
label, job = await _queue.get()
|
label, job, enqueued_at = await _queue.get()
|
||||||
|
if _pending:
|
||||||
|
if _pending[0] == (label, enqueued_at):
|
||||||
|
_pending.popleft()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
_pending.remove((label, enqueued_at))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
_current_label = label
|
_current_label = label
|
||||||
|
_current_meta = {"enqueued_at": enqueued_at, "started_at": time.time()}
|
||||||
|
status = "ok"
|
||||||
try:
|
try:
|
||||||
await job()
|
await job()
|
||||||
|
except Exception as e:
|
||||||
|
status = "err"
|
||||||
|
_logger.exception("Queue job failed: label=%s", label)
|
||||||
|
if _cfg:
|
||||||
|
try:
|
||||||
|
log_incident(
|
||||||
|
_cfg,
|
||||||
|
f"queue_job_failed label={label} error={type(e).__name__}: {e}",
|
||||||
|
category="queue",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
finally:
|
finally:
|
||||||
|
finished_at = time.time()
|
||||||
|
if _current_meta:
|
||||||
|
wait_sec = max(0.0, _current_meta["started_at"] - _current_meta["enqueued_at"])
|
||||||
|
runtime_sec = max(0.0, finished_at - _current_meta["started_at"])
|
||||||
|
n_prev = int(_stats.get("processed", 0))
|
||||||
|
_stats["processed"] = n_prev + 1
|
||||||
|
_stats["avg_wait_sec"] = (
|
||||||
|
(_stats.get("avg_wait_sec", 0.0) * n_prev) + wait_sec
|
||||||
|
) / _stats["processed"]
|
||||||
|
_stats["avg_runtime_sec"] = (
|
||||||
|
(_stats.get("avg_runtime_sec", 0.0) * n_prev) + runtime_sec
|
||||||
|
) / _stats["processed"]
|
||||||
|
_stats["last_label"] = label
|
||||||
|
_stats["last_finished_at"] = finished_at
|
||||||
|
_history.appendleft(
|
||||||
|
{
|
||||||
|
"label": label,
|
||||||
|
"wait_sec": int(wait_sec),
|
||||||
|
"runtime_sec": int(runtime_sec),
|
||||||
|
"finished_at": int(finished_at),
|
||||||
|
"status": status,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
_save_stats()
|
||||||
|
_check_congestion(len(_pending), _stats.get("avg_wait_sec"))
|
||||||
_current_label = None
|
_current_label = None
|
||||||
|
_current_meta = None
|
||||||
_queue.task_done()
|
_queue.task_done()
|
||||||
|
|
||||||
|
|
||||||
def format_status() -> str:
|
def format_status() -> str:
|
||||||
pending = [label for label, _ in list(_queue._queue)]
|
pending = list(_pending)
|
||||||
lines = ["🧾 Queue"]
|
lines = ["🧾 Queue"]
|
||||||
lines.append(f"🔄 Running: {_current_label or 'idle'}")
|
lines.append(f"🔄 Running: {_current_label or 'idle'}")
|
||||||
lines.append(f"⏳ Pending: {len(pending)}")
|
lines.append(f"⏳ Pending: {len(pending)}")
|
||||||
if pending:
|
if pending:
|
||||||
preview = ", ".join(pending[:5])
|
preview = ", ".join([p[0] for p in pending[:5]])
|
||||||
lines.append(f"➡️ Next: {preview}")
|
lines.append(f"➡️ Next: {preview}")
|
||||||
|
if _stats.get("processed"):
|
||||||
|
lines.append(
|
||||||
|
f"📈 Done: {_stats.get('processed')} | "
|
||||||
|
f"avg wait {int(_stats.get('avg_wait_sec', 0))}s | "
|
||||||
|
f"avg run {int(_stats.get('avg_runtime_sec', 0))}s"
|
||||||
|
)
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def format_details(limit: int = 10) -> str:
|
||||||
|
now = time.time()
|
||||||
|
lines = ["🧾 Queue details"]
|
||||||
|
if _current_label:
|
||||||
|
started_at = _current_meta.get("started_at") if _current_meta else None
|
||||||
|
runtime = f"{int(now - started_at)}s" if started_at else "n/a"
|
||||||
|
lines.append(f"🔄 Running: {_current_label} ({runtime})")
|
||||||
|
else:
|
||||||
|
lines.append("🔄 Running: idle")
|
||||||
|
|
||||||
|
pending = list(_pending)
|
||||||
|
lines.append(f"⏳ Pending: {len(pending)}")
|
||||||
|
if pending:
|
||||||
|
lines.append("🔢 Position | Label | Wait")
|
||||||
|
for i, (label, enqueued_at) in enumerate(pending[:limit], start=1):
|
||||||
|
wait = int(now - enqueued_at)
|
||||||
|
lines.append(f"{i:>3} | {label} | {wait}s")
|
||||||
|
if _stats.get("processed"):
|
||||||
|
lines.append("")
|
||||||
|
lines.append(
|
||||||
|
"📈 Stats: "
|
||||||
|
f"{_stats.get('processed')} done, "
|
||||||
|
f"avg wait {int(_stats.get('avg_wait_sec', 0))}s, "
|
||||||
|
f"avg run {int(_stats.get('avg_runtime_sec', 0))}s"
|
||||||
|
)
|
||||||
|
last_label = _stats.get("last_label")
|
||||||
|
if last_label:
|
||||||
|
lines.append(f"Last: {last_label}")
|
||||||
|
if _history:
|
||||||
|
lines.append("")
|
||||||
|
lines.append("🗂 Last jobs:")
|
||||||
|
for item in list(_history)[:5]:
|
||||||
|
t = time.strftime("%H:%M:%S", time.localtime(item["finished_at"]))
|
||||||
|
lines.append(
|
||||||
|
f"- {t} {item['label']} {item['status']} "
|
||||||
|
f"(wait {item['wait_sec']}s, run {item['runtime_sec']}s)"
|
||||||
|
)
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def format_history(limit: int = 20) -> str:
|
||||||
|
lines = ["🗂 Queue history"]
|
||||||
|
if not _history:
|
||||||
|
lines.append("(empty)")
|
||||||
|
return "\n".join(lines)
|
||||||
|
for item in list(_history)[:limit]:
|
||||||
|
t = time.strftime("%m-%d %H:%M:%S", time.localtime(item["finished_at"]))
|
||||||
|
lines.append(
|
||||||
|
f"{t} {item['label']} {item['status']} "
|
||||||
|
f"(wait {item['wait_sec']}s, run {item['runtime_sec']}s)"
|
||||||
|
)
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def get_history_raw() -> list[dict[str, Any]]:
|
||||||
|
return list(_history)
|
||||||
|
|
||||||
|
|
||||||
|
def get_stats() -> dict[str, Any]:
|
||||||
|
return dict(_stats)
|
||||||
|
|||||||
@@ -22,3 +22,24 @@ async def run_cmd(cmd: list[str], *, use_restic_env: bool = False, timeout: int
|
|||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
proc.kill()
|
proc.kill()
|
||||||
return 124, "❌ timeout"
|
return 124, "❌ timeout"
|
||||||
|
|
||||||
|
|
||||||
|
async def run_cmd_full(cmd: list[str], *, use_restic_env: bool = False, timeout: int = 60):
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||||
|
if use_restic_env:
|
||||||
|
env.update(RESTIC_ENV)
|
||||||
|
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
*cmd,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.STDOUT,
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
out, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||||
|
return proc.returncode, out.decode(errors="ignore")
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
proc.kill()
|
||||||
|
return 124, "❌ timeout"
|
||||||
|
|||||||
73
services/runtime_state.py
Normal file
73
services/runtime_state.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
import tempfile
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
_PATH = "/var/server-bot/runtime.json"
|
||||||
|
_STATE: Dict[str, Any] = {}
|
||||||
|
_LOCK = threading.RLock()
|
||||||
|
_LOADED = False
|
||||||
|
|
||||||
|
|
||||||
|
def configure(path: str | None):
|
||||||
|
global _PATH
|
||||||
|
if path:
|
||||||
|
_PATH = path
|
||||||
|
|
||||||
|
|
||||||
|
def _load_from_disk():
|
||||||
|
global _STATE, _LOADED
|
||||||
|
if not os.path.exists(_PATH):
|
||||||
|
_STATE = {}
|
||||||
|
_LOADED = True
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
with open(_PATH, "r", encoding="utf-8") as f:
|
||||||
|
_STATE = json.load(f)
|
||||||
|
except Exception:
|
||||||
|
_STATE = {}
|
||||||
|
_LOADED = True
|
||||||
|
|
||||||
|
|
||||||
|
def _save():
|
||||||
|
directory = os.path.dirname(_PATH) or "."
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
try:
|
||||||
|
fd, tmp_path = tempfile.mkstemp(prefix=".runtime.", suffix=".json", dir=directory)
|
||||||
|
try:
|
||||||
|
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(_STATE, f, ensure_ascii=False)
|
||||||
|
f.flush()
|
||||||
|
os.fsync(f.fileno())
|
||||||
|
os.replace(tmp_path, _PATH)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(tmp_path):
|
||||||
|
try:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_state() -> Dict[str, Any]:
|
||||||
|
with _LOCK:
|
||||||
|
if not _LOADED:
|
||||||
|
_load_from_disk()
|
||||||
|
return _STATE
|
||||||
|
|
||||||
|
|
||||||
|
def set_state(key: str, value: Any):
|
||||||
|
with _LOCK:
|
||||||
|
if not _LOADED:
|
||||||
|
_load_from_disk()
|
||||||
|
_STATE[key] = value
|
||||||
|
_save()
|
||||||
|
|
||||||
|
|
||||||
|
def get(key: str, default: Any = None) -> Any:
|
||||||
|
with _LOCK:
|
||||||
|
if not _LOADED:
|
||||||
|
_load_from_disk()
|
||||||
|
return _STATE.get(key, default)
|
||||||
95
services/selftest.py
Normal file
95
services/selftest.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
import json
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import asyncio
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from services.health import health
|
||||||
|
from services.runner import run_cmd_full
|
||||||
|
from services.incidents import log_incident
|
||||||
|
from services import runtime_state
|
||||||
|
|
||||||
|
|
||||||
|
def _save_history(entry: dict[str, Any]) -> None:
|
||||||
|
hist = runtime_state.get("selftest_history", [])
|
||||||
|
hist = hist[:50] if isinstance(hist, list) else []
|
||||||
|
hist.insert(0, entry)
|
||||||
|
runtime_state.set_state("selftest_history", hist[:20])
|
||||||
|
|
||||||
|
|
||||||
|
async def run_selftest(cfg: dict[str, Any], docker_map: dict[str, str]) -> tuple[str, bool]:
|
||||||
|
lines = ["🧪 Self-test"]
|
||||||
|
ok = True
|
||||||
|
|
||||||
|
# health
|
||||||
|
try:
|
||||||
|
htext = await asyncio.to_thread(health, cfg, docker_map)
|
||||||
|
h_lines = [ln for ln in htext.splitlines() if ln.strip()]
|
||||||
|
brief = " | ".join(h_lines[1:5]) if len(h_lines) > 1 else h_lines[0] if h_lines else "n/a"
|
||||||
|
lines.append(f"🟢 Health: {brief}")
|
||||||
|
except Exception as e:
|
||||||
|
lines.append(f"🔴 Health failed: {e}")
|
||||||
|
ok = False
|
||||||
|
|
||||||
|
# restic snapshots check
|
||||||
|
rc, out = await run_cmd_full(["restic", "snapshots", "--json"], use_restic_env=True, timeout=40)
|
||||||
|
if rc == 0:
|
||||||
|
try:
|
||||||
|
snaps = json.loads(out)
|
||||||
|
if isinstance(snaps, list) and snaps:
|
||||||
|
snaps.sort(key=lambda s: s.get("time", ""), reverse=True)
|
||||||
|
last = snaps[0]
|
||||||
|
t = last.get("time", "?").replace("Z", "").replace("T", " ")[:16]
|
||||||
|
lines.append(f"🟢 Restic snapshots: {len(snaps)}, last {t}")
|
||||||
|
else:
|
||||||
|
lines.append("🟡 Restic snapshots: empty")
|
||||||
|
except Exception:
|
||||||
|
lines.append("🟡 Restic snapshots: invalid JSON")
|
||||||
|
else:
|
||||||
|
lines.append(f"🔴 Restic snapshots error: {out.strip() or rc}")
|
||||||
|
ok = False
|
||||||
|
|
||||||
|
result_text = "\n".join(lines)
|
||||||
|
try:
|
||||||
|
_save_history(
|
||||||
|
{
|
||||||
|
"ts": datetime.now().isoformat(),
|
||||||
|
"ok": ok,
|
||||||
|
"summary": result_text.splitlines()[1] if len(lines) > 1 else "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return result_text, ok
|
||||||
|
|
||||||
|
|
||||||
|
async def schedule_selftest(cfg: dict[str, Any], bot, admin_ids: list[int], docker_map: dict[str, str]):
|
||||||
|
"""
|
||||||
|
Run selftest daily at configured time.
|
||||||
|
"""
|
||||||
|
sched_cfg = cfg.get("selftest", {}).get("schedule", {})
|
||||||
|
if not sched_cfg.get("enabled", False):
|
||||||
|
return
|
||||||
|
time_str = sched_cfg.get("time", "03:30")
|
||||||
|
try:
|
||||||
|
hh, mm = [int(x) for x in time_str.split(":")]
|
||||||
|
except Exception:
|
||||||
|
hh, mm = 3, 30
|
||||||
|
|
||||||
|
while True:
|
||||||
|
now = datetime.now()
|
||||||
|
run_at = now.replace(hour=hh, minute=mm, second=0, microsecond=0)
|
||||||
|
if run_at <= now:
|
||||||
|
run_at += timedelta(days=1)
|
||||||
|
await asyncio.sleep((run_at - now).total_seconds())
|
||||||
|
text, ok = await run_selftest(cfg, docker_map)
|
||||||
|
for chat_id in admin_ids:
|
||||||
|
try:
|
||||||
|
await bot.send_message(chat_id, text)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if not ok:
|
||||||
|
try:
|
||||||
|
log_incident(cfg, "selftest failed", category="selftest")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
61
services/ssl_alerts.py
Normal file
61
services/ssl_alerts.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from services.npmplus import fetch_certificates, _parse_expiry
|
||||||
|
|
||||||
|
|
||||||
|
async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int):
|
||||||
|
npm_cfg = cfg.get("npmplus", {})
|
||||||
|
alert_cfg = npm_cfg.get("alerts", {})
|
||||||
|
if not alert_cfg.get("enabled", True):
|
||||||
|
return
|
||||||
|
|
||||||
|
days_list = alert_cfg.get("days", [30, 14, 7, 1])
|
||||||
|
days_list = sorted({int(x) for x in days_list if int(x) >= 0}, reverse=True)
|
||||||
|
cooldown = int(alert_cfg.get("cooldown_sec", 86400))
|
||||||
|
interval = int(alert_cfg.get("interval_sec", 3600))
|
||||||
|
|
||||||
|
last_sent: dict[str, float] = {}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
try:
|
||||||
|
certs = fetch_certificates(cfg)
|
||||||
|
except Exception:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for cert in certs:
|
||||||
|
name = cert.get("nice_name")
|
||||||
|
if not name:
|
||||||
|
domains = cert.get("domain_names") or []
|
||||||
|
if isinstance(domains, list):
|
||||||
|
name = ", ".join(domains)
|
||||||
|
if not name:
|
||||||
|
name = "unknown"
|
||||||
|
|
||||||
|
expiry = _parse_expiry(cert.get("expires_on"))
|
||||||
|
if expiry is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
days_left = (expiry - now).days
|
||||||
|
for threshold in days_list:
|
||||||
|
if days_left <= threshold:
|
||||||
|
key = f"{name}:{threshold}"
|
||||||
|
last_time = last_sent.get(key, 0)
|
||||||
|
if time.time() - last_time >= cooldown:
|
||||||
|
level = "critical" if days_left <= 1 else "warn"
|
||||||
|
await notify(
|
||||||
|
bot,
|
||||||
|
chat_id,
|
||||||
|
f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)",
|
||||||
|
level=level,
|
||||||
|
key=f"ssl:{name}:{threshold}",
|
||||||
|
category="ssl",
|
||||||
|
)
|
||||||
|
last_sent[key] = time.time()
|
||||||
|
break
|
||||||
|
|
||||||
|
await asyncio.sleep(interval)
|
||||||
107
services/weekly_report.py
Normal file
107
services/weekly_report.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
import asyncio
|
||||||
|
import socket
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import psutil
|
||||||
|
from services.system import worst_disk_usage
|
||||||
|
from services.alert_mute import list_mutes
|
||||||
|
from services.incidents import read_recent
|
||||||
|
from services.docker import docker_cmd
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_hhmm(value: str) -> tuple[int, int]:
|
||||||
|
try:
|
||||||
|
h, m = value.split(":", 1)
|
||||||
|
h = int(h)
|
||||||
|
m = int(m)
|
||||||
|
if 0 <= h <= 23 and 0 <= m <= 59:
|
||||||
|
return h, m
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return 8, 0
|
||||||
|
|
||||||
|
|
||||||
|
def _next_run(day: str, time_str: str) -> datetime:
|
||||||
|
day = (day or "Sun").lower()
|
||||||
|
day_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
||||||
|
target_wd = day_map.get(day[:3], 6)
|
||||||
|
hour, minute = _parse_hhmm(time_str or "08:00")
|
||||||
|
now = datetime.now()
|
||||||
|
candidate = now.replace(hour=hour, minute=minute, second=0, microsecond=0)
|
||||||
|
# find next target weekday/time
|
||||||
|
while candidate <= now or candidate.weekday() != target_wd:
|
||||||
|
candidate = candidate + timedelta(days=1)
|
||||||
|
candidate = candidate.replace(hour=hour, minute=minute, second=0, microsecond=0)
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
|
||||||
|
async def _docker_running_counts(docker_map: dict) -> tuple[int, int]:
|
||||||
|
total = len(docker_map)
|
||||||
|
running = 0
|
||||||
|
for real in docker_map.values():
|
||||||
|
rc, raw = await docker_cmd(["inspect", "-f", "{{.State.Status}}", real], timeout=10)
|
||||||
|
if rc == 0 and raw.strip() == "running":
|
||||||
|
running += 1
|
||||||
|
return running, total
|
||||||
|
|
||||||
|
|
||||||
|
def _format_uptime(seconds: int) -> str:
|
||||||
|
days, rem = divmod(seconds, 86400)
|
||||||
|
hours, rem = divmod(rem, 3600)
|
||||||
|
minutes, _ = divmod(rem, 60)
|
||||||
|
return f"{days}d {hours:02d}:{minutes:02d}"
|
||||||
|
|
||||||
|
|
||||||
|
async def build_weekly_report(cfg, docker_map: dict) -> str:
|
||||||
|
host = socket.gethostname()
|
||||||
|
uptime = int(datetime.now().timestamp() - psutil.boot_time())
|
||||||
|
load1, load5, load15 = psutil.getloadavg()
|
||||||
|
mem = psutil.virtual_memory()
|
||||||
|
disk_usage, disk_mount = worst_disk_usage()
|
||||||
|
running, total = await _docker_running_counts(docker_map)
|
||||||
|
mutes = list_mutes()
|
||||||
|
incidents_24 = len(read_recent(cfg, 24, limit=1000))
|
||||||
|
incidents_7d = len(read_recent(cfg, 24 * 7, limit=2000))
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"🧾 Weekly report — {host}",
|
||||||
|
f"⏱ Uptime: {_format_uptime(uptime)}",
|
||||||
|
f"⚙️ Load: {load1:.2f} {load5:.2f} {load15:.2f}",
|
||||||
|
f"🧠 RAM: {mem.percent}%",
|
||||||
|
]
|
||||||
|
if disk_usage is None:
|
||||||
|
lines.append("💾 Disk: n/a")
|
||||||
|
else:
|
||||||
|
lines.append(f"💾 Disk: {disk_usage}% ({disk_mount})")
|
||||||
|
|
||||||
|
lines.append(f"🐳 Docker: {running}/{total} running")
|
||||||
|
lines.append(f"📓 Incidents: 24h={incidents_24}, 7d={incidents_7d}")
|
||||||
|
|
||||||
|
if mutes:
|
||||||
|
lines.append("🔕 Active mutes:")
|
||||||
|
for cat, secs in mutes.items():
|
||||||
|
mins = max(0, secs) // 60
|
||||||
|
lines.append(f"- {cat}: {mins}m left")
|
||||||
|
else:
|
||||||
|
lines.append("🔔 Mutes: none")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
async def weekly_reporter(cfg, bot, admin_ids: list[int], docker_map: dict):
|
||||||
|
reports_cfg = cfg.get("reports", {}).get("weekly", {})
|
||||||
|
if not reports_cfg.get("enabled", False):
|
||||||
|
return
|
||||||
|
day = reports_cfg.get("day", "Sun")
|
||||||
|
time_str = reports_cfg.get("time", "08:00")
|
||||||
|
while True:
|
||||||
|
target = _next_run(day, time_str)
|
||||||
|
wait_sec = (target - datetime.now()).total_seconds()
|
||||||
|
if wait_sec > 0:
|
||||||
|
await asyncio.sleep(wait_sec)
|
||||||
|
try:
|
||||||
|
text = await build_weekly_report(cfg, docker_map)
|
||||||
|
for admin_id in admin_ids:
|
||||||
|
await bot.send_message(admin_id, text)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
await asyncio.sleep(60) # small delay to avoid tight loop if time skew
|
||||||
2
state.py
2
state.py
@@ -7,3 +7,5 @@ ARCANE_CACHE: Dict[int, dict] = {}
|
|||||||
REBOOT_PENDING: Dict[int, dict] = {}
|
REBOOT_PENDING: Dict[int, dict] = {}
|
||||||
METRICS_STORE = None
|
METRICS_STORE = None
|
||||||
NPMPLUS_TOKEN: Dict[str, object] = {}
|
NPMPLUS_TOKEN: Dict[str, object] = {}
|
||||||
|
PROC_SEARCH_PENDING: Dict[int, dict] = {}
|
||||||
|
PROC_KILL_PENDING: Dict[int, dict] = {}
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
def _cmd(cmd: str) -> str:
|
def _cmd(cmd: str) -> str:
|
||||||
@@ -82,6 +83,62 @@ def list_disks() -> list[str]:
|
|||||||
return disks
|
return disks
|
||||||
|
|
||||||
|
|
||||||
|
def list_md_arrays() -> list[str]:
|
||||||
|
# Prefer /proc/mdstat: it reliably lists active md arrays
|
||||||
|
# even when lsblk tree/filters differ across distros.
|
||||||
|
out = _cmd("cat /proc/mdstat")
|
||||||
|
arrays: set[str] = set()
|
||||||
|
for line in out.splitlines():
|
||||||
|
m = re.match(r"^\s*(md\d+)\s*:", line)
|
||||||
|
if m:
|
||||||
|
arrays.add(f"/dev/{m.group(1)}")
|
||||||
|
|
||||||
|
if arrays:
|
||||||
|
return sorted(arrays)
|
||||||
|
|
||||||
|
# Fallback for environments where mdstat parsing is unavailable.
|
||||||
|
out = _cmd("ls -1 /dev/md* 2>/dev/null")
|
||||||
|
for line in out.splitlines():
|
||||||
|
dev = line.strip()
|
||||||
|
if dev and re.match(r"^/dev/md\d+$", dev):
|
||||||
|
arrays.add(dev)
|
||||||
|
return sorted(arrays)
|
||||||
|
|
||||||
|
|
||||||
|
def md_array_status(dev: str) -> str:
|
||||||
|
out = _cmd("cat /proc/mdstat")
|
||||||
|
if not out or "ERROR:" in out:
|
||||||
|
return "⚠️ n/a"
|
||||||
|
|
||||||
|
name = dev.rsplit("/", 1)[-1]
|
||||||
|
lines = out.splitlines()
|
||||||
|
header = None
|
||||||
|
idx = -1
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
s = line.strip()
|
||||||
|
if s.startswith(f"{name} :"):
|
||||||
|
header = s
|
||||||
|
idx = i
|
||||||
|
break
|
||||||
|
|
||||||
|
if not header:
|
||||||
|
return "⚠️ not found in /proc/mdstat"
|
||||||
|
|
||||||
|
if "inactive" in header:
|
||||||
|
return "🔴 inactive"
|
||||||
|
|
||||||
|
# Typical mdstat health marker: [UU] for healthy mirrors/raid members.
|
||||||
|
block = [header]
|
||||||
|
for line in lines[idx + 1:]:
|
||||||
|
if not line.strip():
|
||||||
|
break
|
||||||
|
block.append(line.strip())
|
||||||
|
block_text = " ".join(block)
|
||||||
|
if "[U_" in block_text or "[_U" in block_text:
|
||||||
|
return "🟡 degraded"
|
||||||
|
return "🟢 active"
|
||||||
|
|
||||||
|
|
||||||
def smart_health(dev: str) -> str:
|
def smart_health(dev: str) -> str:
|
||||||
out = _cmd(f"smartctl -H {dev}")
|
out = _cmd(f"smartctl -H {dev}")
|
||||||
|
|
||||||
@@ -122,10 +179,25 @@ def disk_temperature(dev: str) -> str:
|
|||||||
return "n/a"
|
return "n/a"
|
||||||
|
|
||||||
|
|
||||||
|
def smart_last_test(dev: str) -> str:
|
||||||
|
out = _cmd(f"smartctl -l selftest {dev}")
|
||||||
|
if not out or "ERROR:" in out:
|
||||||
|
return "n/a"
|
||||||
|
|
||||||
|
for line in out.splitlines():
|
||||||
|
if "No self-tests have been logged" in line:
|
||||||
|
return "no tests"
|
||||||
|
if line.lstrip().startswith("#"):
|
||||||
|
return line.strip()
|
||||||
|
|
||||||
|
return "n/a"
|
||||||
|
|
||||||
|
|
||||||
def disks() -> str:
|
def disks() -> str:
|
||||||
disks = list_disks()
|
disks = list_disks()
|
||||||
|
md_arrays = list_md_arrays()
|
||||||
|
|
||||||
if not disks:
|
if not disks and not md_arrays:
|
||||||
return "💽 Disks\n\n❌ No disks found"
|
return "💽 Disks\n\n❌ No disks found"
|
||||||
|
|
||||||
lines = ["💽 Disks (SMART)\n"]
|
lines = ["💽 Disks (SMART)\n"]
|
||||||
@@ -144,6 +216,12 @@ def disks() -> str:
|
|||||||
|
|
||||||
lines.append(f"{icon} {d} — {health}, 🌡 {temp}")
|
lines.append(f"{icon} {d} — {health}, 🌡 {temp}")
|
||||||
|
|
||||||
|
if md_arrays:
|
||||||
|
lines.append("")
|
||||||
|
lines.append("🧱 RAID (md)")
|
||||||
|
for md in md_arrays:
|
||||||
|
lines.append(f"{md} — {md_array_status(md)}")
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
20
tests/test_config_check.py
Normal file
20
tests/test_config_check.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
from services.config_check import validate_cfg
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigCheckTests(unittest.TestCase):
|
||||||
|
def test_admin_ids_without_admin_id_is_valid(self):
|
||||||
|
cfg = {
|
||||||
|
"telegram": {
|
||||||
|
"token": "x",
|
||||||
|
"admin_ids": [1, 2],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
errors, warnings = validate_cfg(cfg)
|
||||||
|
self.assertEqual(errors, [])
|
||||||
|
self.assertIsInstance(warnings, list)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
21
tests/test_disk_report.py
Normal file
21
tests/test_disk_report.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import unittest
|
||||||
|
import types
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Avoid runtime import of real app/aiogram in services.runner.
|
||||||
|
sys.modules.setdefault("app", types.SimpleNamespace(RESTIC_ENV={}))
|
||||||
|
|
||||||
|
from services.disk_report import _top_dirs_cmd
|
||||||
|
|
||||||
|
|
||||||
|
class DiskReportTests(unittest.TestCase):
|
||||||
|
def test_top_dirs_cmd_uses_exec_args_without_shell(self):
|
||||||
|
cmd = _top_dirs_cmd("/tmp/path with spaces", 5)
|
||||||
|
self.assertEqual(cmd[:4], ["du", "-x", "-h", "-d"])
|
||||||
|
self.assertNotIn("bash", cmd)
|
||||||
|
self.assertNotIn("-lc", cmd)
|
||||||
|
self.assertEqual(cmd[-1], "/tmp/path with spaces")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
59
tests/test_queue.py
Normal file
59
tests/test_queue.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from services import runtime_state
|
||||||
|
from services import queue as queue_service
|
||||||
|
|
||||||
|
|
||||||
|
class QueueTests(unittest.IsolatedAsyncioTestCase):
|
||||||
|
async def asyncSetUp(self):
|
||||||
|
self.tmp = tempfile.TemporaryDirectory()
|
||||||
|
runtime_state.configure(f"{self.tmp.name}/runtime.json")
|
||||||
|
|
||||||
|
queue_service._pending.clear() # type: ignore[attr-defined]
|
||||||
|
queue_service._history.clear() # type: ignore[attr-defined]
|
||||||
|
queue_service._stats = { # type: ignore[attr-defined]
|
||||||
|
"processed": 0,
|
||||||
|
"avg_wait_sec": 0.0,
|
||||||
|
"avg_runtime_sec": 0.0,
|
||||||
|
"last_label": "",
|
||||||
|
"last_finished_at": 0.0,
|
||||||
|
}
|
||||||
|
queue_service._cfg = {"incidents": {"enabled": True}} # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
async def asyncTearDown(self):
|
||||||
|
self.tmp.cleanup()
|
||||||
|
|
||||||
|
async def test_worker_logs_failed_job_to_incidents(self):
|
||||||
|
logged = []
|
||||||
|
|
||||||
|
def fake_log_incident(cfg, text, category=None):
|
||||||
|
logged.append((text, category))
|
||||||
|
|
||||||
|
orig = queue_service.log_incident
|
||||||
|
queue_service.log_incident = fake_log_incident
|
||||||
|
|
||||||
|
async def boom():
|
||||||
|
raise RuntimeError("boom")
|
||||||
|
|
||||||
|
worker_task = asyncio.create_task(queue_service.worker())
|
||||||
|
try:
|
||||||
|
await queue_service.enqueue("broken-job", boom)
|
||||||
|
await asyncio.wait_for(queue_service._queue.join(), timeout=2.0) # type: ignore[attr-defined]
|
||||||
|
finally:
|
||||||
|
worker_task.cancel()
|
||||||
|
with contextlib.suppress(asyncio.CancelledError):
|
||||||
|
await worker_task
|
||||||
|
queue_service.log_incident = orig
|
||||||
|
|
||||||
|
self.assertEqual(queue_service._stats.get("processed"), 1) # type: ignore[attr-defined]
|
||||||
|
self.assertTrue(any("queue_job_failed label=broken-job" in t for t, _c in logged))
|
||||||
|
self.assertTrue(any(c == "queue" for _t, c in logged))
|
||||||
|
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
28
tests/test_runtime_state.py
Normal file
28
tests/test_runtime_state.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from services import runtime_state
|
||||||
|
|
||||||
|
|
||||||
|
class RuntimeStateTests(unittest.TestCase):
|
||||||
|
def test_set_and_get_persist_between_loads(self):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
path = Path(tmp) / "runtime.json"
|
||||||
|
runtime_state.configure(str(path))
|
||||||
|
|
||||||
|
runtime_state.set_state("foo", {"bar": 1})
|
||||||
|
self.assertEqual(runtime_state.get("foo"), {"bar": 1})
|
||||||
|
|
||||||
|
# Force a fresh in-memory state and load from disk again.
|
||||||
|
runtime_state._STATE = {} # type: ignore[attr-defined]
|
||||||
|
runtime_state._LOADED = False # type: ignore[attr-defined]
|
||||||
|
self.assertEqual(runtime_state.get("foo"), {"bar": 1})
|
||||||
|
|
||||||
|
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
self.assertEqual(raw.get("foo"), {"bar": 1})
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user