Compare commits
142 Commits
9a031a8584
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| b84107463c | |||
| ee361abb99 | |||
| 2ad423fb6a | |||
| efa5dd9644 | |||
| 678332e6d0 | |||
| 7c56430f32 | |||
| b54a094185 | |||
| 6d5fb9c258 | |||
| 5099ae4fe2 | |||
| 568cd86844 | |||
| b138ee316d | |||
| fa98a96b34 | |||
| 1dba6d4a0f | |||
| b784deb02b | |||
| 5ae54618e8 | |||
| 3fc99bdcfc | |||
| c1d69adbc5 | |||
| a14fb8fccd | |||
| 4ba8f48228 | |||
| 10bf265c29 | |||
| fd179d24e8 | |||
| 2905528677 | |||
| 2b87ce04a3 | |||
| 02b8e2bb55 | |||
| f0fb2aad0e | |||
| 219776c642 | |||
| 28caa551bd | |||
| 783f4abd98 | |||
| f71c02835a | |||
| f7081b78e1 | |||
| 0fbd374823 | |||
| c3db70160c | |||
| 1b9d260530 | |||
| 040a6c96e4 | |||
| 4f6d6dd549 | |||
| 2e0bf0c6ea | |||
| 5a4234f59d | |||
| 1d24caa2a2 | |||
| c91c961134 | |||
| 75113b6182 | |||
| aa7bd85687 | |||
| ff65e15509 | |||
| 08fa95dffd | |||
| b0a4413671 | |||
| 9399be4168 | |||
| 2e35885a5e | |||
| 4d4e3767bc | |||
| b78dc3cd5c | |||
| 20cd56a8c0 | |||
| 7d251a7078 | |||
| 2ee9756d12 | |||
| 77571da4d9 | |||
| d4a19d309f | |||
| 972c8eb6a7 | |||
| ae2d085214 | |||
| 5da7125fbb | |||
| 65682ca162 | |||
| 8bcc3c6878 | |||
| ab58592523 | |||
| a98292604a | |||
| 97524b92a2 | |||
| 0a761e5799 | |||
| d242dafb9b | |||
| 7db336f2aa | |||
| b4a243e72f | |||
| 01c539fad9 | |||
| 8cec8ae53e | |||
| e36bf49f1c | |||
| a029bbfa7a | |||
| ad8a6bff69 | |||
| 64d899d971 | |||
| 8b08b5418f | |||
| 7a5e3d46cf | |||
| c31a194651 | |||
| 5e01a8d596 | |||
| fc061ece30 | |||
| 0f7f53cb27 | |||
| 857fa86e85 | |||
| ea6ad1d5b2 | |||
| e1b0f1153e | |||
| 054d1d0d50 | |||
| 200b8104a6 | |||
| e7a120657b | |||
| c34a142698 | |||
| 3df9db3bf7 | |||
| aab54d4108 | |||
| 45756636b9 | |||
| 51b24be0be | |||
| 1d7262eb78 | |||
| f7ebdfe325 | |||
| 9ced16cfbd | |||
| c8db1be2d8 | |||
| dbf9b1fd2f | |||
| 118d4bf7f2 | |||
| a7d5fb5459 | |||
| 48dc1f38ac | |||
| 4a00deadc3 | |||
| c51e2d4a59 | |||
| 4e79c401a9 | |||
| 4eb202c2ed | |||
| 4989314e2b | |||
| 4b895748d1 | |||
| ab2bc5de33 | |||
| d0c40b9e50 | |||
| ec30d09c52 | |||
| 80bd01a766 | |||
| a221393858 | |||
| 61236b9d60 | |||
| be65398b86 | |||
| c89e7259f6 | |||
| d007f6064b | |||
| 3bce99255f | |||
| 3fa7821d08 | |||
| e10d5f10ab | |||
| 515d0ebe45 | |||
| d3572c6005 | |||
| 5f3c9184b1 | |||
| 4c5c085832 | |||
| 26eb756ec8 | |||
| cd242edaee | |||
| bfcd8b4833 | |||
| 5cfb74c2e8 | |||
| 7d744f5cf4 | |||
| f66d4a482d | |||
| a222e2329a | |||
| 9d4b9620ee | |||
| 08d936ac3e | |||
| 0e6459ea40 | |||
| 83d98b5cb8 | |||
| dcd8de5f48 | |||
| 2bd6e228ac | |||
| 5d169c0b81 | |||
| dddf6bc014 | |||
| fc9ef08525 | |||
| e590b7e38f | |||
| e3a977a2ec | |||
| 6f8838b5f8 | |||
| 3da20f7df8 | |||
| 2db59b6c00 | |||
| 099387da93 | |||
| a3a942ecb7 | |||
| 6a4f29bd4b |
189
CONFIG.en.md
Normal file
189
CONFIG.en.md
Normal file
@@ -0,0 +1,189 @@
|
||||
# Configuration
|
||||
|
||||
This project uses `config.yaml`. Start from `config.example.yaml`.
|
||||
|
||||
## telegram
|
||||
|
||||
- `token` (string, required): Telegram bot token.
|
||||
- `admin_id` (int, required): Telegram user id with admin access.
|
||||
- `admin_ids` (list<int>): Optional list of admins (first is primary for alerts).
|
||||
|
||||
## paths
|
||||
|
||||
- `artifact_state` (string): JSON file for artifact state.
|
||||
- `runtime_state` (string): File for runtime state (mutes, metrics, etc.).
|
||||
- `restic_env` (string): Path to a file with RESTIC_* environment variables.
|
||||
|
||||
## thresholds
|
||||
|
||||
- `disk_warn` (int, percent): Disk usage warning threshold.
|
||||
- `load_warn` (float): Load warning threshold.
|
||||
- `high_load_warn` (float): Critical load threshold.
|
||||
|
||||
## alerts
|
||||
|
||||
- `enabled` (bool): Enable resource alerts.
|
||||
- `interval_sec` (int): Poll interval.
|
||||
- `cooldown_sec` (int): Cooldown between alerts.
|
||||
- `notify_cooldown_sec` (int): Global alert dedup cooldown (defaults to `cooldown_sec`).
|
||||
- `load_only_critical` (bool): Only send critical load alerts (no warn/OK).
|
||||
- `quiet_hours` (object): Quiet hours for non‑critical alerts.
|
||||
- `enabled` (bool): Enable quiet hours.
|
||||
- `start` (string): Start time `HH:MM` (e.g. `23:00`).
|
||||
- `end` (string): End time `HH:MM` (e.g. `08:00`).
|
||||
- `allow_critical` (bool): Allow critical alerts during quiet hours.
|
||||
- `auto_mute` (list): Per-category auto mutes by time window.
|
||||
- `category` (string): load/disk/smart/raid/ssl/docker/test.
|
||||
- `start` (string): Start `HH:MM`.
|
||||
- `end` (string): End `HH:MM` (can wrap over midnight).
|
||||
- `auto_mute_on_high_load_sec` (int): auto-mute `load` category for N seconds on critical load (0 disables).
|
||||
- `notify_recovery` (bool): Send recovery notifications.
|
||||
- `smart_enabled` (bool): Enable SMART health polling.
|
||||
- `smart_interval_sec` (int): SMART poll interval.
|
||||
- `smart_cooldown_sec` (int): SMART alert cooldown.
|
||||
- `smart_temp_warn` (int): SMART temperature warning (C).
|
||||
- `raid_enabled` (bool): Enable md RAID polling (`/proc/mdstat`).
|
||||
- `raid_interval_sec` (int): RAID poll interval.
|
||||
- `raid_cooldown_sec` (int): RAID alert cooldown.
|
||||
|
||||
## disk_report
|
||||
|
||||
- `threshold` (int): Disk usage threshold for auto snapshot.
|
||||
- `cooldown_sec` (int): Cooldown between snapshots.
|
||||
- `top_dirs` (int): How many directories to show.
|
||||
- `docker_dir` (string): Path to docker data.
|
||||
- `logs_dir` (string): Path to logs.
|
||||
|
||||
## audit
|
||||
|
||||
- `enabled` (bool): Enable audit logging.
|
||||
- `path` (string): Log file path. Default `/var/server-bot/audit.log`.
|
||||
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||
- `backup_count` (int): How many rotated files to keep.
|
||||
|
||||
## incidents
|
||||
|
||||
- `enabled` (bool): Enable incidents logging.
|
||||
- `path` (string): Log file path. Default `/var/server-bot/incidents.log`.
|
||||
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||
- `backup_count` (int): How many rotated files to keep.
|
||||
|
||||
## logging
|
||||
|
||||
- `enabled` (bool): Enable bot logging.
|
||||
- `path` (string): Log file path. Default `/var/server-bot/bot.log`.
|
||||
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||
- `backup_count` (int): How many rotated files to keep.
|
||||
- `level` (string): Log level (`INFO`, `WARNING`, `ERROR`).
|
||||
|
||||
## safety
|
||||
|
||||
- `dry_run` (bool): If `true`, dangerous actions (upgrade/reboot/backup) are skipped.
|
||||
|
||||
## reports
|
||||
|
||||
- `weekly.enabled` (bool): Enable weekly report.
|
||||
- `weekly.day` (string): Weekday `Mon`..`Sun` (default `Sun`).
|
||||
- `weekly.time` (string): Local time `HH:MM` (default `08:00`).
|
||||
|
||||
## selftest
|
||||
|
||||
- `schedule.enabled` (bool): Enable auto self-test.
|
||||
- `schedule.time` (string): Local time `HH:MM` (default `03:30`).
|
||||
|
||||
## queue
|
||||
|
||||
- `max_pending_alert` (int): Alert if pending tasks >= this value.
|
||||
- `avg_wait_alert` (int): Alert if average wait exceeds N seconds.
|
||||
- `cooldown_sec` (int): Cooldown between queue alerts (default 300s).
|
||||
## external_checks
|
||||
|
||||
- `enabled` (bool): Enable background checks.
|
||||
- `state_path` (string): State file for uptime, default `/var/server-bot/external_checks.json`.
|
||||
- `timeout_sec` (int): Check timeout in seconds.
|
||||
- `interval_sec` (int): Background check interval.
|
||||
- `services` (list): List of checks.
|
||||
- `name` (string): Service name.
|
||||
- `type` (string): `http`, `tcp`, `ping`.
|
||||
- `url` (string): URL for `http`.
|
||||
- `host` (string): Host for `tcp`/`ping`.
|
||||
- `port` (int): Port for `tcp`.
|
||||
|
||||
## arcane
|
||||
|
||||
- `base_url` (string): Arcane API base url.
|
||||
- `api_key` (string): Arcane API key.
|
||||
- `env_id` (int): Arcane environment id.
|
||||
|
||||
## npmplus
|
||||
|
||||
Used for SSL certificate status.
|
||||
|
||||
- `base_url` (string): NPMplus API base url, for example `https://10.10.10.10:81/api`.
|
||||
- `identity` (string): Login email.
|
||||
- `secret` (string): Login password.
|
||||
- `token` (string): Optional static token (not recommended if it expires).
|
||||
- `verify_tls` (bool): Set to `false` for self-signed TLS.
|
||||
- `alerts.enabled` (bool): Enable expiry notifications.
|
||||
- `alerts.days` (list): Thresholds in days (e.g. 30/14/7/1).
|
||||
- `alerts.cooldown_sec` (int): Cooldown between identical alerts.
|
||||
- `alerts.interval_sec` (int): Check interval.
|
||||
|
||||
Token flow:
|
||||
|
||||
- First token: `POST /api/tokens` with `identity` and `secret`.
|
||||
- Refresh: `GET /api/tokens` using the cached token.
|
||||
|
||||
## gitea
|
||||
|
||||
- `base_url` (string): Gitea base url, for example `http://localhost:3000`.
|
||||
- `token` (string): Optional API token.
|
||||
- `verify_tls` (bool): Set to `false` for self-signed TLS.
|
||||
|
||||
## openwrt
|
||||
|
||||
- `host` (string): Router address, for example `10.10.10.1`.
|
||||
- `user` (string): SSH user (usually `root`).
|
||||
- `port` (int): SSH port (usually `22`).
|
||||
- `identity_file` (string): Path to SSH key (optional).
|
||||
- `strict_host_key_checking` (bool): Set to `false` to skip key confirmation.
|
||||
- `timeout_sec` (int): SSH request timeout.
|
||||
|
||||
## security
|
||||
|
||||
- `reboot_password` (string): Password required before reboot.
|
||||
|
||||
## docker
|
||||
|
||||
- `autodiscovery` (bool): Discover containers by name/label.
|
||||
- `watchdog` (bool): Enable container watchdog notifications.
|
||||
- `label` (string): Optional label filter `key=value`.
|
||||
- `match` (list): Name substrings used for discovery.
|
||||
- `aliases` (map): Alias -> real container name.
|
||||
- `containers` (map): Explicit container list (legacy modules). Each item can define:
|
||||
- `name` (string): Container name.
|
||||
- `url` (string): Health URL for the URLs check.
|
||||
|
||||
Example:
|
||||
|
||||
```yaml
|
||||
telegram:
|
||||
token: "YOUR_TELEGRAM_BOT_TOKEN"
|
||||
admin_id: 123456789
|
||||
|
||||
paths:
|
||||
artifact_state: "/opt/tg-bot/state.json"
|
||||
restic_env: "/etc/restic/restic.env"
|
||||
|
||||
audit:
|
||||
enabled: true
|
||||
path: "/var/server-bot/audit.log"
|
||||
rotate_when: "W0"
|
||||
backup_count: 8
|
||||
|
||||
npmplus:
|
||||
base_url: "https://10.10.10.10:81/api"
|
||||
identity: "your@email.com"
|
||||
secret: "yourPassword"
|
||||
verify_tls: false
|
||||
```
|
||||
190
CONFIG.md
Normal file
190
CONFIG.md
Normal file
@@ -0,0 +1,190 @@
|
||||
# Конфигурация
|
||||
|
||||
Проект использует `config.yaml`. Начни с `config.example.yaml`.
|
||||
|
||||
## telegram
|
||||
|
||||
- `token` (string, обяз.): токен бота.
|
||||
- `admin_id` (int, обяз.): Telegram user id администратора.
|
||||
- `admin_ids` (list<int>): список админов (первый используется как основной для уведомлений).
|
||||
|
||||
## paths
|
||||
|
||||
- `artifact_state` (string): JSON файл состояния артефактов.
|
||||
- `runtime_state` (string): файл с runtime-состоянием (мьюты, метрики и т.п.).
|
||||
- `restic_env` (string): путь к файлу с RESTIC_* переменными.
|
||||
|
||||
## thresholds
|
||||
|
||||
- `disk_warn` (int, %): порог предупреждения по диску.
|
||||
- `load_warn` (float): порог предупреждения по нагрузке.
|
||||
- `high_load_warn` (float): порог для критической нагрузки.
|
||||
|
||||
## alerts
|
||||
|
||||
- `enabled` (bool): включить алерты.
|
||||
- `interval_sec` (int): интервал опроса.
|
||||
- `cooldown_sec` (int): кулдаун между алертами.
|
||||
- `notify_cooldown_sec` (int): глобальный дедуп алертов (по умолчанию `cooldown_sec`).
|
||||
- `load_only_critical` (bool): слать только критичные алерты по нагрузке (без warn/OK).
|
||||
- `quiet_hours` (object): тихие часы для не‑критичных уведомлений.
|
||||
- `enabled` (bool): включить тихие часы.
|
||||
- `start` (string): начало, формат `HH:MM` (например `23:00`).
|
||||
- `end` (string): конец, формат `HH:MM` (например `08:00`).
|
||||
- `allow_critical` (bool): слать критичные алерты в тишину.
|
||||
- `auto_mute` (list): авто‑мьюты по категориям и времени.
|
||||
- `category` (string): load/disk/smart/raid/ssl/docker/test.
|
||||
- `start` (string): начало `HH:MM`.
|
||||
- `end` (string): конец `HH:MM` (интервал может пересекать ночь).
|
||||
- `auto_mute_on_high_load_sec` (int): при critical load автоматически мьютить категорию `load` на N секунд (0 — выкл).
|
||||
- `notify_recovery` (bool): уведомлять о восстановлении.
|
||||
- `smart_enabled` (bool): SMART проверки.
|
||||
- `smart_interval_sec` (int): интервал SMART.
|
||||
- `smart_cooldown_sec` (int): кулдаун SMART.
|
||||
- `smart_temp_warn` (int): порог температуры (C).
|
||||
- `raid_enabled` (bool): RAID проверки (`/proc/mdstat`).
|
||||
- `raid_interval_sec` (int): интервал RAID.
|
||||
- `raid_cooldown_sec` (int): кулдаун RAID алертов.
|
||||
|
||||
## disk_report
|
||||
|
||||
- `threshold` (int): порог диска для авто‑снимка.
|
||||
- `cooldown_sec` (int): кулдаун между снимками.
|
||||
- `top_dirs` (int): сколько директорий показывать.
|
||||
- `docker_dir` (string): путь к docker данным.
|
||||
- `logs_dir` (string): путь к логам.
|
||||
|
||||
## audit
|
||||
|
||||
- `enabled` (bool): включить аудит.
|
||||
- `path` (string): путь к лог-файлу. По умолчанию `/var/server-bot/audit.log`.
|
||||
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||
- `backup_count` (int): сколько файлов хранить.
|
||||
|
||||
## incidents
|
||||
|
||||
- `enabled` (bool): включить лог инцидентов.
|
||||
- `path` (string): путь к лог-файлу. По умолчанию `/var/server-bot/incidents.log`.
|
||||
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||
- `backup_count` (int): сколько файлов хранить.
|
||||
|
||||
## logging
|
||||
|
||||
- `enabled` (bool): включить лог бота.
|
||||
- `path` (string): путь к лог-файлу. По умолчанию `/var/server-bot/bot.log`.
|
||||
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||
- `backup_count` (int): сколько файлов хранить.
|
||||
- `level` (string): уровень логирования (`INFO`, `WARNING`, `ERROR`).
|
||||
|
||||
## safety
|
||||
|
||||
- `dry_run` (bool): если `true`, опасные действия (upgrade/reboot/backup) не выполняются.
|
||||
|
||||
## reports
|
||||
|
||||
- `weekly.enabled` (bool): включить еженедельный отчёт.
|
||||
- `weekly.day` (string): день недели (`Mon`..`Sun`), по умолчанию `Sun`.
|
||||
- `weekly.time` (string): локальное время `HH:MM`, по умолчанию `08:00`.
|
||||
|
||||
## selftest
|
||||
|
||||
- `schedule.enabled` (bool): включить авто self-test.
|
||||
- `schedule.time` (string): локальное время `HH:MM`, по умолчанию `03:30`.
|
||||
|
||||
## queue
|
||||
|
||||
- `max_pending_alert` (int): алерт, если задач в очереди >= этому значению.
|
||||
- `avg_wait_alert` (int): алерт, если среднее ожидание превышает N секунд.
|
||||
- `cooldown_sec` (int): кулдаун между алертами очереди, по умолчанию 300с.
|
||||
|
||||
## external_checks
|
||||
|
||||
- `enabled` (bool): включить фоновые проверки.
|
||||
- `state_path` (string): файл состояния для аптайма, по умолчанию `/var/server-bot/external_checks.json`.
|
||||
- `timeout_sec` (int): таймаут проверки в секундах.
|
||||
- `interval_sec` (int): интервал фоновых проверок.
|
||||
- `services` (list): список проверок.
|
||||
- `name` (string): название сервиса.
|
||||
- `type` (string): `http`, `tcp`, `ping`.
|
||||
- `url` (string): URL для `http`.
|
||||
- `host` (string): хост для `tcp`/`ping`.
|
||||
- `port` (int): порт для `tcp`.
|
||||
|
||||
## arcane
|
||||
|
||||
- `base_url` (string): base url API Arcane.
|
||||
- `api_key` (string): ключ API Arcane.
|
||||
- `env_id` (int): ID окружения Arcane.
|
||||
|
||||
## npmplus
|
||||
|
||||
Используется для статуса SSL сертификатов.
|
||||
|
||||
- `base_url` (string): base url API, например `https://10.10.10.10:81/api`.
|
||||
- `identity` (string): email логин.
|
||||
- `secret` (string): пароль.
|
||||
- `token` (string): опционально статический токен (не рекомендуется при истечении).
|
||||
- `verify_tls` (bool): `false` для self-signed TLS.
|
||||
- `alerts.enabled` (bool): включить уведомления по истечению.
|
||||
- `alerts.days` (list): пороги в днях (например 30/14/7/1).
|
||||
- `alerts.cooldown_sec` (int): кулдаун между одинаковыми алертами.
|
||||
- `alerts.interval_sec` (int): интервал проверки.
|
||||
|
||||
Логика токена:
|
||||
|
||||
- первый токен: `POST /api/tokens` с `identity` и `secret`.
|
||||
- refresh: `GET /api/tokens` с текущим токеном.
|
||||
|
||||
## gitea
|
||||
|
||||
- `base_url` (string): base url Gitea, например `http://localhost:3000`.
|
||||
- `token` (string): опциональный API токен.
|
||||
- `verify_tls` (bool): `false` для self-signed TLS.
|
||||
|
||||
## openwrt
|
||||
|
||||
- `host` (string): адрес роутера, например `10.10.10.1`.
|
||||
- `user` (string): SSH пользователь (обычно `root`).
|
||||
- `port` (int): SSH порт (обычно `22`).
|
||||
- `identity_file` (string): путь к SSH ключу (опционально).
|
||||
- `strict_host_key_checking` (bool): `false` чтобы не спрашивать подтверждение ключа.
|
||||
- `timeout_sec` (int): таймаут SSH запроса.
|
||||
|
||||
## security
|
||||
|
||||
- `reboot_password` (string): пароль для подтверждения reboot.
|
||||
|
||||
## docker
|
||||
|
||||
- `autodiscovery` (bool): автодискавери по имени/label.
|
||||
- `watchdog` (bool): уведомления о контейнерах.
|
||||
- `label` (string): фильтр label `key=value`.
|
||||
- `match` (list): подстроки для поиска контейнеров.
|
||||
- `aliases` (map): alias -> реальное имя.
|
||||
- `containers` (map): явный список (legacy). Можно задавать:
|
||||
- `name` (string)
|
||||
- `url` (string) для URLs проверки
|
||||
|
||||
Пример:
|
||||
|
||||
```yaml
|
||||
telegram:
|
||||
token: "YOUR_TELEGRAM_BOT_TOKEN"
|
||||
admin_id: 123456789
|
||||
|
||||
paths:
|
||||
artifact_state: "/opt/tg-bot/state.json"
|
||||
restic_env: "/etc/restic/restic.env"
|
||||
|
||||
audit:
|
||||
enabled: true
|
||||
path: "/var/server-bot/audit.log"
|
||||
rotate_when: "W0"
|
||||
backup_count: 8
|
||||
|
||||
npmplus:
|
||||
base_url: "https://10.10.10.10:81/api"
|
||||
identity: "your@email.com"
|
||||
secret: "yourPassword"
|
||||
verify_tls: false
|
||||
```
|
||||
674
LICENSE
Normal file
674
LICENSE
Normal file
@@ -0,0 +1,674 @@
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU General Public License is a free, copyleft license for
|
||||
software and other kinds of works.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
the GNU General Public License is intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users. We, the Free Software Foundation, use the
|
||||
GNU General Public License for most of our software; it applies also to
|
||||
any other work released this way by its authors. You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to prevent others from denying you
|
||||
these rights or asking you to surrender the rights. Therefore, you have
|
||||
certain responsibilities if you distribute copies of the software, or if
|
||||
you modify it: responsibilities to respect the freedom of others.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must pass on to the recipients the same
|
||||
freedoms that you received. You must make sure that they, too, receive
|
||||
or can get the source code. And you must show them these terms so they
|
||||
know their rights.
|
||||
|
||||
Developers that use the GNU GPL protect your rights with two steps:
|
||||
(1) assert copyright on the software, and (2) offer you this License
|
||||
giving you legal permission to copy, distribute and/or modify it.
|
||||
|
||||
For the developers' and authors' protection, the GPL clearly explains
|
||||
that there is no warranty for this free software. For both users' and
|
||||
authors' sake, the GPL requires that modified versions be marked as
|
||||
changed, so that their problems will not be attributed erroneously to
|
||||
authors of previous versions.
|
||||
|
||||
Some devices are designed to deny users access to install or run
|
||||
modified versions of the software inside them, although the manufacturer
|
||||
can do so. This is fundamentally incompatible with the aim of
|
||||
protecting users' freedom to change the software. The systematic
|
||||
pattern of such abuse occurs in the area of products for individuals to
|
||||
use, which is precisely where it is most unacceptable. Therefore, we
|
||||
have designed this version of the GPL to prohibit the practice for those
|
||||
products. If such problems arise substantially in other domains, we
|
||||
stand ready to extend this provision to those domains in future versions
|
||||
of the GPL, as needed to protect the freedom of users.
|
||||
|
||||
Finally, every program is threatened constantly by software patents.
|
||||
States should not allow patents to restrict development and use of
|
||||
software on general-purpose computers, but in those that do, we wish to
|
||||
avoid the special danger that patents applied to a free program could
|
||||
make it effectively proprietary. To prevent this, the GPL assures that
|
||||
patents cannot be used to render the program non-free.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Use with the GNU Affero General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU Affero General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the special requirements of the GNU Affero General Public License,
|
||||
section 13, concerning interaction through a network will apply to the
|
||||
combination as such.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, your program's commands
|
||||
might be different; for a GUI interface, you would use an "about box".
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU GPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
|
||||
The GNU General Public License does not permit incorporating your program
|
||||
into proprietary programs. If your program is a subroutine library, you
|
||||
may consider it more useful to permit linking proprietary applications with
|
||||
the library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License. But first, please read
|
||||
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
||||
73
README.en.md
Normal file
73
README.en.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# tg-admin-bot
|
||||
|
||||
Telegram admin bot for Linux servers. Provides quick status checks, backup controls, and ops actions from a chat.
|
||||
|
||||
## Features
|
||||
|
||||
- Docker: status, restart, logs (tail, since, filter).
|
||||
- Arcane: list projects, refresh, up/down, restart.
|
||||
- Backups (restic): snapshots, repo stats, run backup, queue, restic check, weekly report.
|
||||
- System: disks, security, URLs health, metrics, package updates, upgrade, reboot, hardware info, SSL cert status (NPMplus).
|
||||
- Alerts: disk/load/SMART with cooldown and quiet hours.
|
||||
- Audit log: all button presses and messages (weekly rotation).
|
||||
- Logs: bot log rotation and incidents.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Linux host.
|
||||
- Python 3.11+ (tested with 3.13).
|
||||
- System tools as needed:
|
||||
- docker
|
||||
- restic
|
||||
- smartctl (smartmontools)
|
||||
- sudo access for reboot/upgrade/backup scripts
|
||||
- systemd (for timers/status, optional but recommended)
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r req.txt
|
||||
```
|
||||
|
||||
## Configure
|
||||
|
||||
1. Copy the example config:
|
||||
|
||||
```bash
|
||||
cp config.example.yaml config.yaml
|
||||
```
|
||||
|
||||
2. Edit `config.yaml` and set at least:
|
||||
|
||||
- `telegram.token`
|
||||
- `telegram.admin_id`
|
||||
|
||||
3. Optional:
|
||||
|
||||
- Restic env file path (`paths.restic_env`).
|
||||
- Docker autodiscovery or explicit `docker.containers`.
|
||||
- Arcane and NPMplus API settings.
|
||||
- Audit log path and rotation.
|
||||
|
||||
See `CONFIG.en.md` for full details.
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
python bot.py
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
Copyright (C) 2026 benya
|
||||
|
||||
GNU GPL v3.0. Full text in `LICENSE`.
|
||||
|
||||
## Notes
|
||||
|
||||
- For NPMplus with self-signed TLS, set `npmplus.verify_tls: false`.
|
||||
- The bot uses `sudo` for certain actions (reboot, upgrade, backup scripts). Ensure the service user has the required permissions.
|
||||
- Enable `safety.dry_run` if you want a safe mode without actions.
|
||||
- Audit log default path is `/var/server-bot/audit.log`.
|
||||
73
README.md
Normal file
73
README.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# tg-admin-bot
|
||||
|
||||
Telegram-бот администратора для Linux-серверов. Даёт быстрый доступ к статусам, бэкапам и операциям через чат.
|
||||
|
||||
## Возможности
|
||||
|
||||
- Docker: статус, рестарт, логи (tail, since, фильтр).
|
||||
- Arcane: список проектов, refresh, up/down, restart.
|
||||
- Бэкапы (restic): снапшоты, статистика репозитория, запуск бэкапа, очередь, restic check, weekly report.
|
||||
- Система: диски, безопасность, проверка URL, метрики, обновления, upgrade, reboot, железо, SSL (NPMplus).
|
||||
- Алерты: диск/нагрузка/SMART с cooldown и quiet hours.
|
||||
- Аудит: все нажатия и сообщения (ротация раз в неделю).
|
||||
- Логи: ротация логов бота и инциденты.
|
||||
|
||||
## Требования
|
||||
|
||||
- Linux-хост.
|
||||
- Python 3.11+ (проверено на 3.13).
|
||||
- Системные утилиты по необходимости:
|
||||
- docker
|
||||
- restic
|
||||
- smartctl (smartmontools)
|
||||
- sudo для reboot/upgrade/backup скриптов
|
||||
- systemd (для таймеров/статуса, желательно)
|
||||
|
||||
## Установка
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r req.txt
|
||||
```
|
||||
|
||||
## Настройка
|
||||
|
||||
1. Скопировать пример:
|
||||
|
||||
```bash
|
||||
cp config.example.yaml config.yaml
|
||||
```
|
||||
|
||||
2. Заполнить минимум:
|
||||
|
||||
- `telegram.token`
|
||||
- `telegram.admin_id`
|
||||
|
||||
3. Опционально:
|
||||
|
||||
- путь к restic env (`paths.restic_env`)
|
||||
- docker autodiscovery или явный список `docker.containers`
|
||||
- Arcane и NPMplus API
|
||||
- аудит логов
|
||||
|
||||
Подробности в `CONFIG.md`.
|
||||
|
||||
## Запуск
|
||||
|
||||
```bash
|
||||
python bot.py
|
||||
```
|
||||
|
||||
## Лицензия
|
||||
|
||||
Copyright (C) 2026 benya
|
||||
|
||||
GNU GPL v3.0. Полный текст в `LICENSE`.
|
||||
|
||||
## Примечания
|
||||
|
||||
- Для NPMplus с self-signed TLS установи `npmplus.verify_tls: false`.
|
||||
- Бот использует `sudo` для части операций — настрой права.
|
||||
- Включи `safety.dry_run`, если хочешь безопасный режим без действий.
|
||||
- Аудит по умолчанию пишется в `/var/server-bot/audit.log`.
|
||||
15
app.py
15
app.py
@@ -1,13 +1,22 @@
|
||||
from aiogram import Bot, Dispatcher
|
||||
from config import load_cfg, load_env
|
||||
from services import runtime_state
|
||||
|
||||
cfg = load_cfg()
|
||||
|
||||
TOKEN = cfg["telegram"]["token"]
|
||||
ADMIN_ID = cfg["telegram"]["admin_id"]
|
||||
admin_ids_cfg = cfg["telegram"].get("admin_ids")
|
||||
if isinstance(admin_ids_cfg, list) and admin_ids_cfg:
|
||||
ADMIN_IDS = [int(x) for x in admin_ids_cfg]
|
||||
ADMIN_ID = ADMIN_IDS[0]
|
||||
else:
|
||||
ADMIN_ID = int(cfg["telegram"]["admin_id"])
|
||||
ADMIN_IDS = [ADMIN_ID]
|
||||
|
||||
ARTIFACT_STATE = cfg["paths"]["artifact_state"]
|
||||
RESTIC_ENV = load_env(cfg["paths"].get("restic_env", "/etc/restic/restic.env"))
|
||||
paths_cfg = cfg.get("paths", {})
|
||||
runtime_state.configure(paths_cfg.get("runtime_state", "/var/server-bot/runtime.json"))
|
||||
ARTIFACT_STATE = paths_cfg.get("artifact_state", "/opt/tg-bot/state.json")
|
||||
RESTIC_ENV = load_env(paths_cfg.get("restic_env", "/etc/restic/restic.env"))
|
||||
|
||||
DISK_WARN = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
||||
LOAD_WARN = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
|
||||
|
||||
6
auth.py
6
auth.py
@@ -1,10 +1,10 @@
|
||||
from aiogram.types import Message, CallbackQuery
|
||||
from app import ADMIN_ID
|
||||
from app import ADMIN_IDS
|
||||
|
||||
|
||||
def is_admin_msg(msg: Message) -> bool:
|
||||
return msg.from_user and msg.from_user.id == ADMIN_ID
|
||||
return msg.from_user and msg.from_user.id in ADMIN_IDS
|
||||
|
||||
|
||||
def is_admin_cb(cb: CallbackQuery) -> bool:
|
||||
return cb.from_user and cb.from_user.id == ADMIN_ID
|
||||
return cb.from_user and cb.from_user.id in ADMIN_IDS
|
||||
|
||||
@@ -1,26 +1,152 @@
|
||||
telegram:
|
||||
token: "YOUR_TELEGRAM_BOT_TOKEN"
|
||||
admin_id: 123456789
|
||||
# Optional list of admins (first is primary for alerts)
|
||||
admin_ids:
|
||||
- 123456789
|
||||
|
||||
paths:
|
||||
# JSON state file for artifacts
|
||||
artifact_state: "/opt/tg-bot/state.json"
|
||||
runtime_state: "/var/server-bot/runtime.json"
|
||||
# Optional env file with RESTIC_* variables
|
||||
restic_env: "/etc/restic/restic.env"
|
||||
|
||||
thresholds:
|
||||
disk_warn: 80
|
||||
load_warn: 2.0
|
||||
high_load_warn: 3.0
|
||||
|
||||
alerts:
|
||||
enabled: true
|
||||
interval_sec: 60
|
||||
cooldown_sec: 900
|
||||
# Optional global dedup cooldown for notify() calls
|
||||
notify_cooldown_sec: 900
|
||||
# If true, only critical load alerts are sent (no warn/OK)
|
||||
load_only_critical: false
|
||||
# Optional auto-mute windows per category
|
||||
auto_mute:
|
||||
- category: "load"
|
||||
start: "23:00"
|
||||
end: "08:00"
|
||||
# Auto-mute load when critical load fires (seconds)
|
||||
auto_mute_on_high_load_sec: 600
|
||||
quiet_hours:
|
||||
enabled: false
|
||||
start: "23:00"
|
||||
end: "08:00"
|
||||
# Allow critical alerts during quiet hours
|
||||
allow_critical: true
|
||||
notify_recovery: true
|
||||
smart_enabled: true
|
||||
smart_interval_sec: 3600
|
||||
smart_cooldown_sec: 21600
|
||||
smart_temp_warn: 50
|
||||
raid_enabled: true
|
||||
raid_interval_sec: 300
|
||||
raid_cooldown_sec: 1800
|
||||
|
||||
disk_report:
|
||||
threshold: 90
|
||||
cooldown_sec: 21600
|
||||
top_dirs: 8
|
||||
docker_dir: "/var/lib/docker"
|
||||
logs_dir: "/var/log"
|
||||
|
||||
audit:
|
||||
enabled: true
|
||||
path: "/var/server-bot/audit.log"
|
||||
rotate_when: "W0"
|
||||
backup_count: 8
|
||||
|
||||
incidents:
|
||||
enabled: true
|
||||
path: "/var/server-bot/incidents.log"
|
||||
rotate_when: "W0"
|
||||
backup_count: 8
|
||||
|
||||
logging:
|
||||
enabled: true
|
||||
path: "/var/server-bot/bot.log"
|
||||
rotate_when: "W0"
|
||||
backup_count: 8
|
||||
level: "INFO"
|
||||
|
||||
safety:
|
||||
# If true, dangerous actions will be skipped
|
||||
dry_run: false
|
||||
|
||||
reports:
|
||||
weekly:
|
||||
enabled: false
|
||||
day: "Sun" # Mon/Tue/Wed/Thu/Fri/Sat/Sun
|
||||
time: "08:00" # HH:MM server local time
|
||||
|
||||
selftest:
|
||||
schedule:
|
||||
enabled: false
|
||||
time: "03:30"
|
||||
|
||||
queue:
|
||||
max_pending_alert: 5
|
||||
avg_wait_alert: 120
|
||||
cooldown_sec: 300
|
||||
|
||||
external_checks:
|
||||
enabled: true
|
||||
state_path: "/var/server-bot/external_checks.json"
|
||||
timeout_sec: 5
|
||||
interval_sec: 300
|
||||
services:
|
||||
- name: "example-site"
|
||||
type: "http"
|
||||
url: "https://example.com"
|
||||
- name: "example-ssh"
|
||||
type: "tcp"
|
||||
host: "example.com"
|
||||
port: 22
|
||||
|
||||
arcane:
|
||||
base_url: "http://localhost:3552"
|
||||
api_key: "arc_..."
|
||||
env_id: 0
|
||||
|
||||
npmplus:
|
||||
base_url: "https://10.10.10.10:81/api"
|
||||
identity: "your@email.com"
|
||||
secret: "yourPassword"
|
||||
# Optional static token (not recommended if it expires)
|
||||
token: ""
|
||||
verify_tls: true
|
||||
alerts:
|
||||
enabled: true
|
||||
days:
|
||||
- 30
|
||||
- 14
|
||||
- 7
|
||||
- 1
|
||||
cooldown_sec: 86400
|
||||
interval_sec: 3600
|
||||
|
||||
gitea:
|
||||
base_url: "http://localhost:3000"
|
||||
# Optional API token for private instances
|
||||
token: ""
|
||||
verify_tls: true
|
||||
|
||||
openwrt:
|
||||
host: "10.10.10.1"
|
||||
user: "root"
|
||||
port: 22
|
||||
# Optional identity file for SSH
|
||||
identity_file: ""
|
||||
# Disable strict host key checking for auto-accept
|
||||
strict_host_key_checking: false
|
||||
timeout_sec: 8
|
||||
|
||||
security:
|
||||
reboot_password: "CHANGE_ME"
|
||||
|
||||
docker:
|
||||
# If true, discover containers by name/label
|
||||
|
||||
9
deploy.sh
Normal file
9
deploy.sh
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SSH_HOST="root@10.10.10.10"
|
||||
SSH_PORT="1090"
|
||||
APP_DIR="/opt/tg-bot"
|
||||
|
||||
ssh -p "$SSH_PORT" "$SSH_HOST" \
|
||||
"cd \"$APP_DIR\" && git pull --ff-only && systemctl restart tg-bot"
|
||||
162
handlers/alerts_admin.py
Normal file
162
handlers/alerts_admin.py
Normal file
@@ -0,0 +1,162 @@
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from aiogram import F
|
||||
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
||||
from app import dp, bot, cfg, ADMIN_ID
|
||||
from auth import is_admin_msg
|
||||
from services.alert_mute import set_mute, clear_mute, list_mutes
|
||||
from services.incidents import read_recent, log_incident
|
||||
from services.notify import notify
|
||||
|
||||
|
||||
HELP_TEXT = (
|
||||
"Alerts:\n"
|
||||
"/alerts test <critical|warn|info> - send test alert\n"
|
||||
"/alerts mute <category> <minutes> - mute alerts for category\n"
|
||||
"/alerts unmute <category> - unmute category\n"
|
||||
"/alerts list - show active mutes\n"
|
||||
"/alerts recent [hours] - show incidents log (default 24h)\n"
|
||||
"Categories: load, disk, smart, raid, ssl, docker, test\n"
|
||||
)
|
||||
|
||||
|
||||
def _dispatch(msg: Message, action: str, args: list[str]):
|
||||
return {"action": action, "args": args}
|
||||
|
||||
|
||||
async def _handle_alerts(msg: Message, action: str, args: list[str]):
|
||||
if action == "test":
|
||||
level = args[0].lower() if args else "info"
|
||||
if level not in ("critical", "warn", "info"):
|
||||
level = "info"
|
||||
key = f"test:{level}:{int(time.time())}"
|
||||
await notify(bot, msg.chat.id, f"[TEST] {level.upper()} alert", level=level, key=key, category="test")
|
||||
await msg.answer(f"Sent test alert: {level}")
|
||||
log_incident(cfg, f"alert_test level={level} by {msg.from_user.id}", category="test")
|
||||
return
|
||||
|
||||
if action == "mute":
|
||||
if len(args) < 1:
|
||||
await msg.answer("Usage: /alerts mute <category> <minutes>")
|
||||
return
|
||||
category = args[0].lower()
|
||||
minutes = 60
|
||||
if len(args) >= 2:
|
||||
try:
|
||||
minutes = max(1, int(args[1]))
|
||||
except ValueError:
|
||||
minutes = 60
|
||||
until = set_mute(category, minutes * 60)
|
||||
dt = datetime.fromtimestamp(until, tz=timezone.utc).astimezone()
|
||||
await msg.answer(f"🔕 Muted {category} for {minutes}m (until {dt:%Y-%m-%d %H:%M:%S})")
|
||||
log_incident(cfg, f"alert_mute category={category} minutes={minutes} by {msg.from_user.id}", category=category)
|
||||
return
|
||||
|
||||
if action == "unmute":
|
||||
if len(args) < 1:
|
||||
await msg.answer("Usage: /alerts unmute <category>")
|
||||
return
|
||||
category = args[0].lower()
|
||||
clear_mute(category)
|
||||
await msg.answer(f"🔔 Unmuted {category}")
|
||||
log_incident(cfg, f"alert_unmute category={category} by {msg.from_user.id}", category=category)
|
||||
return
|
||||
|
||||
if action in ("list", "mutes"):
|
||||
mutes = list_mutes()
|
||||
if not mutes:
|
||||
await msg.answer("🔔 No active mutes")
|
||||
return
|
||||
lines = ["🔕 Active mutes:"]
|
||||
for cat, secs in mutes.items():
|
||||
mins = max(0, secs) // 60
|
||||
lines.append(f"- {cat}: {mins}m left")
|
||||
await msg.answer("\n".join(lines))
|
||||
return
|
||||
|
||||
if action == "recent":
|
||||
hours = 24
|
||||
if args:
|
||||
try:
|
||||
hours = max(1, int(args[0]))
|
||||
except ValueError:
|
||||
hours = 24
|
||||
rows = read_recent(cfg, hours, limit=50)
|
||||
if not rows:
|
||||
await msg.answer(f"No incidents in last {hours}h")
|
||||
return
|
||||
await msg.answer("🧾 Incidents:\n" + "\n".join(rows))
|
||||
return
|
||||
|
||||
await msg.answer(HELP_TEXT)
|
||||
|
||||
|
||||
ALERTS_KB = InlineKeyboardMarkup(
|
||||
inline_keyboard=[
|
||||
[
|
||||
InlineKeyboardButton(text="List", callback_data="alerts:list"),
|
||||
InlineKeyboardButton(text="Recent 24h", callback_data="alerts:recent:24"),
|
||||
],
|
||||
[
|
||||
InlineKeyboardButton(text="Mute load 60m", callback_data="alerts:mute:load:60"),
|
||||
InlineKeyboardButton(text="Unmute load", callback_data="alerts:unmute:load"),
|
||||
],
|
||||
[
|
||||
InlineKeyboardButton(text="Test CRIT", callback_data="alerts:test:critical"),
|
||||
InlineKeyboardButton(text="Test WARN", callback_data="alerts:test:warn"),
|
||||
InlineKeyboardButton(text="Test INFO", callback_data="alerts:test:info"),
|
||||
],
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@dp.message(F.text.regexp(r"^/alerts(\\s|$)"))
|
||||
async def alerts_cmd(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
|
||||
parts = msg.text.split()
|
||||
if len(parts) < 2:
|
||||
await msg.answer(HELP_TEXT, reply_markup=ALERTS_KB)
|
||||
return
|
||||
|
||||
action = parts[1].lower()
|
||||
args = parts[2:]
|
||||
await _handle_alerts(msg, action, args)
|
||||
|
||||
|
||||
@dp.message(F.text == "/alerts_list")
|
||||
async def alerts_list(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
await _handle_alerts(msg, "list", [])
|
||||
|
||||
|
||||
@dp.message(F.text == "/alerts_recent")
|
||||
async def alerts_recent(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
await _handle_alerts(msg, "recent", ["24"])
|
||||
|
||||
|
||||
@dp.message(F.text == "/alerts_mute_load")
|
||||
async def alerts_mute_load(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
await _handle_alerts(msg, "mute", ["load", "60"])
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("alerts:"))
|
||||
async def alerts_cb(cb: CallbackQuery):
|
||||
if cb.from_user.id != ADMIN_ID:
|
||||
await cb.answer()
|
||||
return
|
||||
parts = cb.data.split(":")
|
||||
# formats: alerts:action or alerts:action:arg1:arg2
|
||||
if len(parts) < 2:
|
||||
await cb.answer()
|
||||
return
|
||||
action = parts[1]
|
||||
args = parts[2:] if len(parts) > 2 else []
|
||||
await _handle_alerts(cb.message, action, args)
|
||||
await cb.answer()
|
||||
290
handlers/arcane.py
Normal file
290
handlers/arcane.py
Normal file
@@ -0,0 +1,290 @@
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from aiogram import F
|
||||
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
|
||||
from app import dp, cfg, ADMIN_IDS
|
||||
from auth import is_admin_msg
|
||||
from keyboards import docker_kb, arcane_kb
|
||||
from services.arcane import list_projects, restart_project, set_project_state, get_project_details
|
||||
from state import ARCANE_CACHE
|
||||
|
||||
|
||||
def _arcane_cfg():
|
||||
arc = cfg.get("arcane", {})
|
||||
return arc.get("base_url"), arc.get("api_key"), int(arc.get("env_id", 0))
|
||||
|
||||
|
||||
def _arcane_kb(page: int, total_pages: int, items: list[dict]) -> InlineKeyboardMarkup:
|
||||
rows = []
|
||||
for p in items:
|
||||
name = p.get("name", "?")
|
||||
pid = p.get("id", "")
|
||||
if not pid:
|
||||
continue
|
||||
status = p.get("status", "unknown")
|
||||
action = "down" if status == "running" else "up"
|
||||
action_text = "⏹" if action == "down" else "▶️"
|
||||
rows.append([
|
||||
InlineKeyboardButton(text=f"🔄 {name}", callback_data=f"arcane:restart:{pid}"),
|
||||
InlineKeyboardButton(text="ℹ️", callback_data=f"arcane:details:{pid}"),
|
||||
InlineKeyboardButton(text="📦", callback_data=f"arcane:deploy:{pid}"),
|
||||
InlineKeyboardButton(text=action_text, callback_data=f"arcane:{action}:{pid}"),
|
||||
])
|
||||
|
||||
nav = []
|
||||
if page > 0:
|
||||
nav.append(InlineKeyboardButton(text="⬅️ Prev", callback_data=f"arcane:page:{page-1}"))
|
||||
nav.append(InlineKeyboardButton(text="🔄 Refresh", callback_data="arcane:refresh"))
|
||||
if page < total_pages - 1:
|
||||
nav.append(InlineKeyboardButton(text="Next ➡️", callback_data=f"arcane:page:{page+1}"))
|
||||
if nav:
|
||||
rows.append(nav)
|
||||
|
||||
return InlineKeyboardMarkup(inline_keyboard=rows)
|
||||
|
||||
|
||||
def _render_arcane_page(items: list[dict], page: int, page_size: int, ts: str) -> tuple[str, InlineKeyboardMarkup]:
|
||||
total_pages = max(1, (len(items) + page_size - 1) // page_size)
|
||||
page = max(0, min(page, total_pages - 1))
|
||||
start = page * page_size
|
||||
end = start + page_size
|
||||
page_items = items[start:end]
|
||||
|
||||
lines = [f"🧰 Arcane projects на {ts} (page {page+1}/{total_pages})\n"]
|
||||
for p in page_items:
|
||||
status = p.get("status", "unknown")
|
||||
name = p.get("name", "?")
|
||||
running = p.get("runningCount", 0)
|
||||
total = p.get("serviceCount", 0)
|
||||
icon = "🟢" if status == "running" else "🟡"
|
||||
lines.append(f"{icon} {name}: {status} ({running}/{total})")
|
||||
|
||||
kb = _arcane_kb(page, total_pages, page_items)
|
||||
return "\n".join(lines), kb
|
||||
|
||||
|
||||
async def cmd_arcane_projects(msg: Message, *, edit: bool, page: int = 0):
|
||||
base_url, api_key, env_id = _arcane_cfg()
|
||||
if not base_url or not api_key:
|
||||
await msg.answer("⚠️ Arcane config missing", reply_markup=docker_kb)
|
||||
return
|
||||
|
||||
if edit:
|
||||
try:
|
||||
await msg.edit_text("⏳ Arcane projects…")
|
||||
except Exception:
|
||||
await msg.answer("⏳ Arcane projects…", reply_markup=arcane_kb)
|
||||
else:
|
||||
await msg.answer("⏳ Arcane projects…", reply_markup=arcane_kb)
|
||||
|
||||
async def worker():
|
||||
ok, info, items = await asyncio.to_thread(list_projects, base_url, api_key, env_id)
|
||||
if not ok:
|
||||
await msg.answer(f"❌ Arcane error: {info}", reply_markup=arcane_kb)
|
||||
return
|
||||
|
||||
ts = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
|
||||
ARCANE_CACHE[msg.chat.id] = {
|
||||
"items": items,
|
||||
"page_size": 4,
|
||||
"ts": ts,
|
||||
}
|
||||
text, kb = _render_arcane_page(items, page, 4, ts)
|
||||
if edit:
|
||||
try:
|
||||
await msg.edit_text(text, reply_markup=kb)
|
||||
except Exception:
|
||||
await msg.answer(text, reply_markup=kb)
|
||||
else:
|
||||
await msg.answer(text, reply_markup=kb)
|
||||
|
||||
asyncio.create_task(worker())
|
||||
|
||||
|
||||
@dp.message(F.text == "🧰 Arcane")
|
||||
async def arcane_menu(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await cmd_arcane_projects(msg, edit=False)
|
||||
|
||||
|
||||
@dp.message(F.text == "🔄 Refresh")
|
||||
async def arcane_refresh(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await cmd_arcane_projects(msg, edit=False)
|
||||
|
||||
|
||||
@dp.callback_query(F.data == "arcane:refresh")
|
||||
async def arcane_refresh_inline(cb: CallbackQuery):
|
||||
if cb.from_user.id not in ADMIN_IDS:
|
||||
return
|
||||
await cb.answer()
|
||||
await cmd_arcane_projects(cb.message, edit=True)
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("arcane:page:"))
|
||||
async def arcane_page(cb: CallbackQuery):
|
||||
if cb.from_user.id not in ADMIN_IDS:
|
||||
return
|
||||
try:
|
||||
page = int(cb.data.split(":", 2)[2])
|
||||
except ValueError:
|
||||
await cb.answer("Bad page")
|
||||
return
|
||||
data = ARCANE_CACHE.get(cb.message.chat.id)
|
||||
if not data:
|
||||
await cb.answer("No cache")
|
||||
return
|
||||
text, kb = _render_arcane_page(data["items"], page, data["page_size"], data["ts"])
|
||||
await cb.answer()
|
||||
await cb.message.edit_text(text, reply_markup=kb)
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("arcane:restart:"))
|
||||
async def arcane_restart(cb: CallbackQuery):
|
||||
if cb.from_user.id not in ADMIN_IDS:
|
||||
return
|
||||
|
||||
_, _, pid = cb.data.split(":", 2)
|
||||
base_url, api_key, env_id = _arcane_cfg()
|
||||
if not base_url or not api_key:
|
||||
await cb.answer("Arcane config missing")
|
||||
return
|
||||
|
||||
await cb.answer("Restarting…")
|
||||
ok, info = await asyncio.to_thread(restart_project, base_url, api_key, env_id, pid)
|
||||
if ok:
|
||||
await cb.message.answer("✅ Arcane restart triggered", reply_markup=arcane_kb)
|
||||
else:
|
||||
await cb.message.answer(f"❌ Arcane restart failed: {info}", reply_markup=arcane_kb)
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("arcane:details:"))
|
||||
async def arcane_details(cb: CallbackQuery):
|
||||
if cb.from_user.id not in ADMIN_IDS:
|
||||
return
|
||||
|
||||
_, _, pid = cb.data.split(":", 2)
|
||||
base_url, api_key, env_id = _arcane_cfg()
|
||||
if not base_url or not api_key:
|
||||
await cb.answer("Arcane config missing")
|
||||
return
|
||||
|
||||
await cb.answer("Loading…")
|
||||
ok, info, data = await asyncio.to_thread(get_project_details, base_url, api_key, env_id, pid)
|
||||
if not ok:
|
||||
await cb.message.answer(f"❌ Arcane details failed: {info}", reply_markup=arcane_kb)
|
||||
return
|
||||
|
||||
name = data.get("name", "?")
|
||||
status = data.get("status", "unknown")
|
||||
running = data.get("runningCount", 0)
|
||||
total = data.get("serviceCount", 0)
|
||||
status_reason = data.get("statusReason")
|
||||
icon = "🟢" if status == "running" else "🟡"
|
||||
|
||||
lines = [
|
||||
f"🧰 **{name}**",
|
||||
f"{icon} Status: {status} ({running}/{total})",
|
||||
]
|
||||
if status_reason:
|
||||
lines.append(f"⚠️ {status_reason}")
|
||||
|
||||
services = data.get("runtimeServices", [])
|
||||
if services:
|
||||
lines.append("")
|
||||
lines.append("🧩 Services:")
|
||||
for s in services:
|
||||
s_name = s.get("name", "?")
|
||||
s_status = s.get("status", "unknown")
|
||||
s_health = s.get("health")
|
||||
s_icon = "🟢" if s_status == "running" else "🟡"
|
||||
line = f"{s_icon} {s_name}: {s_status}"
|
||||
if s_health:
|
||||
line += f" ({s_health})"
|
||||
lines.append(line)
|
||||
|
||||
await cb.message.answer("\n".join(lines), parse_mode="Markdown", reply_markup=arcane_kb)
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("arcane:deploy:"))
|
||||
async def arcane_deploy_status(cb: CallbackQuery):
|
||||
if cb.from_user.id not in ADMIN_IDS:
|
||||
return
|
||||
|
||||
_, _, pid = cb.data.split(":", 2)
|
||||
base_url, api_key, env_id = _arcane_cfg()
|
||||
if not base_url or not api_key:
|
||||
await cb.answer("Arcane config missing")
|
||||
return
|
||||
|
||||
await cb.answer("Loading…")
|
||||
ok, info, data = await asyncio.to_thread(get_project_details, base_url, api_key, env_id, pid)
|
||||
if not ok:
|
||||
await cb.message.answer(f"❌ Arcane deploy status failed: {info}", reply_markup=arcane_kb)
|
||||
return
|
||||
|
||||
name = data.get("name", "?")
|
||||
status = data.get("status", "unknown")
|
||||
status_reason = data.get("statusReason")
|
||||
updated = data.get("updatedAt")
|
||||
path = data.get("path")
|
||||
repo = data.get("gitRepositoryURL")
|
||||
commit = data.get("lastSyncCommit")
|
||||
running = data.get("runningCount", 0)
|
||||
total = data.get("serviceCount", 0)
|
||||
icon = "🟢" if status == "running" else "🟡"
|
||||
|
||||
lines = [
|
||||
f"📦 **Deploy status: {name}**",
|
||||
f"{icon} Status: {status} ({running}/{total})",
|
||||
]
|
||||
if status_reason:
|
||||
lines.append(f"⚠️ {status_reason}")
|
||||
if updated:
|
||||
lines.append(f"🕒 Updated: {updated}")
|
||||
if path:
|
||||
lines.append(f"📁 Path: {path}")
|
||||
if repo:
|
||||
lines.append(f"🔗 Repo: {repo}")
|
||||
if commit:
|
||||
lines.append(f"🧾 Commit: {commit}")
|
||||
|
||||
await cb.message.answer("\n".join(lines), parse_mode="Markdown", reply_markup=arcane_kb)
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("arcane:up:"))
|
||||
async def arcane_up(cb: CallbackQuery):
|
||||
if cb.from_user.id not in ADMIN_IDS:
|
||||
return
|
||||
|
||||
_, _, pid = cb.data.split(":", 2)
|
||||
base_url, api_key, env_id = _arcane_cfg()
|
||||
if not base_url or not api_key:
|
||||
await cb.answer("Arcane config missing")
|
||||
return
|
||||
|
||||
await cb.answer("Starting…")
|
||||
ok, info = await asyncio.to_thread(set_project_state, base_url, api_key, env_id, pid, "up")
|
||||
if ok:
|
||||
await cb.message.answer("✅ Arcane up triggered", reply_markup=arcane_kb)
|
||||
else:
|
||||
await cb.message.answer(f"❌ Arcane up failed: {info}", reply_markup=arcane_kb)
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("arcane:down:"))
|
||||
async def arcane_down(cb: CallbackQuery):
|
||||
if cb.from_user.id not in ADMIN_IDS:
|
||||
return
|
||||
|
||||
_, _, pid = cb.data.split(":", 2)
|
||||
base_url, api_key, env_id = _arcane_cfg()
|
||||
if not base_url or not api_key:
|
||||
await cb.answer("Arcane config missing")
|
||||
return
|
||||
|
||||
await cb.answer("Stopping…")
|
||||
ok, info = await asyncio.to_thread(set_project_state, base_url, api_key, env_id, pid, "down")
|
||||
if ok:
|
||||
await cb.message.answer("✅ Arcane down triggered", reply_markup=arcane_kb)
|
||||
else:
|
||||
await cb.message.answer(f"❌ Arcane down failed: {info}", reply_markup=arcane_kb)
|
||||
@@ -1,22 +1,227 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from aiogram import F
|
||||
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton
|
||||
from app import dp
|
||||
from auth import is_admin_msg
|
||||
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
|
||||
from app import dp, cfg
|
||||
from auth import is_admin_msg, is_admin_cb
|
||||
from keyboards import backup_kb
|
||||
from lock_utils import acquire_lock, release_lock
|
||||
from services.queue import enqueue, format_status
|
||||
from services.queue import enqueue, format_status, format_details, format_history
|
||||
from services.backup import backup_badge, restore_help
|
||||
from services.runner import run_cmd
|
||||
from services.runner import run_cmd, run_cmd_full
|
||||
from services.incidents import log_incident
|
||||
|
||||
|
||||
def _parse_systemctl_kv(raw: str) -> dict[str, str]:
|
||||
data: dict[str, str] = {}
|
||||
for line in raw.splitlines():
|
||||
if "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
data[key.strip()] = value.strip()
|
||||
return data
|
||||
|
||||
|
||||
async def _unit_status(unit: str, props: list[str]) -> dict[str, str]:
|
||||
args = ["systemctl", "show", unit] + [f"-p{prop}" for prop in props]
|
||||
rc, out = await run_cmd(args, timeout=10)
|
||||
if rc != 0:
|
||||
return {"error": out.strip() or f"systemctl {unit} failed"}
|
||||
return _parse_systemctl_kv(out)
|
||||
|
||||
|
||||
def _sudo_cmd(cmd: list[str]) -> list[str]:
|
||||
if os.geteuid() == 0:
|
||||
return cmd
|
||||
return ["sudo", "-E"] + cmd
|
||||
|
||||
|
||||
def _format_backup_result(rc: int, out: str) -> str:
|
||||
log_path = "/var/log/backup-auto.log"
|
||||
header = "✅ Backup finished" if rc == 0 else "❌ Backup failed"
|
||||
lines = out.strip().splitlines()
|
||||
body = "\n".join(lines[:20])
|
||||
if len(lines) > 20:
|
||||
body += f"\n… trimmed {len(lines) - 20} lines"
|
||||
extra = ""
|
||||
if rc != 0 and os.path.exists(log_path):
|
||||
try:
|
||||
tail = ""
|
||||
with open(log_path, "r", encoding="utf-8", errors="replace") as f:
|
||||
tail_lines = f.readlines()[-40:]
|
||||
tail = "".join(tail_lines).strip()
|
||||
if tail:
|
||||
extra = "\n\nLog tail:\n" + tail
|
||||
except Exception:
|
||||
pass
|
||||
base = f"{header} (rc={rc})\nlog: {log_path}"
|
||||
if body:
|
||||
base += "\n\n" + body
|
||||
if extra:
|
||||
base += extra
|
||||
return base
|
||||
|
||||
|
||||
def _tail(path: str, lines: int = 120) -> str:
|
||||
if not os.path.exists(path):
|
||||
return f"⚠️ Log not found: {path}"
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
||||
data = f.readlines()[-lines:]
|
||||
except Exception as e:
|
||||
return f"⚠️ Failed to read log: {e}"
|
||||
return "".join(data).strip() or "(empty)"
|
||||
|
||||
|
||||
def _beautify_restic_forget(raw: str) -> str | None:
|
||||
"""
|
||||
Parse restic forget output tables into a compact bullet list.
|
||||
"""
|
||||
if "Reasons" not in raw or "Paths" not in raw:
|
||||
return None
|
||||
import re
|
||||
|
||||
lines = raw.splitlines()
|
||||
headers = []
|
||||
for idx, line in enumerate(lines):
|
||||
if line.startswith("ID") and "Reasons" in line and "Paths" in line:
|
||||
headers.append(idx)
|
||||
if not headers:
|
||||
return None
|
||||
|
||||
def _valid_id(val: str) -> bool:
|
||||
return bool(re.fullmatch(r"[0-9a-f]{7,64}", val.strip()))
|
||||
|
||||
def parse_block(start_idx: int, end_idx: int) -> list[dict]:
|
||||
header = lines[start_idx]
|
||||
cols = ["ID", "Time", "Host", "Tags", "Reasons", "Paths", "Size"]
|
||||
positions = []
|
||||
for name in cols:
|
||||
pos = header.find(name)
|
||||
if pos == -1:
|
||||
return []
|
||||
positions.append(pos)
|
||||
positions.append(len(header))
|
||||
|
||||
entries: list[dict] = []
|
||||
current: dict | None = None
|
||||
for line in lines[start_idx + 2 : end_idx]:
|
||||
if not line.strip():
|
||||
continue
|
||||
segments = []
|
||||
for i in range(len(cols)):
|
||||
segments.append(line[positions[i] : positions[i + 1]].strip())
|
||||
row = dict(zip(cols, segments))
|
||||
if row["ID"] and _valid_id(row["ID"]):
|
||||
current = {
|
||||
"id": row["ID"],
|
||||
"time": row["Time"],
|
||||
"host": row["Host"],
|
||||
"size": row["Size"],
|
||||
"tags": row["Tags"],
|
||||
"reasons": [],
|
||||
"paths": [],
|
||||
}
|
||||
if row["Reasons"]:
|
||||
current["reasons"].append(row["Reasons"])
|
||||
if row["Paths"]:
|
||||
current["paths"].append(row["Paths"])
|
||||
entries.append(current)
|
||||
elif current:
|
||||
if row["Reasons"] and not row["Reasons"].startswith("-"):
|
||||
current["reasons"].append(row["Reasons"])
|
||||
if row["Paths"] and not row["Paths"].startswith("-"):
|
||||
current["paths"].append(row["Paths"])
|
||||
return entries
|
||||
|
||||
blocks = []
|
||||
for i, start in enumerate(headers):
|
||||
end = headers[i + 1] if i + 1 < len(headers) else len(lines)
|
||||
entries = parse_block(start, end)
|
||||
if not entries:
|
||||
continue
|
||||
label = "Plan"
|
||||
prev_line = lines[start - 1].lower() if start - 1 >= 0 else ""
|
||||
prev2 = lines[start - 2].lower() if start - 2 >= 0 else ""
|
||||
if "keep" in prev_line:
|
||||
label = prev_line.strip()
|
||||
elif "keep" in prev2:
|
||||
label = prev2.strip()
|
||||
elif "snapshots" in prev_line:
|
||||
label = prev_line.strip()
|
||||
blocks.append((label, entries))
|
||||
|
||||
if not blocks:
|
||||
return None
|
||||
|
||||
out_lines = []
|
||||
for label, entries in blocks:
|
||||
out_lines.append(f"📦 {label}")
|
||||
for e in entries:
|
||||
head = f"🧉 {e['id']} | {e['time']} | {e['host']} | {e['size'] or 'n/a'}"
|
||||
out_lines.append(head)
|
||||
if e["reasons"]:
|
||||
out_lines.append(" 📌 " + "; ".join(e["reasons"]))
|
||||
if e["paths"]:
|
||||
for p in e["paths"]:
|
||||
out_lines.append(f" • {p}")
|
||||
out_lines.append("")
|
||||
return "\n".join(out_lines).rstrip()
|
||||
|
||||
|
||||
def _load_json(raw: str, label: str) -> tuple[bool, object | None, str]:
|
||||
if not raw or not raw.strip():
|
||||
return False, None, f"? {label} returned empty output"
|
||||
try:
|
||||
return True, json.loads(raw), ""
|
||||
except json.JSONDecodeError:
|
||||
preview = raw.strip().splitlines()
|
||||
head = preview[0] if preview else "invalid output"
|
||||
return False, None, f"? {label} invalid JSON: {head}"
|
||||
|
||||
|
||||
async def send_backup_jobs_status(msg: Message):
|
||||
services = [
|
||||
("backup-auto", "backup-auto.timer"),
|
||||
("restic-check", "restic-check.timer"),
|
||||
("weekly-report", "weekly-report.timer"),
|
||||
]
|
||||
service_props = ["ActiveState", "SubState", "Result", "ExecMainStatus", "ExecMainExitTimestamp"]
|
||||
timer_props = ["LastTriggerUSecRealtime", "NextElapseUSecRealtime"]
|
||||
|
||||
lines = ["🕒 Backup jobs\n"]
|
||||
for service, timer in services:
|
||||
svc = await _unit_status(f"{service}.service", service_props)
|
||||
tmr = await _unit_status(timer, timer_props)
|
||||
if "error" in svc:
|
||||
lines.append(f"🔴 {service}: {svc['error']}")
|
||||
continue
|
||||
|
||||
active = svc.get("ActiveState", "n/a")
|
||||
result = svc.get("Result", "n/a")
|
||||
exit_status = svc.get("ExecMainStatus", "n/a")
|
||||
last = svc.get("ExecMainExitTimestamp", "n/a")
|
||||
next_run = tmr.get("NextElapseUSecRealtime", "n/a")
|
||||
last_trigger = tmr.get("LastTriggerUSecRealtime", "n/a")
|
||||
|
||||
lines.append(
|
||||
f"🧊 {service}: {active} ({result}, rc={exit_status})"
|
||||
)
|
||||
lines.append(f" Last run: {last}")
|
||||
lines.append(f" Last trigger: {last_trigger}")
|
||||
lines.append(f" Next: {next_run}")
|
||||
lines.append("")
|
||||
|
||||
await msg.answer("\n".join(lines).rstrip(), reply_markup=backup_kb)
|
||||
|
||||
|
||||
async def cmd_repo_stats(msg: Message):
|
||||
await msg.answer("⏳ Loading repo stats…", reply_markup=backup_kb)
|
||||
|
||||
# --- restore-size stats ---
|
||||
rc1, raw1 = await run_cmd(
|
||||
rc1, raw1 = await run_cmd_full(
|
||||
["restic", "stats", "--json"],
|
||||
use_restic_env=True,
|
||||
timeout=30
|
||||
@@ -25,10 +230,14 @@ async def cmd_repo_stats(msg: Message):
|
||||
await msg.answer(raw1, reply_markup=backup_kb)
|
||||
return
|
||||
|
||||
restore = json.loads(raw1)
|
||||
ok, restore, err = _load_json(raw1, "restic stats")
|
||||
if not ok:
|
||||
await msg.answer(err, reply_markup=backup_kb)
|
||||
return
|
||||
|
||||
|
||||
# --- raw-data stats ---
|
||||
rc2, raw2 = await run_cmd(
|
||||
rc2, raw2 = await run_cmd_full(
|
||||
["restic", "stats", "--json", "--mode", "raw-data"],
|
||||
use_restic_env=True,
|
||||
timeout=30
|
||||
@@ -37,15 +246,26 @@ async def cmd_repo_stats(msg: Message):
|
||||
await msg.answer(raw2, reply_markup=backup_kb)
|
||||
return
|
||||
|
||||
raw = json.loads(raw2)
|
||||
ok, raw, err = _load_json(raw2, "restic stats raw-data")
|
||||
if not ok:
|
||||
await msg.answer(err, reply_markup=backup_kb)
|
||||
return
|
||||
|
||||
|
||||
# --- snapshots count ---
|
||||
rc3, raw_snaps = await run_cmd(
|
||||
rc3, raw_snaps = await run_cmd_full(
|
||||
["restic", "snapshots", "--json"],
|
||||
use_restic_env=True,
|
||||
timeout=20
|
||||
)
|
||||
snaps = len(json.loads(raw_snaps)) if rc3 == 0 else "n/a"
|
||||
if rc3 != 0:
|
||||
snaps = "n/a"
|
||||
else:
|
||||
ok, snap_data, err = _load_json(raw_snaps, "restic snapshots")
|
||||
if ok and isinstance(snap_data, list):
|
||||
snaps = len(snap_data)
|
||||
else:
|
||||
snaps = "n/a"
|
||||
|
||||
msg_text = (
|
||||
"📦 **Repository stats**\n\n"
|
||||
@@ -62,7 +282,7 @@ async def cmd_backup_status(msg: Message):
|
||||
await msg.answer("⏳ Loading snapshots…", reply_markup=backup_kb)
|
||||
|
||||
async def worker():
|
||||
rc, raw = await run_cmd(
|
||||
rc, raw = await run_cmd_full(
|
||||
["restic", "snapshots", "--json"],
|
||||
use_restic_env=True,
|
||||
timeout=30
|
||||
@@ -71,7 +291,10 @@ async def cmd_backup_status(msg: Message):
|
||||
await msg.answer(raw, reply_markup=backup_kb)
|
||||
return
|
||||
|
||||
snaps = json.loads(raw)
|
||||
ok, snaps, err = _load_json(raw, "restic snapshots")
|
||||
if not ok or not isinstance(snaps, list):
|
||||
await msg.answer(err, reply_markup=backup_kb)
|
||||
return
|
||||
if not snaps:
|
||||
await msg.answer("📦 Snapshots: none", reply_markup=backup_kb)
|
||||
return
|
||||
@@ -104,12 +327,20 @@ async def cmd_backup_status(msg: Message):
|
||||
f"📦 Snapshots ({len(snaps)})\n{badge}",
|
||||
reply_markup=kb
|
||||
)
|
||||
await send_backup_jobs_status(msg)
|
||||
|
||||
asyncio.create_task(worker())
|
||||
|
||||
|
||||
async def cmd_backup_now(msg: Message):
|
||||
await schedule_backup(msg)
|
||||
|
||||
|
||||
async def schedule_backup(msg: Message):
|
||||
async def job():
|
||||
if cfg.get("safety", {}).get("dry_run", False):
|
||||
await msg.answer("🧪 Dry-run: backup skipped", reply_markup=backup_kb)
|
||||
return
|
||||
if not acquire_lock("backup"):
|
||||
await msg.answer("⚠️ Backup уже выполняется", reply_markup=backup_kb)
|
||||
return
|
||||
@@ -117,20 +348,36 @@ async def cmd_backup_now(msg: Message):
|
||||
await msg.answer("▶️ Backup запущен", reply_markup=backup_kb)
|
||||
|
||||
try:
|
||||
rc, out = await run_cmd(["sudo", "/usr/local/bin/backup.py", "restic-backup"], timeout=6 * 3600)
|
||||
await msg.answer(("✅ OK\n" if rc == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
|
||||
rc, out = await run_cmd(
|
||||
_sudo_cmd(["/usr/local/bin/backup.py", "restic-backup"]),
|
||||
use_restic_env=True,
|
||||
timeout=6 * 3600,
|
||||
)
|
||||
kb = backup_kb
|
||||
if rc != 0:
|
||||
kb = InlineKeyboardMarkup(
|
||||
inline_keyboard=[
|
||||
[InlineKeyboardButton(text="🔁 Retry backup", callback_data="backup:retry")]
|
||||
]
|
||||
)
|
||||
await msg.answer(_format_backup_result(rc, out), reply_markup=kb)
|
||||
finally:
|
||||
release_lock("backup")
|
||||
|
||||
pos = await enqueue("backup", job)
|
||||
await msg.answer(f"🕓 Backup queued (#{pos})", reply_markup=backup_kb)
|
||||
try:
|
||||
from services.incidents import log_incident
|
||||
log_incident(cfg, f"backup_queued by {msg.from_user.id}", category="backup")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def cmd_last_snapshot(msg: Message):
|
||||
await msg.answer("⏳ Loading last snapshot…", reply_markup=backup_kb)
|
||||
|
||||
async def worker():
|
||||
rc, raw = await run_cmd(
|
||||
rc, raw = await run_cmd_full(
|
||||
["restic", "snapshots", "--json"],
|
||||
use_restic_env=True,
|
||||
timeout=20
|
||||
@@ -139,7 +386,10 @@ async def cmd_last_snapshot(msg: Message):
|
||||
await msg.answer(raw, reply_markup=backup_kb)
|
||||
return
|
||||
|
||||
snaps = json.loads(raw)
|
||||
ok, snaps, err = _load_json(raw, "restic snapshots")
|
||||
if not ok or not isinstance(snaps, list):
|
||||
await msg.answer(err, reply_markup=backup_kb)
|
||||
return
|
||||
if not snaps:
|
||||
await msg.answer("📦 Snapshots: none", reply_markup=backup_kb)
|
||||
return
|
||||
@@ -149,7 +399,7 @@ async def cmd_last_snapshot(msg: Message):
|
||||
t = datetime.fromisoformat(s["time"].replace("Z", "+00:00"))
|
||||
short_id = s["short_id"]
|
||||
|
||||
rc2, raw2 = await run_cmd(
|
||||
rc2, raw2 = await run_cmd_full(
|
||||
["restic", "stats", short_id, "--json"],
|
||||
use_restic_env=True,
|
||||
timeout=20
|
||||
@@ -158,7 +408,10 @@ async def cmd_last_snapshot(msg: Message):
|
||||
await msg.answer(raw2, reply_markup=backup_kb)
|
||||
return
|
||||
|
||||
stats = json.loads(raw2)
|
||||
ok, stats, err = _load_json(raw2, f"restic stats {short_id}")
|
||||
if not ok or not isinstance(stats, dict):
|
||||
await msg.answer(err, reply_markup=backup_kb)
|
||||
return
|
||||
|
||||
msg_text = (
|
||||
"📦 **Last snapshot**\n\n"
|
||||
@@ -193,7 +446,20 @@ async def ls(msg: Message):
|
||||
@dp.message(F.text == "🧾 Queue")
|
||||
async def qb(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer(format_status(), reply_markup=backup_kb)
|
||||
kb = InlineKeyboardMarkup(
|
||||
inline_keyboard=[
|
||||
[InlineKeyboardButton(text="Details", callback_data="queue:details")],
|
||||
]
|
||||
)
|
||||
await msg.answer(format_status(), reply_markup=kb)
|
||||
|
||||
|
||||
@dp.callback_query(F.data == "queue:details")
|
||||
async def qd(cb: CallbackQuery):
|
||||
if not is_admin_cb(cb):
|
||||
return
|
||||
await cb.answer()
|
||||
await cb.message.answer(format_details(), reply_markup=backup_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "▶️ Run backup")
|
||||
@@ -202,7 +468,108 @@ async def br(msg: Message):
|
||||
await cmd_backup_now(msg)
|
||||
|
||||
|
||||
@dp.message(F.text == "/backup_run")
|
||||
async def br_cmd(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await schedule_backup(msg)
|
||||
|
||||
|
||||
@dp.message(F.text == "🧪 Restic check")
|
||||
async def rc(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
|
||||
async def job():
|
||||
await msg.answer("🧪 Restic check запущен", reply_markup=backup_kb)
|
||||
rc2, out = await run_cmd(
|
||||
_sudo_cmd(["/usr/local/bin/restic-check.sh"]),
|
||||
use_restic_env=True,
|
||||
timeout=6 * 3600,
|
||||
)
|
||||
kb = backup_kb
|
||||
if rc2 != 0:
|
||||
kb = InlineKeyboardMarkup(
|
||||
inline_keyboard=[
|
||||
[InlineKeyboardButton(text="🔁 Retry restic check", callback_data="backup:retry_check")]
|
||||
]
|
||||
)
|
||||
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=kb)
|
||||
|
||||
pos = await enqueue("restic-check", job)
|
||||
await msg.answer(f"🕓 Restic check queued (#{pos})", reply_markup=backup_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "📬 Weekly report")
|
||||
async def wr(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
|
||||
async def job():
|
||||
await msg.answer("📬 Weekly report запущен", reply_markup=backup_kb)
|
||||
rc2, out = await run_cmd(
|
||||
_sudo_cmd(["/usr/local/bin/weekly-report.sh"]),
|
||||
use_restic_env=True,
|
||||
timeout=3600,
|
||||
)
|
||||
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
|
||||
|
||||
pos = await enqueue("weekly-report", job)
|
||||
await msg.answer(f"🕓 Weekly report queued (#{pos})", reply_markup=backup_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "🧯 Restore help")
|
||||
async def rh(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer(restore_help(), reply_markup=backup_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "📜 History")
|
||||
@dp.message(F.text == "/backup_history")
|
||||
async def backup_history(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
log_path = "/var/log/backup-auto.log"
|
||||
content = _tail(log_path, lines=160)
|
||||
if content.startswith("⚠️"):
|
||||
await msg.answer(content, reply_markup=backup_kb)
|
||||
return
|
||||
pretty = _beautify_restic_forget(content)
|
||||
trimmed = False
|
||||
max_len = 3500
|
||||
if len(content) > max_len:
|
||||
content = content[-max_len:]
|
||||
trimmed = True
|
||||
header = "📜 Backup history (tail)"
|
||||
if trimmed:
|
||||
header += " (trimmed)"
|
||||
if pretty:
|
||||
await msg.answer(f"{header}\n`{log_path}`\n\n{pretty}", reply_markup=backup_kb)
|
||||
else:
|
||||
await msg.answer(
|
||||
f"{header}\n`{log_path}`\n```\n{content}\n```",
|
||||
reply_markup=backup_kb,
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
|
||||
|
||||
@dp.message(F.text == "/queue_history")
|
||||
async def queue_history(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
await msg.answer(format_history(), reply_markup=backup_kb)
|
||||
|
||||
|
||||
@dp.callback_query(F.data == "backup:retry")
|
||||
async def backup_retry(cb: CallbackQuery):
|
||||
if not is_admin_cb(cb):
|
||||
return
|
||||
await cb.answer("Queuing backup…")
|
||||
await schedule_backup(cb.message)
|
||||
|
||||
|
||||
@dp.callback_query(F.data == "backup:retry_check")
|
||||
async def backup_retry_check(cb: CallbackQuery):
|
||||
if not is_admin_cb(cb):
|
||||
return
|
||||
await cb.answer("Queuing restic check…")
|
||||
await rc(cb.message)
|
||||
|
||||
@@ -2,8 +2,10 @@ import json
|
||||
import time
|
||||
from aiogram import F
|
||||
from aiogram.types import CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
||||
from app import dp, ADMIN_ID
|
||||
from app import dp, ADMIN_ID, cfg
|
||||
from services.docker import docker_cmd
|
||||
from services.incidents import log_incident
|
||||
from services.runner import run_cmd
|
||||
from state import DOCKER_MAP, LOG_FILTER_PENDING
|
||||
from handlers.backup import cmd_backup_status
|
||||
|
||||
@@ -13,8 +15,15 @@ async def docker_callback(cb: CallbackQuery):
|
||||
if cb.from_user.id != ADMIN_ID:
|
||||
return
|
||||
|
||||
_, action, alias = cb.data.split(":", 2)
|
||||
real = DOCKER_MAP[alias]
|
||||
try:
|
||||
_, action, alias = cb.data.split(":", 2)
|
||||
except ValueError:
|
||||
await cb.answer("Bad request")
|
||||
return
|
||||
real = DOCKER_MAP.get(alias)
|
||||
if not real:
|
||||
await cb.answer("Container not found")
|
||||
return
|
||||
|
||||
if action == "restart":
|
||||
await cb.answer("Restarting…")
|
||||
@@ -24,6 +33,10 @@ async def docker_callback(cb: CallbackQuery):
|
||||
f"🔄 **{alias} restarted**\n```{out}```",
|
||||
parse_mode="Markdown"
|
||||
)
|
||||
try:
|
||||
log_incident(cfg, f"docker_restart {alias}", category="docker")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif action == "logs":
|
||||
await cb.answer()
|
||||
@@ -54,7 +67,7 @@ async def snapshot_details(cb: CallbackQuery):
|
||||
snap_id = cb.data.split(":", 1)[1]
|
||||
await cb.answer("Loading snapshot…")
|
||||
|
||||
# получаем статистику snapshot
|
||||
# получаем статистику snapshot
|
||||
rc, raw = await run_cmd(
|
||||
["restic", "stats", snap_id, "--json"],
|
||||
use_restic_env=True,
|
||||
@@ -117,8 +130,13 @@ async def logs_options(cb: CallbackQuery):
|
||||
if action == "tail":
|
||||
await cb.answer("Loading logs…")
|
||||
rc, out = await docker_cmd(["logs", "--tail", "80", real])
|
||||
if rc != 0:
|
||||
await cb.message.answer(out)
|
||||
return
|
||||
if not out.strip():
|
||||
out = "(no logs)"
|
||||
await cb.message.answer(
|
||||
f"📜 **Logs: {alias}**\n```{out}```",
|
||||
f"📜 **Logs: {alias}**\n```\n{out}\n```",
|
||||
parse_mode="Markdown"
|
||||
)
|
||||
return
|
||||
@@ -132,8 +150,13 @@ async def logs_options(cb: CallbackQuery):
|
||||
since_ts = str(int(time.time() - seconds))
|
||||
await cb.answer("Loading logs…")
|
||||
rc, out = await docker_cmd(["logs", "--since", since_ts, "--tail", "200", real])
|
||||
if rc != 0:
|
||||
await cb.message.answer(out)
|
||||
return
|
||||
if not out.strip():
|
||||
out = "(no logs for period)"
|
||||
await cb.message.answer(
|
||||
f"📜 **Logs: {alias}**\n```{out}```",
|
||||
f"📜 **Logs: {alias}**\n```\n{out}\n```",
|
||||
parse_mode="Markdown"
|
||||
)
|
||||
return
|
||||
|
||||
24
handlers/config_check.py
Normal file
24
handlers/config_check.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from aiogram import F
|
||||
from aiogram.types import Message
|
||||
from app import dp, cfg
|
||||
from auth import is_admin_msg
|
||||
from services.config_check import validate_cfg
|
||||
|
||||
|
||||
@dp.message(F.text == "/config_check")
|
||||
async def config_check(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
errors, warnings = validate_cfg(cfg)
|
||||
lines = []
|
||||
if errors:
|
||||
lines.append("❌ Config errors:")
|
||||
lines += [f"- {e}" for e in errors]
|
||||
if warnings:
|
||||
if lines:
|
||||
lines.append("")
|
||||
lines.append("⚠️ Warnings:")
|
||||
lines += [f"- {w}" for w in warnings]
|
||||
if not lines:
|
||||
lines.append("✅ Config looks OK")
|
||||
await msg.answer("\n".join(lines))
|
||||
@@ -1,11 +1,13 @@
|
||||
from aiogram import F
|
||||
from aiogram.types import Message
|
||||
from app import dp
|
||||
from app import dp, cfg
|
||||
from auth import is_admin_msg
|
||||
from keyboards import docker_kb, docker_inline_kb
|
||||
from services.docker import container_uptime, docker_cmd
|
||||
from services.incidents import log_incident
|
||||
from state import DOCKER_MAP, LOG_FILTER_PENDING
|
||||
import time
|
||||
import json
|
||||
|
||||
|
||||
async def cmd_docker_status(msg: Message):
|
||||
@@ -42,7 +44,7 @@ async def cmd_docker_status(msg: Message):
|
||||
lines.append(f"{icon} {alias}: {status} ({up})")
|
||||
|
||||
await msg.answer("\n".join(lines), reply_markup=docker_kb)
|
||||
|
||||
log_incident(cfg, f"docker_status by {msg.from_user.id}", category="docker")
|
||||
except Exception as e:
|
||||
# ⬅️ КРИТИЧЕСКИ ВАЖНО
|
||||
await msg.answer(
|
||||
@@ -77,6 +79,137 @@ async def ds(msg: Message):
|
||||
await cmd_docker_status(msg)
|
||||
|
||||
|
||||
@dp.message(F.text == "/docker_status")
|
||||
async def ds_cmd(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await cmd_docker_status(msg)
|
||||
|
||||
|
||||
@dp.message(F.text, F.func(lambda m: (m.text or "").split()[0] == "/docker_health"))
|
||||
async def docker_health(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
parts = msg.text.split()
|
||||
if len(parts) < 2:
|
||||
await msg.answer("Usage: /docker_health <alias>")
|
||||
return
|
||||
alias = parts[1]
|
||||
real = DOCKER_MAP.get(alias)
|
||||
if not real:
|
||||
await msg.answer(f"⚠️ Unknown container: {alias}", reply_markup=docker_kb)
|
||||
return
|
||||
rc, out = await docker_cmd(["inspect", "-f", "{{json .State.Health}}", real], timeout=10)
|
||||
if rc != 0 or not out.strip():
|
||||
await msg.answer(f"⚠️ Failed to get health for {alias}", reply_markup=docker_kb)
|
||||
return
|
||||
try:
|
||||
data = json.loads(out)
|
||||
except json.JSONDecodeError:
|
||||
await msg.answer(f"⚠️ Invalid health JSON for {alias}", reply_markup=docker_kb)
|
||||
return
|
||||
status = data.get("Status", "n/a")
|
||||
fail = data.get("FailingStreak", "n/a")
|
||||
logs = data.get("Log") or []
|
||||
lines = [f"🐳 {alias} health", f"Status: {status}", f"Failing streak: {fail}"]
|
||||
if logs:
|
||||
lines.append("Recent logs:")
|
||||
for entry in logs[-5:]:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
ts = entry.get("Start") or entry.get("End") or ""
|
||||
exitc = entry.get("ExitCode", "")
|
||||
out_line = entry.get("Output", "").strip()
|
||||
lines.append(f"- {ts} rc={exitc} {out_line}")
|
||||
await msg.answer("\n".join(lines), reply_markup=docker_kb)
|
||||
log_incident(cfg, f"docker_health alias={alias} by {msg.from_user.id}", category="docker")
|
||||
|
||||
|
||||
@dp.message(F.text == "/docker_health_summary")
|
||||
async def docker_health_summary(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
if not DOCKER_MAP:
|
||||
await msg.answer("⚠️ DOCKER_MAP пуст", reply_markup=docker_kb)
|
||||
return
|
||||
problems = []
|
||||
total = len(DOCKER_MAP)
|
||||
for alias, real in DOCKER_MAP.items():
|
||||
rc, out = await docker_cmd(["inspect", "-f", "{{json .State}}", real], timeout=10)
|
||||
if rc != 0:
|
||||
problems.append(f"{alias}: inspect error")
|
||||
continue
|
||||
try:
|
||||
state = json.loads(out)
|
||||
except Exception:
|
||||
problems.append(f"{alias}: bad JSON")
|
||||
continue
|
||||
status = state.get("Status", "n/a")
|
||||
health = (state.get("Health") or {}).get("Status", "n/a")
|
||||
if status != "running" or health not in ("healthy", "none"):
|
||||
problems.append(f"{alias}: {status}/{health}")
|
||||
ok = total - len(problems)
|
||||
lines = [f"🐳 Docker health: 🟢 {ok}/{total} healthy, 🔴 {len(problems)} issues"]
|
||||
if problems:
|
||||
lines.append("Problems:")
|
||||
lines.extend([f"- {p}" for p in problems])
|
||||
await msg.answer("\n".join(lines), reply_markup=docker_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "📈 Stats")
|
||||
async def dstats(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
if not DOCKER_MAP:
|
||||
await msg.answer(
|
||||
"⚠️ DOCKER_MAP пуст.\n"
|
||||
"Контейнеры не обнаружены.",
|
||||
reply_markup=docker_kb,
|
||||
)
|
||||
return
|
||||
|
||||
names = list(DOCKER_MAP.values())
|
||||
fmt = "{{.Name}}|{{.CPUPerc}}|{{.MemUsage}}|{{.NetIO}}|{{.BlockIO}}"
|
||||
rc, out = await docker_cmd(["stats", "--no-stream", "--format", fmt] + names)
|
||||
if rc != 0:
|
||||
await msg.answer(out, reply_markup=docker_kb)
|
||||
return
|
||||
lines = [line.strip() for line in out.splitlines() if line.strip()]
|
||||
if not lines:
|
||||
await msg.answer("📈 Stats\n\n(no data)", reply_markup=docker_kb)
|
||||
return
|
||||
|
||||
alias_by_name = {v: k for k, v in DOCKER_MAP.items()}
|
||||
rows = []
|
||||
for line in lines:
|
||||
parts = line.split("|")
|
||||
if len(parts) != 5:
|
||||
continue
|
||||
name, cpu, mem, net, blk = [p.strip() for p in parts]
|
||||
display = alias_by_name.get(name, name)
|
||||
try:
|
||||
cpu_val = float(cpu.strip("%"))
|
||||
except ValueError:
|
||||
cpu_val = 0.0
|
||||
rows.append((cpu_val, display, cpu, mem, net, blk))
|
||||
|
||||
if not rows:
|
||||
await msg.answer("📈 Stats\n\n(no data)", reply_markup=docker_kb)
|
||||
return
|
||||
|
||||
rows.sort(key=lambda r: r[0], reverse=True)
|
||||
header = f"{'NAME':<18} {'CPU':>6} {'MEM':>18} {'NET':>16} {'IO':>16}"
|
||||
formatted = [header]
|
||||
for _cpu_val, name, cpu, mem, net, blk in rows:
|
||||
formatted.append(f"{name[:18]:<18} {cpu:>6} {mem:>18} {net:>16} {blk:>16}")
|
||||
|
||||
body = "\n".join(formatted)
|
||||
await msg.answer(
|
||||
f"📈 **Docker stats**\n```\n{body}\n```",
|
||||
reply_markup=docker_kb,
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
|
||||
|
||||
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in LOG_FILTER_PENDING))
|
||||
async def log_filter_input(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
@@ -102,6 +235,9 @@ async def log_filter_input(msg: Message):
|
||||
if rc != 0:
|
||||
await msg.answer(out, reply_markup=docker_kb)
|
||||
return
|
||||
if not out.strip():
|
||||
await msg.answer("⚠️ Нет логов за выбранный период", reply_markup=docker_kb)
|
||||
return
|
||||
|
||||
lines = [line for line in out.splitlines() if needle.lower() in line.lower()]
|
||||
filtered = "\n".join(lines) if lines else "(no matches)"
|
||||
|
||||
166
handlers/help.py
166
handlers/help.py
@@ -1,24 +1,164 @@
|
||||
from aiogram import F
|
||||
from aiogram.types import Message
|
||||
from app import dp
|
||||
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
||||
from app import dp, ADMIN_ID
|
||||
from auth import is_admin_msg
|
||||
from keyboards import menu_kb
|
||||
|
||||
|
||||
@dp.message(F.text.in_({"ℹ️ Help", "ℹ Help", "Help"}))
|
||||
HELP_PAGES = [
|
||||
(
|
||||
"Overview",
|
||||
"ℹ️ **Help — Overview**\n\n"
|
||||
"🩺 *Health* — быстрый health-check.\n"
|
||||
"📊 *Статус* — общая загрузка.\n"
|
||||
"📋 */status_short* — кратко (load/RAM/диски).\n"
|
||||
"🩺 */health_short* — краткий health.\n"
|
||||
"🧪 */selftest* — health + restic snapshot probe.\n"
|
||||
"🔧 Разделы: Docker, Backup, Artifacts, System, OpenWrt.",
|
||||
),
|
||||
(
|
||||
"Alerts",
|
||||
"🚨 **Alerts & Mute**\n\n"
|
||||
"Команды:\n"
|
||||
"• `/alerts test <critical|warn|info>`\n"
|
||||
"• `/alerts mute <cat> <minutes>` / `/alerts unmute <cat>` / `/alerts list`\n"
|
||||
"• `/alerts recent [hours]`\n"
|
||||
"Шорткаты: `/alerts_list`, `/alerts_recent`, `/alerts_mute_load` (60м).\n"
|
||||
"Категории: load, disk, smart, raid, ssl, docker, test.\n"
|
||||
"Quiet hours: `alerts.quiet_hours` для не‑критичных.\n"
|
||||
"Авто-мьют: `alerts.auto_mute` со слотами времени.\n"
|
||||
"Только красные load: `alerts.load_only_critical: true`.\n"
|
||||
"Валидатор конфига: `/config_check`.",
|
||||
),
|
||||
(
|
||||
"Backup",
|
||||
"💾 **Backup (restic)**\n\n"
|
||||
"Кнопки: Status, Last snapshot, Repo stats, Run backup, Queue, Restic check, Weekly report, History.\n"
|
||||
"History — хвост `/var/log/backup-auto.log`.\n"
|
||||
"Fail → кнопка Retry (backup/check).\n"
|
||||
"Run backup/Check учитывают `safety.dry_run`.\n"
|
||||
"После бэкапа приходит TL;DR + путь к логу `/var/log/backup-auto.log`.\n"
|
||||
"Queue → Details показывает отложенные задачи.",
|
||||
),
|
||||
(
|
||||
"Docker & System",
|
||||
"🐳 **Docker**\n"
|
||||
"Status/Restart/Logs/Stats — клавиатура Docker.\n"
|
||||
"Команды: `/docker_status`, `/docker_health <alias>`.\n\n"
|
||||
"🖥 **System**\n"
|
||||
"Info: Disks/Security/Metrics/Hardware/SMART/OpenWrt.\n"
|
||||
"Ops: Updates/Upgrade/Reboot.\n"
|
||||
"Logs: Audit/Incidents/Security/Integrations/Processes.\n"
|
||||
"OpenWrt: `/openwrt`, `/openwrt_wan`, `/openwrt_clients`, `/openwrt_leases`.",
|
||||
),
|
||||
(
|
||||
"Admin",
|
||||
"🛠 **Admin & Deploy**\n\n"
|
||||
"Config: `/config_check`, файл `config.yaml` (см. config.example.yaml).\n"
|
||||
"Deploy: `deploy.sh` (ssh 10.10.10.10:1090 → git pull → systemctl restart tg-bot).\n"
|
||||
"Incidents: `/incidents_summary`, `/incidents_diff [hours]`.\n"
|
||||
"Export: `/incidents_export [hours] [csv|json]`, `/export_all [hours]` (zip).\n"
|
||||
"Alerts log/heatmap: `/alerts_log [hours]`, `/alerts_heatmap [hours] [cat]`.\n"
|
||||
"Backup SLA: `/backup_sla`; Docker restarts: `/docker_restarts [hours]`.\n"
|
||||
"Disk snapshot: `/disk_snapshot`.\n"
|
||||
"Queue: `/queue_history`, `/queue_sla`.\n"
|
||||
"Self-test history: `/selftest_history`.\n"
|
||||
"OpenWrt leases diff: `/openwrt_leases_diff`.\n"
|
||||
"BotFather list: `/botfather_list`.\n"
|
||||
"Безопасность: `safety.dry_run: true` блокирует опасные действия.\n"
|
||||
"OpenWrt: кнопка в System → Info.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _help_kb(idx: int) -> InlineKeyboardMarkup:
|
||||
buttons = []
|
||||
if idx > 0:
|
||||
buttons.append(InlineKeyboardButton(text="◀️ Prev", callback_data=f"help:{idx-1}"))
|
||||
buttons.append(InlineKeyboardButton(text=f"{idx+1}/{len(HELP_PAGES)}", callback_data="help:noop"))
|
||||
if idx < len(HELP_PAGES) - 1:
|
||||
buttons.append(InlineKeyboardButton(text="Next ▶️", callback_data=f"help:{idx+1}"))
|
||||
return InlineKeyboardMarkup(inline_keyboard=[buttons])
|
||||
|
||||
|
||||
def _help_text(idx: int) -> str:
|
||||
_title, body = HELP_PAGES[idx]
|
||||
return body
|
||||
|
||||
|
||||
@dp.message(F.text.in_({"ℹ️ Help", "ℹ Help", "Help", "/help"}))
|
||||
async def help_cmd(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
|
||||
idx = 0
|
||||
await msg.answer(
|
||||
"ℹ️ **Help / Справка**\n\n"
|
||||
"🩺 Health — быстрый health-check сервера\n"
|
||||
"📊 Статус — общая загрузка сервера\n"
|
||||
"🐳 Docker — управление контейнерами\n"
|
||||
"📦 Backup — restic бэкапы\n"
|
||||
"🧉 Artifacts — критичные образы (Clonezilla, NAND)\n"
|
||||
"⚙️ System — диски, безопасность, URL, reboot\n\n"
|
||||
"Inline-кнопки используются для выбора контейнеров.",
|
||||
reply_markup=menu_kb,
|
||||
_help_text(idx),
|
||||
reply_markup=_help_kb(idx),
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("help:"))
|
||||
async def help_cb(cb: CallbackQuery):
|
||||
if cb.from_user.id != ADMIN_ID:
|
||||
await cb.answer()
|
||||
return
|
||||
payload = cb.data.split(":", 1)[1]
|
||||
if payload == "noop":
|
||||
await cb.answer()
|
||||
return
|
||||
try:
|
||||
idx = int(payload)
|
||||
except ValueError:
|
||||
await cb.answer()
|
||||
return
|
||||
idx = max(0, min(idx, len(HELP_PAGES) - 1))
|
||||
await cb.message.edit_text(
|
||||
_help_text(idx),
|
||||
reply_markup=_help_kb(idx),
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
await cb.answer()
|
||||
|
||||
|
||||
BOTFATHER_LIST = """\
|
||||
help - Show help pages
|
||||
status_short - Compact host status
|
||||
health_short - Compact health report
|
||||
selftest - Health + restic snapshot probe
|
||||
alerts - Manage alerts
|
||||
alerts_list - List active mutes
|
||||
alerts_recent - Show recent incidents (24h)
|
||||
alerts_mute_load - Mute load alerts for 60m
|
||||
alerts_log - Show suppressed alerts
|
||||
alerts_heatmap - Hourly incidents heatmap
|
||||
backup_run - Run backup (queued)
|
||||
backup_history - Show backup log tail
|
||||
queue_history - Show queue recent jobs
|
||||
queue_sla - Queue SLA stats
|
||||
docker_status - Docker summary
|
||||
docker_health - Docker inspect/health by alias
|
||||
docker_health_summary - Docker health summary (problems only)
|
||||
openwrt - Full OpenWrt status
|
||||
openwrt_wan - OpenWrt WAN only
|
||||
openwrt_clients - OpenWrt wifi clients
|
||||
openwrt_leases - OpenWrt DHCP leases
|
||||
openwrt_fast - OpenWrt quick WAN view
|
||||
openwrt_leases_diff - OpenWrt DHCP diff
|
||||
incidents_summary - Incidents counters (24h/7d)
|
||||
incidents_export - Export incidents (hours fmt)
|
||||
incidents_diff - Show incidents since last check
|
||||
export_all - Zip with incidents/queue/selftest
|
||||
backup_sla - Backup SLA check
|
||||
docker_restarts - Docker restart history
|
||||
selftest_history - Self-test history
|
||||
disk_snapshot - Disk usage snapshot
|
||||
config_check - Validate config
|
||||
"""
|
||||
|
||||
|
||||
@dp.message(F.text == "/botfather_list")
|
||||
async def botfather_list(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
await msg.answer(f"Commands for BotFather:\n```\n{BOTFATHER_LIST}\n```", parse_mode="Markdown")
|
||||
|
||||
@@ -2,7 +2,19 @@ from aiogram import F
|
||||
from aiogram.types import Message
|
||||
from app import dp
|
||||
from auth import is_admin_msg
|
||||
from keyboards import menu_kb, docker_kb, backup_kb, artifacts_kb, system_kb
|
||||
from keyboards import (
|
||||
menu_kb,
|
||||
docker_kb,
|
||||
backup_kb,
|
||||
artifacts_kb,
|
||||
system_menu_kb,
|
||||
system_info_kb,
|
||||
system_ops_kb,
|
||||
system_logs_kb,
|
||||
system_logs_audit_kb,
|
||||
system_logs_security_kb,
|
||||
system_logs_integrations_kb,
|
||||
)
|
||||
|
||||
|
||||
@dp.message(F.text == "/start")
|
||||
@@ -38,4 +50,53 @@ async def am(msg: Message):
|
||||
@dp.message(F.text == "⚙️ System")
|
||||
async def sm(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("⚙️ System", reply_markup=system_kb)
|
||||
await msg.answer("⚙️ System", reply_markup=system_menu_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "⬅️ System")
|
||||
async def back_system(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("⚙️ System", reply_markup=system_menu_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "ℹ️ Info")
|
||||
async def sys_info(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("ℹ️ System info", reply_markup=system_info_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "🛠 Ops")
|
||||
async def sys_ops(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("🛠 System ops", reply_markup=system_ops_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "📄 Logs")
|
||||
async def sys_logs(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("📄 System logs", reply_markup=system_logs_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "⬅️ Logs")
|
||||
async def back_logs(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("📄 System logs", reply_markup=system_logs_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "🧾 Audit/Incidents")
|
||||
async def logs_audit_menu(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("🧾 Logs: Audit/Incidents", reply_markup=system_logs_audit_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "🔒 Security")
|
||||
async def logs_security_menu(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("🔒 Logs: Security", reply_markup=system_logs_security_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "🧩 Integrations")
|
||||
async def logs_integrations_menu(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await msg.answer("🧩 Logs: Integrations", reply_markup=system_logs_integrations_kb)
|
||||
|
||||
|
||||
141
handlers/processes.py
Normal file
141
handlers/processes.py
Normal file
@@ -0,0 +1,141 @@
|
||||
import asyncio
|
||||
from aiogram import F
|
||||
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
|
||||
from app import dp, ADMIN_ID
|
||||
from auth import is_admin_msg
|
||||
from keyboards import system_logs_tools_kb
|
||||
from services.processes import get_top_processes, search_processes, terminate_process
|
||||
from state import PROC_SEARCH_PENDING, PROC_KILL_PENDING
|
||||
|
||||
|
||||
def _proc_kb() -> InlineKeyboardMarkup:
|
||||
return InlineKeyboardMarkup(
|
||||
inline_keyboard=[[
|
||||
InlineKeyboardButton(text="🔄 Refresh", callback_data="proc:refresh"),
|
||||
InlineKeyboardButton(text="🔍 Search", callback_data="proc:search"),
|
||||
InlineKeyboardButton(text="🛑 Kill", callback_data="proc:kill"),
|
||||
]]
|
||||
)
|
||||
|
||||
|
||||
def _format_top(title: str, rows: list[dict]) -> str:
|
||||
if not rows:
|
||||
return f"{title}\n(no data)"
|
||||
lines = ["PID CPU% MEM% NAME"]
|
||||
for row in rows:
|
||||
lines.append(
|
||||
f"{row['pid']:<5} {row['cpu']:<5.1f} {row['mem']:<5.1f} {row['name']}"
|
||||
)
|
||||
return f"{title}\n" + "\n".join(lines)
|
||||
|
||||
|
||||
async def send_processes(msg: Message, edit: bool = False):
|
||||
top_cpu, top_mem = await asyncio.to_thread(get_top_processes)
|
||||
body = (
|
||||
"🧰 **Processes**\n\n"
|
||||
"```\n"
|
||||
f"{_format_top('Top CPU', top_cpu)}\n\n"
|
||||
f"{_format_top('Top RAM', top_mem)}\n"
|
||||
"```"
|
||||
)
|
||||
if edit:
|
||||
await msg.edit_text(body, reply_markup=_proc_kb(), parse_mode="Markdown")
|
||||
else:
|
||||
await msg.answer(body, reply_markup=_proc_kb(), parse_mode="Markdown")
|
||||
|
||||
|
||||
@dp.message(F.text == "🧰 Processes")
|
||||
async def proc_menu(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await send_processes(msg, edit=False)
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("proc:"))
|
||||
async def proc_actions(cb: CallbackQuery):
|
||||
if cb.from_user.id != ADMIN_ID:
|
||||
return
|
||||
await cb.answer()
|
||||
action = cb.data.split(":", 1)[1]
|
||||
if action == "refresh":
|
||||
await send_processes(cb.message, edit=True)
|
||||
return
|
||||
if action == "search":
|
||||
PROC_SEARCH_PENDING[cb.from_user.id] = {}
|
||||
await cb.message.answer("🔍 Send search text", reply_markup=system_logs_tools_kb)
|
||||
return
|
||||
if action == "kill":
|
||||
PROC_KILL_PENDING[cb.from_user.id] = {}
|
||||
await cb.message.answer("🛑 Send PID to terminate", reply_markup=system_logs_tools_kb)
|
||||
return
|
||||
|
||||
|
||||
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in PROC_SEARCH_PENDING))
|
||||
async def proc_search(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
PROC_SEARCH_PENDING.pop(msg.from_user.id, None)
|
||||
query = (msg.text or "").strip()
|
||||
if not query:
|
||||
await msg.answer("⚠️ Empty search", reply_markup=system_logs_tools_kb)
|
||||
return
|
||||
|
||||
rows = await asyncio.to_thread(search_processes, query)
|
||||
if not rows:
|
||||
await msg.answer("🔍 No matches", reply_markup=system_logs_tools_kb)
|
||||
return
|
||||
|
||||
lines = ["PID NAME CMD"]
|
||||
for row in rows:
|
||||
cmd = row["cmdline"] or "-"
|
||||
if len(cmd) > 80:
|
||||
cmd = cmd[:80] + "…"
|
||||
lines.append(f"{row['pid']:<5} {row['name']:<6} {cmd}")
|
||||
|
||||
text = "🔍 **Search results**\n```\n" + "\n".join(lines) + "\n```"
|
||||
await msg.answer(text, reply_markup=system_logs_tools_kb, parse_mode="Markdown")
|
||||
|
||||
|
||||
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in PROC_KILL_PENDING))
|
||||
async def proc_kill_pid(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
PROC_KILL_PENDING.pop(msg.from_user.id, None)
|
||||
raw = (msg.text or "").strip()
|
||||
try:
|
||||
pid = int(raw)
|
||||
except ValueError:
|
||||
await msg.answer("⚠️ Invalid PID", reply_markup=system_logs_tools_kb)
|
||||
return
|
||||
|
||||
kb = InlineKeyboardMarkup(
|
||||
inline_keyboard=[[
|
||||
InlineKeyboardButton(text="✅ Confirm", callback_data=f"prockill:{pid}:confirm"),
|
||||
InlineKeyboardButton(text="✖ Cancel", callback_data="prockill:cancel"),
|
||||
]]
|
||||
)
|
||||
await msg.answer(f"⚠️ Terminate PID `{pid}`?", reply_markup=kb, parse_mode="Markdown")
|
||||
|
||||
|
||||
@dp.callback_query(F.data.startswith("prockill:"))
|
||||
async def proc_kill_confirm(cb: CallbackQuery):
|
||||
if cb.from_user.id != ADMIN_ID:
|
||||
return
|
||||
parts = cb.data.split(":")
|
||||
if len(parts) < 2:
|
||||
await cb.answer("Bad request")
|
||||
return
|
||||
if parts[1] == "cancel":
|
||||
await cb.answer("Cancelled")
|
||||
await cb.message.delete()
|
||||
return
|
||||
if len(parts) != 3 or parts[2] != "confirm":
|
||||
await cb.answer("Bad request")
|
||||
return
|
||||
try:
|
||||
pid = int(parts[1])
|
||||
except ValueError:
|
||||
await cb.answer("Bad PID")
|
||||
return
|
||||
await cb.answer()
|
||||
result = await asyncio.to_thread(terminate_process, pid)
|
||||
await cb.message.answer(result, reply_markup=system_logs_tools_kb)
|
||||
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import json
|
||||
import socket
|
||||
import time
|
||||
import psutil
|
||||
@@ -10,6 +11,8 @@ from keyboards import menu_kb
|
||||
from services.system import format_disks
|
||||
from services.health import health
|
||||
from state import DOCKER_MAP
|
||||
from services.runner import run_cmd_full
|
||||
from services.selftest import run_selftest
|
||||
|
||||
|
||||
async def cmd_status(msg: Message):
|
||||
@@ -21,23 +24,30 @@ async def cmd_status(msg: Message):
|
||||
minutes, _ = divmod(rem, 60)
|
||||
|
||||
load1 = psutil.getloadavg()[0]
|
||||
load_warn = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
|
||||
high_warn = float(cfg.get("thresholds", {}).get("high_load_warn", load_warn * 1.5))
|
||||
|
||||
cpu_icon = "🟢"
|
||||
if load1 > 2.0:
|
||||
if load1 > high_warn:
|
||||
cpu_icon = "🔴"
|
||||
elif load1 > 1.0:
|
||||
elif load1 > load_warn:
|
||||
cpu_icon = "🟡"
|
||||
|
||||
mem = psutil.virtual_memory()
|
||||
cpu_percent = psutil.cpu_percent(interval=None)
|
||||
|
||||
disks = format_disks()
|
||||
net_lines = await _network_snapshot()
|
||||
|
||||
await msg.answer(
|
||||
"📊 **Server status**\n\n"
|
||||
f"🖥 **Host:** `{socket.gethostname()}`\n"
|
||||
f"⏱ **Uptime:** {days}d {hours}h {minutes}m\n"
|
||||
f"{cpu_icon} **Load (1m):** {load1:.2f}\n"
|
||||
f"🧮 **CPU:** {cpu_percent:.0f}%\n"
|
||||
f"🧠 **RAM:** {mem.used // (1024**3)} / {mem.total // (1024**3)} GiB ({mem.percent}%)\n\n"
|
||||
f"{disks}",
|
||||
f"{disks}\n\n"
|
||||
f"{net_lines}",
|
||||
reply_markup=menu_kb,
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
@@ -67,3 +77,96 @@ async def h(msg: Message):
|
||||
async def st(msg: Message):
|
||||
if is_admin_msg(msg):
|
||||
await cmd_status(msg)
|
||||
|
||||
|
||||
@dp.message(F.text == "/status_short")
|
||||
async def st_short(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
now = time.time()
|
||||
uptime_sec = int(now - psutil.boot_time())
|
||||
days, rem = divmod(uptime_sec, 86400)
|
||||
hours, rem = divmod(rem, 3600)
|
||||
minutes, _ = divmod(rem, 60)
|
||||
load1, load5, load15 = psutil.getloadavg()
|
||||
mem = psutil.virtual_memory()
|
||||
disks = format_disks().splitlines()
|
||||
disk_line = disks[1] if len(disks) > 1 else "Disks: n/a"
|
||||
await msg.answer(
|
||||
"📋 **Status (short)**\n"
|
||||
f"🖥 `{socket.gethostname()}`\n"
|
||||
f"⏱ Uptime: {days}d {hours}h {minutes}m\n"
|
||||
f"⚙️ Load: {load1:.2f} {load5:.2f} {load15:.2f}\n"
|
||||
f"🧠 RAM: {mem.percent}% ({mem.used // (1024**3)} / {mem.total // (1024**3)} GiB)\n"
|
||||
f"💾 {disk_line}",
|
||||
reply_markup=menu_kb,
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
|
||||
|
||||
@dp.message(F.text == "/health_short")
|
||||
async def health_short(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
try:
|
||||
text = await asyncio.to_thread(health, cfg, DOCKER_MAP)
|
||||
except Exception as e:
|
||||
await msg.answer(f"❌ Health failed: {type(e).__name__}: {e}", reply_markup=menu_kb)
|
||||
return
|
||||
lines = [ln for ln in text.splitlines() if ln.strip()]
|
||||
brief = " | ".join(lines[1:5]) if len(lines) > 1 else text
|
||||
await msg.answer(f"🩺 Health (short)\n{brief}", reply_markup=menu_kb)
|
||||
|
||||
|
||||
@dp.message(F.text.in_({"🧪 Self-test", "/selftest"}))
|
||||
async def selftest(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
|
||||
await msg.answer("⏳ Self-test…", reply_markup=menu_kb)
|
||||
|
||||
async def worker():
|
||||
text, _ok = await run_selftest(cfg, DOCKER_MAP)
|
||||
await msg.answer(text, reply_markup=menu_kb)
|
||||
|
||||
asyncio.create_task(worker())
|
||||
|
||||
|
||||
def _rate_str(value: float) -> str:
|
||||
if value >= 1024 * 1024:
|
||||
return f"{value / (1024 * 1024):.2f} MiB/s"
|
||||
if value >= 1024:
|
||||
return f"{value / 1024:.1f} KiB/s"
|
||||
return f"{value:.0f} B/s"
|
||||
|
||||
|
||||
async def _network_snapshot(interval: float = 1.0) -> str:
|
||||
start = psutil.net_io_counters(pernic=True)
|
||||
await asyncio.sleep(interval)
|
||||
end = psutil.net_io_counters(pernic=True)
|
||||
|
||||
rows = []
|
||||
for nic, s in end.items():
|
||||
if nic.startswith("lo"):
|
||||
continue
|
||||
if not nic.startswith("enp"):
|
||||
continue
|
||||
e = start.get(nic)
|
||||
if not e:
|
||||
continue
|
||||
rx = max(0, s.bytes_recv - e.bytes_recv)
|
||||
tx = max(0, s.bytes_sent - e.bytes_sent)
|
||||
err = max(0, (s.errin - e.errin) + (s.errout - e.errout))
|
||||
score = rx + tx + (err * 1024)
|
||||
rows.append((score, nic, rx, tx, err))
|
||||
|
||||
rows.sort(reverse=True)
|
||||
top = rows[:3]
|
||||
if not top:
|
||||
return "📡 **Network (1s):** no data"
|
||||
|
||||
lines = ["📡 **Network (1s):**"]
|
||||
for _score, nic, rx, tx, err in top:
|
||||
err_part = f", err {err}" if err else ""
|
||||
lines.append(f"- {nic}: RX {_rate_str(rx / interval)}, TX {_rate_str(tx / interval)}{err_part}")
|
||||
return "\n".join(lines)
|
||||
|
||||
1068
handlers/system.py
1068
handlers/system.py
File diff suppressed because it is too large
Load Diff
97
keyboards.py
97
keyboards.py
@@ -10,7 +10,7 @@ menu_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🩺 Health"), KeyboardButton(text="📊 Статус")],
|
||||
[KeyboardButton(text="🐳 Docker"), KeyboardButton(text="📦 Backup")],
|
||||
[KeyboardButton(text="🧉 Artifacts"), KeyboardButton(text="⚙️ System")],
|
||||
[KeyboardButton(text="⚙️ System")],
|
||||
[KeyboardButton(text="ℹ️ Help")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
@@ -18,8 +18,17 @@ menu_kb = ReplyKeyboardMarkup(
|
||||
|
||||
docker_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🐳 Status")],
|
||||
[KeyboardButton(text="🐳 Status"), KeyboardButton(text="🧰 Arcane")],
|
||||
[KeyboardButton(text="🔄 Restart"), KeyboardButton(text="📜 Logs")],
|
||||
[KeyboardButton(text="📈 Stats"), KeyboardButton(text="♻️ Restarts")],
|
||||
[KeyboardButton(text="⬅️ Назад")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
arcane_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🔄 Refresh")],
|
||||
[KeyboardButton(text="⬅️ Назад")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
@@ -29,8 +38,8 @@ backup_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="📦 Status"), KeyboardButton(text="📦 Last snapshot")],
|
||||
[KeyboardButton(text="📊 Repo stats"), KeyboardButton(text="🧯 Restore help")],
|
||||
[KeyboardButton(text="▶️ Run backup"), KeyboardButton(text="🧾 Queue")],
|
||||
[KeyboardButton(text="⬅️ Назад")],
|
||||
[KeyboardButton(text="▶️ Run backup"), KeyboardButton(text="🧾 Queue"), KeyboardButton(text="📊 Queue SLA")],
|
||||
[KeyboardButton(text="📉 Backup SLA"), KeyboardButton(text="📜 History"), KeyboardButton(text="⬅️ Назад")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
@@ -44,13 +53,85 @@ artifacts_kb = ReplyKeyboardMarkup(
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
system_kb = ReplyKeyboardMarkup(
|
||||
system_menu_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="ℹ️ Info"), KeyboardButton(text="🛠 Ops")],
|
||||
[KeyboardButton(text="📄 Logs"), KeyboardButton(text="⬅️ Назад")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
system_info_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="💽 Disks"), KeyboardButton(text="🔐 Security")],
|
||||
[KeyboardButton(text="🌐 URLs"), KeyboardButton(text="📦 Updates")],
|
||||
[KeyboardButton(text="⬆️ Upgrade")],
|
||||
[KeyboardButton(text="📈 Metrics"), KeyboardButton(text="🧱 Hardware")],
|
||||
[KeyboardButton(text="🧪 SMART test"), KeyboardButton(text="🧪 SMART status")],
|
||||
[KeyboardButton(text="📡 OpenWrt"), KeyboardButton(text="⬅️ System")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
system_ops_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="📦 Updates"), KeyboardButton(text="⬆️ Upgrade")],
|
||||
[KeyboardButton(text="🔄 Reboot")],
|
||||
[KeyboardButton(text="⬅️ Назад")],
|
||||
[KeyboardButton(text="⬅️ System")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
system_logs_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🧾 Audit/Incidents"), KeyboardButton(text="🔒 Security")],
|
||||
[KeyboardButton(text="🧩 Integrations"), KeyboardButton(text="🧰 Processes")],
|
||||
[KeyboardButton(text="📣 Summary"), KeyboardButton(text="🔥 Heatmap")],
|
||||
[KeyboardButton(text="⬅️ System")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
system_logs_audit_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🧾 Audit"), KeyboardButton(text="📣 Incidents")],
|
||||
[KeyboardButton(text="🆕 Diff"), KeyboardButton(text="📤 Export")],
|
||||
[KeyboardButton(text="📦 Export all"), KeyboardButton(text="🧰 Alerts log")],
|
||||
[KeyboardButton(text="⬅️ Logs")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
system_logs_security_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🔑 SSH log"), KeyboardButton(text="🔒 SSL")],
|
||||
[KeyboardButton(text="🌍 External"), KeyboardButton(text="🌐 URLs")],
|
||||
[KeyboardButton(text="⬅️ Logs")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
system_logs_integrations_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🧩 NPMplus"), KeyboardButton(text="🍵 Gitea")],
|
||||
[KeyboardButton(text="⬅️ Logs")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
system_logs_tools_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🧰 Processes")],
|
||||
[KeyboardButton(text="⬅️ Logs")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
# OpenWrt submenu (4 ряда)
|
||||
openwrt_kb = ReplyKeyboardMarkup(
|
||||
keyboard=[
|
||||
[KeyboardButton(text="🌐 WAN fast"), KeyboardButton(text="📡 Full status")],
|
||||
[KeyboardButton(text="📶 Wi-Fi clients"), KeyboardButton(text="🧾 Leases")],
|
||||
[KeyboardButton(text="🔀 Leases diff")],
|
||||
[KeyboardButton(text="⬅️ System")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from pathlib import Path
|
||||
import os
|
||||
import time
|
||||
|
||||
LOCK_DIR = Path("/var/run/tg-bot")
|
||||
@@ -11,9 +12,14 @@ def lock_path(name: str) -> Path:
|
||||
|
||||
def acquire_lock(name: str) -> bool:
|
||||
p = lock_path(name)
|
||||
if p.exists():
|
||||
try:
|
||||
fd = os.open(str(p), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
||||
except FileExistsError:
|
||||
return False
|
||||
p.write_text(str(time.time()))
|
||||
try:
|
||||
os.write(fd, str(time.time()).encode("ascii", errors="ignore"))
|
||||
finally:
|
||||
os.close(fd)
|
||||
return True
|
||||
|
||||
|
||||
|
||||
66
main.py
66
main.py
@@ -1,12 +1,20 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import socket
|
||||
from datetime import datetime
|
||||
from app import bot, dp, cfg, ADMIN_ID
|
||||
from app import bot, dp, cfg, ADMIN_ID, ADMIN_IDS
|
||||
from keyboards import menu_kb
|
||||
from services.docker import discover_containers, docker_watchdog
|
||||
from services.alerts import monitor_resources, monitor_smart
|
||||
from services.queue import worker as queue_worker
|
||||
from services.alerts import monitor_resources, monitor_smart, monitor_raid
|
||||
from services.metrics import MetricsStore, start_sampler
|
||||
from services.queue import worker as queue_worker, configure as queue_configure
|
||||
from services.notify import notify
|
||||
from services.audit import AuditMiddleware, audit_start
|
||||
from services.ssl_alerts import monitor_ssl
|
||||
from services.external_checks import monitor_external
|
||||
from services.incidents import log_incident
|
||||
from services.logging_setup import setup_logging
|
||||
from services.selftest import schedule_selftest
|
||||
import state
|
||||
import handlers.menu
|
||||
import handlers.status
|
||||
@@ -16,6 +24,41 @@ import handlers.artifacts
|
||||
import handlers.system
|
||||
import handlers.help
|
||||
import handlers.callbacks
|
||||
import handlers.arcane
|
||||
import handlers.processes
|
||||
from services.weekly_report import weekly_reporter
|
||||
import handlers.alerts_admin
|
||||
import handlers.config_check
|
||||
|
||||
|
||||
def _handle_async_exception(_loop, context):
|
||||
msg = context.get("message") or "Unhandled exception"
|
||||
exc = context.get("exception")
|
||||
if exc:
|
||||
text = f"❌ {msg}: {type(exc).__name__}: {exc}"
|
||||
else:
|
||||
text = f"❌ {msg}"
|
||||
now = datetime.now()
|
||||
if not hasattr(_handle_async_exception, "_recent"):
|
||||
_handle_async_exception._recent = []
|
||||
_handle_async_exception._last_alert = None
|
||||
recent = _handle_async_exception._recent
|
||||
recent.append(now)
|
||||
# keep last hour
|
||||
_handle_async_exception._recent = [t for t in recent if (now - t).total_seconds() < 3600]
|
||||
if len(_handle_async_exception._recent) >= 3:
|
||||
last_alert = getattr(_handle_async_exception, "_last_alert", None)
|
||||
if not last_alert or (now - last_alert).total_seconds() > 3600:
|
||||
try:
|
||||
log_incident(cfg, "exception_flood", category="system")
|
||||
except Exception:
|
||||
pass
|
||||
_handle_async_exception._last_alert = now
|
||||
try:
|
||||
log_incident(cfg, text, category="system")
|
||||
except Exception:
|
||||
pass
|
||||
logging.getLogger("asyncio").error(text)
|
||||
|
||||
|
||||
async def notify_start():
|
||||
@@ -27,6 +70,10 @@ async def notify_start():
|
||||
|
||||
|
||||
async def main():
|
||||
setup_logging(cfg)
|
||||
dp.message.middleware(AuditMiddleware(cfg))
|
||||
dp.callback_query.middleware(AuditMiddleware(cfg))
|
||||
audit_start(cfg)
|
||||
state.DOCKER_MAP.clear()
|
||||
state.DOCKER_MAP.update(await discover_containers(cfg))
|
||||
if cfg.get("docker", {}).get("watchdog", True):
|
||||
@@ -35,7 +82,20 @@ async def main():
|
||||
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
|
||||
if cfg.get("alerts", {}).get("smart_enabled", True):
|
||||
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
||||
if cfg.get("alerts", {}).get("raid_enabled", True):
|
||||
asyncio.create_task(monitor_raid(cfg, notify, bot, ADMIN_ID))
|
||||
if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
|
||||
asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
|
||||
if cfg.get("external_checks", {}).get("enabled", True):
|
||||
asyncio.create_task(monitor_external(cfg))
|
||||
state.METRICS_STORE = MetricsStore()
|
||||
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
||||
queue_configure(cfg.get("queue", {}), cfg)
|
||||
asyncio.create_task(queue_worker())
|
||||
asyncio.create_task(weekly_reporter(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
|
||||
asyncio.create_task(schedule_selftest(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.set_exception_handler(_handle_async_exception)
|
||||
await notify_start()
|
||||
await dp.start_polling(bot)
|
||||
|
||||
|
||||
93
services/alert_mute.py
Normal file
93
services/alert_mute.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import time
|
||||
from typing import Dict
|
||||
from services.runtime_state import get_state, set_state
|
||||
|
||||
# category -> unix timestamp until muted
|
||||
|
||||
|
||||
def _mutes() -> Dict[str, float]:
|
||||
return get_state().get("mutes", {})
|
||||
|
||||
|
||||
def _save(mutes: Dict[str, float]):
|
||||
set_state("mutes", mutes)
|
||||
|
||||
|
||||
def _cleanup() -> None:
|
||||
mutes = _mutes()
|
||||
now = time.time()
|
||||
expired = [k for k, until in mutes.items() if until <= now]
|
||||
for k in expired:
|
||||
mutes.pop(k, None)
|
||||
_save(mutes)
|
||||
|
||||
|
||||
def set_mute(category: str, seconds: int) -> float:
|
||||
_cleanup()
|
||||
mutes = _mutes()
|
||||
until = time.time() + max(0, seconds)
|
||||
mutes[category] = until
|
||||
_save(mutes)
|
||||
return until
|
||||
|
||||
|
||||
def clear_mute(category: str) -> None:
|
||||
mutes = _mutes()
|
||||
mutes.pop(category, None)
|
||||
_save(mutes)
|
||||
|
||||
|
||||
def is_muted(category: str | None) -> bool:
|
||||
if not category:
|
||||
return False
|
||||
_cleanup()
|
||||
mutes = _mutes()
|
||||
until = mutes.get(category)
|
||||
if until is None:
|
||||
return False
|
||||
if until <= time.time():
|
||||
mutes.pop(category, None)
|
||||
_save(mutes)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def list_mutes() -> dict[str, int]:
|
||||
_cleanup()
|
||||
now = time.time()
|
||||
mutes = _mutes()
|
||||
return {k: int(until - now) for k, until in mutes.items()}
|
||||
|
||||
|
||||
def is_auto_muted(cfg: dict, category: str | None) -> bool:
|
||||
if not category:
|
||||
return False
|
||||
auto_list = cfg.get("alerts", {}).get("auto_mute", [])
|
||||
if not isinstance(auto_list, list):
|
||||
return False
|
||||
now = time.localtime()
|
||||
now_minutes = now.tm_hour * 60 + now.tm_min
|
||||
for item in auto_list:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
cat = item.get("category")
|
||||
if cat != category:
|
||||
continue
|
||||
start = item.get("start", "00:00")
|
||||
end = item.get("end", "00:00")
|
||||
try:
|
||||
sh, sm = [int(x) for x in start.split(":")]
|
||||
eh, em = [int(x) for x in end.split(":")]
|
||||
except Exception:
|
||||
continue
|
||||
start_min = sh * 60 + sm
|
||||
end_min = eh * 60 + em
|
||||
if start_min == end_min:
|
||||
continue
|
||||
if start_min < end_min:
|
||||
if start_min <= now_minutes < end_min:
|
||||
return True
|
||||
else:
|
||||
if now_minutes >= start_min or now_minutes < end_min:
|
||||
return True
|
||||
return False
|
||||
@@ -1,8 +1,9 @@
|
||||
import asyncio
|
||||
import time
|
||||
import psutil
|
||||
from system_checks import list_disks, smart_health, disk_temperature
|
||||
from system_checks import list_disks, smart_health, disk_temperature, list_md_arrays, md_array_status
|
||||
from services.system import worst_disk_usage
|
||||
from services.disk_report import build_disk_report
|
||||
|
||||
|
||||
async def monitor_resources(cfg, notify, bot, chat_id):
|
||||
@@ -10,12 +11,17 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
||||
interval = int(alerts_cfg.get("interval_sec", 60))
|
||||
cooldown = int(alerts_cfg.get("cooldown_sec", 900))
|
||||
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
||||
load_only_critical = bool(alerts_cfg.get("load_only_critical", False))
|
||||
auto_mute_high_load_sec = int(alerts_cfg.get("auto_mute_on_high_load_sec", 0))
|
||||
|
||||
disk_warn = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
||||
snapshot_warn = int(cfg.get("disk_report", {}).get("threshold", disk_warn))
|
||||
snapshot_cooldown = int(cfg.get("disk_report", {}).get("cooldown_sec", 21600))
|
||||
load_warn = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
|
||||
high_warn = float(cfg.get("thresholds", {}).get("high_load_warn", load_warn * 1.5))
|
||||
|
||||
last_sent = {"disk": 0.0, "load": 0.0, "disk_na": 0.0}
|
||||
state = {"disk_high": False, "load_high": False, "disk_na": False}
|
||||
last_sent = {"disk": 0.0, "load": 0.0, "disk_na": 0.0, "disk_report": 0.0}
|
||||
state = {"disk_high": False, "disk_na": False, "load_level": 0}
|
||||
|
||||
while True:
|
||||
now = time.time()
|
||||
@@ -23,34 +29,55 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
||||
usage, mount = worst_disk_usage()
|
||||
if usage is None:
|
||||
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
|
||||
await notify(bot, chat_id, "⚠️ Disk usage n/a")
|
||||
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na", category="disk")
|
||||
state["disk_na"] = True
|
||||
last_sent["disk_na"] = now
|
||||
else:
|
||||
if state["disk_na"] and notify_recovery:
|
||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
|
||||
if state["disk_na"] and notify_recovery and not load_only_critical:
|
||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
|
||||
state["disk_na"] = False
|
||||
|
||||
if usage >= disk_warn:
|
||||
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
|
||||
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})")
|
||||
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high", category="disk")
|
||||
state["disk_high"] = True
|
||||
last_sent["disk"] = now
|
||||
else:
|
||||
if state["disk_high"] and notify_recovery:
|
||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
|
||||
if state["disk_high"] and notify_recovery and not load_only_critical:
|
||||
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
|
||||
state["disk_high"] = False
|
||||
|
||||
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
|
||||
report = await build_disk_report(cfg, mount or "/", usage)
|
||||
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot", category="disk")
|
||||
last_sent["disk_report"] = now
|
||||
|
||||
load = psutil.getloadavg()[0]
|
||||
if load >= load_warn:
|
||||
if not state["load_high"] or now - last_sent["load"] >= cooldown:
|
||||
await notify(bot, chat_id, f"🟡 Load high: {load:.2f}")
|
||||
state["load_high"] = True
|
||||
last_sent["load"] = now
|
||||
if load >= high_warn:
|
||||
level = 2
|
||||
elif load >= load_warn:
|
||||
level = 1
|
||||
else:
|
||||
if state["load_high"] and notify_recovery:
|
||||
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}")
|
||||
state["load_high"] = False
|
||||
level = 0
|
||||
if load_only_critical and level == 1:
|
||||
level = 0
|
||||
|
||||
if level == 0:
|
||||
if state["load_level"] > 0 and notify_recovery and not load_only_critical:
|
||||
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok", category="load")
|
||||
state["load_level"] = 0
|
||||
else:
|
||||
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
|
||||
icon = "🔴" if level == 2 else "🟡"
|
||||
level_name = "critical" if level == 2 else "warn"
|
||||
key = "load_high_crit" if level == 2 else "load_high_warn"
|
||||
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load")
|
||||
last_sent["load"] = now
|
||||
if level == 2 and auto_mute_high_load_sec > 0:
|
||||
from services.alert_mute import set_mute
|
||||
|
||||
set_mute("load", auto_mute_high_load_sec)
|
||||
state["load_level"] = level
|
||||
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
@@ -74,7 +101,14 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
||||
continue
|
||||
|
||||
if "FAILED" in health:
|
||||
await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}")
|
||||
await notify(
|
||||
bot,
|
||||
chat_id,
|
||||
f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}",
|
||||
level="critical",
|
||||
key=f"smart_fail:{dev}",
|
||||
category="smart",
|
||||
)
|
||||
last_sent[key] = now
|
||||
continue
|
||||
|
||||
@@ -84,8 +118,66 @@ async def monitor_smart(cfg, notify, bot, chat_id):
|
||||
except ValueError:
|
||||
t = None
|
||||
if t is not None and t >= temp_warn:
|
||||
await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}")
|
||||
await notify(
|
||||
bot,
|
||||
chat_id,
|
||||
f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}",
|
||||
level="warn",
|
||||
key=f"smart_hot:{dev}",
|
||||
category="smart",
|
||||
)
|
||||
last_sent[key] = now
|
||||
continue
|
||||
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
async def monitor_raid(cfg, notify, bot, chat_id):
|
||||
alerts_cfg = cfg.get("alerts", {})
|
||||
interval = int(alerts_cfg.get("raid_interval_sec", 300))
|
||||
cooldown = int(alerts_cfg.get("raid_cooldown_sec", 1800))
|
||||
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
||||
|
||||
last_sent: dict[str, float] = {}
|
||||
bad_state: dict[str, bool] = {}
|
||||
|
||||
while True:
|
||||
now = time.time()
|
||||
for dev in list_md_arrays():
|
||||
status = md_array_status(dev)
|
||||
lower = status.lower()
|
||||
level = None
|
||||
key_suffix = None
|
||||
if "inactive" in lower:
|
||||
level = "critical"
|
||||
key_suffix = "inactive"
|
||||
elif "degraded" in lower:
|
||||
level = "warn"
|
||||
key_suffix = "degraded"
|
||||
|
||||
if level:
|
||||
if not bad_state.get(dev) or (now - last_sent.get(dev, 0.0) >= cooldown):
|
||||
icon = "🔴" if level == "critical" else "🟡"
|
||||
await notify(
|
||||
bot,
|
||||
chat_id,
|
||||
f"{icon} RAID {dev}: {status}",
|
||||
level=level,
|
||||
key=f"raid_{key_suffix}:{dev}",
|
||||
category="raid",
|
||||
)
|
||||
last_sent[dev] = now
|
||||
bad_state[dev] = True
|
||||
else:
|
||||
if bad_state.get(dev) and notify_recovery:
|
||||
await notify(
|
||||
bot,
|
||||
chat_id,
|
||||
f"🟢 RAID {dev}: {status}",
|
||||
level="info",
|
||||
key=f"raid_ok:{dev}",
|
||||
category="raid",
|
||||
)
|
||||
bad_state[dev] = False
|
||||
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
104
services/arcane.py
Normal file
104
services/arcane.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import json
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import URLError, HTTPError
|
||||
|
||||
|
||||
def list_projects(base_url: str, api_key: str, env_id: int, timeout: int = 10) -> tuple[bool, str, list[dict]]:
|
||||
url = f"{base_url.rstrip('/')}/api/environments/{env_id}/projects"
|
||||
req = Request(url, headers={"X-Api-Key": api_key})
|
||||
try:
|
||||
with urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="ignore")
|
||||
except HTTPError as e:
|
||||
return False, f"HTTP {e.code}", []
|
||||
except URLError as e:
|
||||
return False, f"URL error: {e.reason}", []
|
||||
except Exception as e:
|
||||
return False, f"Error: {e}", []
|
||||
|
||||
try:
|
||||
payload = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return False, "Invalid JSON", []
|
||||
|
||||
if not payload.get("success"):
|
||||
return False, "API returned success=false", []
|
||||
|
||||
return True, "OK", payload.get("data", [])
|
||||
|
||||
|
||||
def restart_project(base_url: str, api_key: str, env_id: int, project_id: str, timeout: int = 20) -> tuple[bool, str]:
|
||||
url = f"{base_url.rstrip('/')}/api/environments/{env_id}/projects/{project_id}/restart"
|
||||
req = Request(url, method="POST", headers={"X-Api-Key": api_key})
|
||||
try:
|
||||
with urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="ignore")
|
||||
except HTTPError as e:
|
||||
return False, f"HTTP {e.code}"
|
||||
except URLError as e:
|
||||
return False, f"URL error: {e.reason}"
|
||||
except Exception as e:
|
||||
return False, f"Error: {e}"
|
||||
|
||||
try:
|
||||
payload = json.loads(raw) if raw else {}
|
||||
except json.JSONDecodeError:
|
||||
payload = {}
|
||||
|
||||
if payload and not payload.get("success", True):
|
||||
return False, "API returned success=false"
|
||||
return True, "OK"
|
||||
|
||||
|
||||
def set_project_state(
|
||||
base_url: str,
|
||||
api_key: str,
|
||||
env_id: int,
|
||||
project_id: str,
|
||||
action: str,
|
||||
timeout: int = 20,
|
||||
) -> tuple[bool, str]:
|
||||
url = f"{base_url.rstrip('/')}/api/environments/{env_id}/projects/{project_id}/{action}"
|
||||
req = Request(url, method="POST", headers={"X-Api-Key": api_key})
|
||||
try:
|
||||
with urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="ignore")
|
||||
except HTTPError as e:
|
||||
return False, f"HTTP {e.code}"
|
||||
except URLError as e:
|
||||
return False, f"URL error: {e.reason}"
|
||||
except Exception as e:
|
||||
return False, f"Error: {e}"
|
||||
|
||||
try:
|
||||
payload = json.loads(raw) if raw else {}
|
||||
except json.JSONDecodeError:
|
||||
payload = {}
|
||||
|
||||
if payload and not payload.get("success", True):
|
||||
return False, "API returned success=false"
|
||||
return True, "OK"
|
||||
|
||||
|
||||
def get_project_details(base_url: str, api_key: str, env_id: int, project_id: str, timeout: int = 10) -> tuple[bool, str, dict]:
|
||||
url = f"{base_url.rstrip('/')}/api/environments/{env_id}/projects/{project_id}"
|
||||
req = Request(url, headers={"X-Api-Key": api_key})
|
||||
try:
|
||||
with urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="ignore")
|
||||
except HTTPError as e:
|
||||
return False, f"HTTP {e.code}", {}
|
||||
except URLError as e:
|
||||
return False, f"URL error: {e.reason}", {}
|
||||
except Exception as e:
|
||||
return False, f"Error: {e}", {}
|
||||
|
||||
try:
|
||||
payload = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return False, "Invalid JSON", {}
|
||||
|
||||
if not payload.get("success"):
|
||||
return False, "API returned success=false", {}
|
||||
|
||||
return True, "OK", payload.get("data", {})
|
||||
138
services/audit.py
Normal file
138
services/audit.py
Normal file
@@ -0,0 +1,138 @@
|
||||
import logging
|
||||
import os
|
||||
from collections import deque
|
||||
from datetime import datetime, timezone
|
||||
from logging.handlers import TimedRotatingFileHandler
|
||||
from typing import Any, Optional
|
||||
|
||||
from aiogram import BaseMiddleware
|
||||
from aiogram.types import CallbackQuery, Message
|
||||
|
||||
|
||||
def _get_audit_path(cfg: dict[str, Any]) -> str:
|
||||
return cfg.get("audit", {}).get("path", "/var/server-bot/audit.log")
|
||||
|
||||
|
||||
def get_audit_logger(cfg: dict[str, Any]) -> logging.Logger:
|
||||
logger = logging.getLogger("audit")
|
||||
if logger.handlers:
|
||||
return logger
|
||||
|
||||
path = _get_audit_path(cfg)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
|
||||
rotate_when = cfg.get("audit", {}).get("rotate_when", "W0")
|
||||
backup_count = int(cfg.get("audit", {}).get("backup_count", 8))
|
||||
handler = TimedRotatingFileHandler(
|
||||
path,
|
||||
when=rotate_when,
|
||||
interval=1,
|
||||
backupCount=backup_count,
|
||||
encoding="utf-8",
|
||||
utc=True,
|
||||
)
|
||||
formatter = logging.Formatter("%(asctime)s\t%(message)s")
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.addHandler(handler)
|
||||
logger.propagate = False
|
||||
return logger
|
||||
|
||||
|
||||
def audit_health(cfg: dict[str, Any]) -> tuple[bool, str]:
|
||||
path = _get_audit_path(cfg)
|
||||
directory = os.path.dirname(path)
|
||||
try:
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
if not os.path.exists(path):
|
||||
with open(path, "a", encoding="utf-8"):
|
||||
pass
|
||||
if not os.access(path, os.W_OK):
|
||||
return False, f"Audit log not writable: {path}"
|
||||
except Exception as e:
|
||||
return False, f"Audit log error: {e}"
|
||||
return True, path
|
||||
|
||||
|
||||
def audit_start(cfg: dict[str, Any]) -> None:
|
||||
logger = get_audit_logger(cfg)
|
||||
ok, detail = audit_health(cfg)
|
||||
status = "ok" if ok else "error"
|
||||
logger.info("startup\tstatus=%s\tpath=%s", status, detail)
|
||||
|
||||
|
||||
def _user_label(message: Message | CallbackQuery) -> str:
|
||||
user = message.from_user
|
||||
if not user:
|
||||
return "unknown"
|
||||
parts = [user.username, user.first_name, user.last_name]
|
||||
label = " ".join(p for p in parts if p)
|
||||
return label or str(user.id)
|
||||
|
||||
|
||||
def _normalize_action(text: str, limit: int = 200) -> str:
|
||||
cleaned = " ".join(text.split())
|
||||
if len(cleaned) > limit:
|
||||
return cleaned[:limit] + "…"
|
||||
return cleaned
|
||||
|
||||
|
||||
class AuditMiddleware(BaseMiddleware):
|
||||
def __init__(self, cfg: dict[str, Any]) -> None:
|
||||
self.cfg = cfg
|
||||
self.logger = get_audit_logger(cfg)
|
||||
|
||||
async def __call__(self, handler, event, data):
|
||||
if not self.cfg.get("audit", {}).get("enabled", True):
|
||||
return await handler(event, data)
|
||||
|
||||
action: Optional[str] = None
|
||||
if isinstance(event, Message):
|
||||
if event.text:
|
||||
action = _normalize_action(event.text)
|
||||
elif event.caption:
|
||||
action = _normalize_action(event.caption)
|
||||
else:
|
||||
action = f"<{event.content_type}>"
|
||||
elif isinstance(event, CallbackQuery):
|
||||
if event.data:
|
||||
action = _normalize_action(f"callback:{event.data}")
|
||||
else:
|
||||
action = "callback:<empty>"
|
||||
|
||||
if action:
|
||||
chat_id = event.chat.id if isinstance(event, Message) else event.message.chat.id
|
||||
user_id = event.from_user.id if event.from_user else "unknown"
|
||||
label = _user_label(event)
|
||||
self.logger.info(
|
||||
"user_id=%s\tuser=%s\tchat_id=%s\taction=%s",
|
||||
user_id,
|
||||
label,
|
||||
chat_id,
|
||||
action,
|
||||
)
|
||||
|
||||
return await handler(event, data)
|
||||
|
||||
|
||||
def read_audit_tail(cfg: dict[str, Any], limit: int = 200) -> str:
|
||||
path = _get_audit_path(cfg)
|
||||
if not os.path.exists(path):
|
||||
return "⚠️ Audit log not found"
|
||||
|
||||
lines = deque(maxlen=limit)
|
||||
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
||||
for line in f:
|
||||
lines.append(line.rstrip())
|
||||
|
||||
if not lines:
|
||||
return "ℹ️ Audit log is empty"
|
||||
|
||||
header = f"🧾 Audit log ({datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC})"
|
||||
body = "\n".join(lines)
|
||||
max_body = 3500
|
||||
if len(body) > max_body:
|
||||
body = body[-max_body:]
|
||||
body = "...(truncated)\n" + body
|
||||
return f"{header}\n```\n{body}\n```"
|
||||
35
services/config_check.py
Normal file
35
services/config_check.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
from typing import Any, Tuple, List
|
||||
|
||||
|
||||
def validate_cfg(cfg: dict[str, Any]) -> Tuple[List[str], List[str]]:
|
||||
errors: List[str] = []
|
||||
warnings: List[str] = []
|
||||
|
||||
tg = cfg.get("telegram", {})
|
||||
if not tg.get("token"):
|
||||
errors.append("telegram.token is missing")
|
||||
admin_ids = tg.get("admin_ids")
|
||||
has_admin_ids = isinstance(admin_ids, list) and len(admin_ids) > 0
|
||||
if not tg.get("admin_id") and not has_admin_ids:
|
||||
errors.append("telegram.admin_id is missing")
|
||||
|
||||
thresholds = cfg.get("thresholds", {})
|
||||
for key in ("disk_warn", "load_warn", "high_load_warn"):
|
||||
if key not in thresholds:
|
||||
warnings.append(f"thresholds.{key} not set")
|
||||
|
||||
paths = cfg.get("paths", {})
|
||||
env_path = paths.get("restic_env")
|
||||
if env_path and not os.path.exists(env_path):
|
||||
warnings.append(f"paths.restic_env not found: {env_path}")
|
||||
|
||||
npm = cfg.get("npmplus", {})
|
||||
if npm and not npm.get("token") and (not npm.get("identity") or not npm.get("secret")):
|
||||
warnings.append("npmplus: token missing and identity/secret missing")
|
||||
|
||||
ow = cfg.get("openwrt", {})
|
||||
if ow and not ow.get("host"):
|
||||
warnings.append("openwrt.host is missing")
|
||||
|
||||
return errors, warnings
|
||||
78
services/disk_report.py
Normal file
78
services/disk_report.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import os
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from services.runner import run_cmd
|
||||
|
||||
|
||||
def _top_dirs_cmd(path: str, limit: int) -> list[str]:
|
||||
_ = limit
|
||||
return ["du", "-x", "-h", "-d", "1", path]
|
||||
|
||||
|
||||
_SIZE_RE = re.compile(r"^\s*([0-9]+(?:\.[0-9]+)?)([KMGTP]?)(i?B?)?$", re.IGNORECASE)
|
||||
|
||||
|
||||
def _size_to_bytes(value: str) -> float:
|
||||
m = _SIZE_RE.match(value.strip())
|
||||
if not m:
|
||||
return -1.0
|
||||
num = float(m.group(1))
|
||||
unit = (m.group(2) or "").upper()
|
||||
mul = {
|
||||
"": 1,
|
||||
"K": 1024,
|
||||
"M": 1024**2,
|
||||
"G": 1024**3,
|
||||
"T": 1024**4,
|
||||
"P": 1024**5,
|
||||
}.get(unit, 1)
|
||||
return num * mul
|
||||
|
||||
|
||||
def _format_top_dirs(raw: str, limit: int) -> str:
|
||||
rows: list[tuple[float, str]] = []
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
parts = line.split(maxsplit=1)
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
size, name = parts
|
||||
rows.append((_size_to_bytes(size), f"{size}\t{name}"))
|
||||
rows.sort(key=lambda x: x[0])
|
||||
return "\n".join(line for _sz, line in rows[-max(1, limit):])
|
||||
|
||||
|
||||
async def build_disk_report(cfg: dict[str, Any], mount: str, usage: int) -> str:
|
||||
limit = int(cfg.get("disk_report", {}).get("top_dirs", 8))
|
||||
|
||||
lines = ["🧱 Disk report", f"💽 {mount}: {usage}%"]
|
||||
|
||||
rc, out = await run_cmd(_top_dirs_cmd(mount, limit), timeout=30)
|
||||
if rc == 0 and out.strip():
|
||||
top_out = _format_top_dirs(out, limit)
|
||||
lines.append("")
|
||||
lines.append("Top directories:")
|
||||
lines.append(top_out)
|
||||
|
||||
docker_dir = cfg.get("disk_report", {}).get("docker_dir", "/var/lib/docker")
|
||||
if docker_dir and os.path.exists(docker_dir):
|
||||
rc2, out2 = await run_cmd(_top_dirs_cmd(docker_dir, limit), timeout=30)
|
||||
if rc2 == 0 and out2.strip():
|
||||
top_out2 = _format_top_dirs(out2, limit)
|
||||
lines.append("")
|
||||
lines.append(f"Docker dir: {docker_dir}")
|
||||
lines.append(top_out2)
|
||||
|
||||
logs_dir = cfg.get("disk_report", {}).get("logs_dir", "/var/log")
|
||||
if logs_dir and os.path.exists(logs_dir):
|
||||
rc3, out3 = await run_cmd(_top_dirs_cmd(logs_dir, limit), timeout=30)
|
||||
if rc3 == 0 and out3.strip():
|
||||
top_out3 = _format_top_dirs(out3, limit)
|
||||
lines.append("")
|
||||
lines.append(f"Logs dir: {logs_dir}")
|
||||
lines.append(top_out3)
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -101,16 +101,35 @@ async def docker_cmd(args: list[str], timeout: int = 20):
|
||||
async def docker_watchdog(container_map, notify, bot, chat_id):
|
||||
last = {}
|
||||
while True:
|
||||
if not last:
|
||||
for alias, real in container_map.items():
|
||||
rc, raw = await docker_cmd(
|
||||
["inspect", "-f", "{{.State.Status}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}n/a{{end}}", real],
|
||||
timeout=10
|
||||
)
|
||||
if rc != 0:
|
||||
last[alias] = ("error", "n/a")
|
||||
continue
|
||||
parts = raw.strip().split("|", 1)
|
||||
status = parts[0] if parts else "unknown"
|
||||
health = parts[1] if len(parts) > 1 else "n/a"
|
||||
last[alias] = (status, health)
|
||||
await asyncio.sleep(120)
|
||||
continue
|
||||
for alias, real in container_map.items():
|
||||
rc, state = await docker_cmd(
|
||||
["inspect", "-f", "{{.State.Status}}", real],
|
||||
rc, raw = await docker_cmd(
|
||||
["inspect", "-f", "{{.State.Status}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}n/a{{end}}", real],
|
||||
timeout=10
|
||||
)
|
||||
if rc != 0:
|
||||
state = "error"
|
||||
state = state.strip()
|
||||
if last.get(alias) != state:
|
||||
if state != "running":
|
||||
status, health = "error", "n/a"
|
||||
else:
|
||||
parts = raw.strip().split("|", 1)
|
||||
status = parts[0] if parts else "unknown"
|
||||
health = parts[1] if len(parts) > 1 else "n/a"
|
||||
|
||||
if last.get(alias) != (status, health):
|
||||
if status != "running":
|
||||
kb = InlineKeyboardMarkup(
|
||||
inline_keyboard=[[
|
||||
InlineKeyboardButton(
|
||||
@@ -121,10 +140,26 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
|
||||
)
|
||||
await bot.send_message(
|
||||
chat_id,
|
||||
f"🐳 {alias}: {state}",
|
||||
f"🐳 {alias}: {status}",
|
||||
reply_markup=kb,
|
||||
)
|
||||
elif health not in ("healthy", "n/a"):
|
||||
await notify(
|
||||
bot,
|
||||
chat_id,
|
||||
f"⚠️ {alias} health: {health}",
|
||||
level="warn",
|
||||
key=f"docker_health:{alias}",
|
||||
category="docker",
|
||||
)
|
||||
else:
|
||||
await notify(bot, chat_id, f"🐳 {alias}: {state}")
|
||||
last[alias] = state
|
||||
await notify(
|
||||
bot,
|
||||
chat_id,
|
||||
f"🐳 {alias}: {status}",
|
||||
level="info",
|
||||
key=f"docker_status:{alias}:{status}",
|
||||
category="docker",
|
||||
)
|
||||
last[alias] = (status, health)
|
||||
await asyncio.sleep(120)
|
||||
|
||||
143
services/external_checks.py
Normal file
143
services/external_checks.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
|
||||
def _state_path(cfg: dict[str, Any]) -> str:
|
||||
return cfg.get("external_checks", {}).get("state_path", "/var/server-bot/external_checks.json")
|
||||
|
||||
|
||||
def _load_state(cfg: dict[str, Any]) -> dict[str, Any]:
|
||||
path = _state_path(cfg)
|
||||
if not os.path.exists(path):
|
||||
return {"services": {}, "total_checks": 0, "ok_checks": 0}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return {"services": {}, "total_checks": 0, "ok_checks": 0}
|
||||
|
||||
|
||||
def _save_state(cfg: dict[str, Any], state: dict[str, Any]) -> None:
|
||||
path = _state_path(cfg)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(state, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def _check_http(url: str, timeout: int) -> tuple[bool, str]:
|
||||
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
|
||||
try:
|
||||
with urlopen(req, timeout=timeout) as resp:
|
||||
status = int(resp.status)
|
||||
return status < 400, f"HTTP {status}"
|
||||
except HTTPError as e:
|
||||
return False, f"HTTP {int(e.code)}"
|
||||
except URLError as e:
|
||||
return False, str(e.reason)
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def _check_tcp(host: str, port: int, timeout: int) -> tuple[bool, str]:
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=timeout):
|
||||
return True, "TCP ok"
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def _check_ping(host: str, timeout: int) -> tuple[bool, str]:
|
||||
try:
|
||||
socket.gethostbyname(host)
|
||||
return True, "DNS ok"
|
||||
except Exception:
|
||||
pass
|
||||
return _check_tcp(host, 80, timeout)
|
||||
|
||||
|
||||
def run_checks(cfg: dict[str, Any]) -> dict[str, Any]:
|
||||
checks_cfg = cfg.get("external_checks", {})
|
||||
services = checks_cfg.get("services", [])
|
||||
timeout = int(checks_cfg.get("timeout_sec", 5))
|
||||
|
||||
state = _load_state(cfg)
|
||||
services_state = state.setdefault("services", {})
|
||||
|
||||
results = []
|
||||
for entry in services:
|
||||
name = entry.get("name") or "unknown"
|
||||
check_type = entry.get("type", "http")
|
||||
ok = False
|
||||
detail = "n/a"
|
||||
|
||||
if check_type == "http":
|
||||
url = entry.get("url")
|
||||
if url:
|
||||
ok, detail = _check_http(url, timeout)
|
||||
elif check_type == "tcp":
|
||||
host = entry.get("host")
|
||||
port = int(entry.get("port", 0))
|
||||
if host and port:
|
||||
ok, detail = _check_tcp(host, port, timeout)
|
||||
elif check_type == "ping":
|
||||
host = entry.get("host")
|
||||
if host:
|
||||
ok, detail = _check_ping(host, timeout)
|
||||
|
||||
service_state = services_state.setdefault(name, {"ok": 0, "total": 0})
|
||||
service_state["total"] += 1
|
||||
if ok:
|
||||
service_state["ok"] += 1
|
||||
|
||||
state["total_checks"] = state.get("total_checks", 0) + 1
|
||||
if ok:
|
||||
state["ok_checks"] = state.get("ok_checks", 0) + 1
|
||||
|
||||
results.append({"name": name, "ok": ok, "detail": detail})
|
||||
|
||||
_save_state(cfg, state)
|
||||
return {"results": results, "state": state}
|
||||
|
||||
|
||||
def format_report(cfg: dict[str, Any]) -> str:
|
||||
checks_cfg = cfg.get("external_checks", {})
|
||||
services = checks_cfg.get("services", [])
|
||||
if not services:
|
||||
return "🌍 External checks\n\nℹ️ No services configured"
|
||||
|
||||
data = run_checks(cfg)
|
||||
results = data["results"]
|
||||
state = data["state"]
|
||||
|
||||
total = state.get("total_checks", 0) or 1
|
||||
ok_total = state.get("ok_checks", 0)
|
||||
uptime = 100.0 * ok_total / total
|
||||
|
||||
lines = ["🌍 External checks", ""]
|
||||
for item in results:
|
||||
icon = "🟢" if item["ok"] else "🔴"
|
||||
lines.append(f"{icon} {item['name']}: {item['detail']}")
|
||||
|
||||
lines.append("")
|
||||
lines.append(f"📈 Uptime (global): {uptime:.2f}%")
|
||||
|
||||
lines.append(f"🕒 {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def monitor_external(cfg: dict[str, Any]):
|
||||
checks_cfg = cfg.get("external_checks", {})
|
||||
if not checks_cfg.get("enabled", True):
|
||||
return
|
||||
interval = int(checks_cfg.get("interval_sec", 300))
|
||||
|
||||
while True:
|
||||
run_checks(cfg)
|
||||
await asyncio.sleep(interval)
|
||||
88
services/gitea.py
Normal file
88
services/gitea.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import json
|
||||
import ssl
|
||||
from typing import Any
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
|
||||
def _request(url: str, headers: dict[str, str], verify_tls: bool) -> tuple[int, str]:
|
||||
context = None
|
||||
if not verify_tls:
|
||||
context = ssl._create_unverified_context() # nosec - config-controlled
|
||||
|
||||
req = Request(url, headers=headers)
|
||||
try:
|
||||
with urlopen(req, timeout=10, context=context) as resp:
|
||||
body = resp.read().decode("utf-8")
|
||||
return int(resp.status), body
|
||||
except HTTPError as e:
|
||||
try:
|
||||
body = e.read().decode("utf-8")
|
||||
except Exception:
|
||||
body = ""
|
||||
return int(e.code), body
|
||||
except URLError as e:
|
||||
raise RuntimeError(str(e.reason)) from e
|
||||
|
||||
|
||||
def _api_base(cfg: dict[str, Any]) -> str:
|
||||
g_cfg = cfg.get("gitea", {})
|
||||
base = (g_cfg.get("base_url") or "").rstrip("/")
|
||||
return base
|
||||
|
||||
|
||||
def get_gitea_health(cfg: dict[str, Any]) -> str:
|
||||
g_cfg = cfg.get("gitea", {})
|
||||
base = _api_base(cfg)
|
||||
verify_tls = g_cfg.get("verify_tls", True)
|
||||
if not base:
|
||||
return "⚠️ Gitea base_url not configured"
|
||||
|
||||
token = (g_cfg.get("token") or "").strip()
|
||||
headers = {"User-Agent": "tg-admin-bot"}
|
||||
if token:
|
||||
headers["Authorization"] = f"token {token}"
|
||||
|
||||
lines = ["🍵 Gitea\n"]
|
||||
|
||||
health_paths = ["/api/healthz", "/api/v1/healthz"]
|
||||
health_status = None
|
||||
health_payload = None
|
||||
for path in health_paths:
|
||||
status, body = _request(f"{base}{path}", headers, verify_tls)
|
||||
if status == 200:
|
||||
health_status = (status, path)
|
||||
try:
|
||||
health_payload = json.loads(body)
|
||||
except json.JSONDecodeError:
|
||||
health_payload = None
|
||||
break
|
||||
if status not in (404, 405):
|
||||
health_status = (status, path)
|
||||
break
|
||||
|
||||
if health_status:
|
||||
status, path = health_status
|
||||
icon = "🟢" if status == 200 else "🔴"
|
||||
if status == 200 and isinstance(health_payload, dict):
|
||||
state = health_payload.get("status") or "ok"
|
||||
checks = health_payload.get("checks") or {}
|
||||
checks_total = len(checks) if isinstance(checks, dict) else 0
|
||||
lines.append(f"{icon} API health: {state} ({checks_total} checks)")
|
||||
else:
|
||||
lines.append(f"{icon} API health: {status} ({path})")
|
||||
else:
|
||||
lines.append("🟡 API health: endpoint not found")
|
||||
|
||||
ver_status, ver_body = _request(f"{base}/api/v1/version", headers, verify_tls)
|
||||
if ver_status == 200:
|
||||
try:
|
||||
payload = json.loads(ver_body)
|
||||
except json.JSONDecodeError:
|
||||
payload = {}
|
||||
version = payload.get("version") or "unknown"
|
||||
lines.append(f"ℹ️ Version: {version}")
|
||||
else:
|
||||
lines.append(f"🟡 Version: HTTP {ver_status}")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -1,6 +1,9 @@
|
||||
import os
|
||||
import os
|
||||
import ssl
|
||||
import subprocess
|
||||
import psutil
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import Request, urlopen
|
||||
from app import RESTIC_ENV
|
||||
from services.system import worst_disk_usage
|
||||
|
||||
@@ -9,9 +12,35 @@ def _containers_from_cfg(cfg) -> dict:
|
||||
return cfg.get("docker", {}).get("containers", {})
|
||||
|
||||
|
||||
def _request_status(url: str, verify_tls: bool) -> int | None:
|
||||
context = None
|
||||
if not verify_tls:
|
||||
context = ssl._create_unverified_context() # nosec - config-controlled
|
||||
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
|
||||
try:
|
||||
with urlopen(req, timeout=8, context=context) as resp:
|
||||
return int(resp.status)
|
||||
except HTTPError as e:
|
||||
return int(e.code)
|
||||
except URLError:
|
||||
return None
|
||||
|
||||
|
||||
def _npm_api_base(cfg) -> str | None:
|
||||
npm_cfg = cfg.get("npmplus", {})
|
||||
base = (npm_cfg.get("base_url") or "").rstrip("/")
|
||||
if not base:
|
||||
return None
|
||||
if not base.endswith("/api"):
|
||||
base = f"{base}/api"
|
||||
return base
|
||||
|
||||
|
||||
def health(cfg, container_map: dict | None = None) -> str:
|
||||
lines = ["🩺 Health check\n"]
|
||||
|
||||
thresholds = cfg.get("thresholds", {})
|
||||
disk_warn = int(thresholds.get("disk_warn", 80))
|
||||
load_warn = float(thresholds.get("load_warn", 2.0))
|
||||
try:
|
||||
env = os.environ.copy()
|
||||
env.update(RESTIC_ENV)
|
||||
@@ -30,15 +59,47 @@ def health(cfg, container_map: dict | None = None) -> str:
|
||||
else:
|
||||
lines.append(f"🟢 {alias} OK")
|
||||
|
||||
npm_cfg = cfg.get("npmplus", {})
|
||||
npm_base = _npm_api_base(cfg)
|
||||
if npm_base:
|
||||
npm_status = _request_status(npm_base, npm_cfg.get("verify_tls", True))
|
||||
if npm_status == 200:
|
||||
lines.append("🟢 NPMplus API OK")
|
||||
elif npm_status is None:
|
||||
lines.append("🔴 NPMplus API unreachable")
|
||||
else:
|
||||
lines.append(f"🟡 NPMplus API HTTP {npm_status}")
|
||||
|
||||
g_cfg = cfg.get("gitea", {})
|
||||
g_base = (g_cfg.get("base_url") or "").rstrip("/")
|
||||
if g_base:
|
||||
health_paths = ["/api/healthz", "/api/v1/healthz"]
|
||||
g_status = None
|
||||
for path in health_paths:
|
||||
status = _request_status(f"{g_base}{path}", g_cfg.get("verify_tls", True))
|
||||
if status == 200:
|
||||
g_status = status
|
||||
break
|
||||
if status not in (404, 405):
|
||||
g_status = status
|
||||
break
|
||||
if g_status == 200:
|
||||
lines.append("🟢 Gitea API OK")
|
||||
elif g_status is None:
|
||||
lines.append("🔴 Gitea API unreachable")
|
||||
else:
|
||||
lines.append(f"🟡 Gitea API HTTP {g_status}")
|
||||
|
||||
usage, mount = worst_disk_usage()
|
||||
if usage is None:
|
||||
lines.append("⚠️ Disk n/a")
|
||||
elif usage > cfg["thresholds"]["disk_warn"]:
|
||||
elif usage > disk_warn:
|
||||
lines.append(f"🟡 Disk {usage}% ({mount})")
|
||||
else:
|
||||
lines.append(f"🟢 Disk {usage}% ({mount})")
|
||||
|
||||
load = psutil.getloadavg()[0]
|
||||
lines.append(f"{'🟢' if load < cfg['thresholds']['load_warn'] else '🟡'} Load {load}")
|
||||
lines.append(f"{'🟢' if load < load_warn else '🟡'} Load {load}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
118
services/incidents.py
Normal file
118
services/incidents.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import logging
|
||||
import os
|
||||
from collections import deque
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from logging.handlers import TimedRotatingFileHandler
|
||||
from typing import Any
|
||||
from services import runtime_state
|
||||
|
||||
|
||||
def _get_path(cfg: dict[str, Any]) -> str:
|
||||
return cfg.get("incidents", {}).get("path", "/var/server-bot/incidents.log")
|
||||
|
||||
|
||||
def incidents_path(cfg: dict[str, Any]) -> str:
|
||||
return _get_path(cfg)
|
||||
|
||||
|
||||
def _get_logger(cfg: dict[str, Any]) -> logging.Logger:
|
||||
logger = logging.getLogger("incidents")
|
||||
if logger.handlers:
|
||||
return logger
|
||||
|
||||
path = _get_path(cfg)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
|
||||
rotate_when = cfg.get("incidents", {}).get("rotate_when", "W0")
|
||||
backup_count = int(cfg.get("incidents", {}).get("backup_count", 8))
|
||||
handler = TimedRotatingFileHandler(
|
||||
path,
|
||||
when=rotate_when,
|
||||
interval=1,
|
||||
backupCount=backup_count,
|
||||
encoding="utf-8",
|
||||
utc=True,
|
||||
)
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s\t%(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%SZ",
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.addHandler(handler)
|
||||
logger.propagate = False
|
||||
return logger
|
||||
|
||||
|
||||
def log_incident(cfg: dict[str, Any], text: str, category: str | None = None) -> None:
|
||||
if not cfg.get("incidents", {}).get("enabled", True):
|
||||
return
|
||||
if category and "category=" not in text:
|
||||
text = f"category={category} {text}"
|
||||
logger = _get_logger(cfg)
|
||||
logger.info(text)
|
||||
|
||||
|
||||
def _parse_line(line: str) -> tuple[datetime | None, str]:
|
||||
if "\t" not in line:
|
||||
return None, line.strip()
|
||||
ts, msg = line.split("\t", 1)
|
||||
try:
|
||||
dt = datetime.strptime(ts.strip(), "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
dt = None
|
||||
return dt, msg.strip()
|
||||
|
||||
|
||||
def read_recent(cfg: dict[str, Any], hours: int, limit: int = 200) -> list[str]:
|
||||
return [f"{dt:%Y-%m-%d %H:%M} {msg}" for dt, msg in read_raw(cfg, hours, limit=limit)]
|
||||
|
||||
|
||||
def read_raw(cfg: dict[str, Any], hours: int, limit: int = 200, *, include_old: bool = False) -> list[tuple[datetime, str]]:
|
||||
path = _get_path(cfg)
|
||||
if not os.path.exists(path):
|
||||
return []
|
||||
|
||||
since = datetime.now(timezone.utc) - timedelta(hours=hours)
|
||||
lines = deque(maxlen=limit)
|
||||
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
||||
for line in f:
|
||||
dt, msg = _parse_line(line.rstrip())
|
||||
if dt is None:
|
||||
continue
|
||||
if not include_old and dt < since:
|
||||
continue
|
||||
lines.append((dt, msg))
|
||||
return list(lines)
|
||||
|
||||
|
||||
def infer_category(text: str) -> str | None:
|
||||
lower = text.lower()
|
||||
if "category=" in lower:
|
||||
import re
|
||||
|
||||
m = re.search(r"category=([a-z0-9_-]+)", lower)
|
||||
if m:
|
||||
return m.group(1)
|
||||
if "load" in lower:
|
||||
return "load"
|
||||
if "docker" in lower:
|
||||
return "docker"
|
||||
if "restic" in lower or "backup" in lower:
|
||||
return "backup"
|
||||
if "smart" in lower:
|
||||
return "smart"
|
||||
if "ssl" in lower or "cert" in lower:
|
||||
return "ssl"
|
||||
if "npmplus" in lower:
|
||||
return "npmplus"
|
||||
if "gitea" in lower:
|
||||
return "gitea"
|
||||
if "openwrt" in lower:
|
||||
return "openwrt"
|
||||
if "queue" in lower:
|
||||
return "queue"
|
||||
if "selftest" in lower:
|
||||
return "selftest"
|
||||
return None
|
||||
35
services/logging_setup.py
Normal file
35
services/logging_setup.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import logging
|
||||
import os
|
||||
from logging.handlers import TimedRotatingFileHandler
|
||||
|
||||
|
||||
def setup_logging(cfg: dict) -> None:
|
||||
log_cfg = cfg.get("logging", {})
|
||||
if not log_cfg.get("enabled", True):
|
||||
return
|
||||
|
||||
path = log_cfg.get("path", "/var/server-bot/bot.log")
|
||||
rotate_when = log_cfg.get("rotate_when", "W0")
|
||||
backup_count = int(log_cfg.get("backup_count", 8))
|
||||
level = str(log_cfg.get("level", "INFO")).upper()
|
||||
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
|
||||
root = logging.getLogger()
|
||||
for handler in root.handlers:
|
||||
if isinstance(handler, TimedRotatingFileHandler) and handler.baseFilename == path:
|
||||
return
|
||||
|
||||
handler = TimedRotatingFileHandler(
|
||||
path,
|
||||
when=rotate_when,
|
||||
interval=1,
|
||||
backupCount=backup_count,
|
||||
encoding="utf-8",
|
||||
utc=True,
|
||||
)
|
||||
formatter = logging.Formatter("%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s")
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
root.setLevel(level)
|
||||
root.addHandler(handler)
|
||||
86
services/metrics.py
Normal file
86
services/metrics.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import asyncio
|
||||
import time
|
||||
from collections import deque
|
||||
import psutil
|
||||
|
||||
|
||||
class MetricsStore:
|
||||
def __init__(self, maxlen: int = 720):
|
||||
self.samples = deque(maxlen=maxlen)
|
||||
self.interval = 5
|
||||
|
||||
def add(self, sample: dict):
|
||||
self.samples.append(sample)
|
||||
|
||||
|
||||
async def start_sampler(store: MetricsStore, interval: int = 5):
|
||||
store.interval = interval
|
||||
psutil.cpu_percent(interval=None)
|
||||
last_net = psutil.net_io_counters()
|
||||
|
||||
while True:
|
||||
now = time.time()
|
||||
cpu = psutil.cpu_percent(interval=None)
|
||||
load1 = psutil.getloadavg()[0]
|
||||
|
||||
net = psutil.net_io_counters()
|
||||
rx_bytes = net.bytes_recv - last_net.bytes_recv
|
||||
tx_bytes = net.bytes_sent - last_net.bytes_sent
|
||||
last_net = net
|
||||
|
||||
rx_rate = rx_bytes / interval
|
||||
tx_rate = tx_bytes / interval
|
||||
|
||||
store.add({
|
||||
"ts": now,
|
||||
"cpu": cpu,
|
||||
"load1": load1,
|
||||
"rx_bytes": rx_bytes,
|
||||
"tx_bytes": tx_bytes,
|
||||
"rx_rate": rx_rate,
|
||||
"tx_rate": tx_rate,
|
||||
})
|
||||
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
def summarize(store: MetricsStore, minutes: int = 15) -> str:
|
||||
cutoff = time.time() - minutes * 60
|
||||
data = [s for s in list(store.samples) if s["ts"] >= cutoff]
|
||||
if not data:
|
||||
return "📈 Metrics\n\n⚠️ No data yet"
|
||||
|
||||
cpu_vals = [s["cpu"] for s in data]
|
||||
load_vals = [s["load1"] for s in data]
|
||||
rx_rates = [s["rx_rate"] for s in data]
|
||||
tx_rates = [s["tx_rate"] for s in data]
|
||||
|
||||
total_rx = sum(s["rx_bytes"] for s in data)
|
||||
total_tx = sum(s["tx_bytes"] for s in data)
|
||||
|
||||
def avg(vals):
|
||||
return sum(vals) / len(vals) if vals else 0.0
|
||||
|
||||
def fmt_rate(bps):
|
||||
if bps > 1024**2:
|
||||
return f"{bps / (1024**2):.2f} MiB/s"
|
||||
if bps > 1024:
|
||||
return f"{bps / 1024:.2f} KiB/s"
|
||||
return f"{bps:.0f} B/s"
|
||||
|
||||
def fmt_bytes(b):
|
||||
if b > 1024**3:
|
||||
return f"{b / (1024**3):.2f} GiB"
|
||||
if b > 1024**2:
|
||||
return f"{b / (1024**2):.2f} MiB"
|
||||
if b > 1024:
|
||||
return f"{b / 1024:.2f} KiB"
|
||||
return f"{b} B"
|
||||
|
||||
return (
|
||||
f"📈 Metrics (last {minutes}m)\n\n"
|
||||
f"🧠 CPU avg: {avg(cpu_vals):.1f}% | max: {max(cpu_vals):.1f}%\n"
|
||||
f"⚙️ Load avg: {avg(load_vals):.2f} | max: {max(load_vals):.2f}\n"
|
||||
f"⬇️ RX avg: {fmt_rate(avg(rx_rates))} | max: {fmt_rate(max(rx_rates))} | total: {fmt_bytes(total_rx)}\n"
|
||||
f"⬆️ TX avg: {fmt_rate(avg(tx_rates))} | max: {fmt_rate(max(tx_rates))} | total: {fmt_bytes(total_tx)}"
|
||||
)
|
||||
@@ -1,8 +1,83 @@
|
||||
import time
|
||||
from datetime import datetime
|
||||
from aiogram import Bot
|
||||
from app import cfg
|
||||
from services.alert_mute import is_muted, is_auto_muted
|
||||
from services.incidents import log_incident
|
||||
|
||||
|
||||
async def notify(bot: Bot, chat_id: int, text: str):
|
||||
_LAST_SENT: dict[str, float] = {}
|
||||
|
||||
|
||||
def _parse_hhmm(value: str) -> int | None:
|
||||
try:
|
||||
hours, minutes = value.strip().split(":", 1)
|
||||
h = int(hours)
|
||||
m = int(minutes)
|
||||
except Exception:
|
||||
return None
|
||||
if not (0 <= h <= 23 and 0 <= m <= 59):
|
||||
return None
|
||||
return h * 60 + m
|
||||
|
||||
|
||||
def _in_quiet_hours(alerts_cfg: dict) -> bool:
|
||||
quiet = alerts_cfg.get("quiet_hours", {})
|
||||
if not quiet.get("enabled", False):
|
||||
return False
|
||||
start_min = _parse_hhmm(quiet.get("start", "23:00"))
|
||||
end_min = _parse_hhmm(quiet.get("end", "08:00"))
|
||||
if start_min is None or end_min is None:
|
||||
return False
|
||||
if start_min == end_min:
|
||||
return False
|
||||
now = datetime.now()
|
||||
now_min = now.hour * 60 + now.minute
|
||||
if start_min < end_min:
|
||||
return start_min <= now_min < end_min
|
||||
return now_min >= start_min or now_min < end_min
|
||||
|
||||
|
||||
async def notify(
|
||||
bot: Bot,
|
||||
chat_id: int,
|
||||
text: str,
|
||||
level: str = "info",
|
||||
key: str | None = None,
|
||||
category: str | None = None,
|
||||
):
|
||||
alerts_cfg = cfg.get("alerts", {})
|
||||
suppressed_reason = None
|
||||
if category and is_muted(category):
|
||||
suppressed_reason = "muted"
|
||||
elif category and is_auto_muted(cfg, category):
|
||||
suppressed_reason = "auto_mute"
|
||||
elif _in_quiet_hours(alerts_cfg):
|
||||
allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True))
|
||||
if not (allow_critical and level == "critical"):
|
||||
suppressed_reason = "quiet_hours"
|
||||
|
||||
if suppressed_reason:
|
||||
try:
|
||||
log_incident(cfg, f"[suppressed:{suppressed_reason}] {text}", category=category)
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
|
||||
dedup_sec = int(alerts_cfg.get("notify_cooldown_sec", alerts_cfg.get("cooldown_sec", 900)))
|
||||
if dedup_sec > 0:
|
||||
dedup_key = key or text
|
||||
now = time.time()
|
||||
last_time = _LAST_SENT.get(dedup_key, 0)
|
||||
if now - last_time < dedup_sec:
|
||||
return
|
||||
_LAST_SENT[dedup_key] = now
|
||||
|
||||
try:
|
||||
await bot.send_message(chat_id, text)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
log_incident(cfg, text, category=category)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
234
services/npmplus.py
Normal file
234
services/npmplus.py
Normal file
@@ -0,0 +1,234 @@
|
||||
import json
|
||||
import ssl
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Any
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
import state
|
||||
|
||||
|
||||
def _parse_expiry(value: str | None) -> datetime | None:
|
||||
if not value:
|
||||
return None
|
||||
cleaned = value.strip()
|
||||
try:
|
||||
if cleaned.endswith("Z"):
|
||||
dt = datetime.fromisoformat(cleaned.replace("Z", "+00:00"))
|
||||
else:
|
||||
dt = datetime.fromisoformat(cleaned)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt
|
||||
except ValueError:
|
||||
for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
|
||||
try:
|
||||
return datetime.strptime(cleaned, fmt).replace(tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def _request_json(
|
||||
url: str,
|
||||
headers: dict[str, str],
|
||||
data: dict[str, Any] | None,
|
||||
verify_tls: bool,
|
||||
method: str | None = None,
|
||||
) -> Any:
|
||||
body = None
|
||||
if data is not None:
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
req = Request(url, headers=headers, data=body, method=method)
|
||||
|
||||
context = None
|
||||
if not verify_tls:
|
||||
context = ssl._create_unverified_context() # nosec - config-controlled
|
||||
|
||||
try:
|
||||
with urlopen(req, timeout=10, context=context) as resp:
|
||||
payload = resp.read().decode("utf-8")
|
||||
except HTTPError as e:
|
||||
detail = f"HTTP {e.code}"
|
||||
try:
|
||||
payload = e.read().decode("utf-8").strip()
|
||||
except Exception:
|
||||
payload = ""
|
||||
if payload:
|
||||
payload = " ".join(payload.split())
|
||||
if len(payload) > 300:
|
||||
payload = payload[:300] + "..."
|
||||
detail = f"{detail}: {payload}"
|
||||
raise RuntimeError(f"{detail} ({url})") from e
|
||||
except URLError as e:
|
||||
raise RuntimeError(str(e.reason)) from e
|
||||
|
||||
return json.loads(payload)
|
||||
|
||||
|
||||
def _api_base(cfg: dict[str, Any]) -> str:
|
||||
npm_cfg = cfg.get("npmplus", {})
|
||||
base = (npm_cfg.get("base_url") or "").rstrip("/")
|
||||
if not base:
|
||||
return ""
|
||||
if not base.endswith("/api"):
|
||||
base = f"{base}/api"
|
||||
return base
|
||||
|
||||
|
||||
def _get_token(cfg: dict[str, Any]) -> str:
|
||||
npm_cfg = cfg.get("npmplus", {})
|
||||
base_url = _api_base(cfg)
|
||||
identity = npm_cfg.get("identity")
|
||||
secret = npm_cfg.get("secret")
|
||||
static_token = npm_cfg.get("token")
|
||||
verify_tls = npm_cfg.get("verify_tls", True)
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
cached = state.NPMPLUS_TOKEN
|
||||
token = cached.get("token")
|
||||
expires_at = cached.get("expires_at")
|
||||
if token and isinstance(expires_at, datetime) and expires_at - now > timedelta(minutes=1):
|
||||
return token
|
||||
|
||||
if token and base_url:
|
||||
url = f"{base_url}/tokens"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {token}",
|
||||
"User-Agent": "tg-admin-bot",
|
||||
}
|
||||
try:
|
||||
payload = _request_json(url, headers, None, verify_tls)
|
||||
new_token = payload.get("token")
|
||||
if new_token:
|
||||
expires = _parse_expiry(payload.get("expires"))
|
||||
if expires is None:
|
||||
expires = now + timedelta(hours=1)
|
||||
state.NPMPLUS_TOKEN = {"token": new_token, "expires_at": expires}
|
||||
return new_token
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if identity and secret and base_url:
|
||||
url = f"{base_url}/tokens"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "tg-admin-bot",
|
||||
}
|
||||
data = {"identity": identity, "secret": secret}
|
||||
payload = _request_json(url, headers, data, verify_tls)
|
||||
token = payload.get("token")
|
||||
if not token:
|
||||
raise RuntimeError("Token not returned")
|
||||
expires_at = _parse_expiry(payload.get("expires"))
|
||||
if expires_at is None:
|
||||
expires_at = now + timedelta(hours=1)
|
||||
state.NPMPLUS_TOKEN = {"token": token, "expires_at": expires_at}
|
||||
return token
|
||||
|
||||
if static_token:
|
||||
return static_token
|
||||
|
||||
raise ValueError("NPMplus identity/secret or token not configured")
|
||||
|
||||
|
||||
def fetch_certificates(cfg: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
npm_cfg = cfg.get("npmplus", {})
|
||||
base_url = _api_base(cfg)
|
||||
verify_tls = npm_cfg.get("verify_tls", True)
|
||||
|
||||
if not base_url:
|
||||
raise ValueError("NPMplus base_url not configured")
|
||||
|
||||
token = _get_token(cfg)
|
||||
url = f"{base_url}/nginx/certificates"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {token}",
|
||||
"User-Agent": "tg-admin-bot",
|
||||
}
|
||||
|
||||
data = _request_json(url, headers, None, verify_tls)
|
||||
if not isinstance(data, list):
|
||||
raise RuntimeError("Unexpected API response")
|
||||
return data
|
||||
|
||||
|
||||
def list_proxy_hosts(cfg: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
npm_cfg = cfg.get("npmplus", {})
|
||||
base_url = _api_base(cfg)
|
||||
verify_tls = npm_cfg.get("verify_tls", True)
|
||||
if not base_url:
|
||||
raise ValueError("NPMplus base_url not configured")
|
||||
|
||||
token = _get_token(cfg)
|
||||
url = f"{base_url}/nginx/proxy-hosts"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {token}",
|
||||
"User-Agent": "tg-admin-bot",
|
||||
}
|
||||
data = _request_json(url, headers, None, verify_tls)
|
||||
if not isinstance(data, list):
|
||||
raise RuntimeError("Unexpected API response")
|
||||
return data
|
||||
|
||||
|
||||
def set_proxy_host(cfg: dict[str, Any], host_id: int, enabled: bool) -> tuple[bool, str]:
|
||||
npm_cfg = cfg.get("npmplus", {})
|
||||
base_url = _api_base(cfg)
|
||||
verify_tls = npm_cfg.get("verify_tls", True)
|
||||
if not base_url:
|
||||
return False, "NPMplus base_url not configured"
|
||||
|
||||
token = _get_token(cfg)
|
||||
action = "enable" if enabled else "disable"
|
||||
url = f"{base_url}/nginx/proxy-hosts/{host_id}/{action}"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {token}",
|
||||
"User-Agent": "tg-admin-bot",
|
||||
}
|
||||
try:
|
||||
payload = _request_json(url, headers, None, verify_tls, method="POST")
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
if payload is True or (isinstance(payload, dict) and payload.get("success", True)):
|
||||
return True, "OK"
|
||||
return False, "API returned error"
|
||||
|
||||
|
||||
def format_certificates(certs: list[dict[str, Any]]) -> str:
|
||||
if not certs:
|
||||
return "🔒 SSL certificates\n\nℹ️ No certificates found"
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
items: list[tuple[datetime | None, str]] = []
|
||||
|
||||
for cert in certs:
|
||||
name = cert.get("nice_name")
|
||||
if not name:
|
||||
domains = cert.get("domain_names") or []
|
||||
if isinstance(domains, list):
|
||||
name = ", ".join(domains)
|
||||
if not name:
|
||||
name = "unknown"
|
||||
|
||||
expiry = _parse_expiry(cert.get("expires_on"))
|
||||
if expiry is None:
|
||||
items.append((expiry, f"⚠️ {name}: unknown expiry"))
|
||||
continue
|
||||
|
||||
days = (expiry - now).days
|
||||
date_str = expiry.astimezone(timezone.utc).strftime("%Y-%m-%d")
|
||||
if days < 0:
|
||||
line = f"🔴 {name}: expired {-days}d ago ({date_str})"
|
||||
elif days <= 7:
|
||||
line = f"🟠 {name}: {days}d ({date_str})"
|
||||
elif days <= 30:
|
||||
line = f"🟡 {name}: {days}d ({date_str})"
|
||||
else:
|
||||
line = f"🟢 {name}: {days}d ({date_str})"
|
||||
items.append((expiry, line))
|
||||
|
||||
items.sort(key=lambda item: item[0] or datetime.max.replace(tzinfo=timezone.utc))
|
||||
lines = ["🔒 SSL certificates\n"]
|
||||
lines.extend(line for _, line in items)
|
||||
return "\n".join(lines)
|
||||
504
services/openwrt.py
Normal file
504
services/openwrt.py
Normal file
@@ -0,0 +1,504 @@
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from services.runner import run_cmd, run_cmd_full
|
||||
|
||||
|
||||
def _format_uptime(seconds: int | float | None) -> str:
|
||||
if seconds is None:
|
||||
return "unknown"
|
||||
total = int(seconds)
|
||||
days, rem = divmod(total, 86400)
|
||||
hours, rem = divmod(rem, 3600)
|
||||
minutes, _ = divmod(rem, 60)
|
||||
if days > 0:
|
||||
return f"{days}d {hours:02d}:{minutes:02d}"
|
||||
return f"{hours:02d}:{minutes:02d}"
|
||||
|
||||
|
||||
def _format_load(load: list[Any] | None) -> str:
|
||||
if not load or not isinstance(load, list):
|
||||
return "unknown"
|
||||
values = []
|
||||
for raw in load[:3]:
|
||||
try:
|
||||
values.append(float(raw))
|
||||
except (TypeError, ValueError):
|
||||
values.append(0.0)
|
||||
scale = 1.0
|
||||
if values and max(values) > 1000:
|
||||
scale = 1 / 65536.0
|
||||
return " ".join(f"{val * scale:.2f}" for val in values)
|
||||
|
||||
|
||||
def _format_rate(rate: Any) -> str:
|
||||
try:
|
||||
val = float(rate)
|
||||
except (TypeError, ValueError):
|
||||
return "?"
|
||||
if val <= 0:
|
||||
return "?"
|
||||
if val >= 1_000_000:
|
||||
return f"{val / 1_000_000:.1f}M"
|
||||
if val >= 1_000:
|
||||
return f"{val / 1_000:.1f}K"
|
||||
return f"{val:.0f}b"
|
||||
|
||||
|
||||
def _extract_wan_ip(wan: dict[str, Any]) -> str | None:
|
||||
if not isinstance(wan, dict):
|
||||
return None
|
||||
addrs = wan.get("ipv4-address") or []
|
||||
if isinstance(addrs, list):
|
||||
for item in addrs:
|
||||
if isinstance(item, dict):
|
||||
ip = item.get("address")
|
||||
if ip:
|
||||
return str(ip)
|
||||
return None
|
||||
|
||||
|
||||
def _extract_wifi_clients(wireless: dict[str, Any]) -> list[str]:
|
||||
clients: list[str] = []
|
||||
if not isinstance(wireless, dict):
|
||||
return clients
|
||||
for radio in wireless.values():
|
||||
if not isinstance(radio, dict):
|
||||
continue
|
||||
for iface in radio.get("interfaces", []) or []:
|
||||
if not isinstance(iface, dict):
|
||||
continue
|
||||
ifname = iface.get("ifname") or "wifi"
|
||||
assoclist = iface.get("assoclist")
|
||||
stations = iface.get("stations")
|
||||
if isinstance(assoclist, dict):
|
||||
for mac, meta in assoclist.items():
|
||||
if not isinstance(meta, dict):
|
||||
continue
|
||||
signal = meta.get("signal")
|
||||
rx = _format_rate((meta.get("rx") or {}).get("rate"))
|
||||
tx = _format_rate((meta.get("tx") or {}).get("rate"))
|
||||
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
|
||||
clients.append(f"{ifname} {mac} {sig} rx:{rx} tx:{tx}")
|
||||
elif isinstance(stations, list):
|
||||
for meta in stations:
|
||||
if not isinstance(meta, dict):
|
||||
continue
|
||||
mac = meta.get("mac") or "?"
|
||||
signal = meta.get("signal")
|
||||
rx = _format_rate((meta.get("rx") or {}).get("rate"))
|
||||
tx = _format_rate((meta.get("tx") or {}).get("rate"))
|
||||
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
|
||||
clients.append(f"{ifname} {mac} {sig} rx:{rx} tx:{tx}")
|
||||
return clients
|
||||
|
||||
|
||||
def _extract_leases(leases: dict[str, Any]) -> list[str]:
|
||||
items = None
|
||||
if isinstance(leases, dict):
|
||||
items = leases.get("leases") or leases.get("dhcp_leases") or leases.get("ipv4_leases")
|
||||
elif isinstance(leases, list):
|
||||
items = leases
|
||||
if not isinstance(items, list):
|
||||
return []
|
||||
out = []
|
||||
for lease in items:
|
||||
if not isinstance(lease, dict):
|
||||
continue
|
||||
ipaddr = lease.get("ipaddr") or "?"
|
||||
host = lease.get("hostname") or "unknown"
|
||||
mac = lease.get("macaddr") or "?"
|
||||
out.append(f"{ipaddr} {host} ({mac})")
|
||||
return out
|
||||
|
||||
|
||||
def _extract_lease_name_map(leases: Any) -> dict[str, str]:
|
||||
items = None
|
||||
if isinstance(leases, dict):
|
||||
items = leases.get("leases") or leases.get("dhcp_leases") or leases.get("ipv4_leases")
|
||||
elif isinstance(leases, list):
|
||||
items = leases
|
||||
if not isinstance(items, list):
|
||||
return {}
|
||||
out: dict[str, str] = {}
|
||||
for lease in items:
|
||||
if not isinstance(lease, dict):
|
||||
continue
|
||||
mac = lease.get("macaddr")
|
||||
if not mac:
|
||||
continue
|
||||
host = lease.get("hostname") or "unknown"
|
||||
out[str(mac).lower()] = str(host)
|
||||
return out
|
||||
|
||||
|
||||
def _extract_lease_name_map_fallback(raw: str) -> dict[str, str]:
|
||||
out: dict[str, str] = {}
|
||||
for line in raw.splitlines():
|
||||
parts = line.strip().split()
|
||||
if len(parts) < 4:
|
||||
continue
|
||||
_expiry, mac, _ipaddr, host = parts[:4]
|
||||
host = host if host != "*" else "unknown"
|
||||
out[str(mac).lower()] = str(host)
|
||||
return out
|
||||
|
||||
|
||||
def _extract_ifnames(wireless: dict[str, Any]) -> list[str]:
|
||||
ifnames: list[str] = []
|
||||
if not isinstance(wireless, dict):
|
||||
return ifnames
|
||||
for radio in wireless.values():
|
||||
if not isinstance(radio, dict):
|
||||
continue
|
||||
for iface in radio.get("interfaces", []) or []:
|
||||
if not isinstance(iface, dict):
|
||||
continue
|
||||
ifname = iface.get("ifname")
|
||||
if ifname:
|
||||
ifnames.append(str(ifname))
|
||||
return ifnames
|
||||
|
||||
|
||||
def _extract_ifname_meta(wireless: dict[str, Any]) -> dict[str, dict[str, str]]:
|
||||
meta: dict[str, dict[str, str]] = {}
|
||||
if not isinstance(wireless, dict):
|
||||
return meta
|
||||
for radio in wireless.values():
|
||||
if not isinstance(radio, dict):
|
||||
continue
|
||||
band = None
|
||||
cfg = radio.get("config") or {}
|
||||
if isinstance(cfg, dict):
|
||||
band = cfg.get("band")
|
||||
band_label = None
|
||||
if band == "2g":
|
||||
band_label = "2.4GHz"
|
||||
elif band == "5g":
|
||||
band_label = "5GHz"
|
||||
elif band:
|
||||
band_label = str(band)
|
||||
for iface in radio.get("interfaces", []) or []:
|
||||
if not isinstance(iface, dict):
|
||||
continue
|
||||
ifname = iface.get("ifname")
|
||||
if not ifname:
|
||||
continue
|
||||
iface_cfg = iface.get("config") or {}
|
||||
ssid = None
|
||||
if isinstance(iface_cfg, dict):
|
||||
ssid = iface_cfg.get("ssid")
|
||||
meta[str(ifname)] = {
|
||||
"ssid": str(ssid) if ssid else "",
|
||||
"band": band_label or "",
|
||||
}
|
||||
return meta
|
||||
|
||||
|
||||
def _extract_hostapd_ifnames(raw: str) -> list[str]:
|
||||
ifnames: list[str] = []
|
||||
for line in raw.splitlines():
|
||||
name = line.strip()
|
||||
if not name or name == "hostapd":
|
||||
continue
|
||||
ifnames.append(name)
|
||||
return ifnames
|
||||
|
||||
|
||||
def _net_label_for_ifname(ifname: str, ifname_meta: dict[str, dict[str, str]]) -> str:
|
||||
meta = ifname_meta.get(ifname, {})
|
||||
ssid = meta.get("ssid") or ""
|
||||
band = meta.get("band") or ""
|
||||
if ssid and band:
|
||||
return f"{ssid} ({band})"
|
||||
if ssid:
|
||||
return ssid
|
||||
if band:
|
||||
return band
|
||||
return ifname
|
||||
|
||||
|
||||
def _safe_json_load(raw: str) -> Any | None:
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}")
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
return None
|
||||
try:
|
||||
return json.loads(raw[start : end + 1])
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_hostapd_clients(
|
||||
payload: Any,
|
||||
ifname: str,
|
||||
*,
|
||||
name_map: dict[str, str] | None = None,
|
||||
ifname_meta: dict[str, dict[str, str]] | None = None,
|
||||
) -> list[tuple[str, int | None, str]]:
|
||||
if not isinstance(payload, dict):
|
||||
return []
|
||||
data = payload.get("clients")
|
||||
if isinstance(data, dict):
|
||||
items = data.items()
|
||||
else:
|
||||
return []
|
||||
clients: list[tuple[str, int | None, str]] = []
|
||||
name_map = name_map or {}
|
||||
meta = (ifname_meta or {}).get(ifname, {})
|
||||
ssid = meta.get("ssid") or ""
|
||||
band = meta.get("band") or ""
|
||||
if ssid and band:
|
||||
net_label = f"{ssid} ({band})"
|
||||
elif ssid:
|
||||
net_label = ssid
|
||||
elif band:
|
||||
net_label = band
|
||||
else:
|
||||
net_label = ifname
|
||||
for mac, meta in items:
|
||||
if not isinstance(meta, dict):
|
||||
continue
|
||||
signal = meta.get("signal")
|
||||
rate = meta.get("rate") or {}
|
||||
rx = _format_rate((rate or {}).get("rx"))
|
||||
tx = _format_rate((rate or {}).get("tx"))
|
||||
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
|
||||
host = name_map.get(str(mac).lower())
|
||||
if host and host != "unknown":
|
||||
client_label = host
|
||||
else:
|
||||
client_label = str(mac)
|
||||
line = f"{net_label} {client_label} {sig} rx:{rx} tx:{tx}"
|
||||
clients.append((line, signal if isinstance(signal, (int, float)) else None, net_label))
|
||||
return clients
|
||||
|
||||
|
||||
def _parse_proc_fallback(raw: str) -> tuple[int | None, list[float] | None]:
|
||||
uptime = None
|
||||
load = None
|
||||
for line in raw.splitlines():
|
||||
parts = line.split()
|
||||
if len(parts) >= 2 and uptime is None:
|
||||
try:
|
||||
uptime = int(float(parts[0]))
|
||||
except ValueError:
|
||||
uptime = None
|
||||
if len(parts) >= 3 and load is None:
|
||||
try:
|
||||
load = [float(parts[0]), float(parts[1]), float(parts[2])]
|
||||
except ValueError:
|
||||
load = None
|
||||
return uptime, load
|
||||
|
||||
|
||||
def _parse_leases_fallback(raw: str) -> list[str]:
|
||||
out = []
|
||||
for line in raw.splitlines():
|
||||
parts = line.strip().split()
|
||||
if len(parts) < 4:
|
||||
continue
|
||||
_expiry, mac, ipaddr, host = parts[:4]
|
||||
host = host if host != "*" else "unknown"
|
||||
out.append(f"{ipaddr} {host} ({mac})")
|
||||
return out
|
||||
|
||||
|
||||
async def get_openwrt_status(cfg: dict[str, Any], mode: str = "full") -> str:
|
||||
ow_cfg = cfg.get("openwrt", {})
|
||||
host = ow_cfg.get("host")
|
||||
user = ow_cfg.get("user", "root")
|
||||
port = ow_cfg.get("port", 22)
|
||||
identity_file = ow_cfg.get("identity_file")
|
||||
timeout_sec = ow_cfg.get("timeout_sec", 8)
|
||||
strict = ow_cfg.get("strict_host_key_checking", True)
|
||||
|
||||
if not host:
|
||||
return "⚠️ OpenWrt host not configured"
|
||||
|
||||
ssh_cmd = [
|
||||
"ssh",
|
||||
"-o",
|
||||
"BatchMode=yes",
|
||||
"-o",
|
||||
f"ConnectTimeout={timeout_sec}",
|
||||
"-o",
|
||||
"LogLevel=ERROR",
|
||||
]
|
||||
if not strict:
|
||||
ssh_cmd += ["-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"]
|
||||
if identity_file:
|
||||
ssh_cmd += ["-i", str(identity_file)]
|
||||
ssh_cmd += ["-p", str(port), f"{user}@{host}"]
|
||||
|
||||
remote = (
|
||||
"ubus call system info 2>/dev/null || (cat /proc/uptime; echo; cat /proc/loadavg); "
|
||||
"echo __SEP__;"
|
||||
"ubus call network.interface.wan status 2>/dev/null; echo __SEP__;"
|
||||
"ubus call network.wireless status 2>/dev/null; echo __SEP__;"
|
||||
"ubus call luci-rpc getDHCPLeases '{\"family\":4}' 2>/dev/null || cat /tmp/dhcp.leases"
|
||||
)
|
||||
cmd = ssh_cmd + ["sh", "-c", remote]
|
||||
rc, out = await run_cmd_full(cmd, timeout=timeout_sec + 15)
|
||||
if rc == 124:
|
||||
return "⚠️ OpenWrt SSH error: timeout"
|
||||
if rc != 0:
|
||||
return f"⚠️ OpenWrt SSH error: {out.strip() or 'unknown error'}"
|
||||
|
||||
parts = [p.strip() for p in out.split("__SEP__")]
|
||||
if len(parts) < 4:
|
||||
return "⚠️ OpenWrt response incomplete"
|
||||
|
||||
sys_info = _safe_json_load(parts[0])
|
||||
wan_status = _safe_json_load(parts[1]) or {}
|
||||
wireless = _safe_json_load(parts[2]) or {}
|
||||
leases = _safe_json_load(parts[3])
|
||||
leases_fallback = "" if leases is not None else parts[3]
|
||||
|
||||
if isinstance(sys_info, dict):
|
||||
uptime_raw = sys_info.get("uptime")
|
||||
load_raw = sys_info.get("load")
|
||||
else:
|
||||
uptime_raw, load_raw = _parse_proc_fallback(parts[0])
|
||||
uptime = _format_uptime(uptime_raw)
|
||||
load = _format_load(load_raw)
|
||||
wan_ip = _extract_wan_ip(wan_status) or "unknown"
|
||||
wan_up = wan_status.get("up") if isinstance(wan_status, dict) else None
|
||||
wan_state = "up" if wan_up else "down"
|
||||
|
||||
wifi_clients = _extract_wifi_clients(wireless)
|
||||
ifnames = _extract_ifnames(wireless)
|
||||
ifname_meta = _extract_ifname_meta(wireless)
|
||||
rc_l, out_l = await run_cmd_full(
|
||||
ssh_cmd + ["sh", "-c", r"ubus -S list | awk -F. '/^hostapd\.phy/{print $2}'"],
|
||||
timeout=timeout_sec + 15,
|
||||
)
|
||||
if rc_l == 0 and out_l.strip():
|
||||
ifnames.extend(_extract_hostapd_ifnames(out_l))
|
||||
ifnames = sorted({name for name in ifnames if name})
|
||||
lease_name_map = _extract_lease_name_map(leases or {})
|
||||
if leases_fallback:
|
||||
lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback))
|
||||
wifi_net_counts: dict[str, int] = {}
|
||||
wifi_signals: dict[str, list[int]] = {}
|
||||
if ifnames:
|
||||
for ifname in ifnames:
|
||||
cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"]
|
||||
rc2, out2 = await run_cmd_full(cmd_clients, timeout=timeout_sec + 15)
|
||||
if rc2 == 124:
|
||||
return f"⚠️ OpenWrt SSH error (wifi clients {ifname}): timeout"
|
||||
if rc2 == 0:
|
||||
payload = _safe_json_load(out2)
|
||||
if payload:
|
||||
clients_payload = payload.get("clients") if isinstance(payload, dict) else None
|
||||
if isinstance(clients_payload, dict):
|
||||
label = _net_label_for_ifname(ifname, ifname_meta)
|
||||
wifi_net_counts[label] = wifi_net_counts.get(label, 0) + len(clients_payload)
|
||||
parsed = _parse_hostapd_clients(
|
||||
payload,
|
||||
ifname,
|
||||
name_map=lease_name_map,
|
||||
ifname_meta=ifname_meta,
|
||||
)
|
||||
wifi_clients.extend([p[0] for p in parsed])
|
||||
for _line, sig, net_label in parsed:
|
||||
if sig is not None and net_label:
|
||||
wifi_signals.setdefault(net_label, []).append(sig)
|
||||
|
||||
if leases:
|
||||
leases_list = _extract_leases(leases)
|
||||
else:
|
||||
leases_list = _parse_leases_fallback(leases_fallback)
|
||||
|
||||
header = [
|
||||
"📡 OpenWrt",
|
||||
f"🕒 Uptime: {uptime}",
|
||||
f"⚙️ Load: {load}",
|
||||
f"🌐 WAN: {wan_ip} ({wan_state})",
|
||||
"",
|
||||
]
|
||||
wifi_section: list[str] = []
|
||||
if wifi_net_counts:
|
||||
wifi_section.append("📶 Wi-Fi networks:")
|
||||
for label, count in sorted(wifi_net_counts.items()):
|
||||
sigs = wifi_signals.get(label) or []
|
||||
if sigs:
|
||||
avg_sig = sum(sigs) / len(sigs)
|
||||
min_sig = min(sigs)
|
||||
wifi_section.append(f" - {label}: {count} (avg {avg_sig:.0f}dBm, min {min_sig}dBm)")
|
||||
else:
|
||||
wifi_section.append(f" - {label}: {count}")
|
||||
wifi_section.append("")
|
||||
|
||||
wifi_section.append(f"📶 Wi-Fi clients: {len(wifi_clients)}")
|
||||
if wifi_clients:
|
||||
for line in wifi_clients[:20]:
|
||||
wifi_section.append(f" - {line}")
|
||||
if len(wifi_clients) > 20:
|
||||
wifi_section.append(f" … and {len(wifi_clients) - 20} more")
|
||||
else:
|
||||
wifi_section.append(" (none)")
|
||||
|
||||
lease_section: list[str] = ["", f"🧾 DHCP leases: {len(leases_list)}"]
|
||||
if leases_list:
|
||||
for line in leases_list[:20]:
|
||||
lease_section.append(f" - {line}")
|
||||
if len(leases_list) > 20:
|
||||
lease_section.append(f" … and {len(leases_list) - 20} more")
|
||||
else:
|
||||
lease_section.append(" (none)")
|
||||
|
||||
if mode == "wan":
|
||||
return "\n".join(header)
|
||||
if mode == "clients":
|
||||
return "\n".join(header + wifi_section)
|
||||
if mode == "leases":
|
||||
return "\n".join(header + lease_section)
|
||||
return "\n".join(header + wifi_section + lease_section)
|
||||
|
||||
|
||||
async def fetch_openwrt_leases(cfg: dict[str, Any]) -> list[str]:
|
||||
"""
|
||||
Fetch DHCP leases as list of strings "IP hostname (MAC)".
|
||||
"""
|
||||
ow_cfg = cfg.get("openwrt", {})
|
||||
host = ow_cfg.get("host")
|
||||
user = ow_cfg.get("user", "root")
|
||||
port = ow_cfg.get("port", 22)
|
||||
identity_file = ow_cfg.get("identity_file")
|
||||
timeout_sec = ow_cfg.get("timeout_sec", 8)
|
||||
strict = ow_cfg.get("strict_host_key_checking", True)
|
||||
|
||||
if not host:
|
||||
raise RuntimeError("OpenWrt host not configured")
|
||||
|
||||
ssh_cmd = [
|
||||
"ssh",
|
||||
"-o",
|
||||
"BatchMode=yes",
|
||||
"-o",
|
||||
f"ConnectTimeout={timeout_sec}",
|
||||
"-o",
|
||||
"LogLevel=ERROR",
|
||||
]
|
||||
if not strict:
|
||||
ssh_cmd += ["-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"]
|
||||
if identity_file:
|
||||
ssh_cmd += ["-i", str(identity_file)]
|
||||
ssh_cmd += ["-p", str(port), f"{user}@{host}"]
|
||||
|
||||
remote = "ubus call luci-rpc getDHCPLeases '{\"family\":4}' 2>/dev/null || cat /tmp/dhcp.leases"
|
||||
rc, out = await run_cmd_full(ssh_cmd + ["sh", "-c", remote], timeout=timeout_sec + 10)
|
||||
if rc == 124:
|
||||
raise RuntimeError("timeout")
|
||||
if rc != 0:
|
||||
raise RuntimeError(out.strip() or f"ssh rc={rc}")
|
||||
leases = _safe_json_load(out)
|
||||
if leases:
|
||||
return _extract_leases(leases)
|
||||
return _parse_leases_fallback(out)
|
||||
88
services/processes.py
Normal file
88
services/processes.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
def _safe_name(info: dict[str, Any]) -> str:
|
||||
name = info.get("name") or "unknown"
|
||||
return str(name)
|
||||
|
||||
|
||||
def get_top_processes(limit: int = 5, interval: float = 0.2) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
||||
procs = []
|
||||
for p in psutil.process_iter(attrs=["pid", "name"]):
|
||||
procs.append(p)
|
||||
|
||||
for p in procs:
|
||||
try:
|
||||
p.cpu_percent(None)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
time.sleep(interval)
|
||||
|
||||
items = []
|
||||
for p in procs:
|
||||
try:
|
||||
cpu = p.cpu_percent(None)
|
||||
mem = p.memory_percent()
|
||||
info = p.info
|
||||
items.append({
|
||||
"pid": info.get("pid"),
|
||||
"name": _safe_name(info),
|
||||
"cpu": cpu,
|
||||
"mem": mem,
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
top_cpu = sorted(items, key=lambda x: x["cpu"], reverse=True)[:limit]
|
||||
top_mem = sorted(items, key=lambda x: x["mem"], reverse=True)[:limit]
|
||||
return top_cpu, top_mem
|
||||
|
||||
|
||||
def search_processes(query: str, limit: int = 10) -> list[dict[str, Any]]:
|
||||
needle = query.lower().strip()
|
||||
if not needle:
|
||||
return []
|
||||
|
||||
results = []
|
||||
for p in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
|
||||
try:
|
||||
info = p.info
|
||||
name = _safe_name(info)
|
||||
cmdline = " ".join(info.get("cmdline") or [])
|
||||
hay = f"{name} {cmdline}".lower()
|
||||
if needle not in hay:
|
||||
continue
|
||||
results.append({
|
||||
"pid": info.get("pid"),
|
||||
"name": name,
|
||||
"cmdline": cmdline,
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return results[:limit]
|
||||
|
||||
|
||||
def terminate_process(pid: int, timeout: float = 5.0) -> str:
|
||||
try:
|
||||
proc = psutil.Process(pid)
|
||||
except Exception:
|
||||
return f"Process {pid} not found"
|
||||
|
||||
try:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=timeout)
|
||||
return f"Process {pid} terminated"
|
||||
except psutil.TimeoutExpired:
|
||||
try:
|
||||
proc.kill()
|
||||
proc.wait(timeout=timeout)
|
||||
return f"Process {pid} killed"
|
||||
except Exception as e:
|
||||
return f"Kill failed for {pid}: {e}"
|
||||
except Exception as e:
|
||||
return f"Terminate failed for {pid}: {e}"
|
||||
@@ -1,34 +1,209 @@
|
||||
import asyncio
|
||||
from typing import Awaitable, Callable
|
||||
import logging
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import Awaitable, Callable, Any
|
||||
from services import runtime_state
|
||||
from services.incidents import log_incident
|
||||
|
||||
|
||||
_queue: asyncio.Queue = asyncio.Queue()
|
||||
_current_label: str | None = None
|
||||
_current_meta: dict[str, Any] | None = None
|
||||
_pending: deque[tuple[str, float]] = deque()
|
||||
_stats: dict[str, Any] = runtime_state.get("queue_stats", {}) or {
|
||||
"processed": 0,
|
||||
"avg_wait_sec": 0.0,
|
||||
"avg_runtime_sec": 0.0,
|
||||
"last_label": "",
|
||||
"last_finished_at": 0.0,
|
||||
}
|
||||
_history: deque[dict[str, Any]] = deque(runtime_state.get("queue_history", []) or [], maxlen=50)
|
||||
_alert_cfg: dict[str, Any] = {
|
||||
"max_pending": None,
|
||||
"avg_wait": None,
|
||||
"cooldown": 300,
|
||||
"last_sent": 0.0,
|
||||
}
|
||||
_cfg: dict[str, Any] | None = None
|
||||
_logger = logging.getLogger("queue")
|
||||
|
||||
|
||||
def _save_stats():
|
||||
runtime_state.set_state("queue_stats", _stats)
|
||||
runtime_state.set_state("queue_history", list(_history))
|
||||
|
||||
|
||||
def configure(queue_cfg: dict[str, Any], cfg: dict[str, Any]):
|
||||
global _cfg
|
||||
_cfg = cfg
|
||||
_alert_cfg["max_pending"] = queue_cfg.get("max_pending_alert")
|
||||
_alert_cfg["avg_wait"] = queue_cfg.get("avg_wait_alert")
|
||||
_alert_cfg["cooldown"] = queue_cfg.get("cooldown_sec", 300)
|
||||
|
||||
|
||||
def _check_congestion(pending_count: int, avg_wait: float | None):
|
||||
max_pending = _alert_cfg.get("max_pending")
|
||||
avg_wait_thr = _alert_cfg.get("avg_wait")
|
||||
cooldown = _alert_cfg.get("cooldown", 300)
|
||||
now = time.time()
|
||||
if now - _alert_cfg.get("last_sent", 0) < cooldown:
|
||||
return
|
||||
reason = None
|
||||
if max_pending and pending_count >= max_pending:
|
||||
reason = f"pending={pending_count} >= {max_pending}"
|
||||
if avg_wait_thr and avg_wait is not None and avg_wait >= avg_wait_thr:
|
||||
reason = reason or f"avg_wait={avg_wait:.1f}s >= {avg_wait_thr}s"
|
||||
if reason and _cfg:
|
||||
try:
|
||||
log_incident(_cfg, f"queue_congested {reason}", category="queue")
|
||||
except Exception:
|
||||
pass
|
||||
_alert_cfg["last_sent"] = now
|
||||
|
||||
|
||||
async def enqueue(label: str, job: Callable[[], Awaitable[None]]) -> int:
|
||||
await _queue.put((label, job))
|
||||
return _queue.qsize()
|
||||
enqueued_at = time.time()
|
||||
await _queue.put((label, job, enqueued_at))
|
||||
_pending.append((label, enqueued_at))
|
||||
_check_congestion(len(_pending), None)
|
||||
return len(_pending)
|
||||
|
||||
|
||||
async def worker():
|
||||
global _current_label
|
||||
global _current_label, _current_meta
|
||||
while True:
|
||||
label, job = await _queue.get()
|
||||
label, job, enqueued_at = await _queue.get()
|
||||
if _pending:
|
||||
if _pending[0] == (label, enqueued_at):
|
||||
_pending.popleft()
|
||||
else:
|
||||
try:
|
||||
_pending.remove((label, enqueued_at))
|
||||
except ValueError:
|
||||
pass
|
||||
_current_label = label
|
||||
_current_meta = {"enqueued_at": enqueued_at, "started_at": time.time()}
|
||||
status = "ok"
|
||||
try:
|
||||
await job()
|
||||
except Exception as e:
|
||||
status = "err"
|
||||
_logger.exception("Queue job failed: label=%s", label)
|
||||
if _cfg:
|
||||
try:
|
||||
log_incident(
|
||||
_cfg,
|
||||
f"queue_job_failed label={label} error={type(e).__name__}: {e}",
|
||||
category="queue",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
finished_at = time.time()
|
||||
if _current_meta:
|
||||
wait_sec = max(0.0, _current_meta["started_at"] - _current_meta["enqueued_at"])
|
||||
runtime_sec = max(0.0, finished_at - _current_meta["started_at"])
|
||||
n_prev = int(_stats.get("processed", 0))
|
||||
_stats["processed"] = n_prev + 1
|
||||
_stats["avg_wait_sec"] = (
|
||||
(_stats.get("avg_wait_sec", 0.0) * n_prev) + wait_sec
|
||||
) / _stats["processed"]
|
||||
_stats["avg_runtime_sec"] = (
|
||||
(_stats.get("avg_runtime_sec", 0.0) * n_prev) + runtime_sec
|
||||
) / _stats["processed"]
|
||||
_stats["last_label"] = label
|
||||
_stats["last_finished_at"] = finished_at
|
||||
_history.appendleft(
|
||||
{
|
||||
"label": label,
|
||||
"wait_sec": int(wait_sec),
|
||||
"runtime_sec": int(runtime_sec),
|
||||
"finished_at": int(finished_at),
|
||||
"status": status,
|
||||
}
|
||||
)
|
||||
_save_stats()
|
||||
_check_congestion(len(_pending), _stats.get("avg_wait_sec"))
|
||||
_current_label = None
|
||||
_current_meta = None
|
||||
_queue.task_done()
|
||||
|
||||
|
||||
def format_status() -> str:
|
||||
pending = [label for label, _ in list(_queue._queue)]
|
||||
pending = list(_pending)
|
||||
lines = ["🧾 Queue"]
|
||||
lines.append(f"🔄 Running: {_current_label or 'idle'}")
|
||||
lines.append(f"⏳ Pending: {len(pending)}")
|
||||
if pending:
|
||||
preview = ", ".join(pending[:5])
|
||||
preview = ", ".join([p[0] for p in pending[:5]])
|
||||
lines.append(f"➡️ Next: {preview}")
|
||||
if _stats.get("processed"):
|
||||
lines.append(
|
||||
f"📈 Done: {_stats.get('processed')} | "
|
||||
f"avg wait {int(_stats.get('avg_wait_sec', 0))}s | "
|
||||
f"avg run {int(_stats.get('avg_runtime_sec', 0))}s"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_details(limit: int = 10) -> str:
|
||||
now = time.time()
|
||||
lines = ["🧾 Queue details"]
|
||||
if _current_label:
|
||||
started_at = _current_meta.get("started_at") if _current_meta else None
|
||||
runtime = f"{int(now - started_at)}s" if started_at else "n/a"
|
||||
lines.append(f"🔄 Running: {_current_label} ({runtime})")
|
||||
else:
|
||||
lines.append("🔄 Running: idle")
|
||||
|
||||
pending = list(_pending)
|
||||
lines.append(f"⏳ Pending: {len(pending)}")
|
||||
if pending:
|
||||
lines.append("🔢 Position | Label | Wait")
|
||||
for i, (label, enqueued_at) in enumerate(pending[:limit], start=1):
|
||||
wait = int(now - enqueued_at)
|
||||
lines.append(f"{i:>3} | {label} | {wait}s")
|
||||
if _stats.get("processed"):
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"📈 Stats: "
|
||||
f"{_stats.get('processed')} done, "
|
||||
f"avg wait {int(_stats.get('avg_wait_sec', 0))}s, "
|
||||
f"avg run {int(_stats.get('avg_runtime_sec', 0))}s"
|
||||
)
|
||||
last_label = _stats.get("last_label")
|
||||
if last_label:
|
||||
lines.append(f"Last: {last_label}")
|
||||
if _history:
|
||||
lines.append("")
|
||||
lines.append("🗂 Last jobs:")
|
||||
for item in list(_history)[:5]:
|
||||
t = time.strftime("%H:%M:%S", time.localtime(item["finished_at"]))
|
||||
lines.append(
|
||||
f"- {t} {item['label']} {item['status']} "
|
||||
f"(wait {item['wait_sec']}s, run {item['runtime_sec']}s)"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_history(limit: int = 20) -> str:
|
||||
lines = ["🗂 Queue history"]
|
||||
if not _history:
|
||||
lines.append("(empty)")
|
||||
return "\n".join(lines)
|
||||
for item in list(_history)[:limit]:
|
||||
t = time.strftime("%m-%d %H:%M:%S", time.localtime(item["finished_at"]))
|
||||
lines.append(
|
||||
f"{t} {item['label']} {item['status']} "
|
||||
f"(wait {item['wait_sec']}s, run {item['runtime_sec']}s)"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def get_history_raw() -> list[dict[str, Any]]:
|
||||
return list(_history)
|
||||
|
||||
|
||||
def get_stats() -> dict[str, Any]:
|
||||
return dict(_stats)
|
||||
|
||||
@@ -22,3 +22,24 @@ async def run_cmd(cmd: list[str], *, use_restic_env: bool = False, timeout: int
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
return 124, "❌ timeout"
|
||||
|
||||
|
||||
async def run_cmd_full(cmd: list[str], *, use_restic_env: bool = False, timeout: int = 60):
|
||||
env = os.environ.copy()
|
||||
env["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
if use_restic_env:
|
||||
env.update(RESTIC_ENV)
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.STDOUT,
|
||||
env=env,
|
||||
)
|
||||
|
||||
try:
|
||||
out, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||
return proc.returncode, out.decode(errors="ignore")
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
return 124, "❌ timeout"
|
||||
|
||||
73
services/runtime_state.py
Normal file
73
services/runtime_state.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
import tempfile
|
||||
from typing import Any, Dict
|
||||
|
||||
_PATH = "/var/server-bot/runtime.json"
|
||||
_STATE: Dict[str, Any] = {}
|
||||
_LOCK = threading.RLock()
|
||||
_LOADED = False
|
||||
|
||||
|
||||
def configure(path: str | None):
|
||||
global _PATH
|
||||
if path:
|
||||
_PATH = path
|
||||
|
||||
|
||||
def _load_from_disk():
|
||||
global _STATE, _LOADED
|
||||
if not os.path.exists(_PATH):
|
||||
_STATE = {}
|
||||
_LOADED = True
|
||||
return
|
||||
try:
|
||||
with open(_PATH, "r", encoding="utf-8") as f:
|
||||
_STATE = json.load(f)
|
||||
except Exception:
|
||||
_STATE = {}
|
||||
_LOADED = True
|
||||
|
||||
|
||||
def _save():
|
||||
directory = os.path.dirname(_PATH) or "."
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
try:
|
||||
fd, tmp_path = tempfile.mkstemp(prefix=".runtime.", suffix=".json", dir=directory)
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
||||
json.dump(_STATE, f, ensure_ascii=False)
|
||||
f.flush()
|
||||
os.fsync(f.fileno())
|
||||
os.replace(tmp_path, _PATH)
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def get_state() -> Dict[str, Any]:
|
||||
with _LOCK:
|
||||
if not _LOADED:
|
||||
_load_from_disk()
|
||||
return _STATE
|
||||
|
||||
|
||||
def set_state(key: str, value: Any):
|
||||
with _LOCK:
|
||||
if not _LOADED:
|
||||
_load_from_disk()
|
||||
_STATE[key] = value
|
||||
_save()
|
||||
|
||||
|
||||
def get(key: str, default: Any = None) -> Any:
|
||||
with _LOCK:
|
||||
if not _LOADED:
|
||||
_load_from_disk()
|
||||
return _STATE.get(key, default)
|
||||
95
services/selftest.py
Normal file
95
services/selftest.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from services.health import health
|
||||
from services.runner import run_cmd_full
|
||||
from services.incidents import log_incident
|
||||
from services import runtime_state
|
||||
|
||||
|
||||
def _save_history(entry: dict[str, Any]) -> None:
|
||||
hist = runtime_state.get("selftest_history", [])
|
||||
hist = hist[:50] if isinstance(hist, list) else []
|
||||
hist.insert(0, entry)
|
||||
runtime_state.set_state("selftest_history", hist[:20])
|
||||
|
||||
|
||||
async def run_selftest(cfg: dict[str, Any], docker_map: dict[str, str]) -> tuple[str, bool]:
|
||||
lines = ["🧪 Self-test"]
|
||||
ok = True
|
||||
|
||||
# health
|
||||
try:
|
||||
htext = await asyncio.to_thread(health, cfg, docker_map)
|
||||
h_lines = [ln for ln in htext.splitlines() if ln.strip()]
|
||||
brief = " | ".join(h_lines[1:5]) if len(h_lines) > 1 else h_lines[0] if h_lines else "n/a"
|
||||
lines.append(f"🟢 Health: {brief}")
|
||||
except Exception as e:
|
||||
lines.append(f"🔴 Health failed: {e}")
|
||||
ok = False
|
||||
|
||||
# restic snapshots check
|
||||
rc, out = await run_cmd_full(["restic", "snapshots", "--json"], use_restic_env=True, timeout=40)
|
||||
if rc == 0:
|
||||
try:
|
||||
snaps = json.loads(out)
|
||||
if isinstance(snaps, list) and snaps:
|
||||
snaps.sort(key=lambda s: s.get("time", ""), reverse=True)
|
||||
last = snaps[0]
|
||||
t = last.get("time", "?").replace("Z", "").replace("T", " ")[:16]
|
||||
lines.append(f"🟢 Restic snapshots: {len(snaps)}, last {t}")
|
||||
else:
|
||||
lines.append("🟡 Restic snapshots: empty")
|
||||
except Exception:
|
||||
lines.append("🟡 Restic snapshots: invalid JSON")
|
||||
else:
|
||||
lines.append(f"🔴 Restic snapshots error: {out.strip() or rc}")
|
||||
ok = False
|
||||
|
||||
result_text = "\n".join(lines)
|
||||
try:
|
||||
_save_history(
|
||||
{
|
||||
"ts": datetime.now().isoformat(),
|
||||
"ok": ok,
|
||||
"summary": result_text.splitlines()[1] if len(lines) > 1 else "",
|
||||
}
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return result_text, ok
|
||||
|
||||
|
||||
async def schedule_selftest(cfg: dict[str, Any], bot, admin_ids: list[int], docker_map: dict[str, str]):
|
||||
"""
|
||||
Run selftest daily at configured time.
|
||||
"""
|
||||
sched_cfg = cfg.get("selftest", {}).get("schedule", {})
|
||||
if not sched_cfg.get("enabled", False):
|
||||
return
|
||||
time_str = sched_cfg.get("time", "03:30")
|
||||
try:
|
||||
hh, mm = [int(x) for x in time_str.split(":")]
|
||||
except Exception:
|
||||
hh, mm = 3, 30
|
||||
|
||||
while True:
|
||||
now = datetime.now()
|
||||
run_at = now.replace(hour=hh, minute=mm, second=0, microsecond=0)
|
||||
if run_at <= now:
|
||||
run_at += timedelta(days=1)
|
||||
await asyncio.sleep((run_at - now).total_seconds())
|
||||
text, ok = await run_selftest(cfg, docker_map)
|
||||
for chat_id in admin_ids:
|
||||
try:
|
||||
await bot.send_message(chat_id, text)
|
||||
except Exception:
|
||||
pass
|
||||
if not ok:
|
||||
try:
|
||||
log_incident(cfg, "selftest failed", category="selftest")
|
||||
except Exception:
|
||||
pass
|
||||
61
services/ssl_alerts.py
Normal file
61
services/ssl_alerts.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import asyncio
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from services.npmplus import fetch_certificates, _parse_expiry
|
||||
|
||||
|
||||
async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int):
|
||||
npm_cfg = cfg.get("npmplus", {})
|
||||
alert_cfg = npm_cfg.get("alerts", {})
|
||||
if not alert_cfg.get("enabled", True):
|
||||
return
|
||||
|
||||
days_list = alert_cfg.get("days", [30, 14, 7, 1])
|
||||
days_list = sorted({int(x) for x in days_list if int(x) >= 0}, reverse=True)
|
||||
cooldown = int(alert_cfg.get("cooldown_sec", 86400))
|
||||
interval = int(alert_cfg.get("interval_sec", 3600))
|
||||
|
||||
last_sent: dict[str, float] = {}
|
||||
|
||||
while True:
|
||||
now = datetime.now(timezone.utc)
|
||||
try:
|
||||
certs = fetch_certificates(cfg)
|
||||
except Exception:
|
||||
await asyncio.sleep(interval)
|
||||
continue
|
||||
|
||||
for cert in certs:
|
||||
name = cert.get("nice_name")
|
||||
if not name:
|
||||
domains = cert.get("domain_names") or []
|
||||
if isinstance(domains, list):
|
||||
name = ", ".join(domains)
|
||||
if not name:
|
||||
name = "unknown"
|
||||
|
||||
expiry = _parse_expiry(cert.get("expires_on"))
|
||||
if expiry is None:
|
||||
continue
|
||||
|
||||
days_left = (expiry - now).days
|
||||
for threshold in days_list:
|
||||
if days_left <= threshold:
|
||||
key = f"{name}:{threshold}"
|
||||
last_time = last_sent.get(key, 0)
|
||||
if time.time() - last_time >= cooldown:
|
||||
level = "critical" if days_left <= 1 else "warn"
|
||||
await notify(
|
||||
bot,
|
||||
chat_id,
|
||||
f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)",
|
||||
level=level,
|
||||
key=f"ssl:{name}:{threshold}",
|
||||
category="ssl",
|
||||
)
|
||||
last_sent[key] = time.time()
|
||||
break
|
||||
|
||||
await asyncio.sleep(interval)
|
||||
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import re
|
||||
from services.runner import run_cmd
|
||||
|
||||
|
||||
@@ -14,55 +15,65 @@ def detect_pkg_manager() -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
async def list_updates() -> str:
|
||||
async def list_updates() -> tuple[str, list[str]]:
|
||||
pm = detect_pkg_manager()
|
||||
if not pm:
|
||||
return "⚠️ No supported package manager found"
|
||||
return "⚠️ Updates", ["No supported package manager found"]
|
||||
|
||||
if pm == "apt":
|
||||
await run_cmd(["sudo", "apt", "update"], timeout=300)
|
||||
rc, out = await run_cmd(["apt", "list", "--upgradable"], timeout=120)
|
||||
if rc != 0:
|
||||
return f"❌ apt list failed\n```{out}```"
|
||||
return "❌ Updates (apt)", [f"apt list failed: {out}"]
|
||||
|
||||
lines = []
|
||||
for line in out.splitlines():
|
||||
if not line or line.startswith("Listing..."):
|
||||
continue
|
||||
# Format: name/version ... [upgradable from: old]
|
||||
name_ver = line.split(" ", 1)[0]
|
||||
if "/" not in name_ver:
|
||||
if "upgradable from:" not in line:
|
||||
continue
|
||||
name, new_ver = name_ver.split("/", 1)
|
||||
# Format: name/repo new_ver arch [upgradable from: old]
|
||||
m = re.match(r"^(\S+)\s+(\S+)\s", line)
|
||||
if not m:
|
||||
continue
|
||||
name_repo, new_ver = m.group(1), m.group(2)
|
||||
name = name_repo.split("/", 1)[0] if "/" in name_repo else name_repo
|
||||
|
||||
old_ver = None
|
||||
marker = "upgradable from: "
|
||||
if marker in line:
|
||||
old_ver = line.split(marker, 1)[1].rstrip("]").strip()
|
||||
m_old = re.search(r"upgradable from:\s*([^\]]+)", line)
|
||||
if m_old:
|
||||
old_ver = m_old.group(1).strip()
|
||||
if old_ver:
|
||||
lines.append(f"{name}: {old_ver} -> {new_ver}")
|
||||
else:
|
||||
lines.append(f"{name}: -> {new_ver}")
|
||||
|
||||
body = "\n".join(lines) if lines else "No updates"
|
||||
return f"📦 Updates (apt)\n```{body}```"
|
||||
if not lines:
|
||||
lines = ["No updates"]
|
||||
return "📦 Updates (apt)", lines
|
||||
|
||||
if pm == "dnf":
|
||||
rc, out = await run_cmd(["sudo", "dnf", "check-update"], timeout=300)
|
||||
if rc in (0, 100):
|
||||
return f"📦 Updates (dnf)\n```{out}```"
|
||||
return f"❌ dnf check-update failed\n```{out}```"
|
||||
lines = out.splitlines() or ["No updates"]
|
||||
return "📦 Updates (dnf)", lines
|
||||
return "❌ Updates (dnf)", [f"dnf check-update failed: {out}"]
|
||||
|
||||
if pm == "yum":
|
||||
rc, out = await run_cmd(["sudo", "yum", "check-update"], timeout=300)
|
||||
if rc in (0, 100):
|
||||
return f"📦 Updates (yum)\n```{out}```"
|
||||
return f"❌ yum check-update failed\n```{out}```"
|
||||
lines = out.splitlines() or ["No updates"]
|
||||
return "📦 Updates (yum)", lines
|
||||
return "❌ Updates (yum)", [f"yum check-update failed: {out}"]
|
||||
|
||||
if pm == "pacman":
|
||||
rc, out = await run_cmd(["pacman", "-Qu"], timeout=120)
|
||||
return f"📦 Updates (pacman)\n```{out}```" if rc == 0 else f"❌ pacman -Qu failed\n```{out}```"
|
||||
if rc == 0:
|
||||
lines = out.splitlines() or ["No updates"]
|
||||
return "📦 Updates (pacman)", lines
|
||||
return "❌ Updates (pacman)", [f"pacman -Qu failed: {out}"]
|
||||
|
||||
return "⚠️ Unsupported package manager"
|
||||
return "⚠️ Updates", ["Unsupported package manager"]
|
||||
|
||||
|
||||
async def apply_updates() -> str:
|
||||
|
||||
107
services/weekly_report.py
Normal file
107
services/weekly_report.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import asyncio
|
||||
import socket
|
||||
from datetime import datetime, timedelta
|
||||
import psutil
|
||||
from services.system import worst_disk_usage
|
||||
from services.alert_mute import list_mutes
|
||||
from services.incidents import read_recent
|
||||
from services.docker import docker_cmd
|
||||
|
||||
|
||||
def _parse_hhmm(value: str) -> tuple[int, int]:
|
||||
try:
|
||||
h, m = value.split(":", 1)
|
||||
h = int(h)
|
||||
m = int(m)
|
||||
if 0 <= h <= 23 and 0 <= m <= 59:
|
||||
return h, m
|
||||
except Exception:
|
||||
pass
|
||||
return 8, 0
|
||||
|
||||
|
||||
def _next_run(day: str, time_str: str) -> datetime:
|
||||
day = (day or "Sun").lower()
|
||||
day_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
||||
target_wd = day_map.get(day[:3], 6)
|
||||
hour, minute = _parse_hhmm(time_str or "08:00")
|
||||
now = datetime.now()
|
||||
candidate = now.replace(hour=hour, minute=minute, second=0, microsecond=0)
|
||||
# find next target weekday/time
|
||||
while candidate <= now or candidate.weekday() != target_wd:
|
||||
candidate = candidate + timedelta(days=1)
|
||||
candidate = candidate.replace(hour=hour, minute=minute, second=0, microsecond=0)
|
||||
return candidate
|
||||
|
||||
|
||||
async def _docker_running_counts(docker_map: dict) -> tuple[int, int]:
|
||||
total = len(docker_map)
|
||||
running = 0
|
||||
for real in docker_map.values():
|
||||
rc, raw = await docker_cmd(["inspect", "-f", "{{.State.Status}}", real], timeout=10)
|
||||
if rc == 0 and raw.strip() == "running":
|
||||
running += 1
|
||||
return running, total
|
||||
|
||||
|
||||
def _format_uptime(seconds: int) -> str:
|
||||
days, rem = divmod(seconds, 86400)
|
||||
hours, rem = divmod(rem, 3600)
|
||||
minutes, _ = divmod(rem, 60)
|
||||
return f"{days}d {hours:02d}:{minutes:02d}"
|
||||
|
||||
|
||||
async def build_weekly_report(cfg, docker_map: dict) -> str:
|
||||
host = socket.gethostname()
|
||||
uptime = int(datetime.now().timestamp() - psutil.boot_time())
|
||||
load1, load5, load15 = psutil.getloadavg()
|
||||
mem = psutil.virtual_memory()
|
||||
disk_usage, disk_mount = worst_disk_usage()
|
||||
running, total = await _docker_running_counts(docker_map)
|
||||
mutes = list_mutes()
|
||||
incidents_24 = len(read_recent(cfg, 24, limit=1000))
|
||||
incidents_7d = len(read_recent(cfg, 24 * 7, limit=2000))
|
||||
|
||||
lines = [
|
||||
f"🧾 Weekly report — {host}",
|
||||
f"⏱ Uptime: {_format_uptime(uptime)}",
|
||||
f"⚙️ Load: {load1:.2f} {load5:.2f} {load15:.2f}",
|
||||
f"🧠 RAM: {mem.percent}%",
|
||||
]
|
||||
if disk_usage is None:
|
||||
lines.append("💾 Disk: n/a")
|
||||
else:
|
||||
lines.append(f"💾 Disk: {disk_usage}% ({disk_mount})")
|
||||
|
||||
lines.append(f"🐳 Docker: {running}/{total} running")
|
||||
lines.append(f"📓 Incidents: 24h={incidents_24}, 7d={incidents_7d}")
|
||||
|
||||
if mutes:
|
||||
lines.append("🔕 Active mutes:")
|
||||
for cat, secs in mutes.items():
|
||||
mins = max(0, secs) // 60
|
||||
lines.append(f"- {cat}: {mins}m left")
|
||||
else:
|
||||
lines.append("🔔 Mutes: none")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def weekly_reporter(cfg, bot, admin_ids: list[int], docker_map: dict):
|
||||
reports_cfg = cfg.get("reports", {}).get("weekly", {})
|
||||
if not reports_cfg.get("enabled", False):
|
||||
return
|
||||
day = reports_cfg.get("day", "Sun")
|
||||
time_str = reports_cfg.get("time", "08:00")
|
||||
while True:
|
||||
target = _next_run(day, time_str)
|
||||
wait_sec = (target - datetime.now()).total_seconds()
|
||||
if wait_sec > 0:
|
||||
await asyncio.sleep(wait_sec)
|
||||
try:
|
||||
text = await build_weekly_report(cfg, docker_map)
|
||||
for admin_id in admin_ids:
|
||||
await bot.send_message(admin_id, text)
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.sleep(60) # small delay to avoid tight loop if time skew
|
||||
7
state.py
7
state.py
@@ -2,3 +2,10 @@ from typing import Dict
|
||||
|
||||
DOCKER_MAP: Dict[str, str] = {}
|
||||
LOG_FILTER_PENDING: Dict[int, dict] = {}
|
||||
UPDATES_CACHE: Dict[int, dict] = {}
|
||||
ARCANE_CACHE: Dict[int, dict] = {}
|
||||
REBOOT_PENDING: Dict[int, dict] = {}
|
||||
METRICS_STORE = None
|
||||
NPMPLUS_TOKEN: Dict[str, object] = {}
|
||||
PROC_SEARCH_PENDING: Dict[int, dict] = {}
|
||||
PROC_KILL_PENDING: Dict[int, dict] = {}
|
||||
|
||||
208
system_checks.py
208
system_checks.py
@@ -1,4 +1,6 @@
|
||||
import subprocess
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
def _cmd(cmd: str) -> str:
|
||||
@@ -19,10 +21,49 @@ def security() -> str:
|
||||
if not out or "ERROR:" in out:
|
||||
return "🔐 Security\n\n⚠️ permitrootlogin not found"
|
||||
|
||||
if "no" in out.lower():
|
||||
return "🔐 Security\n\n🟢 Root login disabled"
|
||||
lines = ["🔐 Security\n"]
|
||||
|
||||
return "🔐 Security\n\n🔴 Root login ENABLED"
|
||||
if "no" in out.lower():
|
||||
lines.append("🟢 Root login disabled")
|
||||
else:
|
||||
lines.append("🔴 Root login ENABLED")
|
||||
|
||||
pass_auth = _cmd("sshd -T | grep -i '^passwordauthentication'")
|
||||
if pass_auth and "ERROR:" not in pass_auth:
|
||||
lines.append("🔴 Password auth enabled" if "yes" in pass_auth.lower() else "🟢 Password auth disabled")
|
||||
|
||||
pubkey_auth = _cmd("sshd -T | grep -i '^pubkeyauthentication'")
|
||||
if pubkey_auth and "ERROR:" not in pubkey_auth:
|
||||
lines.append("🟢 Pubkey auth enabled" if "yes" in pubkey_auth.lower() else "🔴 Pubkey auth disabled")
|
||||
|
||||
sec_updates = _cmd("apt list --upgradable 2>/dev/null | grep -i security | wc -l")
|
||||
if sec_updates and "ERROR:" not in sec_updates:
|
||||
try:
|
||||
count = int(sec_updates.strip())
|
||||
lines.append(f"🔔 Security updates: {count}")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
time_info = _cmd("timedatectl")
|
||||
if time_info and "ERROR:" not in time_info:
|
||||
tz = None
|
||||
ntp = None
|
||||
synced = None
|
||||
for line in time_info.splitlines():
|
||||
if "Time zone:" in line:
|
||||
tz = line.split("Time zone:", 1)[1].strip()
|
||||
if "NTP service:" in line:
|
||||
ntp = line.split("NTP service:", 1)[1].strip()
|
||||
if "System clock synchronized:" in line:
|
||||
synced = line.split("System clock synchronized:", 1)[1].strip()
|
||||
if tz:
|
||||
lines.append(f"🕒 Time zone: {tz}")
|
||||
if ntp:
|
||||
lines.append(f"🔧 NTP service: {ntp}")
|
||||
if synced:
|
||||
lines.append(f"⏱ Clock synced: {synced}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------- DISKS ----------
|
||||
@@ -42,6 +83,62 @@ def list_disks() -> list[str]:
|
||||
return disks
|
||||
|
||||
|
||||
def list_md_arrays() -> list[str]:
|
||||
# Prefer /proc/mdstat: it reliably lists active md arrays
|
||||
# even when lsblk tree/filters differ across distros.
|
||||
out = _cmd("cat /proc/mdstat")
|
||||
arrays: set[str] = set()
|
||||
for line in out.splitlines():
|
||||
m = re.match(r"^\s*(md\d+)\s*:", line)
|
||||
if m:
|
||||
arrays.add(f"/dev/{m.group(1)}")
|
||||
|
||||
if arrays:
|
||||
return sorted(arrays)
|
||||
|
||||
# Fallback for environments where mdstat parsing is unavailable.
|
||||
out = _cmd("ls -1 /dev/md* 2>/dev/null")
|
||||
for line in out.splitlines():
|
||||
dev = line.strip()
|
||||
if dev and re.match(r"^/dev/md\d+$", dev):
|
||||
arrays.add(dev)
|
||||
return sorted(arrays)
|
||||
|
||||
|
||||
def md_array_status(dev: str) -> str:
|
||||
out = _cmd("cat /proc/mdstat")
|
||||
if not out or "ERROR:" in out:
|
||||
return "⚠️ n/a"
|
||||
|
||||
name = dev.rsplit("/", 1)[-1]
|
||||
lines = out.splitlines()
|
||||
header = None
|
||||
idx = -1
|
||||
for i, line in enumerate(lines):
|
||||
s = line.strip()
|
||||
if s.startswith(f"{name} :"):
|
||||
header = s
|
||||
idx = i
|
||||
break
|
||||
|
||||
if not header:
|
||||
return "⚠️ not found in /proc/mdstat"
|
||||
|
||||
if "inactive" in header:
|
||||
return "🔴 inactive"
|
||||
|
||||
# Typical mdstat health marker: [UU] for healthy mirrors/raid members.
|
||||
block = [header]
|
||||
for line in lines[idx + 1:]:
|
||||
if not line.strip():
|
||||
break
|
||||
block.append(line.strip())
|
||||
block_text = " ".join(block)
|
||||
if "[U_" in block_text or "[_U" in block_text:
|
||||
return "🟡 degraded"
|
||||
return "🟢 active"
|
||||
|
||||
|
||||
def smart_health(dev: str) -> str:
|
||||
out = _cmd(f"smartctl -H {dev}")
|
||||
|
||||
@@ -82,10 +179,25 @@ def disk_temperature(dev: str) -> str:
|
||||
return "n/a"
|
||||
|
||||
|
||||
def smart_last_test(dev: str) -> str:
|
||||
out = _cmd(f"smartctl -l selftest {dev}")
|
||||
if not out or "ERROR:" in out:
|
||||
return "n/a"
|
||||
|
||||
for line in out.splitlines():
|
||||
if "No self-tests have been logged" in line:
|
||||
return "no tests"
|
||||
if line.lstrip().startswith("#"):
|
||||
return line.strip()
|
||||
|
||||
return "n/a"
|
||||
|
||||
|
||||
def disks() -> str:
|
||||
disks = list_disks()
|
||||
md_arrays = list_md_arrays()
|
||||
|
||||
if not disks:
|
||||
if not disks and not md_arrays:
|
||||
return "💽 Disks\n\n❌ No disks found"
|
||||
|
||||
lines = ["💽 Disks (SMART)\n"]
|
||||
@@ -104,4 +216,92 @@ def disks() -> str:
|
||||
|
||||
lines.append(f"{icon} {d} — {health}, 🌡 {temp}")
|
||||
|
||||
if md_arrays:
|
||||
lines.append("")
|
||||
lines.append("🧱 RAID (md)")
|
||||
for md in md_arrays:
|
||||
lines.append(f"{md} — {md_array_status(md)}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def hardware() -> str:
|
||||
cpu_model = "n/a"
|
||||
try:
|
||||
with open("/proc/cpuinfo", "r") as f:
|
||||
for line in f:
|
||||
if line.lower().startswith("model name"):
|
||||
cpu_model = line.split(":", 1)[1].strip()
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
mem_total = "n/a"
|
||||
swap_total = "n/a"
|
||||
try:
|
||||
with open("/proc/meminfo", "r") as f:
|
||||
for line in f:
|
||||
if line.startswith("MemTotal:"):
|
||||
mem_kb = int(line.split()[1])
|
||||
mem_total = f"{mem_kb / (1024**2):.2f} GiB"
|
||||
if line.startswith("SwapTotal:"):
|
||||
swap_kb = int(line.split()[1])
|
||||
swap_total = f"{swap_kb / (1024**2):.2f} GiB"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
cores = os.cpu_count() or "n/a"
|
||||
uname = os.uname()
|
||||
|
||||
lines = [
|
||||
"🧱 Hardware",
|
||||
"",
|
||||
f"🧠 CPU: {cpu_model}",
|
||||
f"🧩 Cores: {cores}",
|
||||
f"💾 RAM: {mem_total}",
|
||||
f"🌀 Swap: {swap_total}",
|
||||
f"🧬 Arch: {uname.machine}",
|
||||
f"🐧 Kernel: {uname.release}",
|
||||
]
|
||||
gpu_lines = _gpu_info()
|
||||
if gpu_lines:
|
||||
lines.append("")
|
||||
lines.extend(gpu_lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _gpu_info() -> list[str]:
|
||||
# 1) NVIDIA: use nvidia-smi if available for model + memory
|
||||
smi = _cmd("nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader")
|
||||
if smi and "ERROR:" not in smi and "not found" not in smi.lower():
|
||||
lines = ["🎮 GPU (NVIDIA)"]
|
||||
for line in smi.splitlines():
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 2:
|
||||
name = parts[0]
|
||||
mem = parts[1]
|
||||
drv = parts[2] if len(parts) > 2 else "n/a"
|
||||
lines.append(f"• {name} | {mem} | driver {drv}")
|
||||
return lines
|
||||
|
||||
# 2) Generic: lspci (VGA/3D/Display)
|
||||
lspci = _cmd("lspci -mm | egrep -i 'vga|3d|display'")
|
||||
if lspci and "ERROR:" not in lspci and "not found" not in lspci.lower():
|
||||
lines = ["🎮 GPU"]
|
||||
for line in lspci.splitlines():
|
||||
# Format: "00:02.0" "VGA compatible controller" "Intel Corporation" "..."
|
||||
parts = [p.strip().strip('"') for p in line.split('"') if p.strip()]
|
||||
if len(parts) >= 4:
|
||||
vendor = parts[2]
|
||||
model = parts[3]
|
||||
lines.append(f"• {vendor} {model}")
|
||||
elif line.strip():
|
||||
lines.append(f"• {line.strip()}")
|
||||
# Try AMD VRAM from sysfs if present
|
||||
vram = _cmd("cat /sys/class/drm/card*/device/mem_info_vram_total 2>/dev/null | head -n 1")
|
||||
if vram and vram.strip().isdigit():
|
||||
bytes_val = int(vram.strip())
|
||||
lines.append(f"• VRAM: {bytes_val / (1024**3):.2f} GiB")
|
||||
return lines
|
||||
|
||||
return []
|
||||
|
||||
20
tests/test_config_check.py
Normal file
20
tests/test_config_check.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import unittest
|
||||
|
||||
from services.config_check import validate_cfg
|
||||
|
||||
|
||||
class ConfigCheckTests(unittest.TestCase):
|
||||
def test_admin_ids_without_admin_id_is_valid(self):
|
||||
cfg = {
|
||||
"telegram": {
|
||||
"token": "x",
|
||||
"admin_ids": [1, 2],
|
||||
}
|
||||
}
|
||||
errors, warnings = validate_cfg(cfg)
|
||||
self.assertEqual(errors, [])
|
||||
self.assertIsInstance(warnings, list)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
21
tests/test_disk_report.py
Normal file
21
tests/test_disk_report.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import unittest
|
||||
import types
|
||||
import sys
|
||||
|
||||
# Avoid runtime import of real app/aiogram in services.runner.
|
||||
sys.modules.setdefault("app", types.SimpleNamespace(RESTIC_ENV={}))
|
||||
|
||||
from services.disk_report import _top_dirs_cmd
|
||||
|
||||
|
||||
class DiskReportTests(unittest.TestCase):
|
||||
def test_top_dirs_cmd_uses_exec_args_without_shell(self):
|
||||
cmd = _top_dirs_cmd("/tmp/path with spaces", 5)
|
||||
self.assertEqual(cmd[:4], ["du", "-x", "-h", "-d"])
|
||||
self.assertNotIn("bash", cmd)
|
||||
self.assertNotIn("-lc", cmd)
|
||||
self.assertEqual(cmd[-1], "/tmp/path with spaces")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
59
tests/test_queue.py
Normal file
59
tests/test_queue.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import asyncio
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from services import runtime_state
|
||||
from services import queue as queue_service
|
||||
|
||||
|
||||
class QueueTests(unittest.IsolatedAsyncioTestCase):
|
||||
async def asyncSetUp(self):
|
||||
self.tmp = tempfile.TemporaryDirectory()
|
||||
runtime_state.configure(f"{self.tmp.name}/runtime.json")
|
||||
|
||||
queue_service._pending.clear() # type: ignore[attr-defined]
|
||||
queue_service._history.clear() # type: ignore[attr-defined]
|
||||
queue_service._stats = { # type: ignore[attr-defined]
|
||||
"processed": 0,
|
||||
"avg_wait_sec": 0.0,
|
||||
"avg_runtime_sec": 0.0,
|
||||
"last_label": "",
|
||||
"last_finished_at": 0.0,
|
||||
}
|
||||
queue_service._cfg = {"incidents": {"enabled": True}} # type: ignore[attr-defined]
|
||||
|
||||
async def asyncTearDown(self):
|
||||
self.tmp.cleanup()
|
||||
|
||||
async def test_worker_logs_failed_job_to_incidents(self):
|
||||
logged = []
|
||||
|
||||
def fake_log_incident(cfg, text, category=None):
|
||||
logged.append((text, category))
|
||||
|
||||
orig = queue_service.log_incident
|
||||
queue_service.log_incident = fake_log_incident
|
||||
|
||||
async def boom():
|
||||
raise RuntimeError("boom")
|
||||
|
||||
worker_task = asyncio.create_task(queue_service.worker())
|
||||
try:
|
||||
await queue_service.enqueue("broken-job", boom)
|
||||
await asyncio.wait_for(queue_service._queue.join(), timeout=2.0) # type: ignore[attr-defined]
|
||||
finally:
|
||||
worker_task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await worker_task
|
||||
queue_service.log_incident = orig
|
||||
|
||||
self.assertEqual(queue_service._stats.get("processed"), 1) # type: ignore[attr-defined]
|
||||
self.assertTrue(any("queue_job_failed label=broken-job" in t for t, _c in logged))
|
||||
self.assertTrue(any(c == "queue" for _t, c in logged))
|
||||
|
||||
|
||||
import contextlib
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
28
tests/test_runtime_state.py
Normal file
28
tests/test_runtime_state.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from services import runtime_state
|
||||
|
||||
|
||||
class RuntimeStateTests(unittest.TestCase):
|
||||
def test_set_and_get_persist_between_loads(self):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "runtime.json"
|
||||
runtime_state.configure(str(path))
|
||||
|
||||
runtime_state.set_state("foo", {"bar": 1})
|
||||
self.assertEqual(runtime_state.get("foo"), {"bar": 1})
|
||||
|
||||
# Force a fresh in-memory state and load from disk again.
|
||||
runtime_state._STATE = {} # type: ignore[attr-defined]
|
||||
runtime_state._LOADED = False # type: ignore[attr-defined]
|
||||
self.assertEqual(runtime_state.get("foo"), {"bar": 1})
|
||||
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
self.assertEqual(raw.get("foo"), {"bar": 1})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user