Compare commits

..

101 Commits

Author SHA1 Message Date
b84107463c Add dedicated RAID alert category and monitor 2026-02-25 01:43:10 +03:00
ee361abb99 Detect md arrays via /proc/mdstat for RAID status 2026-02-25 01:39:11 +03:00
2ad423fb6a Fix md RAID detection for lsblk raid* types 2026-02-25 01:36:59 +03:00
efa5dd9644 Fix mojibake text and add md RAID checks 2026-02-25 01:32:55 +03:00
678332e6d0 Add lightweight unittest coverage for stability fixes 2026-02-15 01:25:11 +03:00
7c56430f32 Unify admin callback checks and log queue job failures 2026-02-15 01:20:55 +03:00
b54a094185 Add safe config fallbacks for app init and health checks 2026-02-15 01:16:58 +03:00
6d5fb9c258 Harden docker callback parsing and remove duplicate /openwrt handler 2026-02-15 01:12:45 +03:00
5099ae4fe2 Fix critical race conditions and unsafe disk report command 2026-02-15 01:12:41 +03:00
568cd86844 Fix heatmap button args 2026-02-15 00:51:09 +03:00
b138ee316d Import backup keyboard for SLA handlers 2026-02-15 00:46:53 +03:00
fa98a96b34 Route any SLA text to corresponding handler 2026-02-15 00:45:56 +03:00
1dba6d4a0f Match SLA buttons via regex 2026-02-15 00:44:14 +03:00
b784deb02b Ack SLA requests immediately 2026-02-15 00:35:32 +03:00
5ae54618e8 Broaden SLA button matching 2026-02-15 00:32:09 +03:00
3fc99bdcfc Handle SLA buttons without emojis 2026-02-15 00:30:39 +03:00
c1d69adbc5 Make incidents diff resilient and send sample if empty 2026-02-09 04:21:27 +03:00
a14fb8fccd Show recent sample when incidents diff is empty 2026-02-09 04:19:59 +03:00
4ba8f48228 Auto-reset incidents diff marker if ahead of log 2026-02-09 04:18:05 +03:00
10bf265c29 Add reset option to /incidents_diff 2026-02-09 04:16:28 +03:00
fd179d24e8 Remove Incidents entry from main keyboard 2026-02-09 04:13:47 +03:00
2905528677 Keep incidents summary inside logs keyboard 2026-02-09 04:12:44 +03:00
2b87ce04a3 Keep backup/queue SLA and OpenWrt leases diff in their menus 2026-02-09 04:10:04 +03:00
02b8e2bb55 Keep docker restarts inside docker keyboard 2026-02-09 04:08:27 +03:00
f0fb2aad0e Split OpenWrt menu vs full status actions 2026-02-09 04:06:49 +03:00
219776c642 Disambiguate OpenWrt menu vs full status button 2026-02-09 04:05:25 +03:00
28caa551bd Narrow /docker_health match to avoid summary collisions 2026-02-09 04:03:17 +03:00
783f4abd98 Use icon buttons for incidents, queue and OpenWrt actions 2026-02-09 04:00:04 +03:00
f71c02835a Adjust keyboards with incidents and OpenWrt submenus 2026-02-09 03:45:13 +03:00
f7081b78e1 Add incident exports, queue SLA, and OpenWrt diff utilities 2026-02-09 02:57:16 +03:00
0fbd374823 Log docker restarts as incidents 2026-02-09 02:45:06 +03:00
c3db70160c Use semicolon delimiter in incidents_export CSV 2026-02-09 02:32:50 +03:00
1b9d260530 Use BufferedInputFile for incidents_export 2026-02-09 02:31:24 +03:00
040a6c96e4 Seek to start before sending incidents export files 2026-02-09 02:30:17 +03:00
4f6d6dd549 Fix incidents_export file delivery 2026-02-09 02:28:49 +03:00
2e0bf0c6ea Add incidents export, queue alerts, and health summaries 2026-02-09 02:24:08 +03:00
5a4234f59d Log incidents even when alerts are muted 2026-02-09 02:09:32 +03:00
1d24caa2a2 Fix docker_status log_incident indentation 2026-02-09 02:04:15 +03:00
c91c961134 Tag incidents with categories for summaries 2026-02-09 02:03:04 +03:00
75113b6182 Add selftest scheduler, queue history, and OpenWrt signal stats 2026-02-09 01:56:27 +03:00
aa7bd85687 Filter restic forget parsing to ignore summary rows 2026-02-09 01:41:11 +03:00
ff65e15509 Beautify restic forget table in backup history 2026-02-09 01:39:06 +03:00
08fa95dffd Trim backup history output to fit Telegram 2026-02-09 01:35:41 +03:00
b0a4413671 Add runtime state, auto-mute schedules, and backup retries 2026-02-09 01:14:37 +03:00
9399be4168 Update help with alert shortcuts and docker/openwrt commands 2026-02-08 23:34:27 +03:00
2e35885a5e Fix cfg import in docker handler 2026-02-08 23:31:38 +03:00
4d4e3767bc Add weekly report, multi-admin, docker health cmd, backup tail, openwrt filters 2026-02-08 23:27:23 +03:00
b78dc3cd5c Limit /alerts handler to exact command (fix alias collisions) 2026-02-08 23:09:09 +03:00
20cd56a8c0 Add inline alerts menu with callbacks 2026-02-08 23:07:39 +03:00
7d251a7078 Fix alerts command dispatch indentation 2026-02-08 23:04:35 +03:00
2ee9756d12 Add shortcut commands for alerts, backup, docker, openwrt 2026-02-08 23:01:33 +03:00
77571da4d9 Add /help alias for inline help 2026-02-08 22:54:50 +03:00
d4a19d309f Add multi-page inline help 2026-02-08 22:52:40 +03:00
972c8eb6a7 Add alert tools, mutes, short status, and backup summary 2026-02-08 22:43:16 +03:00
ae2d085214 Allow critical-only load alerts 2026-02-08 18:51:45 +03:00
5da7125fbb Filter status network to enp interfaces 2026-02-08 04:30:57 +03:00
65682ca162 Add quiet hours, health checks, and logging 2026-02-08 04:19:28 +03:00
8bcc3c6878 Preserve restic env for backup commands 2026-02-08 04:02:35 +03:00
ab58592523 Use full restic JSON output 2026-02-08 03:56:15 +03:00
a98292604a Harden backup JSON parsing and fix queue display 2026-02-08 03:54:51 +03:00
97524b92a2 Fix 2026-02-08 03:48:45 +03:00
0a761e5799 Fix OpenWrt rate/lease mapping and queue pending 2026-02-08 03:48:04 +03:00
d242dafb9b Fix OpenWrt hostapd regex warning 2026-02-08 03:35:59 +03:00
7db336f2aa Hide MACs when hostname present 2026-02-08 03:33:43 +03:00
b4a243e72f Improve OpenWrt Wi-Fi client display 2026-02-08 03:32:10 +03:00
01c539fad9 Fix OpenWrt hostapd client fetch 2026-02-08 03:27:55 +03:00
8cec8ae53e Fix OpenWrt Wi-Fi client discovery 2026-02-08 03:23:35 +03:00
e36bf49f1c Harden OpenWrt JSON parsing 2026-02-08 03:20:45 +03:00
a029bbfa7a Fix OpenWrt Wi-Fi client parsing 2026-02-08 03:12:33 +03:00
ad8a6bff69 Allow full command output for OpenWrt 2026-02-08 03:10:19 +03:00
64d899d971 Increase OpenWrt SSH timeouts 2026-02-08 03:07:15 +03:00
8b08b5418f Fetch OpenWrt data via separate SSH calls 2026-02-08 03:04:51 +03:00
7a5e3d46cf Use luci-rpc leases and hostapd clients 2026-02-08 03:01:35 +03:00
c31a194651 Fix OpenWrt SSH data parsing 2026-02-08 02:58:37 +03:00
5e01a8d596 Add OpenWrt SSH status 2026-02-08 02:54:09 +03:00
fc061ece30 Split Logs menu into subcategories 2026-02-08 02:48:25 +03:00
0f7f53cb27 Add Gitea health check 2026-02-08 02:41:50 +03:00
857fa86e85 Fix NPMplus enable/disable request method 2026-02-08 02:35:31 +03:00
ea6ad1d5b2 Improve NPMplus HTTP error details 2026-02-08 02:34:06 +03:00
e1b0f1153e Normalize NPMplus base_url to /api 2026-02-08 02:28:41 +03:00
054d1d0d50 Fix NPMplus button f-string 2026-02-08 02:26:50 +03:00
200b8104a6 Add NPMplus proxy hosts controls 2026-02-08 02:26:07 +03:00
e7a120657b Add Arcane deploy status view 2026-02-08 02:23:26 +03:00
c34a142698 Add disk usage snapshot reports 2026-02-08 02:21:15 +03:00
3df9db3bf7 Add external checks with uptime 2026-02-08 02:16:42 +03:00
aab54d4108 Add SSL expiry alerts 2026-02-08 02:12:54 +03:00
45756636b9 Fix queue details callback admin check 2026-02-08 02:07:37 +03:00
51b24be0be Move queue details to inline button 2026-02-08 02:06:21 +03:00
1d7262eb78 Add queue details view 2026-02-08 02:03:34 +03:00
f7ebdfe325 Format docker stats output 2026-02-08 02:01:31 +03:00
9ced16cfbd Add docker stats view 2026-02-08 01:59:59 +03:00
c8db1be2d8 Format SSH login log entries 2026-02-08 01:58:40 +03:00
dbf9b1fd2f Fix SSH log journalctl filter 2026-02-08 01:56:38 +03:00
118d4bf7f2 Add SSH login log viewer 2026-02-08 01:52:53 +03:00
a7d5fb5459 Add SMART short test and status 2026-02-08 01:50:39 +03:00
48dc1f38ac Add processes service and state wiring 2026-02-08 01:47:51 +03:00
4a00deadc3 Split System menu into submenus 2026-02-08 01:46:24 +03:00
c51e2d4a59 Add network snapshot to status 2026-02-08 01:35:15 +03:00
4e79c401a9 Add incidents log and viewer 2026-02-08 01:33:14 +03:00
4eb202c2ed Document high_load_warn threshold 2026-02-08 01:22:53 +03:00
4989314e2b Tune load thresholds for status and alerts 2026-02-08 01:21:25 +03:00
49 changed files with 5448 additions and 143 deletions

View File

@@ -6,27 +6,53 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
- `token` (string, required): Telegram bot token.
- `admin_id` (int, required): Telegram user id with admin access.
- `admin_ids` (list<int>): Optional list of admins (first is primary for alerts).
## paths
- `artifact_state` (string): JSON file for artifact state.
- `runtime_state` (string): File for runtime state (mutes, metrics, etc.).
- `restic_env` (string): Path to a file with RESTIC_* environment variables.
## thresholds
- `disk_warn` (int, percent): Disk usage warning threshold.
- `load_warn` (float): System load warning threshold.
- `load_warn` (float): Load warning threshold.
- `high_load_warn` (float): Critical load threshold.
## alerts
- `enabled` (bool): Enable resource alerts.
- `interval_sec` (int): Poll interval.
- `cooldown_sec` (int): Cooldown between alerts.
- `notify_cooldown_sec` (int): Global alert dedup cooldown (defaults to `cooldown_sec`).
- `load_only_critical` (bool): Only send critical load alerts (no warn/OK).
- `quiet_hours` (object): Quiet hours for noncritical alerts.
- `enabled` (bool): Enable quiet hours.
- `start` (string): Start time `HH:MM` (e.g. `23:00`).
- `end` (string): End time `HH:MM` (e.g. `08:00`).
- `allow_critical` (bool): Allow critical alerts during quiet hours.
- `auto_mute` (list): Per-category auto mutes by time window.
- `category` (string): load/disk/smart/raid/ssl/docker/test.
- `start` (string): Start `HH:MM`.
- `end` (string): End `HH:MM` (can wrap over midnight).
- `auto_mute_on_high_load_sec` (int): auto-mute `load` category for N seconds on critical load (0 disables).
- `notify_recovery` (bool): Send recovery notifications.
- `smart_enabled` (bool): Enable SMART health polling.
- `smart_interval_sec` (int): SMART poll interval.
- `smart_cooldown_sec` (int): SMART alert cooldown.
- `smart_temp_warn` (int): SMART temperature warning (C).
- `raid_enabled` (bool): Enable md RAID polling (`/proc/mdstat`).
- `raid_interval_sec` (int): RAID poll interval.
- `raid_cooldown_sec` (int): RAID alert cooldown.
## disk_report
- `threshold` (int): Disk usage threshold for auto snapshot.
- `cooldown_sec` (int): Cooldown between snapshots.
- `top_dirs` (int): How many directories to show.
- `docker_dir` (string): Path to docker data.
- `logs_dir` (string): Path to logs.
## audit
@@ -35,6 +61,54 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
- `backup_count` (int): How many rotated files to keep.
## incidents
- `enabled` (bool): Enable incidents logging.
- `path` (string): Log file path. Default `/var/server-bot/incidents.log`.
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
- `backup_count` (int): How many rotated files to keep.
## logging
- `enabled` (bool): Enable bot logging.
- `path` (string): Log file path. Default `/var/server-bot/bot.log`.
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
- `backup_count` (int): How many rotated files to keep.
- `level` (string): Log level (`INFO`, `WARNING`, `ERROR`).
## safety
- `dry_run` (bool): If `true`, dangerous actions (upgrade/reboot/backup) are skipped.
## reports
- `weekly.enabled` (bool): Enable weekly report.
- `weekly.day` (string): Weekday `Mon`..`Sun` (default `Sun`).
- `weekly.time` (string): Local time `HH:MM` (default `08:00`).
## selftest
- `schedule.enabled` (bool): Enable auto self-test.
- `schedule.time` (string): Local time `HH:MM` (default `03:30`).
## queue
- `max_pending_alert` (int): Alert if pending tasks >= this value.
- `avg_wait_alert` (int): Alert if average wait exceeds N seconds.
- `cooldown_sec` (int): Cooldown between queue alerts (default 300s).
## external_checks
- `enabled` (bool): Enable background checks.
- `state_path` (string): State file for uptime, default `/var/server-bot/external_checks.json`.
- `timeout_sec` (int): Check timeout in seconds.
- `interval_sec` (int): Background check interval.
- `services` (list): List of checks.
- `name` (string): Service name.
- `type` (string): `http`, `tcp`, `ping`.
- `url` (string): URL for `http`.
- `host` (string): Host for `tcp`/`ping`.
- `port` (int): Port for `tcp`.
## arcane
- `base_url` (string): Arcane API base url.
@@ -50,12 +124,31 @@ Used for SSL certificate status.
- `secret` (string): Login password.
- `token` (string): Optional static token (not recommended if it expires).
- `verify_tls` (bool): Set to `false` for self-signed TLS.
- `alerts.enabled` (bool): Enable expiry notifications.
- `alerts.days` (list): Thresholds in days (e.g. 30/14/7/1).
- `alerts.cooldown_sec` (int): Cooldown between identical alerts.
- `alerts.interval_sec` (int): Check interval.
Token flow:
- First token: `POST /api/tokens` with `identity` and `secret`.
- Refresh: `GET /api/tokens` using the cached token.
## gitea
- `base_url` (string): Gitea base url, for example `http://localhost:3000`.
- `token` (string): Optional API token.
- `verify_tls` (bool): Set to `false` for self-signed TLS.
## openwrt
- `host` (string): Router address, for example `10.10.10.1`.
- `user` (string): SSH user (usually `root`).
- `port` (int): SSH port (usually `22`).
- `identity_file` (string): Path to SSH key (optional).
- `strict_host_key_checking` (bool): Set to `false` to skip key confirmation.
- `timeout_sec` (int): SSH request timeout.
## security
- `reboot_password` (string): Password required before reboot.

View File

@@ -6,27 +6,53 @@
- `token` (string, обяз.): токен бота.
- `admin_id` (int, обяз.): Telegram user id администратора.
- `admin_ids` (list<int>): список админов (первый используется как основной для уведомлений).
## paths
- `artifact_state` (string): JSON файл состояния артефактов.
- `runtime_state` (string): файл с runtime-состоянием (мьюты, метрики и т.п.).
- `restic_env` (string): путь к файлу с RESTIC_* переменными.
## thresholds
- `disk_warn` (int, %): порог предупреждения по диску.
- `load_warn` (float): порог предупреждения по нагрузке.
- `high_load_warn` (float): порог для критической нагрузки.
## alerts
- `enabled` (bool): включить алерты.
- `interval_sec` (int): интервал опроса.
- `cooldown_sec` (int): кулдаун между алертами.
- `notify_cooldown_sec` (int): глобальный дедуп алертов (по умолчанию `cooldown_sec`).
- `load_only_critical` (bool): слать только критичные алерты по нагрузке (без warn/OK).
- `quiet_hours` (object): тихие часы для не‑критичных уведомлений.
- `enabled` (bool): включить тихие часы.
- `start` (string): начало, формат `HH:MM` (например `23:00`).
- `end` (string): конец, формат `HH:MM` (например `08:00`).
- `allow_critical` (bool): слать критичные алерты в тишину.
- `auto_mute` (list): авто‑мьюты по категориям и времени.
- `category` (string): load/disk/smart/raid/ssl/docker/test.
- `start` (string): начало `HH:MM`.
- `end` (string): конец `HH:MM` (интервал может пересекать ночь).
- `auto_mute_on_high_load_sec` (int): при critical load автоматически мьютить категорию `load` на N секунд (0 — выкл).
- `notify_recovery` (bool): уведомлять о восстановлении.
- `smart_enabled` (bool): SMART проверки.
- `smart_interval_sec` (int): интервал SMART.
- `smart_cooldown_sec` (int): кулдаун SMART.
- `smart_temp_warn` (int): порог температуры (C).
- `raid_enabled` (bool): RAID проверки (`/proc/mdstat`).
- `raid_interval_sec` (int): интервал RAID.
- `raid_cooldown_sec` (int): кулдаун RAID алертов.
## disk_report
- `threshold` (int): порог диска для авто‑снимка.
- `cooldown_sec` (int): кулдаун между снимками.
- `top_dirs` (int): сколько директорий показывать.
- `docker_dir` (string): путь к docker данным.
- `logs_dir` (string): путь к логам.
## audit
@@ -35,6 +61,55 @@
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
- `backup_count` (int): сколько файлов хранить.
## incidents
- `enabled` (bool): включить лог инцидентов.
- `path` (string): путь к лог-файлу. По умолчанию `/var/server-bot/incidents.log`.
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
- `backup_count` (int): сколько файлов хранить.
## logging
- `enabled` (bool): включить лог бота.
- `path` (string): путь к лог-файлу. По умолчанию `/var/server-bot/bot.log`.
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
- `backup_count` (int): сколько файлов хранить.
- `level` (string): уровень логирования (`INFO`, `WARNING`, `ERROR`).
## safety
- `dry_run` (bool): если `true`, опасные действия (upgrade/reboot/backup) не выполняются.
## reports
- `weekly.enabled` (bool): включить еженедельный отчёт.
- `weekly.day` (string): день недели (`Mon`..`Sun`), по умолчанию `Sun`.
- `weekly.time` (string): локальное время `HH:MM`, по умолчанию `08:00`.
## selftest
- `schedule.enabled` (bool): включить авто self-test.
- `schedule.time` (string): локальное время `HH:MM`, по умолчанию `03:30`.
## queue
- `max_pending_alert` (int): алерт, если задач в очереди >= этому значению.
- `avg_wait_alert` (int): алерт, если среднее ожидание превышает N секунд.
- `cooldown_sec` (int): кулдаун между алертами очереди, по умолчанию 300с.
## external_checks
- `enabled` (bool): включить фоновые проверки.
- `state_path` (string): файл состояния для аптайма, по умолчанию `/var/server-bot/external_checks.json`.
- `timeout_sec` (int): таймаут проверки в секундах.
- `interval_sec` (int): интервал фоновых проверок.
- `services` (list): список проверок.
- `name` (string): название сервиса.
- `type` (string): `http`, `tcp`, `ping`.
- `url` (string): URL для `http`.
- `host` (string): хост для `tcp`/`ping`.
- `port` (int): порт для `tcp`.
## arcane
- `base_url` (string): base url API Arcane.
@@ -50,12 +125,31 @@
- `secret` (string): пароль.
- `token` (string): опционально статический токен (не рекомендуется при истечении).
- `verify_tls` (bool): `false` для self-signed TLS.
- `alerts.enabled` (bool): включить уведомления по истечению.
- `alerts.days` (list): пороги в днях (например 30/14/7/1).
- `alerts.cooldown_sec` (int): кулдаун между одинаковыми алертами.
- `alerts.interval_sec` (int): интервал проверки.
Логика токена:
- первый токен: `POST /api/tokens` с `identity` и `secret`.
- refresh: `GET /api/tokens` с текущим токеном.
## gitea
- `base_url` (string): base url Gitea, например `http://localhost:3000`.
- `token` (string): опциональный API токен.
- `verify_tls` (bool): `false` для self-signed TLS.
## openwrt
- `host` (string): адрес роутера, например `10.10.10.1`.
- `user` (string): SSH пользователь (обычно `root`).
- `port` (int): SSH порт (обычно `22`).
- `identity_file` (string): путь к SSH ключу (опционально).
- `strict_host_key_checking` (bool): `false` чтобы не спрашивать подтверждение ключа.
- `timeout_sec` (int): таймаут SSH запроса.
## security
- `reboot_password` (string): пароль для подтверждения reboot.

674
LICENSE Normal file
View File

@@ -0,0 +1,674 @@
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<https://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.

View File

@@ -8,8 +8,9 @@ Telegram admin bot for Linux servers. Provides quick status checks, backup contr
- Arcane: list projects, refresh, up/down, restart.
- Backups (restic): snapshots, repo stats, run backup, queue, restic check, weekly report.
- System: disks, security, URLs health, metrics, package updates, upgrade, reboot, hardware info, SSL cert status (NPMplus).
- Alerts: disk/load and SMART monitoring with cooldown.
- Alerts: disk/load/SMART with cooldown and quiet hours.
- Audit log: all button presses and messages (weekly rotation).
- Logs: bot log rotation and incidents.
## Requirements
@@ -68,4 +69,5 @@ GNU GPL v3.0. Full text in `LICENSE`.
- For NPMplus with self-signed TLS, set `npmplus.verify_tls: false`.
- The bot uses `sudo` for certain actions (reboot, upgrade, backup scripts). Ensure the service user has the required permissions.
- Enable `safety.dry_run` if you want a safe mode without actions.
- Audit log default path is `/var/server-bot/audit.log`.

View File

@@ -8,8 +8,9 @@ Telegram-бот администратора для Linux-серверов. Да
- Arcane: список проектов, refresh, up/down, restart.
- Бэкапы (restic): снапшоты, статистика репозитория, запуск бэкапа, очередь, restic check, weekly report.
- Система: диски, безопасность, проверка URL, метрики, обновления, upgrade, reboot, железо, SSL (NPMplus).
- Алерты: диск/нагрузка и SMART с cooldown.
- Алерты: диск/нагрузка/SMART с cooldown и quiet hours.
- Аудит: все нажатия и сообщения (ротация раз в неделю).
- Логи: ротация логов бота и инциденты.
## Требования
@@ -68,4 +69,5 @@ GNU GPL v3.0. Полный текст в `LICENSE`.
- Для NPMplus с self-signed TLS установи `npmplus.verify_tls: false`.
- Бот использует `sudo` для части операций — настрой права.
- Включи `safety.dry_run`, если хочешь безопасный режим без действий.
- Аудит по умолчанию пишется в `/var/server-bot/audit.log`.

15
app.py
View File

@@ -1,13 +1,22 @@
from aiogram import Bot, Dispatcher
from config import load_cfg, load_env
from services import runtime_state
cfg = load_cfg()
TOKEN = cfg["telegram"]["token"]
ADMIN_ID = cfg["telegram"]["admin_id"]
admin_ids_cfg = cfg["telegram"].get("admin_ids")
if isinstance(admin_ids_cfg, list) and admin_ids_cfg:
ADMIN_IDS = [int(x) for x in admin_ids_cfg]
ADMIN_ID = ADMIN_IDS[0]
else:
ADMIN_ID = int(cfg["telegram"]["admin_id"])
ADMIN_IDS = [ADMIN_ID]
ARTIFACT_STATE = cfg["paths"]["artifact_state"]
RESTIC_ENV = load_env(cfg["paths"].get("restic_env", "/etc/restic/restic.env"))
paths_cfg = cfg.get("paths", {})
runtime_state.configure(paths_cfg.get("runtime_state", "/var/server-bot/runtime.json"))
ARTIFACT_STATE = paths_cfg.get("artifact_state", "/opt/tg-bot/state.json")
RESTIC_ENV = load_env(paths_cfg.get("restic_env", "/etc/restic/restic.env"))
DISK_WARN = int(cfg.get("thresholds", {}).get("disk_warn", 80))
LOAD_WARN = float(cfg.get("thresholds", {}).get("load_warn", 2.0))

View File

@@ -1,10 +1,10 @@
from aiogram.types import Message, CallbackQuery
from app import ADMIN_ID
from app import ADMIN_IDS
def is_admin_msg(msg: Message) -> bool:
return msg.from_user and msg.from_user.id == ADMIN_ID
return msg.from_user and msg.from_user.id in ADMIN_IDS
def is_admin_cb(cb: CallbackQuery) -> bool:
return cb.from_user and cb.from_user.id == ADMIN_ID
return cb.from_user and cb.from_user.id in ADMIN_IDS

View File

@@ -1,26 +1,58 @@
telegram:
token: "YOUR_TELEGRAM_BOT_TOKEN"
admin_id: 123456789
# Optional list of admins (first is primary for alerts)
admin_ids:
- 123456789
paths:
# JSON state file for artifacts
artifact_state: "/opt/tg-bot/state.json"
runtime_state: "/var/server-bot/runtime.json"
# Optional env file with RESTIC_* variables
restic_env: "/etc/restic/restic.env"
thresholds:
disk_warn: 80
load_warn: 2.0
high_load_warn: 3.0
alerts:
enabled: true
interval_sec: 60
cooldown_sec: 900
# Optional global dedup cooldown for notify() calls
notify_cooldown_sec: 900
# If true, only critical load alerts are sent (no warn/OK)
load_only_critical: false
# Optional auto-mute windows per category
auto_mute:
- category: "load"
start: "23:00"
end: "08:00"
# Auto-mute load when critical load fires (seconds)
auto_mute_on_high_load_sec: 600
quiet_hours:
enabled: false
start: "23:00"
end: "08:00"
# Allow critical alerts during quiet hours
allow_critical: true
notify_recovery: true
smart_enabled: true
smart_interval_sec: 3600
smart_cooldown_sec: 21600
smart_temp_warn: 50
raid_enabled: true
raid_interval_sec: 300
raid_cooldown_sec: 1800
disk_report:
threshold: 90
cooldown_sec: 21600
top_dirs: 8
docker_dir: "/var/lib/docker"
logs_dir: "/var/log"
audit:
enabled: true
@@ -28,6 +60,53 @@ audit:
rotate_when: "W0"
backup_count: 8
incidents:
enabled: true
path: "/var/server-bot/incidents.log"
rotate_when: "W0"
backup_count: 8
logging:
enabled: true
path: "/var/server-bot/bot.log"
rotate_when: "W0"
backup_count: 8
level: "INFO"
safety:
# If true, dangerous actions will be skipped
dry_run: false
reports:
weekly:
enabled: false
day: "Sun" # Mon/Tue/Wed/Thu/Fri/Sat/Sun
time: "08:00" # HH:MM server local time
selftest:
schedule:
enabled: false
time: "03:30"
queue:
max_pending_alert: 5
avg_wait_alert: 120
cooldown_sec: 300
external_checks:
enabled: true
state_path: "/var/server-bot/external_checks.json"
timeout_sec: 5
interval_sec: 300
services:
- name: "example-site"
type: "http"
url: "https://example.com"
- name: "example-ssh"
type: "tcp"
host: "example.com"
port: 22
arcane:
base_url: "http://localhost:3552"
api_key: "arc_..."
@@ -40,6 +119,31 @@ npmplus:
# Optional static token (not recommended if it expires)
token: ""
verify_tls: true
alerts:
enabled: true
days:
- 30
- 14
- 7
- 1
cooldown_sec: 86400
interval_sec: 3600
gitea:
base_url: "http://localhost:3000"
# Optional API token for private instances
token: ""
verify_tls: true
openwrt:
host: "10.10.10.1"
user: "root"
port: 22
# Optional identity file for SSH
identity_file: ""
# Disable strict host key checking for auto-accept
strict_host_key_checking: false
timeout_sec: 8
security:
reboot_password: "CHANGE_ME"

9
deploy.sh Normal file
View File

@@ -0,0 +1,9 @@
#!/usr/bin/env bash
set -euo pipefail
SSH_HOST="root@10.10.10.10"
SSH_PORT="1090"
APP_DIR="/opt/tg-bot"
ssh -p "$SSH_PORT" "$SSH_HOST" \
"cd \"$APP_DIR\" && git pull --ff-only && systemctl restart tg-bot"

162
handlers/alerts_admin.py Normal file
View File

@@ -0,0 +1,162 @@
import time
from datetime import datetime, timedelta, timezone
from aiogram import F
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
from app import dp, bot, cfg, ADMIN_ID
from auth import is_admin_msg
from services.alert_mute import set_mute, clear_mute, list_mutes
from services.incidents import read_recent, log_incident
from services.notify import notify
HELP_TEXT = (
"Alerts:\n"
"/alerts test <critical|warn|info> - send test alert\n"
"/alerts mute <category> <minutes> - mute alerts for category\n"
"/alerts unmute <category> - unmute category\n"
"/alerts list - show active mutes\n"
"/alerts recent [hours] - show incidents log (default 24h)\n"
"Categories: load, disk, smart, raid, ssl, docker, test\n"
)
def _dispatch(msg: Message, action: str, args: list[str]):
return {"action": action, "args": args}
async def _handle_alerts(msg: Message, action: str, args: list[str]):
if action == "test":
level = args[0].lower() if args else "info"
if level not in ("critical", "warn", "info"):
level = "info"
key = f"test:{level}:{int(time.time())}"
await notify(bot, msg.chat.id, f"[TEST] {level.upper()} alert", level=level, key=key, category="test")
await msg.answer(f"Sent test alert: {level}")
log_incident(cfg, f"alert_test level={level} by {msg.from_user.id}", category="test")
return
if action == "mute":
if len(args) < 1:
await msg.answer("Usage: /alerts mute <category> <minutes>")
return
category = args[0].lower()
minutes = 60
if len(args) >= 2:
try:
minutes = max(1, int(args[1]))
except ValueError:
minutes = 60
until = set_mute(category, minutes * 60)
dt = datetime.fromtimestamp(until, tz=timezone.utc).astimezone()
await msg.answer(f"🔕 Muted {category} for {minutes}m (until {dt:%Y-%m-%d %H:%M:%S})")
log_incident(cfg, f"alert_mute category={category} minutes={minutes} by {msg.from_user.id}", category=category)
return
if action == "unmute":
if len(args) < 1:
await msg.answer("Usage: /alerts unmute <category>")
return
category = args[0].lower()
clear_mute(category)
await msg.answer(f"🔔 Unmuted {category}")
log_incident(cfg, f"alert_unmute category={category} by {msg.from_user.id}", category=category)
return
if action in ("list", "mutes"):
mutes = list_mutes()
if not mutes:
await msg.answer("🔔 No active mutes")
return
lines = ["🔕 Active mutes:"]
for cat, secs in mutes.items():
mins = max(0, secs) // 60
lines.append(f"- {cat}: {mins}m left")
await msg.answer("\n".join(lines))
return
if action == "recent":
hours = 24
if args:
try:
hours = max(1, int(args[0]))
except ValueError:
hours = 24
rows = read_recent(cfg, hours, limit=50)
if not rows:
await msg.answer(f"No incidents in last {hours}h")
return
await msg.answer("🧾 Incidents:\n" + "\n".join(rows))
return
await msg.answer(HELP_TEXT)
ALERTS_KB = InlineKeyboardMarkup(
inline_keyboard=[
[
InlineKeyboardButton(text="List", callback_data="alerts:list"),
InlineKeyboardButton(text="Recent 24h", callback_data="alerts:recent:24"),
],
[
InlineKeyboardButton(text="Mute load 60m", callback_data="alerts:mute:load:60"),
InlineKeyboardButton(text="Unmute load", callback_data="alerts:unmute:load"),
],
[
InlineKeyboardButton(text="Test CRIT", callback_data="alerts:test:critical"),
InlineKeyboardButton(text="Test WARN", callback_data="alerts:test:warn"),
InlineKeyboardButton(text="Test INFO", callback_data="alerts:test:info"),
],
]
)
@dp.message(F.text.regexp(r"^/alerts(\\s|$)"))
async def alerts_cmd(msg: Message):
if not is_admin_msg(msg):
return
parts = msg.text.split()
if len(parts) < 2:
await msg.answer(HELP_TEXT, reply_markup=ALERTS_KB)
return
action = parts[1].lower()
args = parts[2:]
await _handle_alerts(msg, action, args)
@dp.message(F.text == "/alerts_list")
async def alerts_list(msg: Message):
if not is_admin_msg(msg):
return
await _handle_alerts(msg, "list", [])
@dp.message(F.text == "/alerts_recent")
async def alerts_recent(msg: Message):
if not is_admin_msg(msg):
return
await _handle_alerts(msg, "recent", ["24"])
@dp.message(F.text == "/alerts_mute_load")
async def alerts_mute_load(msg: Message):
if not is_admin_msg(msg):
return
await _handle_alerts(msg, "mute", ["load", "60"])
@dp.callback_query(F.data.startswith("alerts:"))
async def alerts_cb(cb: CallbackQuery):
if cb.from_user.id != ADMIN_ID:
await cb.answer()
return
parts = cb.data.split(":")
# formats: alerts:action or alerts:action:arg1:arg2
if len(parts) < 2:
await cb.answer()
return
action = parts[1]
args = parts[2:] if len(parts) > 2 else []
await _handle_alerts(cb.message, action, args)
await cb.answer()

View File

@@ -2,7 +2,7 @@ import asyncio
from datetime import datetime
from aiogram import F
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
from app import dp, cfg
from app import dp, cfg, ADMIN_IDS
from auth import is_admin_msg
from keyboards import docker_kb, arcane_kb
from services.arcane import list_projects, restart_project, set_project_state, get_project_details
@@ -27,6 +27,7 @@ def _arcane_kb(page: int, total_pages: int, items: list[dict]) -> InlineKeyboard
rows.append([
InlineKeyboardButton(text=f"🔄 {name}", callback_data=f"arcane:restart:{pid}"),
InlineKeyboardButton(text="", callback_data=f"arcane:details:{pid}"),
InlineKeyboardButton(text="📦", callback_data=f"arcane:deploy:{pid}"),
InlineKeyboardButton(text=action_text, callback_data=f"arcane:{action}:{pid}"),
])
@@ -114,7 +115,7 @@ async def arcane_refresh(msg: Message):
@dp.callback_query(F.data == "arcane:refresh")
async def arcane_refresh_inline(cb: CallbackQuery):
if cb.from_user.id != cfg["telegram"]["admin_id"]:
if cb.from_user.id not in ADMIN_IDS:
return
await cb.answer()
await cmd_arcane_projects(cb.message, edit=True)
@@ -122,7 +123,7 @@ async def arcane_refresh_inline(cb: CallbackQuery):
@dp.callback_query(F.data.startswith("arcane:page:"))
async def arcane_page(cb: CallbackQuery):
if cb.from_user.id != cfg["telegram"]["admin_id"]:
if cb.from_user.id not in ADMIN_IDS:
return
try:
page = int(cb.data.split(":", 2)[2])
@@ -140,7 +141,7 @@ async def arcane_page(cb: CallbackQuery):
@dp.callback_query(F.data.startswith("arcane:restart:"))
async def arcane_restart(cb: CallbackQuery):
if cb.from_user.id != cfg["telegram"]["admin_id"]:
if cb.from_user.id not in ADMIN_IDS:
return
_, _, pid = cb.data.split(":", 2)
@@ -159,7 +160,7 @@ async def arcane_restart(cb: CallbackQuery):
@dp.callback_query(F.data.startswith("arcane:details:"))
async def arcane_details(cb: CallbackQuery):
if cb.from_user.id != cfg["telegram"]["admin_id"]:
if cb.from_user.id not in ADMIN_IDS:
return
_, _, pid = cb.data.split(":", 2)
@@ -205,9 +206,55 @@ async def arcane_details(cb: CallbackQuery):
await cb.message.answer("\n".join(lines), parse_mode="Markdown", reply_markup=arcane_kb)
@dp.callback_query(F.data.startswith("arcane:deploy:"))
async def arcane_deploy_status(cb: CallbackQuery):
if cb.from_user.id not in ADMIN_IDS:
return
_, _, pid = cb.data.split(":", 2)
base_url, api_key, env_id = _arcane_cfg()
if not base_url or not api_key:
await cb.answer("Arcane config missing")
return
await cb.answer("Loading…")
ok, info, data = await asyncio.to_thread(get_project_details, base_url, api_key, env_id, pid)
if not ok:
await cb.message.answer(f"❌ Arcane deploy status failed: {info}", reply_markup=arcane_kb)
return
name = data.get("name", "?")
status = data.get("status", "unknown")
status_reason = data.get("statusReason")
updated = data.get("updatedAt")
path = data.get("path")
repo = data.get("gitRepositoryURL")
commit = data.get("lastSyncCommit")
running = data.get("runningCount", 0)
total = data.get("serviceCount", 0)
icon = "🟢" if status == "running" else "🟡"
lines = [
f"📦 **Deploy status: {name}**",
f"{icon} Status: {status} ({running}/{total})",
]
if status_reason:
lines.append(f"⚠️ {status_reason}")
if updated:
lines.append(f"🕒 Updated: {updated}")
if path:
lines.append(f"📁 Path: {path}")
if repo:
lines.append(f"🔗 Repo: {repo}")
if commit:
lines.append(f"🧾 Commit: {commit}")
await cb.message.answer("\n".join(lines), parse_mode="Markdown", reply_markup=arcane_kb)
@dp.callback_query(F.data.startswith("arcane:up:"))
async def arcane_up(cb: CallbackQuery):
if cb.from_user.id != cfg["telegram"]["admin_id"]:
if cb.from_user.id not in ADMIN_IDS:
return
_, _, pid = cb.data.split(":", 2)
@@ -226,7 +273,7 @@ async def arcane_up(cb: CallbackQuery):
@dp.callback_query(F.data.startswith("arcane:down:"))
async def arcane_down(cb: CallbackQuery):
if cb.from_user.id != cfg["telegram"]["admin_id"]:
if cb.from_user.id not in ADMIN_IDS:
return
_, _, pid = cb.data.split(":", 2)

View File

@@ -1,15 +1,17 @@
import asyncio
import json
import os
from datetime import datetime
from aiogram import F
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton
from app import dp
from auth import is_admin_msg
from aiogram.types import Message, InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
from app import dp, cfg
from auth import is_admin_msg, is_admin_cb
from keyboards import backup_kb
from lock_utils import acquire_lock, release_lock
from services.queue import enqueue, format_status
from services.queue import enqueue, format_status, format_details, format_history
from services.backup import backup_badge, restore_help
from services.runner import run_cmd
from services.runner import run_cmd, run_cmd_full
from services.incidents import log_incident
def _parse_systemctl_kv(raw: str) -> dict[str, str]:
@@ -30,6 +32,156 @@ async def _unit_status(unit: str, props: list[str]) -> dict[str, str]:
return _parse_systemctl_kv(out)
def _sudo_cmd(cmd: list[str]) -> list[str]:
if os.geteuid() == 0:
return cmd
return ["sudo", "-E"] + cmd
def _format_backup_result(rc: int, out: str) -> str:
log_path = "/var/log/backup-auto.log"
header = "✅ Backup finished" if rc == 0 else "❌ Backup failed"
lines = out.strip().splitlines()
body = "\n".join(lines[:20])
if len(lines) > 20:
body += f"\n… trimmed {len(lines) - 20} lines"
extra = ""
if rc != 0 and os.path.exists(log_path):
try:
tail = ""
with open(log_path, "r", encoding="utf-8", errors="replace") as f:
tail_lines = f.readlines()[-40:]
tail = "".join(tail_lines).strip()
if tail:
extra = "\n\nLog tail:\n" + tail
except Exception:
pass
base = f"{header} (rc={rc})\nlog: {log_path}"
if body:
base += "\n\n" + body
if extra:
base += extra
return base
def _tail(path: str, lines: int = 120) -> str:
if not os.path.exists(path):
return f"⚠️ Log not found: {path}"
try:
with open(path, "r", encoding="utf-8", errors="replace") as f:
data = f.readlines()[-lines:]
except Exception as e:
return f"⚠️ Failed to read log: {e}"
return "".join(data).strip() or "(empty)"
def _beautify_restic_forget(raw: str) -> str | None:
"""
Parse restic forget output tables into a compact bullet list.
"""
if "Reasons" not in raw or "Paths" not in raw:
return None
import re
lines = raw.splitlines()
headers = []
for idx, line in enumerate(lines):
if line.startswith("ID") and "Reasons" in line and "Paths" in line:
headers.append(idx)
if not headers:
return None
def _valid_id(val: str) -> bool:
return bool(re.fullmatch(r"[0-9a-f]{7,64}", val.strip()))
def parse_block(start_idx: int, end_idx: int) -> list[dict]:
header = lines[start_idx]
cols = ["ID", "Time", "Host", "Tags", "Reasons", "Paths", "Size"]
positions = []
for name in cols:
pos = header.find(name)
if pos == -1:
return []
positions.append(pos)
positions.append(len(header))
entries: list[dict] = []
current: dict | None = None
for line in lines[start_idx + 2 : end_idx]:
if not line.strip():
continue
segments = []
for i in range(len(cols)):
segments.append(line[positions[i] : positions[i + 1]].strip())
row = dict(zip(cols, segments))
if row["ID"] and _valid_id(row["ID"]):
current = {
"id": row["ID"],
"time": row["Time"],
"host": row["Host"],
"size": row["Size"],
"tags": row["Tags"],
"reasons": [],
"paths": [],
}
if row["Reasons"]:
current["reasons"].append(row["Reasons"])
if row["Paths"]:
current["paths"].append(row["Paths"])
entries.append(current)
elif current:
if row["Reasons"] and not row["Reasons"].startswith("-"):
current["reasons"].append(row["Reasons"])
if row["Paths"] and not row["Paths"].startswith("-"):
current["paths"].append(row["Paths"])
return entries
blocks = []
for i, start in enumerate(headers):
end = headers[i + 1] if i + 1 < len(headers) else len(lines)
entries = parse_block(start, end)
if not entries:
continue
label = "Plan"
prev_line = lines[start - 1].lower() if start - 1 >= 0 else ""
prev2 = lines[start - 2].lower() if start - 2 >= 0 else ""
if "keep" in prev_line:
label = prev_line.strip()
elif "keep" in prev2:
label = prev2.strip()
elif "snapshots" in prev_line:
label = prev_line.strip()
blocks.append((label, entries))
if not blocks:
return None
out_lines = []
for label, entries in blocks:
out_lines.append(f"📦 {label}")
for e in entries:
head = f"🧉 {e['id']} | {e['time']} | {e['host']} | {e['size'] or 'n/a'}"
out_lines.append(head)
if e["reasons"]:
out_lines.append(" 📌 " + "; ".join(e["reasons"]))
if e["paths"]:
for p in e["paths"]:
out_lines.append(f"{p}")
out_lines.append("")
return "\n".join(out_lines).rstrip()
def _load_json(raw: str, label: str) -> tuple[bool, object | None, str]:
if not raw or not raw.strip():
return False, None, f"? {label} returned empty output"
try:
return True, json.loads(raw), ""
except json.JSONDecodeError:
preview = raw.strip().splitlines()
head = preview[0] if preview else "invalid output"
return False, None, f"? {label} invalid JSON: {head}"
async def send_backup_jobs_status(msg: Message):
services = [
("backup-auto", "backup-auto.timer"),
@@ -69,7 +221,7 @@ async def cmd_repo_stats(msg: Message):
await msg.answer("⏳ Loading repo stats…", reply_markup=backup_kb)
# --- restore-size stats ---
rc1, raw1 = await run_cmd(
rc1, raw1 = await run_cmd_full(
["restic", "stats", "--json"],
use_restic_env=True,
timeout=30
@@ -78,10 +230,14 @@ async def cmd_repo_stats(msg: Message):
await msg.answer(raw1, reply_markup=backup_kb)
return
restore = json.loads(raw1)
ok, restore, err = _load_json(raw1, "restic stats")
if not ok:
await msg.answer(err, reply_markup=backup_kb)
return
# --- raw-data stats ---
rc2, raw2 = await run_cmd(
rc2, raw2 = await run_cmd_full(
["restic", "stats", "--json", "--mode", "raw-data"],
use_restic_env=True,
timeout=30
@@ -90,15 +246,26 @@ async def cmd_repo_stats(msg: Message):
await msg.answer(raw2, reply_markup=backup_kb)
return
raw = json.loads(raw2)
ok, raw, err = _load_json(raw2, "restic stats raw-data")
if not ok:
await msg.answer(err, reply_markup=backup_kb)
return
# --- snapshots count ---
rc3, raw_snaps = await run_cmd(
rc3, raw_snaps = await run_cmd_full(
["restic", "snapshots", "--json"],
use_restic_env=True,
timeout=20
)
snaps = len(json.loads(raw_snaps)) if rc3 == 0 else "n/a"
if rc3 != 0:
snaps = "n/a"
else:
ok, snap_data, err = _load_json(raw_snaps, "restic snapshots")
if ok and isinstance(snap_data, list):
snaps = len(snap_data)
else:
snaps = "n/a"
msg_text = (
"📦 **Repository stats**\n\n"
@@ -115,7 +282,7 @@ async def cmd_backup_status(msg: Message):
await msg.answer("⏳ Loading snapshots…", reply_markup=backup_kb)
async def worker():
rc, raw = await run_cmd(
rc, raw = await run_cmd_full(
["restic", "snapshots", "--json"],
use_restic_env=True,
timeout=30
@@ -124,7 +291,10 @@ async def cmd_backup_status(msg: Message):
await msg.answer(raw, reply_markup=backup_kb)
return
snaps = json.loads(raw)
ok, snaps, err = _load_json(raw, "restic snapshots")
if not ok or not isinstance(snaps, list):
await msg.answer(err, reply_markup=backup_kb)
return
if not snaps:
await msg.answer("📦 Snapshots: none", reply_markup=backup_kb)
return
@@ -163,7 +333,14 @@ async def cmd_backup_status(msg: Message):
async def cmd_backup_now(msg: Message):
await schedule_backup(msg)
async def schedule_backup(msg: Message):
async def job():
if cfg.get("safety", {}).get("dry_run", False):
await msg.answer("🧪 Dry-run: backup skipped", reply_markup=backup_kb)
return
if not acquire_lock("backup"):
await msg.answer("⚠️ Backup уже выполняется", reply_markup=backup_kb)
return
@@ -171,20 +348,36 @@ async def cmd_backup_now(msg: Message):
await msg.answer("▶️ Backup запущен", reply_markup=backup_kb)
try:
rc, out = await run_cmd(["sudo", "/usr/local/bin/backup.py", "restic-backup"], timeout=6 * 3600)
await msg.answer(("✅ OK\n" if rc == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
rc, out = await run_cmd(
_sudo_cmd(["/usr/local/bin/backup.py", "restic-backup"]),
use_restic_env=True,
timeout=6 * 3600,
)
kb = backup_kb
if rc != 0:
kb = InlineKeyboardMarkup(
inline_keyboard=[
[InlineKeyboardButton(text="🔁 Retry backup", callback_data="backup:retry")]
]
)
await msg.answer(_format_backup_result(rc, out), reply_markup=kb)
finally:
release_lock("backup")
pos = await enqueue("backup", job)
await msg.answer(f"🕓 Backup queued (#{pos})", reply_markup=backup_kb)
try:
from services.incidents import log_incident
log_incident(cfg, f"backup_queued by {msg.from_user.id}", category="backup")
except Exception:
pass
async def cmd_last_snapshot(msg: Message):
await msg.answer("⏳ Loading last snapshot…", reply_markup=backup_kb)
async def worker():
rc, raw = await run_cmd(
rc, raw = await run_cmd_full(
["restic", "snapshots", "--json"],
use_restic_env=True,
timeout=20
@@ -193,7 +386,10 @@ async def cmd_last_snapshot(msg: Message):
await msg.answer(raw, reply_markup=backup_kb)
return
snaps = json.loads(raw)
ok, snaps, err = _load_json(raw, "restic snapshots")
if not ok or not isinstance(snaps, list):
await msg.answer(err, reply_markup=backup_kb)
return
if not snaps:
await msg.answer("📦 Snapshots: none", reply_markup=backup_kb)
return
@@ -203,7 +399,7 @@ async def cmd_last_snapshot(msg: Message):
t = datetime.fromisoformat(s["time"].replace("Z", "+00:00"))
short_id = s["short_id"]
rc2, raw2 = await run_cmd(
rc2, raw2 = await run_cmd_full(
["restic", "stats", short_id, "--json"],
use_restic_env=True,
timeout=20
@@ -212,7 +408,10 @@ async def cmd_last_snapshot(msg: Message):
await msg.answer(raw2, reply_markup=backup_kb)
return
stats = json.loads(raw2)
ok, stats, err = _load_json(raw2, f"restic stats {short_id}")
if not ok or not isinstance(stats, dict):
await msg.answer(err, reply_markup=backup_kb)
return
msg_text = (
"📦 **Last snapshot**\n\n"
@@ -247,7 +446,20 @@ async def ls(msg: Message):
@dp.message(F.text == "🧾 Queue")
async def qb(msg: Message):
if is_admin_msg(msg):
await msg.answer(format_status(), reply_markup=backup_kb)
kb = InlineKeyboardMarkup(
inline_keyboard=[
[InlineKeyboardButton(text="Details", callback_data="queue:details")],
]
)
await msg.answer(format_status(), reply_markup=kb)
@dp.callback_query(F.data == "queue:details")
async def qd(cb: CallbackQuery):
if not is_admin_cb(cb):
return
await cb.answer()
await cb.message.answer(format_details(), reply_markup=backup_kb)
@dp.message(F.text == "▶️ Run backup")
@@ -256,6 +468,12 @@ async def br(msg: Message):
await cmd_backup_now(msg)
@dp.message(F.text == "/backup_run")
async def br_cmd(msg: Message):
if is_admin_msg(msg):
await schedule_backup(msg)
@dp.message(F.text == "🧪 Restic check")
async def rc(msg: Message):
if not is_admin_msg(msg):
@@ -263,8 +481,19 @@ async def rc(msg: Message):
async def job():
await msg.answer("🧪 Restic check запущен", reply_markup=backup_kb)
rc2, out = await run_cmd(["sudo", "/usr/local/bin/restic-check.sh"], timeout=6 * 3600)
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
rc2, out = await run_cmd(
_sudo_cmd(["/usr/local/bin/restic-check.sh"]),
use_restic_env=True,
timeout=6 * 3600,
)
kb = backup_kb
if rc2 != 0:
kb = InlineKeyboardMarkup(
inline_keyboard=[
[InlineKeyboardButton(text="🔁 Retry restic check", callback_data="backup:retry_check")]
]
)
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=kb)
pos = await enqueue("restic-check", job)
await msg.answer(f"🕓 Restic check queued (#{pos})", reply_markup=backup_kb)
@@ -277,7 +506,11 @@ async def wr(msg: Message):
async def job():
await msg.answer("📬 Weekly report запущен", reply_markup=backup_kb)
rc2, out = await run_cmd(["sudo", "/usr/local/bin/weekly-report.sh"], timeout=3600)
rc2, out = await run_cmd(
_sudo_cmd(["/usr/local/bin/weekly-report.sh"]),
use_restic_env=True,
timeout=3600,
)
await msg.answer(("✅ OK\n" if rc2 == 0 else "❌ FAIL\n") + out, reply_markup=backup_kb)
pos = await enqueue("weekly-report", job)
@@ -288,3 +521,55 @@ async def wr(msg: Message):
async def rh(msg: Message):
if is_admin_msg(msg):
await msg.answer(restore_help(), reply_markup=backup_kb)
@dp.message(F.text == "📜 History")
@dp.message(F.text == "/backup_history")
async def backup_history(msg: Message):
if not is_admin_msg(msg):
return
log_path = "/var/log/backup-auto.log"
content = _tail(log_path, lines=160)
if content.startswith("⚠️"):
await msg.answer(content, reply_markup=backup_kb)
return
pretty = _beautify_restic_forget(content)
trimmed = False
max_len = 3500
if len(content) > max_len:
content = content[-max_len:]
trimmed = True
header = "📜 Backup history (tail)"
if trimmed:
header += " (trimmed)"
if pretty:
await msg.answer(f"{header}\n`{log_path}`\n\n{pretty}", reply_markup=backup_kb)
else:
await msg.answer(
f"{header}\n`{log_path}`\n```\n{content}\n```",
reply_markup=backup_kb,
parse_mode="Markdown",
)
@dp.message(F.text == "/queue_history")
async def queue_history(msg: Message):
if not is_admin_msg(msg):
return
await msg.answer(format_history(), reply_markup=backup_kb)
@dp.callback_query(F.data == "backup:retry")
async def backup_retry(cb: CallbackQuery):
if not is_admin_cb(cb):
return
await cb.answer("Queuing backup…")
await schedule_backup(cb.message)
@dp.callback_query(F.data == "backup:retry_check")
async def backup_retry_check(cb: CallbackQuery):
if not is_admin_cb(cb):
return
await cb.answer("Queuing restic check…")
await rc(cb.message)

View File

@@ -2,8 +2,10 @@ import json
import time
from aiogram import F
from aiogram.types import CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
from app import dp, ADMIN_ID
from app import dp, ADMIN_ID, cfg
from services.docker import docker_cmd
from services.incidents import log_incident
from services.runner import run_cmd
from state import DOCKER_MAP, LOG_FILTER_PENDING
from handlers.backup import cmd_backup_status
@@ -13,8 +15,15 @@ async def docker_callback(cb: CallbackQuery):
if cb.from_user.id != ADMIN_ID:
return
try:
_, action, alias = cb.data.split(":", 2)
real = DOCKER_MAP[alias]
except ValueError:
await cb.answer("Bad request")
return
real = DOCKER_MAP.get(alias)
if not real:
await cb.answer("Container not found")
return
if action == "restart":
await cb.answer("Restarting…")
@@ -24,6 +33,10 @@ async def docker_callback(cb: CallbackQuery):
f"🔄 **{alias} restarted**\n```{out}```",
parse_mode="Markdown"
)
try:
log_incident(cfg, f"docker_restart {alias}", category="docker")
except Exception:
pass
elif action == "logs":
await cb.answer()
@@ -54,7 +67,7 @@ async def snapshot_details(cb: CallbackQuery):
snap_id = cb.data.split(":", 1)[1]
await cb.answer("Loading snapshot…")
# получаем статистику snapshot
# получаем статистику snapshot
rc, raw = await run_cmd(
["restic", "stats", snap_id, "--json"],
use_restic_env=True,

24
handlers/config_check.py Normal file
View File

@@ -0,0 +1,24 @@
from aiogram import F
from aiogram.types import Message
from app import dp, cfg
from auth import is_admin_msg
from services.config_check import validate_cfg
@dp.message(F.text == "/config_check")
async def config_check(msg: Message):
if not is_admin_msg(msg):
return
errors, warnings = validate_cfg(cfg)
lines = []
if errors:
lines.append("❌ Config errors:")
lines += [f"- {e}" for e in errors]
if warnings:
if lines:
lines.append("")
lines.append("⚠️ Warnings:")
lines += [f"- {w}" for w in warnings]
if not lines:
lines.append("✅ Config looks OK")
await msg.answer("\n".join(lines))

View File

@@ -1,11 +1,13 @@
from aiogram import F
from aiogram.types import Message
from app import dp
from app import dp, cfg
from auth import is_admin_msg
from keyboards import docker_kb, docker_inline_kb
from services.docker import container_uptime, docker_cmd
from services.incidents import log_incident
from state import DOCKER_MAP, LOG_FILTER_PENDING
import time
import json
async def cmd_docker_status(msg: Message):
@@ -42,7 +44,7 @@ async def cmd_docker_status(msg: Message):
lines.append(f"{icon} {alias}: {status} ({up})")
await msg.answer("\n".join(lines), reply_markup=docker_kb)
log_incident(cfg, f"docker_status by {msg.from_user.id}", category="docker")
except Exception as e:
# ⬅️ КРИТИЧЕСКИ ВАЖНО
await msg.answer(
@@ -77,6 +79,137 @@ async def ds(msg: Message):
await cmd_docker_status(msg)
@dp.message(F.text == "/docker_status")
async def ds_cmd(msg: Message):
if is_admin_msg(msg):
await cmd_docker_status(msg)
@dp.message(F.text, F.func(lambda m: (m.text or "").split()[0] == "/docker_health"))
async def docker_health(msg: Message):
if not is_admin_msg(msg):
return
parts = msg.text.split()
if len(parts) < 2:
await msg.answer("Usage: /docker_health <alias>")
return
alias = parts[1]
real = DOCKER_MAP.get(alias)
if not real:
await msg.answer(f"⚠️ Unknown container: {alias}", reply_markup=docker_kb)
return
rc, out = await docker_cmd(["inspect", "-f", "{{json .State.Health}}", real], timeout=10)
if rc != 0 or not out.strip():
await msg.answer(f"⚠️ Failed to get health for {alias}", reply_markup=docker_kb)
return
try:
data = json.loads(out)
except json.JSONDecodeError:
await msg.answer(f"⚠️ Invalid health JSON for {alias}", reply_markup=docker_kb)
return
status = data.get("Status", "n/a")
fail = data.get("FailingStreak", "n/a")
logs = data.get("Log") or []
lines = [f"🐳 {alias} health", f"Status: {status}", f"Failing streak: {fail}"]
if logs:
lines.append("Recent logs:")
for entry in logs[-5:]:
if not isinstance(entry, dict):
continue
ts = entry.get("Start") or entry.get("End") or ""
exitc = entry.get("ExitCode", "")
out_line = entry.get("Output", "").strip()
lines.append(f"- {ts} rc={exitc} {out_line}")
await msg.answer("\n".join(lines), reply_markup=docker_kb)
log_incident(cfg, f"docker_health alias={alias} by {msg.from_user.id}", category="docker")
@dp.message(F.text == "/docker_health_summary")
async def docker_health_summary(msg: Message):
if not is_admin_msg(msg):
return
if not DOCKER_MAP:
await msg.answer("⚠️ DOCKER_MAP пуст", reply_markup=docker_kb)
return
problems = []
total = len(DOCKER_MAP)
for alias, real in DOCKER_MAP.items():
rc, out = await docker_cmd(["inspect", "-f", "{{json .State}}", real], timeout=10)
if rc != 0:
problems.append(f"{alias}: inspect error")
continue
try:
state = json.loads(out)
except Exception:
problems.append(f"{alias}: bad JSON")
continue
status = state.get("Status", "n/a")
health = (state.get("Health") or {}).get("Status", "n/a")
if status != "running" or health not in ("healthy", "none"):
problems.append(f"{alias}: {status}/{health}")
ok = total - len(problems)
lines = [f"🐳 Docker health: 🟢 {ok}/{total} healthy, 🔴 {len(problems)} issues"]
if problems:
lines.append("Problems:")
lines.extend([f"- {p}" for p in problems])
await msg.answer("\n".join(lines), reply_markup=docker_kb)
@dp.message(F.text == "📈 Stats")
async def dstats(msg: Message):
if not is_admin_msg(msg):
return
if not DOCKER_MAP:
await msg.answer(
"⚠️ DOCKER_MAP пуст.\n"
"Контейнеры не обнаружены.",
reply_markup=docker_kb,
)
return
names = list(DOCKER_MAP.values())
fmt = "{{.Name}}|{{.CPUPerc}}|{{.MemUsage}}|{{.NetIO}}|{{.BlockIO}}"
rc, out = await docker_cmd(["stats", "--no-stream", "--format", fmt] + names)
if rc != 0:
await msg.answer(out, reply_markup=docker_kb)
return
lines = [line.strip() for line in out.splitlines() if line.strip()]
if not lines:
await msg.answer("📈 Stats\n\n(no data)", reply_markup=docker_kb)
return
alias_by_name = {v: k for k, v in DOCKER_MAP.items()}
rows = []
for line in lines:
parts = line.split("|")
if len(parts) != 5:
continue
name, cpu, mem, net, blk = [p.strip() for p in parts]
display = alias_by_name.get(name, name)
try:
cpu_val = float(cpu.strip("%"))
except ValueError:
cpu_val = 0.0
rows.append((cpu_val, display, cpu, mem, net, blk))
if not rows:
await msg.answer("📈 Stats\n\n(no data)", reply_markup=docker_kb)
return
rows.sort(key=lambda r: r[0], reverse=True)
header = f"{'NAME':<18} {'CPU':>6} {'MEM':>18} {'NET':>16} {'IO':>16}"
formatted = [header]
for _cpu_val, name, cpu, mem, net, blk in rows:
formatted.append(f"{name[:18]:<18} {cpu:>6} {mem:>18} {net:>16} {blk:>16}")
body = "\n".join(formatted)
await msg.answer(
f"📈 **Docker stats**\n```\n{body}\n```",
reply_markup=docker_kb,
parse_mode="Markdown",
)
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in LOG_FILTER_PENDING))
async def log_filter_input(msg: Message):
if not is_admin_msg(msg):

View File

@@ -1,24 +1,164 @@
from aiogram import F
from aiogram.types import Message
from app import dp
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
from app import dp, ADMIN_ID
from auth import is_admin_msg
from keyboards import menu_kb
@dp.message(F.text.in_({" Help", " Help", "Help"}))
HELP_PAGES = [
(
"Overview",
" **Help — Overview**\n\n"
"🩺 *Health* — быстрый health-check.\n"
"📊 *Статус* — общая загрузка.\n"
"📋 */status_short* — кратко (load/RAM/диски).\n"
"🩺 */health_short* — краткий health.\n"
"🧪 */selftest* — health + restic snapshot probe.\n"
"🔧 Разделы: Docker, Backup, Artifacts, System, OpenWrt.",
),
(
"Alerts",
"🚨 **Alerts & Mute**\n\n"
"Команды:\n"
"• `/alerts test <critical|warn|info>`\n"
"• `/alerts mute <cat> <minutes>` / `/alerts unmute <cat>` / `/alerts list`\n"
"• `/alerts recent [hours]`\n"
"Шорткаты: `/alerts_list`, `/alerts_recent`, `/alerts_mute_load` (60м).\n"
"Категории: load, disk, smart, raid, ssl, docker, test.\n"
"Quiet hours: `alerts.quiet_hours` для не‑критичных.\n"
"Авто-мьют: `alerts.auto_mute` со слотами времени.\n"
"Только красные load: `alerts.load_only_critical: true`.\n"
"Валидатор конфига: `/config_check`.",
),
(
"Backup",
"💾 **Backup (restic)**\n\n"
"Кнопки: Status, Last snapshot, Repo stats, Run backup, Queue, Restic check, Weekly report, History.\n"
"History — хвост `/var/log/backup-auto.log`.\n"
"Fail → кнопка Retry (backup/check).\n"
"Run backup/Check учитывают `safety.dry_run`.\n"
"После бэкапа приходит TL;DR + путь к логу `/var/log/backup-auto.log`.\n"
"Queue → Details показывает отложенные задачи.",
),
(
"Docker & System",
"🐳 **Docker**\n"
"Status/Restart/Logs/Stats — клавиатура Docker.\n"
"Команды: `/docker_status`, `/docker_health <alias>`.\n\n"
"🖥 **System**\n"
"Info: Disks/Security/Metrics/Hardware/SMART/OpenWrt.\n"
"Ops: Updates/Upgrade/Reboot.\n"
"Logs: Audit/Incidents/Security/Integrations/Processes.\n"
"OpenWrt: `/openwrt`, `/openwrt_wan`, `/openwrt_clients`, `/openwrt_leases`.",
),
(
"Admin",
"🛠 **Admin & Deploy**\n\n"
"Config: `/config_check`, файл `config.yaml` (см. config.example.yaml).\n"
"Deploy: `deploy.sh` (ssh 10.10.10.10:1090 → git pull → systemctl restart tg-bot).\n"
"Incidents: `/incidents_summary`, `/incidents_diff [hours]`.\n"
"Export: `/incidents_export [hours] [csv|json]`, `/export_all [hours]` (zip).\n"
"Alerts log/heatmap: `/alerts_log [hours]`, `/alerts_heatmap [hours] [cat]`.\n"
"Backup SLA: `/backup_sla`; Docker restarts: `/docker_restarts [hours]`.\n"
"Disk snapshot: `/disk_snapshot`.\n"
"Queue: `/queue_history`, `/queue_sla`.\n"
"Self-test history: `/selftest_history`.\n"
"OpenWrt leases diff: `/openwrt_leases_diff`.\n"
"BotFather list: `/botfather_list`.\n"
"Безопасность: `safety.dry_run: true` блокирует опасные действия.\n"
"OpenWrt: кнопка в System → Info.",
),
]
def _help_kb(idx: int) -> InlineKeyboardMarkup:
buttons = []
if idx > 0:
buttons.append(InlineKeyboardButton(text="◀️ Prev", callback_data=f"help:{idx-1}"))
buttons.append(InlineKeyboardButton(text=f"{idx+1}/{len(HELP_PAGES)}", callback_data="help:noop"))
if idx < len(HELP_PAGES) - 1:
buttons.append(InlineKeyboardButton(text="Next ▶️", callback_data=f"help:{idx+1}"))
return InlineKeyboardMarkup(inline_keyboard=[buttons])
def _help_text(idx: int) -> str:
_title, body = HELP_PAGES[idx]
return body
@dp.message(F.text.in_({" Help", " Help", "Help", "/help"}))
async def help_cmd(msg: Message):
if not is_admin_msg(msg):
return
idx = 0
await msg.answer(
" **Help / Справка**\n\n"
"🩺 Health — быстрый health-check сервера\n"
"📊 Статус — общая загрузка сервера\n"
"🐳 Docker — управление контейнерами\n"
"📦 Backup — restic бэкапы\n"
"🧉 Artifacts — критичные образы (Clonezilla, NAND)\n"
"⚙️ System — диски, безопасность, URL, metrics, reboot\n\n"
"Inline-кнопки используются для выбора контейнеров.",
reply_markup=menu_kb,
_help_text(idx),
reply_markup=_help_kb(idx),
parse_mode="Markdown",
)
@dp.callback_query(F.data.startswith("help:"))
async def help_cb(cb: CallbackQuery):
if cb.from_user.id != ADMIN_ID:
await cb.answer()
return
payload = cb.data.split(":", 1)[1]
if payload == "noop":
await cb.answer()
return
try:
idx = int(payload)
except ValueError:
await cb.answer()
return
idx = max(0, min(idx, len(HELP_PAGES) - 1))
await cb.message.edit_text(
_help_text(idx),
reply_markup=_help_kb(idx),
parse_mode="Markdown",
)
await cb.answer()
BOTFATHER_LIST = """\
help - Show help pages
status_short - Compact host status
health_short - Compact health report
selftest - Health + restic snapshot probe
alerts - Manage alerts
alerts_list - List active mutes
alerts_recent - Show recent incidents (24h)
alerts_mute_load - Mute load alerts for 60m
alerts_log - Show suppressed alerts
alerts_heatmap - Hourly incidents heatmap
backup_run - Run backup (queued)
backup_history - Show backup log tail
queue_history - Show queue recent jobs
queue_sla - Queue SLA stats
docker_status - Docker summary
docker_health - Docker inspect/health by alias
docker_health_summary - Docker health summary (problems only)
openwrt - Full OpenWrt status
openwrt_wan - OpenWrt WAN only
openwrt_clients - OpenWrt wifi clients
openwrt_leases - OpenWrt DHCP leases
openwrt_fast - OpenWrt quick WAN view
openwrt_leases_diff - OpenWrt DHCP diff
incidents_summary - Incidents counters (24h/7d)
incidents_export - Export incidents (hours fmt)
incidents_diff - Show incidents since last check
export_all - Zip with incidents/queue/selftest
backup_sla - Backup SLA check
docker_restarts - Docker restart history
selftest_history - Self-test history
disk_snapshot - Disk usage snapshot
config_check - Validate config
"""
@dp.message(F.text == "/botfather_list")
async def botfather_list(msg: Message):
if not is_admin_msg(msg):
return
await msg.answer(f"Commands for BotFather:\n```\n{BOTFATHER_LIST}\n```", parse_mode="Markdown")

View File

@@ -2,7 +2,19 @@ from aiogram import F
from aiogram.types import Message
from app import dp
from auth import is_admin_msg
from keyboards import menu_kb, docker_kb, backup_kb, artifacts_kb, system_kb
from keyboards import (
menu_kb,
docker_kb,
backup_kb,
artifacts_kb,
system_menu_kb,
system_info_kb,
system_ops_kb,
system_logs_kb,
system_logs_audit_kb,
system_logs_security_kb,
system_logs_integrations_kb,
)
@dp.message(F.text == "/start")
@@ -38,4 +50,53 @@ async def am(msg: Message):
@dp.message(F.text == "⚙️ System")
async def sm(msg: Message):
if is_admin_msg(msg):
await msg.answer("⚙️ System", reply_markup=system_kb)
await msg.answer("⚙️ System", reply_markup=system_menu_kb)
@dp.message(F.text == "⬅️ System")
async def back_system(msg: Message):
if is_admin_msg(msg):
await msg.answer("⚙️ System", reply_markup=system_menu_kb)
@dp.message(F.text == " Info")
async def sys_info(msg: Message):
if is_admin_msg(msg):
await msg.answer(" System info", reply_markup=system_info_kb)
@dp.message(F.text == "🛠 Ops")
async def sys_ops(msg: Message):
if is_admin_msg(msg):
await msg.answer("🛠 System ops", reply_markup=system_ops_kb)
@dp.message(F.text == "📄 Logs")
async def sys_logs(msg: Message):
if is_admin_msg(msg):
await msg.answer("📄 System logs", reply_markup=system_logs_kb)
@dp.message(F.text == "⬅️ Logs")
async def back_logs(msg: Message):
if is_admin_msg(msg):
await msg.answer("📄 System logs", reply_markup=system_logs_kb)
@dp.message(F.text == "🧾 Audit/Incidents")
async def logs_audit_menu(msg: Message):
if is_admin_msg(msg):
await msg.answer("🧾 Logs: Audit/Incidents", reply_markup=system_logs_audit_kb)
@dp.message(F.text == "🔒 Security")
async def logs_security_menu(msg: Message):
if is_admin_msg(msg):
await msg.answer("🔒 Logs: Security", reply_markup=system_logs_security_kb)
@dp.message(F.text == "🧩 Integrations")
async def logs_integrations_menu(msg: Message):
if is_admin_msg(msg):
await msg.answer("🧩 Logs: Integrations", reply_markup=system_logs_integrations_kb)

141
handlers/processes.py Normal file
View File

@@ -0,0 +1,141 @@
import asyncio
from aiogram import F
from aiogram.types import Message, CallbackQuery, InlineKeyboardMarkup, InlineKeyboardButton
from app import dp, ADMIN_ID
from auth import is_admin_msg
from keyboards import system_logs_tools_kb
from services.processes import get_top_processes, search_processes, terminate_process
from state import PROC_SEARCH_PENDING, PROC_KILL_PENDING
def _proc_kb() -> InlineKeyboardMarkup:
return InlineKeyboardMarkup(
inline_keyboard=[[
InlineKeyboardButton(text="🔄 Refresh", callback_data="proc:refresh"),
InlineKeyboardButton(text="🔍 Search", callback_data="proc:search"),
InlineKeyboardButton(text="🛑 Kill", callback_data="proc:kill"),
]]
)
def _format_top(title: str, rows: list[dict]) -> str:
if not rows:
return f"{title}\n(no data)"
lines = ["PID CPU% MEM% NAME"]
for row in rows:
lines.append(
f"{row['pid']:<5} {row['cpu']:<5.1f} {row['mem']:<5.1f} {row['name']}"
)
return f"{title}\n" + "\n".join(lines)
async def send_processes(msg: Message, edit: bool = False):
top_cpu, top_mem = await asyncio.to_thread(get_top_processes)
body = (
"🧰 **Processes**\n\n"
"```\n"
f"{_format_top('Top CPU', top_cpu)}\n\n"
f"{_format_top('Top RAM', top_mem)}\n"
"```"
)
if edit:
await msg.edit_text(body, reply_markup=_proc_kb(), parse_mode="Markdown")
else:
await msg.answer(body, reply_markup=_proc_kb(), parse_mode="Markdown")
@dp.message(F.text == "🧰 Processes")
async def proc_menu(msg: Message):
if is_admin_msg(msg):
await send_processes(msg, edit=False)
@dp.callback_query(F.data.startswith("proc:"))
async def proc_actions(cb: CallbackQuery):
if cb.from_user.id != ADMIN_ID:
return
await cb.answer()
action = cb.data.split(":", 1)[1]
if action == "refresh":
await send_processes(cb.message, edit=True)
return
if action == "search":
PROC_SEARCH_PENDING[cb.from_user.id] = {}
await cb.message.answer("🔍 Send search text", reply_markup=system_logs_tools_kb)
return
if action == "kill":
PROC_KILL_PENDING[cb.from_user.id] = {}
await cb.message.answer("🛑 Send PID to terminate", reply_markup=system_logs_tools_kb)
return
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in PROC_SEARCH_PENDING))
async def proc_search(msg: Message):
if not is_admin_msg(msg):
return
PROC_SEARCH_PENDING.pop(msg.from_user.id, None)
query = (msg.text or "").strip()
if not query:
await msg.answer("⚠️ Empty search", reply_markup=system_logs_tools_kb)
return
rows = await asyncio.to_thread(search_processes, query)
if not rows:
await msg.answer("🔍 No matches", reply_markup=system_logs_tools_kb)
return
lines = ["PID NAME CMD"]
for row in rows:
cmd = row["cmdline"] or "-"
if len(cmd) > 80:
cmd = cmd[:80] + ""
lines.append(f"{row['pid']:<5} {row['name']:<6} {cmd}")
text = "🔍 **Search results**\n```\n" + "\n".join(lines) + "\n```"
await msg.answer(text, reply_markup=system_logs_tools_kb, parse_mode="Markdown")
@dp.message(F.text, F.func(lambda msg: msg.from_user and msg.from_user.id in PROC_KILL_PENDING))
async def proc_kill_pid(msg: Message):
if not is_admin_msg(msg):
return
PROC_KILL_PENDING.pop(msg.from_user.id, None)
raw = (msg.text or "").strip()
try:
pid = int(raw)
except ValueError:
await msg.answer("⚠️ Invalid PID", reply_markup=system_logs_tools_kb)
return
kb = InlineKeyboardMarkup(
inline_keyboard=[[
InlineKeyboardButton(text="✅ Confirm", callback_data=f"prockill:{pid}:confirm"),
InlineKeyboardButton(text="✖ Cancel", callback_data="prockill:cancel"),
]]
)
await msg.answer(f"⚠️ Terminate PID `{pid}`?", reply_markup=kb, parse_mode="Markdown")
@dp.callback_query(F.data.startswith("prockill:"))
async def proc_kill_confirm(cb: CallbackQuery):
if cb.from_user.id != ADMIN_ID:
return
parts = cb.data.split(":")
if len(parts) < 2:
await cb.answer("Bad request")
return
if parts[1] == "cancel":
await cb.answer("Cancelled")
await cb.message.delete()
return
if len(parts) != 3 or parts[2] != "confirm":
await cb.answer("Bad request")
return
try:
pid = int(parts[1])
except ValueError:
await cb.answer("Bad PID")
return
await cb.answer()
result = await asyncio.to_thread(terminate_process, pid)
await cb.message.answer(result, reply_markup=system_logs_tools_kb)

View File

@@ -1,4 +1,5 @@
import asyncio
import json
import socket
import time
import psutil
@@ -10,6 +11,8 @@ from keyboards import menu_kb
from services.system import format_disks
from services.health import health
from state import DOCKER_MAP
from services.runner import run_cmd_full
from services.selftest import run_selftest
async def cmd_status(msg: Message):
@@ -21,16 +24,20 @@ async def cmd_status(msg: Message):
minutes, _ = divmod(rem, 60)
load1 = psutil.getloadavg()[0]
load_warn = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
high_warn = float(cfg.get("thresholds", {}).get("high_load_warn", load_warn * 1.5))
cpu_icon = "🟢"
if load1 > 2.0:
if load1 > high_warn:
cpu_icon = "🔴"
elif load1 > 1.0:
elif load1 > load_warn:
cpu_icon = "🟡"
mem = psutil.virtual_memory()
cpu_percent = psutil.cpu_percent(interval=None)
disks = format_disks()
net_lines = await _network_snapshot()
await msg.answer(
"📊 **Server status**\n\n"
@@ -39,7 +46,8 @@ async def cmd_status(msg: Message):
f"{cpu_icon} **Load (1m):** {load1:.2f}\n"
f"🧮 **CPU:** {cpu_percent:.0f}%\n"
f"🧠 **RAM:** {mem.used // (1024**3)} / {mem.total // (1024**3)} GiB ({mem.percent}%)\n\n"
f"{disks}",
f"{disks}\n\n"
f"{net_lines}",
reply_markup=menu_kb,
parse_mode="Markdown",
)
@@ -69,3 +77,96 @@ async def h(msg: Message):
async def st(msg: Message):
if is_admin_msg(msg):
await cmd_status(msg)
@dp.message(F.text == "/status_short")
async def st_short(msg: Message):
if not is_admin_msg(msg):
return
now = time.time()
uptime_sec = int(now - psutil.boot_time())
days, rem = divmod(uptime_sec, 86400)
hours, rem = divmod(rem, 3600)
minutes, _ = divmod(rem, 60)
load1, load5, load15 = psutil.getloadavg()
mem = psutil.virtual_memory()
disks = format_disks().splitlines()
disk_line = disks[1] if len(disks) > 1 else "Disks: n/a"
await msg.answer(
"📋 **Status (short)**\n"
f"🖥 `{socket.gethostname()}`\n"
f"⏱ Uptime: {days}d {hours}h {minutes}m\n"
f"⚙️ Load: {load1:.2f} {load5:.2f} {load15:.2f}\n"
f"🧠 RAM: {mem.percent}% ({mem.used // (1024**3)} / {mem.total // (1024**3)} GiB)\n"
f"💾 {disk_line}",
reply_markup=menu_kb,
parse_mode="Markdown",
)
@dp.message(F.text == "/health_short")
async def health_short(msg: Message):
if not is_admin_msg(msg):
return
try:
text = await asyncio.to_thread(health, cfg, DOCKER_MAP)
except Exception as e:
await msg.answer(f"❌ Health failed: {type(e).__name__}: {e}", reply_markup=menu_kb)
return
lines = [ln for ln in text.splitlines() if ln.strip()]
brief = " | ".join(lines[1:5]) if len(lines) > 1 else text
await msg.answer(f"🩺 Health (short)\n{brief}", reply_markup=menu_kb)
@dp.message(F.text.in_({"🧪 Self-test", "/selftest"}))
async def selftest(msg: Message):
if not is_admin_msg(msg):
return
await msg.answer("⏳ Self-test…", reply_markup=menu_kb)
async def worker():
text, _ok = await run_selftest(cfg, DOCKER_MAP)
await msg.answer(text, reply_markup=menu_kb)
asyncio.create_task(worker())
def _rate_str(value: float) -> str:
if value >= 1024 * 1024:
return f"{value / (1024 * 1024):.2f} MiB/s"
if value >= 1024:
return f"{value / 1024:.1f} KiB/s"
return f"{value:.0f} B/s"
async def _network_snapshot(interval: float = 1.0) -> str:
start = psutil.net_io_counters(pernic=True)
await asyncio.sleep(interval)
end = psutil.net_io_counters(pernic=True)
rows = []
for nic, s in end.items():
if nic.startswith("lo"):
continue
if not nic.startswith("enp"):
continue
e = start.get(nic)
if not e:
continue
rx = max(0, s.bytes_recv - e.bytes_recv)
tx = max(0, s.bytes_sent - e.bytes_sent)
err = max(0, (s.errin - e.errin) + (s.errout - e.errout))
score = rx + tx + (err * 1024)
rows.append((score, nic, rx, tx, err))
rows.sort(reverse=True)
top = rows[:3]
if not top:
return "📡 **Network (1s):** no data"
lines = ["📡 **Network (1s):**"]
for _score, nic, rx, tx, err in top:
err_part = f", err {err}" if err else ""
lines.append(f"- {nic}: RX {_rate_str(rx / interval)}, TX {_rate_str(tx / interval)}{err_part}")
return "\n".join(lines)

File diff suppressed because it is too large Load Diff

View File

@@ -10,7 +10,7 @@ menu_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="🩺 Health"), KeyboardButton(text="📊 Статус")],
[KeyboardButton(text="🐳 Docker"), KeyboardButton(text="📦 Backup")],
[KeyboardButton(text="🧉 Artifacts"), KeyboardButton(text="⚙️ System")],
[KeyboardButton(text="⚙️ System")],
[KeyboardButton(text=" Help")],
],
resize_keyboard=True,
@@ -20,6 +20,7 @@ docker_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="🐳 Status"), KeyboardButton(text="🧰 Arcane")],
[KeyboardButton(text="🔄 Restart"), KeyboardButton(text="📜 Logs")],
[KeyboardButton(text="📈 Stats"), KeyboardButton(text="♻️ Restarts")],
[KeyboardButton(text="⬅️ Назад")],
],
resize_keyboard=True,
@@ -37,8 +38,8 @@ backup_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="📦 Status"), KeyboardButton(text="📦 Last snapshot")],
[KeyboardButton(text="📊 Repo stats"), KeyboardButton(text="🧯 Restore help")],
[KeyboardButton(text="▶️ Run backup"), KeyboardButton(text="🧾 Queue")],
[KeyboardButton(text="🧪 Restic check"), KeyboardButton(text="📬 Weekly report"), KeyboardButton(text="⬅️ Назад")],
[KeyboardButton(text="▶️ Run backup"), KeyboardButton(text="🧾 Queue"), KeyboardButton(text="📊 Queue SLA")],
[KeyboardButton(text="📉 Backup SLA"), KeyboardButton(text="📜 History"), KeyboardButton(text="⬅️ Назад")],
],
resize_keyboard=True,
)
@@ -52,12 +53,85 @@ artifacts_kb = ReplyKeyboardMarkup(
resize_keyboard=True,
)
system_kb = ReplyKeyboardMarkup(
system_menu_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text=" Info"), KeyboardButton(text="🛠 Ops")],
[KeyboardButton(text="📄 Logs"), KeyboardButton(text="⬅️ Назад")],
],
resize_keyboard=True,
)
system_info_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="💽 Disks"), KeyboardButton(text="🔐 Security")],
[KeyboardButton(text="📈 Metrics"), KeyboardButton(text="🧱 Hardware")],
[KeyboardButton(text="🧪 SMART test"), KeyboardButton(text="🧪 SMART status")],
[KeyboardButton(text="📡 OpenWrt"), KeyboardButton(text="⬅️ System")],
],
resize_keyboard=True,
)
system_ops_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="💽 Disks"), KeyboardButton(text="🔐 Security"), KeyboardButton(text="🧾 Audit")],
[KeyboardButton(text="🌐 URLs"), KeyboardButton(text="📈 Metrics"), KeyboardButton(text="🔒 SSL")],
[KeyboardButton(text="📦 Updates"), KeyboardButton(text="⬆️ Upgrade")],
[KeyboardButton(text="🧱 Hardware"), KeyboardButton(text="🔄 Reboot"), KeyboardButton(text="⬅️ Назад")],
[KeyboardButton(text="🔄 Reboot")],
[KeyboardButton(text="⬅️ System")],
],
resize_keyboard=True,
)
system_logs_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="🧾 Audit/Incidents"), KeyboardButton(text="🔒 Security")],
[KeyboardButton(text="🧩 Integrations"), KeyboardButton(text="🧰 Processes")],
[KeyboardButton(text="📣 Summary"), KeyboardButton(text="🔥 Heatmap")],
[KeyboardButton(text="⬅️ System")],
],
resize_keyboard=True,
)
system_logs_audit_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="🧾 Audit"), KeyboardButton(text="📣 Incidents")],
[KeyboardButton(text="🆕 Diff"), KeyboardButton(text="📤 Export")],
[KeyboardButton(text="📦 Export all"), KeyboardButton(text="🧰 Alerts log")],
[KeyboardButton(text="⬅️ Logs")],
],
resize_keyboard=True,
)
system_logs_security_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="🔑 SSH log"), KeyboardButton(text="🔒 SSL")],
[KeyboardButton(text="🌍 External"), KeyboardButton(text="🌐 URLs")],
[KeyboardButton(text="⬅️ Logs")],
],
resize_keyboard=True,
)
system_logs_integrations_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="🧩 NPMplus"), KeyboardButton(text="🍵 Gitea")],
[KeyboardButton(text="⬅️ Logs")],
],
resize_keyboard=True,
)
system_logs_tools_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="🧰 Processes")],
[KeyboardButton(text="⬅️ Logs")],
],
resize_keyboard=True,
)
# OpenWrt submenu (4 ряда)
openwrt_kb = ReplyKeyboardMarkup(
keyboard=[
[KeyboardButton(text="🌐 WAN fast"), KeyboardButton(text="📡 Full status")],
[KeyboardButton(text="📶 Wi-Fi clients"), KeyboardButton(text="🧾 Leases")],
[KeyboardButton(text="🔀 Leases diff")],
[KeyboardButton(text="⬅️ System")],
],
resize_keyboard=True,
)

View File

@@ -1,4 +1,5 @@
from pathlib import Path
import os
import time
LOCK_DIR = Path("/var/run/tg-bot")
@@ -11,9 +12,14 @@ def lock_path(name: str) -> Path:
def acquire_lock(name: str) -> bool:
p = lock_path(name)
if p.exists():
try:
fd = os.open(str(p), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
except FileExistsError:
return False
p.write_text(str(time.time()))
try:
os.write(fd, str(time.time()).encode("ascii", errors="ignore"))
finally:
os.close(fd)
return True

58
main.py
View File

@@ -1,14 +1,20 @@
import asyncio
import logging
import socket
from datetime import datetime
from app import bot, dp, cfg, ADMIN_ID
from app import bot, dp, cfg, ADMIN_ID, ADMIN_IDS
from keyboards import menu_kb
from services.docker import discover_containers, docker_watchdog
from services.alerts import monitor_resources, monitor_smart
from services.alerts import monitor_resources, monitor_smart, monitor_raid
from services.metrics import MetricsStore, start_sampler
from services.queue import worker as queue_worker
from services.queue import worker as queue_worker, configure as queue_configure
from services.notify import notify
from services.audit import AuditMiddleware, audit_start
from services.ssl_alerts import monitor_ssl
from services.external_checks import monitor_external
from services.incidents import log_incident
from services.logging_setup import setup_logging
from services.selftest import schedule_selftest
import state
import handlers.menu
import handlers.status
@@ -19,6 +25,40 @@ import handlers.system
import handlers.help
import handlers.callbacks
import handlers.arcane
import handlers.processes
from services.weekly_report import weekly_reporter
import handlers.alerts_admin
import handlers.config_check
def _handle_async_exception(_loop, context):
msg = context.get("message") or "Unhandled exception"
exc = context.get("exception")
if exc:
text = f"{msg}: {type(exc).__name__}: {exc}"
else:
text = f"{msg}"
now = datetime.now()
if not hasattr(_handle_async_exception, "_recent"):
_handle_async_exception._recent = []
_handle_async_exception._last_alert = None
recent = _handle_async_exception._recent
recent.append(now)
# keep last hour
_handle_async_exception._recent = [t for t in recent if (now - t).total_seconds() < 3600]
if len(_handle_async_exception._recent) >= 3:
last_alert = getattr(_handle_async_exception, "_last_alert", None)
if not last_alert or (now - last_alert).total_seconds() > 3600:
try:
log_incident(cfg, "exception_flood", category="system")
except Exception:
pass
_handle_async_exception._last_alert = now
try:
log_incident(cfg, text, category="system")
except Exception:
pass
logging.getLogger("asyncio").error(text)
async def notify_start():
@@ -30,6 +70,7 @@ async def notify_start():
async def main():
setup_logging(cfg)
dp.message.middleware(AuditMiddleware(cfg))
dp.callback_query.middleware(AuditMiddleware(cfg))
audit_start(cfg)
@@ -41,9 +82,20 @@ async def main():
asyncio.create_task(monitor_resources(cfg, notify, bot, ADMIN_ID))
if cfg.get("alerts", {}).get("smart_enabled", True):
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
if cfg.get("alerts", {}).get("raid_enabled", True):
asyncio.create_task(monitor_raid(cfg, notify, bot, ADMIN_ID))
if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
if cfg.get("external_checks", {}).get("enabled", True):
asyncio.create_task(monitor_external(cfg))
state.METRICS_STORE = MetricsStore()
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
queue_configure(cfg.get("queue", {}), cfg)
asyncio.create_task(queue_worker())
asyncio.create_task(weekly_reporter(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
asyncio.create_task(schedule_selftest(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
loop = asyncio.get_running_loop()
loop.set_exception_handler(_handle_async_exception)
await notify_start()
await dp.start_polling(bot)

93
services/alert_mute.py Normal file
View File

@@ -0,0 +1,93 @@
import time
from typing import Dict
from services.runtime_state import get_state, set_state
# category -> unix timestamp until muted
def _mutes() -> Dict[str, float]:
return get_state().get("mutes", {})
def _save(mutes: Dict[str, float]):
set_state("mutes", mutes)
def _cleanup() -> None:
mutes = _mutes()
now = time.time()
expired = [k for k, until in mutes.items() if until <= now]
for k in expired:
mutes.pop(k, None)
_save(mutes)
def set_mute(category: str, seconds: int) -> float:
_cleanup()
mutes = _mutes()
until = time.time() + max(0, seconds)
mutes[category] = until
_save(mutes)
return until
def clear_mute(category: str) -> None:
mutes = _mutes()
mutes.pop(category, None)
_save(mutes)
def is_muted(category: str | None) -> bool:
if not category:
return False
_cleanup()
mutes = _mutes()
until = mutes.get(category)
if until is None:
return False
if until <= time.time():
mutes.pop(category, None)
_save(mutes)
return False
return True
def list_mutes() -> dict[str, int]:
_cleanup()
now = time.time()
mutes = _mutes()
return {k: int(until - now) for k, until in mutes.items()}
def is_auto_muted(cfg: dict, category: str | None) -> bool:
if not category:
return False
auto_list = cfg.get("alerts", {}).get("auto_mute", [])
if not isinstance(auto_list, list):
return False
now = time.localtime()
now_minutes = now.tm_hour * 60 + now.tm_min
for item in auto_list:
if not isinstance(item, dict):
continue
cat = item.get("category")
if cat != category:
continue
start = item.get("start", "00:00")
end = item.get("end", "00:00")
try:
sh, sm = [int(x) for x in start.split(":")]
eh, em = [int(x) for x in end.split(":")]
except Exception:
continue
start_min = sh * 60 + sm
end_min = eh * 60 + em
if start_min == end_min:
continue
if start_min < end_min:
if start_min <= now_minutes < end_min:
return True
else:
if now_minutes >= start_min or now_minutes < end_min:
return True
return False

View File

@@ -1,8 +1,9 @@
import asyncio
import time
import psutil
from system_checks import list_disks, smart_health, disk_temperature
from system_checks import list_disks, smart_health, disk_temperature, list_md_arrays, md_array_status
from services.system import worst_disk_usage
from services.disk_report import build_disk_report
async def monitor_resources(cfg, notify, bot, chat_id):
@@ -10,12 +11,17 @@ async def monitor_resources(cfg, notify, bot, chat_id):
interval = int(alerts_cfg.get("interval_sec", 60))
cooldown = int(alerts_cfg.get("cooldown_sec", 900))
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
load_only_critical = bool(alerts_cfg.get("load_only_critical", False))
auto_mute_high_load_sec = int(alerts_cfg.get("auto_mute_on_high_load_sec", 0))
disk_warn = int(cfg.get("thresholds", {}).get("disk_warn", 80))
snapshot_warn = int(cfg.get("disk_report", {}).get("threshold", disk_warn))
snapshot_cooldown = int(cfg.get("disk_report", {}).get("cooldown_sec", 21600))
load_warn = float(cfg.get("thresholds", {}).get("load_warn", 2.0))
high_warn = float(cfg.get("thresholds", {}).get("high_load_warn", load_warn * 1.5))
last_sent = {"disk": 0.0, "load": 0.0, "disk_na": 0.0}
state = {"disk_high": False, "load_high": False, "disk_na": False}
last_sent = {"disk": 0.0, "load": 0.0, "disk_na": 0.0, "disk_report": 0.0}
state = {"disk_high": False, "disk_na": False, "load_level": 0}
while True:
now = time.time()
@@ -23,34 +29,55 @@ async def monitor_resources(cfg, notify, bot, chat_id):
usage, mount = worst_disk_usage()
if usage is None:
if not state["disk_na"] or now - last_sent["disk_na"] >= cooldown:
await notify(bot, chat_id, "⚠️ Disk usage n/a")
await notify(bot, chat_id, "⚠️ Disk usage n/a", level="warn", key="disk_na", category="disk")
state["disk_na"] = True
last_sent["disk_na"] = now
else:
if state["disk_na"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
if state["disk_na"] and notify_recovery and not load_only_critical:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
state["disk_na"] = False
if usage >= disk_warn:
if not state["disk_high"] or now - last_sent["disk"] >= cooldown:
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})")
await notify(bot, chat_id, f"🟡 Disk usage {usage}% ({mount})", level="warn", key="disk_high", category="disk")
state["disk_high"] = True
last_sent["disk"] = now
else:
if state["disk_high"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})")
if state["disk_high"] and notify_recovery and not load_only_critical:
await notify(bot, chat_id, f"🟢 Disk usage OK ({usage}% {mount})", level="info", key="disk_ok", category="disk")
state["disk_high"] = False
if usage >= snapshot_warn and now - last_sent["disk_report"] >= snapshot_cooldown:
report = await build_disk_report(cfg, mount or "/", usage)
await notify(bot, chat_id, f"📦 Disk snapshot\n\n{report}", level="info", key="disk_snapshot", category="disk")
last_sent["disk_report"] = now
load = psutil.getloadavg()[0]
if load >= load_warn:
if not state["load_high"] or now - last_sent["load"] >= cooldown:
await notify(bot, chat_id, f"🟡 Load high: {load:.2f}")
state["load_high"] = True
last_sent["load"] = now
if load >= high_warn:
level = 2
elif load >= load_warn:
level = 1
else:
if state["load_high"] and notify_recovery:
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}")
state["load_high"] = False
level = 0
if load_only_critical and level == 1:
level = 0
if level == 0:
if state["load_level"] > 0 and notify_recovery and not load_only_critical:
await notify(bot, chat_id, f"🟢 Load OK: {load:.2f}", level="info", key="load_ok", category="load")
state["load_level"] = 0
else:
if level != state["load_level"] or now - last_sent["load"] >= cooldown:
icon = "🔴" if level == 2 else "🟡"
level_name = "critical" if level == 2 else "warn"
key = "load_high_crit" if level == 2 else "load_high_warn"
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load")
last_sent["load"] = now
if level == 2 and auto_mute_high_load_sec > 0:
from services.alert_mute import set_mute
set_mute("load", auto_mute_high_load_sec)
state["load_level"] = level
await asyncio.sleep(interval)
@@ -74,7 +101,14 @@ async def monitor_smart(cfg, notify, bot, chat_id):
continue
if "FAILED" in health:
await notify(bot, chat_id, f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}")
await notify(
bot,
chat_id,
f"🔴 SMART FAIL {dev}: {health}, 🌡 {temp}",
level="critical",
key=f"smart_fail:{dev}",
category="smart",
)
last_sent[key] = now
continue
@@ -84,8 +118,66 @@ async def monitor_smart(cfg, notify, bot, chat_id):
except ValueError:
t = None
if t is not None and t >= temp_warn:
await notify(bot, chat_id, f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}")
await notify(
bot,
chat_id,
f"🟡 SMART HOT {dev}: {health}, 🌡 {temp}",
level="warn",
key=f"smart_hot:{dev}",
category="smart",
)
last_sent[key] = now
continue
await asyncio.sleep(interval)
async def monitor_raid(cfg, notify, bot, chat_id):
alerts_cfg = cfg.get("alerts", {})
interval = int(alerts_cfg.get("raid_interval_sec", 300))
cooldown = int(alerts_cfg.get("raid_cooldown_sec", 1800))
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
last_sent: dict[str, float] = {}
bad_state: dict[str, bool] = {}
while True:
now = time.time()
for dev in list_md_arrays():
status = md_array_status(dev)
lower = status.lower()
level = None
key_suffix = None
if "inactive" in lower:
level = "critical"
key_suffix = "inactive"
elif "degraded" in lower:
level = "warn"
key_suffix = "degraded"
if level:
if not bad_state.get(dev) or (now - last_sent.get(dev, 0.0) >= cooldown):
icon = "🔴" if level == "critical" else "🟡"
await notify(
bot,
chat_id,
f"{icon} RAID {dev}: {status}",
level=level,
key=f"raid_{key_suffix}:{dev}",
category="raid",
)
last_sent[dev] = now
bad_state[dev] = True
else:
if bad_state.get(dev) and notify_recovery:
await notify(
bot,
chat_id,
f"🟢 RAID {dev}: {status}",
level="info",
key=f"raid_ok:{dev}",
category="raid",
)
bad_state[dev] = False
await asyncio.sleep(interval)

35
services/config_check.py Normal file
View File

@@ -0,0 +1,35 @@
import os
from typing import Any, Tuple, List
def validate_cfg(cfg: dict[str, Any]) -> Tuple[List[str], List[str]]:
errors: List[str] = []
warnings: List[str] = []
tg = cfg.get("telegram", {})
if not tg.get("token"):
errors.append("telegram.token is missing")
admin_ids = tg.get("admin_ids")
has_admin_ids = isinstance(admin_ids, list) and len(admin_ids) > 0
if not tg.get("admin_id") and not has_admin_ids:
errors.append("telegram.admin_id is missing")
thresholds = cfg.get("thresholds", {})
for key in ("disk_warn", "load_warn", "high_load_warn"):
if key not in thresholds:
warnings.append(f"thresholds.{key} not set")
paths = cfg.get("paths", {})
env_path = paths.get("restic_env")
if env_path and not os.path.exists(env_path):
warnings.append(f"paths.restic_env not found: {env_path}")
npm = cfg.get("npmplus", {})
if npm and not npm.get("token") and (not npm.get("identity") or not npm.get("secret")):
warnings.append("npmplus: token missing and identity/secret missing")
ow = cfg.get("openwrt", {})
if ow and not ow.get("host"):
warnings.append("openwrt.host is missing")
return errors, warnings

78
services/disk_report.py Normal file
View File

@@ -0,0 +1,78 @@
import os
import re
from typing import Any
from services.runner import run_cmd
def _top_dirs_cmd(path: str, limit: int) -> list[str]:
_ = limit
return ["du", "-x", "-h", "-d", "1", path]
_SIZE_RE = re.compile(r"^\s*([0-9]+(?:\.[0-9]+)?)([KMGTP]?)(i?B?)?$", re.IGNORECASE)
def _size_to_bytes(value: str) -> float:
m = _SIZE_RE.match(value.strip())
if not m:
return -1.0
num = float(m.group(1))
unit = (m.group(2) or "").upper()
mul = {
"": 1,
"K": 1024,
"M": 1024**2,
"G": 1024**3,
"T": 1024**4,
"P": 1024**5,
}.get(unit, 1)
return num * mul
def _format_top_dirs(raw: str, limit: int) -> str:
rows: list[tuple[float, str]] = []
for line in raw.splitlines():
line = line.strip()
if not line:
continue
parts = line.split(maxsplit=1)
if len(parts) != 2:
continue
size, name = parts
rows.append((_size_to_bytes(size), f"{size}\t{name}"))
rows.sort(key=lambda x: x[0])
return "\n".join(line for _sz, line in rows[-max(1, limit):])
async def build_disk_report(cfg: dict[str, Any], mount: str, usage: int) -> str:
limit = int(cfg.get("disk_report", {}).get("top_dirs", 8))
lines = ["🧱 Disk report", f"💽 {mount}: {usage}%"]
rc, out = await run_cmd(_top_dirs_cmd(mount, limit), timeout=30)
if rc == 0 and out.strip():
top_out = _format_top_dirs(out, limit)
lines.append("")
lines.append("Top directories:")
lines.append(top_out)
docker_dir = cfg.get("disk_report", {}).get("docker_dir", "/var/lib/docker")
if docker_dir and os.path.exists(docker_dir):
rc2, out2 = await run_cmd(_top_dirs_cmd(docker_dir, limit), timeout=30)
if rc2 == 0 and out2.strip():
top_out2 = _format_top_dirs(out2, limit)
lines.append("")
lines.append(f"Docker dir: {docker_dir}")
lines.append(top_out2)
logs_dir = cfg.get("disk_report", {}).get("logs_dir", "/var/log")
if logs_dir and os.path.exists(logs_dir):
rc3, out3 = await run_cmd(_top_dirs_cmd(logs_dir, limit), timeout=30)
if rc3 == 0 and out3.strip():
top_out3 = _format_top_dirs(out3, limit)
lines.append("")
lines.append(f"Logs dir: {logs_dir}")
lines.append(top_out3)
return "\n".join(lines)

View File

@@ -144,8 +144,22 @@ async def docker_watchdog(container_map, notify, bot, chat_id):
reply_markup=kb,
)
elif health not in ("healthy", "n/a"):
await notify(bot, chat_id, f"⚠️ {alias} health: {health}")
await notify(
bot,
chat_id,
f"⚠️ {alias} health: {health}",
level="warn",
key=f"docker_health:{alias}",
category="docker",
)
else:
await notify(bot, chat_id, f"🐳 {alias}: {status}")
await notify(
bot,
chat_id,
f"🐳 {alias}: {status}",
level="info",
key=f"docker_status:{alias}:{status}",
category="docker",
)
last[alias] = (status, health)
await asyncio.sleep(120)

143
services/external_checks.py Normal file
View File

@@ -0,0 +1,143 @@
import asyncio
import json
import os
import socket
import time
from datetime import datetime, timezone
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
def _state_path(cfg: dict[str, Any]) -> str:
return cfg.get("external_checks", {}).get("state_path", "/var/server-bot/external_checks.json")
def _load_state(cfg: dict[str, Any]) -> dict[str, Any]:
path = _state_path(cfg)
if not os.path.exists(path):
return {"services": {}, "total_checks": 0, "ok_checks": 0}
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return {"services": {}, "total_checks": 0, "ok_checks": 0}
def _save_state(cfg: dict[str, Any], state: dict[str, Any]) -> None:
path = _state_path(cfg)
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(state, f, ensure_ascii=False, indent=2)
def _check_http(url: str, timeout: int) -> tuple[bool, str]:
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
try:
with urlopen(req, timeout=timeout) as resp:
status = int(resp.status)
return status < 400, f"HTTP {status}"
except HTTPError as e:
return False, f"HTTP {int(e.code)}"
except URLError as e:
return False, str(e.reason)
except Exception as e:
return False, str(e)
def _check_tcp(host: str, port: int, timeout: int) -> tuple[bool, str]:
try:
with socket.create_connection((host, port), timeout=timeout):
return True, "TCP ok"
except Exception as e:
return False, str(e)
def _check_ping(host: str, timeout: int) -> tuple[bool, str]:
try:
socket.gethostbyname(host)
return True, "DNS ok"
except Exception:
pass
return _check_tcp(host, 80, timeout)
def run_checks(cfg: dict[str, Any]) -> dict[str, Any]:
checks_cfg = cfg.get("external_checks", {})
services = checks_cfg.get("services", [])
timeout = int(checks_cfg.get("timeout_sec", 5))
state = _load_state(cfg)
services_state = state.setdefault("services", {})
results = []
for entry in services:
name = entry.get("name") or "unknown"
check_type = entry.get("type", "http")
ok = False
detail = "n/a"
if check_type == "http":
url = entry.get("url")
if url:
ok, detail = _check_http(url, timeout)
elif check_type == "tcp":
host = entry.get("host")
port = int(entry.get("port", 0))
if host and port:
ok, detail = _check_tcp(host, port, timeout)
elif check_type == "ping":
host = entry.get("host")
if host:
ok, detail = _check_ping(host, timeout)
service_state = services_state.setdefault(name, {"ok": 0, "total": 0})
service_state["total"] += 1
if ok:
service_state["ok"] += 1
state["total_checks"] = state.get("total_checks", 0) + 1
if ok:
state["ok_checks"] = state.get("ok_checks", 0) + 1
results.append({"name": name, "ok": ok, "detail": detail})
_save_state(cfg, state)
return {"results": results, "state": state}
def format_report(cfg: dict[str, Any]) -> str:
checks_cfg = cfg.get("external_checks", {})
services = checks_cfg.get("services", [])
if not services:
return "🌍 External checks\n\n No services configured"
data = run_checks(cfg)
results = data["results"]
state = data["state"]
total = state.get("total_checks", 0) or 1
ok_total = state.get("ok_checks", 0)
uptime = 100.0 * ok_total / total
lines = ["🌍 External checks", ""]
for item in results:
icon = "🟢" if item["ok"] else "🔴"
lines.append(f"{icon} {item['name']}: {item['detail']}")
lines.append("")
lines.append(f"📈 Uptime (global): {uptime:.2f}%")
lines.append(f"🕒 {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}")
return "\n".join(lines)
async def monitor_external(cfg: dict[str, Any]):
checks_cfg = cfg.get("external_checks", {})
if not checks_cfg.get("enabled", True):
return
interval = int(checks_cfg.get("interval_sec", 300))
while True:
run_checks(cfg)
await asyncio.sleep(interval)

88
services/gitea.py Normal file
View File

@@ -0,0 +1,88 @@
import json
import ssl
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
def _request(url: str, headers: dict[str, str], verify_tls: bool) -> tuple[int, str]:
context = None
if not verify_tls:
context = ssl._create_unverified_context() # nosec - config-controlled
req = Request(url, headers=headers)
try:
with urlopen(req, timeout=10, context=context) as resp:
body = resp.read().decode("utf-8")
return int(resp.status), body
except HTTPError as e:
try:
body = e.read().decode("utf-8")
except Exception:
body = ""
return int(e.code), body
except URLError as e:
raise RuntimeError(str(e.reason)) from e
def _api_base(cfg: dict[str, Any]) -> str:
g_cfg = cfg.get("gitea", {})
base = (g_cfg.get("base_url") or "").rstrip("/")
return base
def get_gitea_health(cfg: dict[str, Any]) -> str:
g_cfg = cfg.get("gitea", {})
base = _api_base(cfg)
verify_tls = g_cfg.get("verify_tls", True)
if not base:
return "⚠️ Gitea base_url not configured"
token = (g_cfg.get("token") or "").strip()
headers = {"User-Agent": "tg-admin-bot"}
if token:
headers["Authorization"] = f"token {token}"
lines = ["🍵 Gitea\n"]
health_paths = ["/api/healthz", "/api/v1/healthz"]
health_status = None
health_payload = None
for path in health_paths:
status, body = _request(f"{base}{path}", headers, verify_tls)
if status == 200:
health_status = (status, path)
try:
health_payload = json.loads(body)
except json.JSONDecodeError:
health_payload = None
break
if status not in (404, 405):
health_status = (status, path)
break
if health_status:
status, path = health_status
icon = "🟢" if status == 200 else "🔴"
if status == 200 and isinstance(health_payload, dict):
state = health_payload.get("status") or "ok"
checks = health_payload.get("checks") or {}
checks_total = len(checks) if isinstance(checks, dict) else 0
lines.append(f"{icon} API health: {state} ({checks_total} checks)")
else:
lines.append(f"{icon} API health: {status} ({path})")
else:
lines.append("🟡 API health: endpoint not found")
ver_status, ver_body = _request(f"{base}/api/v1/version", headers, verify_tls)
if ver_status == 200:
try:
payload = json.loads(ver_body)
except json.JSONDecodeError:
payload = {}
version = payload.get("version") or "unknown"
lines.append(f" Version: {version}")
else:
lines.append(f"🟡 Version: HTTP {ver_status}")
return "\n".join(lines)

View File

@@ -1,6 +1,9 @@
import os
import os
import ssl
import subprocess
import psutil
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
from app import RESTIC_ENV
from services.system import worst_disk_usage
@@ -9,9 +12,35 @@ def _containers_from_cfg(cfg) -> dict:
return cfg.get("docker", {}).get("containers", {})
def _request_status(url: str, verify_tls: bool) -> int | None:
context = None
if not verify_tls:
context = ssl._create_unverified_context() # nosec - config-controlled
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
try:
with urlopen(req, timeout=8, context=context) as resp:
return int(resp.status)
except HTTPError as e:
return int(e.code)
except URLError:
return None
def _npm_api_base(cfg) -> str | None:
npm_cfg = cfg.get("npmplus", {})
base = (npm_cfg.get("base_url") or "").rstrip("/")
if not base:
return None
if not base.endswith("/api"):
base = f"{base}/api"
return base
def health(cfg, container_map: dict | None = None) -> str:
lines = ["🩺 Health check\n"]
thresholds = cfg.get("thresholds", {})
disk_warn = int(thresholds.get("disk_warn", 80))
load_warn = float(thresholds.get("load_warn", 2.0))
try:
env = os.environ.copy()
env.update(RESTIC_ENV)
@@ -30,15 +59,47 @@ def health(cfg, container_map: dict | None = None) -> str:
else:
lines.append(f"🟢 {alias} OK")
npm_cfg = cfg.get("npmplus", {})
npm_base = _npm_api_base(cfg)
if npm_base:
npm_status = _request_status(npm_base, npm_cfg.get("verify_tls", True))
if npm_status == 200:
lines.append("🟢 NPMplus API OK")
elif npm_status is None:
lines.append("🔴 NPMplus API unreachable")
else:
lines.append(f"🟡 NPMplus API HTTP {npm_status}")
g_cfg = cfg.get("gitea", {})
g_base = (g_cfg.get("base_url") or "").rstrip("/")
if g_base:
health_paths = ["/api/healthz", "/api/v1/healthz"]
g_status = None
for path in health_paths:
status = _request_status(f"{g_base}{path}", g_cfg.get("verify_tls", True))
if status == 200:
g_status = status
break
if status not in (404, 405):
g_status = status
break
if g_status == 200:
lines.append("🟢 Gitea API OK")
elif g_status is None:
lines.append("🔴 Gitea API unreachable")
else:
lines.append(f"🟡 Gitea API HTTP {g_status}")
usage, mount = worst_disk_usage()
if usage is None:
lines.append("⚠️ Disk n/a")
elif usage > cfg["thresholds"]["disk_warn"]:
elif usage > disk_warn:
lines.append(f"🟡 Disk {usage}% ({mount})")
else:
lines.append(f"🟢 Disk {usage}% ({mount})")
load = psutil.getloadavg()[0]
lines.append(f"{'🟢' if load < cfg['thresholds']['load_warn'] else '🟡'} Load {load}")
lines.append(f"{'🟢' if load < load_warn else '🟡'} Load {load}")
return "\n".join(lines)

118
services/incidents.py Normal file
View File

@@ -0,0 +1,118 @@
import logging
import os
from collections import deque
from datetime import datetime, timedelta, timezone
from logging.handlers import TimedRotatingFileHandler
from typing import Any
from services import runtime_state
def _get_path(cfg: dict[str, Any]) -> str:
return cfg.get("incidents", {}).get("path", "/var/server-bot/incidents.log")
def incidents_path(cfg: dict[str, Any]) -> str:
return _get_path(cfg)
def _get_logger(cfg: dict[str, Any]) -> logging.Logger:
logger = logging.getLogger("incidents")
if logger.handlers:
return logger
path = _get_path(cfg)
os.makedirs(os.path.dirname(path), exist_ok=True)
rotate_when = cfg.get("incidents", {}).get("rotate_when", "W0")
backup_count = int(cfg.get("incidents", {}).get("backup_count", 8))
handler = TimedRotatingFileHandler(
path,
when=rotate_when,
interval=1,
backupCount=backup_count,
encoding="utf-8",
utc=True,
)
formatter = logging.Formatter(
"%(asctime)s\t%(message)s",
datefmt="%Y-%m-%dT%H:%M:%SZ",
)
handler.setFormatter(formatter)
logger.setLevel(logging.INFO)
logger.addHandler(handler)
logger.propagate = False
return logger
def log_incident(cfg: dict[str, Any], text: str, category: str | None = None) -> None:
if not cfg.get("incidents", {}).get("enabled", True):
return
if category and "category=" not in text:
text = f"category={category} {text}"
logger = _get_logger(cfg)
logger.info(text)
def _parse_line(line: str) -> tuple[datetime | None, str]:
if "\t" not in line:
return None, line.strip()
ts, msg = line.split("\t", 1)
try:
dt = datetime.strptime(ts.strip(), "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
except ValueError:
dt = None
return dt, msg.strip()
def read_recent(cfg: dict[str, Any], hours: int, limit: int = 200) -> list[str]:
return [f"{dt:%Y-%m-%d %H:%M} {msg}" for dt, msg in read_raw(cfg, hours, limit=limit)]
def read_raw(cfg: dict[str, Any], hours: int, limit: int = 200, *, include_old: bool = False) -> list[tuple[datetime, str]]:
path = _get_path(cfg)
if not os.path.exists(path):
return []
since = datetime.now(timezone.utc) - timedelta(hours=hours)
lines = deque(maxlen=limit)
with open(path, "r", encoding="utf-8", errors="replace") as f:
for line in f:
dt, msg = _parse_line(line.rstrip())
if dt is None:
continue
if not include_old and dt < since:
continue
lines.append((dt, msg))
return list(lines)
def infer_category(text: str) -> str | None:
lower = text.lower()
if "category=" in lower:
import re
m = re.search(r"category=([a-z0-9_-]+)", lower)
if m:
return m.group(1)
if "load" in lower:
return "load"
if "docker" in lower:
return "docker"
if "restic" in lower or "backup" in lower:
return "backup"
if "smart" in lower:
return "smart"
if "ssl" in lower or "cert" in lower:
return "ssl"
if "npmplus" in lower:
return "npmplus"
if "gitea" in lower:
return "gitea"
if "openwrt" in lower:
return "openwrt"
if "queue" in lower:
return "queue"
if "selftest" in lower:
return "selftest"
return None

35
services/logging_setup.py Normal file
View File

@@ -0,0 +1,35 @@
import logging
import os
from logging.handlers import TimedRotatingFileHandler
def setup_logging(cfg: dict) -> None:
log_cfg = cfg.get("logging", {})
if not log_cfg.get("enabled", True):
return
path = log_cfg.get("path", "/var/server-bot/bot.log")
rotate_when = log_cfg.get("rotate_when", "W0")
backup_count = int(log_cfg.get("backup_count", 8))
level = str(log_cfg.get("level", "INFO")).upper()
os.makedirs(os.path.dirname(path), exist_ok=True)
root = logging.getLogger()
for handler in root.handlers:
if isinstance(handler, TimedRotatingFileHandler) and handler.baseFilename == path:
return
handler = TimedRotatingFileHandler(
path,
when=rotate_when,
interval=1,
backupCount=backup_count,
encoding="utf-8",
utc=True,
)
formatter = logging.Formatter("%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s")
handler.setFormatter(formatter)
root.setLevel(level)
root.addHandler(handler)

View File

@@ -1,8 +1,83 @@
import time
from datetime import datetime
from aiogram import Bot
from app import cfg
from services.alert_mute import is_muted, is_auto_muted
from services.incidents import log_incident
async def notify(bot: Bot, chat_id: int, text: str):
_LAST_SENT: dict[str, float] = {}
def _parse_hhmm(value: str) -> int | None:
try:
hours, minutes = value.strip().split(":", 1)
h = int(hours)
m = int(minutes)
except Exception:
return None
if not (0 <= h <= 23 and 0 <= m <= 59):
return None
return h * 60 + m
def _in_quiet_hours(alerts_cfg: dict) -> bool:
quiet = alerts_cfg.get("quiet_hours", {})
if not quiet.get("enabled", False):
return False
start_min = _parse_hhmm(quiet.get("start", "23:00"))
end_min = _parse_hhmm(quiet.get("end", "08:00"))
if start_min is None or end_min is None:
return False
if start_min == end_min:
return False
now = datetime.now()
now_min = now.hour * 60 + now.minute
if start_min < end_min:
return start_min <= now_min < end_min
return now_min >= start_min or now_min < end_min
async def notify(
bot: Bot,
chat_id: int,
text: str,
level: str = "info",
key: str | None = None,
category: str | None = None,
):
alerts_cfg = cfg.get("alerts", {})
suppressed_reason = None
if category and is_muted(category):
suppressed_reason = "muted"
elif category and is_auto_muted(cfg, category):
suppressed_reason = "auto_mute"
elif _in_quiet_hours(alerts_cfg):
allow_critical = bool(alerts_cfg.get("quiet_hours", {}).get("allow_critical", True))
if not (allow_critical and level == "critical"):
suppressed_reason = "quiet_hours"
if suppressed_reason:
try:
log_incident(cfg, f"[suppressed:{suppressed_reason}] {text}", category=category)
except Exception:
pass
return
dedup_sec = int(alerts_cfg.get("notify_cooldown_sec", alerts_cfg.get("cooldown_sec", 900)))
if dedup_sec > 0:
dedup_key = key or text
now = time.time()
last_time = _LAST_SENT.get(dedup_key, 0)
if now - last_time < dedup_sec:
return
_LAST_SENT[dedup_key] = now
try:
await bot.send_message(chat_id, text)
except Exception:
pass
try:
log_incident(cfg, text, category=category)
except Exception:
pass

View File

@@ -34,11 +34,12 @@ def _request_json(
headers: dict[str, str],
data: dict[str, Any] | None,
verify_tls: bool,
method: str | None = None,
) -> Any:
body = None
if data is not None:
body = json.dumps(data).encode("utf-8")
req = Request(url, headers=headers, data=body)
req = Request(url, headers=headers, data=body, method=method)
context = None
if not verify_tls:
@@ -48,16 +49,36 @@ def _request_json(
with urlopen(req, timeout=10, context=context) as resp:
payload = resp.read().decode("utf-8")
except HTTPError as e:
raise RuntimeError(f"HTTP {e.code}") from e
detail = f"HTTP {e.code}"
try:
payload = e.read().decode("utf-8").strip()
except Exception:
payload = ""
if payload:
payload = " ".join(payload.split())
if len(payload) > 300:
payload = payload[:300] + "..."
detail = f"{detail}: {payload}"
raise RuntimeError(f"{detail} ({url})") from e
except URLError as e:
raise RuntimeError(str(e.reason)) from e
return json.loads(payload)
def _api_base(cfg: dict[str, Any]) -> str:
npm_cfg = cfg.get("npmplus", {})
base = (npm_cfg.get("base_url") or "").rstrip("/")
if not base:
return ""
if not base.endswith("/api"):
base = f"{base}/api"
return base
def _get_token(cfg: dict[str, Any]) -> str:
npm_cfg = cfg.get("npmplus", {})
base_url = (npm_cfg.get("base_url") or "").rstrip("/")
base_url = _api_base(cfg)
identity = npm_cfg.get("identity")
secret = npm_cfg.get("secret")
static_token = npm_cfg.get("token")
@@ -113,7 +134,7 @@ def _get_token(cfg: dict[str, Any]) -> str:
def fetch_certificates(cfg: dict[str, Any]) -> list[dict[str, Any]]:
npm_cfg = cfg.get("npmplus", {})
base_url = (npm_cfg.get("base_url") or "").rstrip("/")
base_url = _api_base(cfg)
verify_tls = npm_cfg.get("verify_tls", True)
if not base_url:
@@ -132,6 +153,48 @@ def fetch_certificates(cfg: dict[str, Any]) -> list[dict[str, Any]]:
return data
def list_proxy_hosts(cfg: dict[str, Any]) -> list[dict[str, Any]]:
npm_cfg = cfg.get("npmplus", {})
base_url = _api_base(cfg)
verify_tls = npm_cfg.get("verify_tls", True)
if not base_url:
raise ValueError("NPMplus base_url not configured")
token = _get_token(cfg)
url = f"{base_url}/nginx/proxy-hosts"
headers = {
"Authorization": f"Bearer {token}",
"User-Agent": "tg-admin-bot",
}
data = _request_json(url, headers, None, verify_tls)
if not isinstance(data, list):
raise RuntimeError("Unexpected API response")
return data
def set_proxy_host(cfg: dict[str, Any], host_id: int, enabled: bool) -> tuple[bool, str]:
npm_cfg = cfg.get("npmplus", {})
base_url = _api_base(cfg)
verify_tls = npm_cfg.get("verify_tls", True)
if not base_url:
return False, "NPMplus base_url not configured"
token = _get_token(cfg)
action = "enable" if enabled else "disable"
url = f"{base_url}/nginx/proxy-hosts/{host_id}/{action}"
headers = {
"Authorization": f"Bearer {token}",
"User-Agent": "tg-admin-bot",
}
try:
payload = _request_json(url, headers, None, verify_tls, method="POST")
except Exception as e:
return False, str(e)
if payload is True or (isinstance(payload, dict) and payload.get("success", True)):
return True, "OK"
return False, "API returned error"
def format_certificates(certs: list[dict[str, Any]]) -> str:
if not certs:
return "🔒 SSL certificates\n\n No certificates found"

504
services/openwrt.py Normal file
View File

@@ -0,0 +1,504 @@
import json
from typing import Any
from services.runner import run_cmd, run_cmd_full
def _format_uptime(seconds: int | float | None) -> str:
if seconds is None:
return "unknown"
total = int(seconds)
days, rem = divmod(total, 86400)
hours, rem = divmod(rem, 3600)
minutes, _ = divmod(rem, 60)
if days > 0:
return f"{days}d {hours:02d}:{minutes:02d}"
return f"{hours:02d}:{minutes:02d}"
def _format_load(load: list[Any] | None) -> str:
if not load or not isinstance(load, list):
return "unknown"
values = []
for raw in load[:3]:
try:
values.append(float(raw))
except (TypeError, ValueError):
values.append(0.0)
scale = 1.0
if values and max(values) > 1000:
scale = 1 / 65536.0
return " ".join(f"{val * scale:.2f}" for val in values)
def _format_rate(rate: Any) -> str:
try:
val = float(rate)
except (TypeError, ValueError):
return "?"
if val <= 0:
return "?"
if val >= 1_000_000:
return f"{val / 1_000_000:.1f}M"
if val >= 1_000:
return f"{val / 1_000:.1f}K"
return f"{val:.0f}b"
def _extract_wan_ip(wan: dict[str, Any]) -> str | None:
if not isinstance(wan, dict):
return None
addrs = wan.get("ipv4-address") or []
if isinstance(addrs, list):
for item in addrs:
if isinstance(item, dict):
ip = item.get("address")
if ip:
return str(ip)
return None
def _extract_wifi_clients(wireless: dict[str, Any]) -> list[str]:
clients: list[str] = []
if not isinstance(wireless, dict):
return clients
for radio in wireless.values():
if not isinstance(radio, dict):
continue
for iface in radio.get("interfaces", []) or []:
if not isinstance(iface, dict):
continue
ifname = iface.get("ifname") or "wifi"
assoclist = iface.get("assoclist")
stations = iface.get("stations")
if isinstance(assoclist, dict):
for mac, meta in assoclist.items():
if not isinstance(meta, dict):
continue
signal = meta.get("signal")
rx = _format_rate((meta.get("rx") or {}).get("rate"))
tx = _format_rate((meta.get("tx") or {}).get("rate"))
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
clients.append(f"{ifname} {mac} {sig} rx:{rx} tx:{tx}")
elif isinstance(stations, list):
for meta in stations:
if not isinstance(meta, dict):
continue
mac = meta.get("mac") or "?"
signal = meta.get("signal")
rx = _format_rate((meta.get("rx") or {}).get("rate"))
tx = _format_rate((meta.get("tx") or {}).get("rate"))
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
clients.append(f"{ifname} {mac} {sig} rx:{rx} tx:{tx}")
return clients
def _extract_leases(leases: dict[str, Any]) -> list[str]:
items = None
if isinstance(leases, dict):
items = leases.get("leases") or leases.get("dhcp_leases") or leases.get("ipv4_leases")
elif isinstance(leases, list):
items = leases
if not isinstance(items, list):
return []
out = []
for lease in items:
if not isinstance(lease, dict):
continue
ipaddr = lease.get("ipaddr") or "?"
host = lease.get("hostname") or "unknown"
mac = lease.get("macaddr") or "?"
out.append(f"{ipaddr} {host} ({mac})")
return out
def _extract_lease_name_map(leases: Any) -> dict[str, str]:
items = None
if isinstance(leases, dict):
items = leases.get("leases") or leases.get("dhcp_leases") or leases.get("ipv4_leases")
elif isinstance(leases, list):
items = leases
if not isinstance(items, list):
return {}
out: dict[str, str] = {}
for lease in items:
if not isinstance(lease, dict):
continue
mac = lease.get("macaddr")
if not mac:
continue
host = lease.get("hostname") or "unknown"
out[str(mac).lower()] = str(host)
return out
def _extract_lease_name_map_fallback(raw: str) -> dict[str, str]:
out: dict[str, str] = {}
for line in raw.splitlines():
parts = line.strip().split()
if len(parts) < 4:
continue
_expiry, mac, _ipaddr, host = parts[:4]
host = host if host != "*" else "unknown"
out[str(mac).lower()] = str(host)
return out
def _extract_ifnames(wireless: dict[str, Any]) -> list[str]:
ifnames: list[str] = []
if not isinstance(wireless, dict):
return ifnames
for radio in wireless.values():
if not isinstance(radio, dict):
continue
for iface in radio.get("interfaces", []) or []:
if not isinstance(iface, dict):
continue
ifname = iface.get("ifname")
if ifname:
ifnames.append(str(ifname))
return ifnames
def _extract_ifname_meta(wireless: dict[str, Any]) -> dict[str, dict[str, str]]:
meta: dict[str, dict[str, str]] = {}
if not isinstance(wireless, dict):
return meta
for radio in wireless.values():
if not isinstance(radio, dict):
continue
band = None
cfg = radio.get("config") or {}
if isinstance(cfg, dict):
band = cfg.get("band")
band_label = None
if band == "2g":
band_label = "2.4GHz"
elif band == "5g":
band_label = "5GHz"
elif band:
band_label = str(band)
for iface in radio.get("interfaces", []) or []:
if not isinstance(iface, dict):
continue
ifname = iface.get("ifname")
if not ifname:
continue
iface_cfg = iface.get("config") or {}
ssid = None
if isinstance(iface_cfg, dict):
ssid = iface_cfg.get("ssid")
meta[str(ifname)] = {
"ssid": str(ssid) if ssid else "",
"band": band_label or "",
}
return meta
def _extract_hostapd_ifnames(raw: str) -> list[str]:
ifnames: list[str] = []
for line in raw.splitlines():
name = line.strip()
if not name or name == "hostapd":
continue
ifnames.append(name)
return ifnames
def _net_label_for_ifname(ifname: str, ifname_meta: dict[str, dict[str, str]]) -> str:
meta = ifname_meta.get(ifname, {})
ssid = meta.get("ssid") or ""
band = meta.get("band") or ""
if ssid and band:
return f"{ssid} ({band})"
if ssid:
return ssid
if band:
return band
return ifname
def _safe_json_load(raw: str) -> Any | None:
if not raw:
return None
try:
return json.loads(raw)
except json.JSONDecodeError:
start = raw.find("{")
end = raw.rfind("}")
if start == -1 or end == -1 or end <= start:
return None
try:
return json.loads(raw[start : end + 1])
except json.JSONDecodeError:
return None
def _parse_hostapd_clients(
payload: Any,
ifname: str,
*,
name_map: dict[str, str] | None = None,
ifname_meta: dict[str, dict[str, str]] | None = None,
) -> list[tuple[str, int | None, str]]:
if not isinstance(payload, dict):
return []
data = payload.get("clients")
if isinstance(data, dict):
items = data.items()
else:
return []
clients: list[tuple[str, int | None, str]] = []
name_map = name_map or {}
meta = (ifname_meta or {}).get(ifname, {})
ssid = meta.get("ssid") or ""
band = meta.get("band") or ""
if ssid and band:
net_label = f"{ssid} ({band})"
elif ssid:
net_label = ssid
elif band:
net_label = band
else:
net_label = ifname
for mac, meta in items:
if not isinstance(meta, dict):
continue
signal = meta.get("signal")
rate = meta.get("rate") or {}
rx = _format_rate((rate or {}).get("rx"))
tx = _format_rate((rate or {}).get("tx"))
sig = f"{signal}dBm" if isinstance(signal, (int, float)) else "?"
host = name_map.get(str(mac).lower())
if host and host != "unknown":
client_label = host
else:
client_label = str(mac)
line = f"{net_label} {client_label} {sig} rx:{rx} tx:{tx}"
clients.append((line, signal if isinstance(signal, (int, float)) else None, net_label))
return clients
def _parse_proc_fallback(raw: str) -> tuple[int | None, list[float] | None]:
uptime = None
load = None
for line in raw.splitlines():
parts = line.split()
if len(parts) >= 2 and uptime is None:
try:
uptime = int(float(parts[0]))
except ValueError:
uptime = None
if len(parts) >= 3 and load is None:
try:
load = [float(parts[0]), float(parts[1]), float(parts[2])]
except ValueError:
load = None
return uptime, load
def _parse_leases_fallback(raw: str) -> list[str]:
out = []
for line in raw.splitlines():
parts = line.strip().split()
if len(parts) < 4:
continue
_expiry, mac, ipaddr, host = parts[:4]
host = host if host != "*" else "unknown"
out.append(f"{ipaddr} {host} ({mac})")
return out
async def get_openwrt_status(cfg: dict[str, Any], mode: str = "full") -> str:
ow_cfg = cfg.get("openwrt", {})
host = ow_cfg.get("host")
user = ow_cfg.get("user", "root")
port = ow_cfg.get("port", 22)
identity_file = ow_cfg.get("identity_file")
timeout_sec = ow_cfg.get("timeout_sec", 8)
strict = ow_cfg.get("strict_host_key_checking", True)
if not host:
return "⚠️ OpenWrt host not configured"
ssh_cmd = [
"ssh",
"-o",
"BatchMode=yes",
"-o",
f"ConnectTimeout={timeout_sec}",
"-o",
"LogLevel=ERROR",
]
if not strict:
ssh_cmd += ["-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"]
if identity_file:
ssh_cmd += ["-i", str(identity_file)]
ssh_cmd += ["-p", str(port), f"{user}@{host}"]
remote = (
"ubus call system info 2>/dev/null || (cat /proc/uptime; echo; cat /proc/loadavg); "
"echo __SEP__;"
"ubus call network.interface.wan status 2>/dev/null; echo __SEP__;"
"ubus call network.wireless status 2>/dev/null; echo __SEP__;"
"ubus call luci-rpc getDHCPLeases '{\"family\":4}' 2>/dev/null || cat /tmp/dhcp.leases"
)
cmd = ssh_cmd + ["sh", "-c", remote]
rc, out = await run_cmd_full(cmd, timeout=timeout_sec + 15)
if rc == 124:
return "⚠️ OpenWrt SSH error: timeout"
if rc != 0:
return f"⚠️ OpenWrt SSH error: {out.strip() or 'unknown error'}"
parts = [p.strip() for p in out.split("__SEP__")]
if len(parts) < 4:
return "⚠️ OpenWrt response incomplete"
sys_info = _safe_json_load(parts[0])
wan_status = _safe_json_load(parts[1]) or {}
wireless = _safe_json_load(parts[2]) or {}
leases = _safe_json_load(parts[3])
leases_fallback = "" if leases is not None else parts[3]
if isinstance(sys_info, dict):
uptime_raw = sys_info.get("uptime")
load_raw = sys_info.get("load")
else:
uptime_raw, load_raw = _parse_proc_fallback(parts[0])
uptime = _format_uptime(uptime_raw)
load = _format_load(load_raw)
wan_ip = _extract_wan_ip(wan_status) or "unknown"
wan_up = wan_status.get("up") if isinstance(wan_status, dict) else None
wan_state = "up" if wan_up else "down"
wifi_clients = _extract_wifi_clients(wireless)
ifnames = _extract_ifnames(wireless)
ifname_meta = _extract_ifname_meta(wireless)
rc_l, out_l = await run_cmd_full(
ssh_cmd + ["sh", "-c", r"ubus -S list | awk -F. '/^hostapd\.phy/{print $2}'"],
timeout=timeout_sec + 15,
)
if rc_l == 0 and out_l.strip():
ifnames.extend(_extract_hostapd_ifnames(out_l))
ifnames = sorted({name for name in ifnames if name})
lease_name_map = _extract_lease_name_map(leases or {})
if leases_fallback:
lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback))
wifi_net_counts: dict[str, int] = {}
wifi_signals: dict[str, list[int]] = {}
if ifnames:
for ifname in ifnames:
cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"]
rc2, out2 = await run_cmd_full(cmd_clients, timeout=timeout_sec + 15)
if rc2 == 124:
return f"⚠️ OpenWrt SSH error (wifi clients {ifname}): timeout"
if rc2 == 0:
payload = _safe_json_load(out2)
if payload:
clients_payload = payload.get("clients") if isinstance(payload, dict) else None
if isinstance(clients_payload, dict):
label = _net_label_for_ifname(ifname, ifname_meta)
wifi_net_counts[label] = wifi_net_counts.get(label, 0) + len(clients_payload)
parsed = _parse_hostapd_clients(
payload,
ifname,
name_map=lease_name_map,
ifname_meta=ifname_meta,
)
wifi_clients.extend([p[0] for p in parsed])
for _line, sig, net_label in parsed:
if sig is not None and net_label:
wifi_signals.setdefault(net_label, []).append(sig)
if leases:
leases_list = _extract_leases(leases)
else:
leases_list = _parse_leases_fallback(leases_fallback)
header = [
"📡 OpenWrt",
f"🕒 Uptime: {uptime}",
f"⚙️ Load: {load}",
f"🌐 WAN: {wan_ip} ({wan_state})",
"",
]
wifi_section: list[str] = []
if wifi_net_counts:
wifi_section.append("📶 Wi-Fi networks:")
for label, count in sorted(wifi_net_counts.items()):
sigs = wifi_signals.get(label) or []
if sigs:
avg_sig = sum(sigs) / len(sigs)
min_sig = min(sigs)
wifi_section.append(f" - {label}: {count} (avg {avg_sig:.0f}dBm, min {min_sig}dBm)")
else:
wifi_section.append(f" - {label}: {count}")
wifi_section.append("")
wifi_section.append(f"📶 Wi-Fi clients: {len(wifi_clients)}")
if wifi_clients:
for line in wifi_clients[:20]:
wifi_section.append(f" - {line}")
if len(wifi_clients) > 20:
wifi_section.append(f" … and {len(wifi_clients) - 20} more")
else:
wifi_section.append(" (none)")
lease_section: list[str] = ["", f"🧾 DHCP leases: {len(leases_list)}"]
if leases_list:
for line in leases_list[:20]:
lease_section.append(f" - {line}")
if len(leases_list) > 20:
lease_section.append(f" … and {len(leases_list) - 20} more")
else:
lease_section.append(" (none)")
if mode == "wan":
return "\n".join(header)
if mode == "clients":
return "\n".join(header + wifi_section)
if mode == "leases":
return "\n".join(header + lease_section)
return "\n".join(header + wifi_section + lease_section)
async def fetch_openwrt_leases(cfg: dict[str, Any]) -> list[str]:
"""
Fetch DHCP leases as list of strings "IP hostname (MAC)".
"""
ow_cfg = cfg.get("openwrt", {})
host = ow_cfg.get("host")
user = ow_cfg.get("user", "root")
port = ow_cfg.get("port", 22)
identity_file = ow_cfg.get("identity_file")
timeout_sec = ow_cfg.get("timeout_sec", 8)
strict = ow_cfg.get("strict_host_key_checking", True)
if not host:
raise RuntimeError("OpenWrt host not configured")
ssh_cmd = [
"ssh",
"-o",
"BatchMode=yes",
"-o",
f"ConnectTimeout={timeout_sec}",
"-o",
"LogLevel=ERROR",
]
if not strict:
ssh_cmd += ["-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"]
if identity_file:
ssh_cmd += ["-i", str(identity_file)]
ssh_cmd += ["-p", str(port), f"{user}@{host}"]
remote = "ubus call luci-rpc getDHCPLeases '{\"family\":4}' 2>/dev/null || cat /tmp/dhcp.leases"
rc, out = await run_cmd_full(ssh_cmd + ["sh", "-c", remote], timeout=timeout_sec + 10)
if rc == 124:
raise RuntimeError("timeout")
if rc != 0:
raise RuntimeError(out.strip() or f"ssh rc={rc}")
leases = _safe_json_load(out)
if leases:
return _extract_leases(leases)
return _parse_leases_fallback(out)

88
services/processes.py Normal file
View File

@@ -0,0 +1,88 @@
import time
from typing import Any
import psutil
def _safe_name(info: dict[str, Any]) -> str:
name = info.get("name") or "unknown"
return str(name)
def get_top_processes(limit: int = 5, interval: float = 0.2) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
procs = []
for p in psutil.process_iter(attrs=["pid", "name"]):
procs.append(p)
for p in procs:
try:
p.cpu_percent(None)
except Exception:
continue
time.sleep(interval)
items = []
for p in procs:
try:
cpu = p.cpu_percent(None)
mem = p.memory_percent()
info = p.info
items.append({
"pid": info.get("pid"),
"name": _safe_name(info),
"cpu": cpu,
"mem": mem,
})
except Exception:
continue
top_cpu = sorted(items, key=lambda x: x["cpu"], reverse=True)[:limit]
top_mem = sorted(items, key=lambda x: x["mem"], reverse=True)[:limit]
return top_cpu, top_mem
def search_processes(query: str, limit: int = 10) -> list[dict[str, Any]]:
needle = query.lower().strip()
if not needle:
return []
results = []
for p in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
try:
info = p.info
name = _safe_name(info)
cmdline = " ".join(info.get("cmdline") or [])
hay = f"{name} {cmdline}".lower()
if needle not in hay:
continue
results.append({
"pid": info.get("pid"),
"name": name,
"cmdline": cmdline,
})
except Exception:
continue
return results[:limit]
def terminate_process(pid: int, timeout: float = 5.0) -> str:
try:
proc = psutil.Process(pid)
except Exception:
return f"Process {pid} not found"
try:
proc.terminate()
proc.wait(timeout=timeout)
return f"Process {pid} terminated"
except psutil.TimeoutExpired:
try:
proc.kill()
proc.wait(timeout=timeout)
return f"Process {pid} killed"
except Exception as e:
return f"Kill failed for {pid}: {e}"
except Exception as e:
return f"Terminate failed for {pid}: {e}"

View File

@@ -1,34 +1,209 @@
import asyncio
from typing import Awaitable, Callable
import logging
import time
from collections import deque
from typing import Awaitable, Callable, Any
from services import runtime_state
from services.incidents import log_incident
_queue: asyncio.Queue = asyncio.Queue()
_current_label: str | None = None
_current_meta: dict[str, Any] | None = None
_pending: deque[tuple[str, float]] = deque()
_stats: dict[str, Any] = runtime_state.get("queue_stats", {}) or {
"processed": 0,
"avg_wait_sec": 0.0,
"avg_runtime_sec": 0.0,
"last_label": "",
"last_finished_at": 0.0,
}
_history: deque[dict[str, Any]] = deque(runtime_state.get("queue_history", []) or [], maxlen=50)
_alert_cfg: dict[str, Any] = {
"max_pending": None,
"avg_wait": None,
"cooldown": 300,
"last_sent": 0.0,
}
_cfg: dict[str, Any] | None = None
_logger = logging.getLogger("queue")
def _save_stats():
runtime_state.set_state("queue_stats", _stats)
runtime_state.set_state("queue_history", list(_history))
def configure(queue_cfg: dict[str, Any], cfg: dict[str, Any]):
global _cfg
_cfg = cfg
_alert_cfg["max_pending"] = queue_cfg.get("max_pending_alert")
_alert_cfg["avg_wait"] = queue_cfg.get("avg_wait_alert")
_alert_cfg["cooldown"] = queue_cfg.get("cooldown_sec", 300)
def _check_congestion(pending_count: int, avg_wait: float | None):
max_pending = _alert_cfg.get("max_pending")
avg_wait_thr = _alert_cfg.get("avg_wait")
cooldown = _alert_cfg.get("cooldown", 300)
now = time.time()
if now - _alert_cfg.get("last_sent", 0) < cooldown:
return
reason = None
if max_pending and pending_count >= max_pending:
reason = f"pending={pending_count} >= {max_pending}"
if avg_wait_thr and avg_wait is not None and avg_wait >= avg_wait_thr:
reason = reason or f"avg_wait={avg_wait:.1f}s >= {avg_wait_thr}s"
if reason and _cfg:
try:
log_incident(_cfg, f"queue_congested {reason}", category="queue")
except Exception:
pass
_alert_cfg["last_sent"] = now
async def enqueue(label: str, job: Callable[[], Awaitable[None]]) -> int:
await _queue.put((label, job))
return _queue.qsize()
enqueued_at = time.time()
await _queue.put((label, job, enqueued_at))
_pending.append((label, enqueued_at))
_check_congestion(len(_pending), None)
return len(_pending)
async def worker():
global _current_label
global _current_label, _current_meta
while True:
label, job = await _queue.get()
label, job, enqueued_at = await _queue.get()
if _pending:
if _pending[0] == (label, enqueued_at):
_pending.popleft()
else:
try:
_pending.remove((label, enqueued_at))
except ValueError:
pass
_current_label = label
_current_meta = {"enqueued_at": enqueued_at, "started_at": time.time()}
status = "ok"
try:
await job()
except Exception as e:
status = "err"
_logger.exception("Queue job failed: label=%s", label)
if _cfg:
try:
log_incident(
_cfg,
f"queue_job_failed label={label} error={type(e).__name__}: {e}",
category="queue",
)
except Exception:
pass
finally:
finished_at = time.time()
if _current_meta:
wait_sec = max(0.0, _current_meta["started_at"] - _current_meta["enqueued_at"])
runtime_sec = max(0.0, finished_at - _current_meta["started_at"])
n_prev = int(_stats.get("processed", 0))
_stats["processed"] = n_prev + 1
_stats["avg_wait_sec"] = (
(_stats.get("avg_wait_sec", 0.0) * n_prev) + wait_sec
) / _stats["processed"]
_stats["avg_runtime_sec"] = (
(_stats.get("avg_runtime_sec", 0.0) * n_prev) + runtime_sec
) / _stats["processed"]
_stats["last_label"] = label
_stats["last_finished_at"] = finished_at
_history.appendleft(
{
"label": label,
"wait_sec": int(wait_sec),
"runtime_sec": int(runtime_sec),
"finished_at": int(finished_at),
"status": status,
}
)
_save_stats()
_check_congestion(len(_pending), _stats.get("avg_wait_sec"))
_current_label = None
_current_meta = None
_queue.task_done()
def format_status() -> str:
pending = [label for label, _ in list(_queue._queue)]
pending = list(_pending)
lines = ["🧾 Queue"]
lines.append(f"🔄 Running: {_current_label or 'idle'}")
lines.append(f"⏳ Pending: {len(pending)}")
if pending:
preview = ", ".join(pending[:5])
preview = ", ".join([p[0] for p in pending[:5]])
lines.append(f"➡️ Next: {preview}")
if _stats.get("processed"):
lines.append(
f"📈 Done: {_stats.get('processed')} | "
f"avg wait {int(_stats.get('avg_wait_sec', 0))}s | "
f"avg run {int(_stats.get('avg_runtime_sec', 0))}s"
)
return "\n".join(lines)
def format_details(limit: int = 10) -> str:
now = time.time()
lines = ["🧾 Queue details"]
if _current_label:
started_at = _current_meta.get("started_at") if _current_meta else None
runtime = f"{int(now - started_at)}s" if started_at else "n/a"
lines.append(f"🔄 Running: {_current_label} ({runtime})")
else:
lines.append("🔄 Running: idle")
pending = list(_pending)
lines.append(f"⏳ Pending: {len(pending)}")
if pending:
lines.append("🔢 Position | Label | Wait")
for i, (label, enqueued_at) in enumerate(pending[:limit], start=1):
wait = int(now - enqueued_at)
lines.append(f"{i:>3} | {label} | {wait}s")
if _stats.get("processed"):
lines.append("")
lines.append(
"📈 Stats: "
f"{_stats.get('processed')} done, "
f"avg wait {int(_stats.get('avg_wait_sec', 0))}s, "
f"avg run {int(_stats.get('avg_runtime_sec', 0))}s"
)
last_label = _stats.get("last_label")
if last_label:
lines.append(f"Last: {last_label}")
if _history:
lines.append("")
lines.append("🗂 Last jobs:")
for item in list(_history)[:5]:
t = time.strftime("%H:%M:%S", time.localtime(item["finished_at"]))
lines.append(
f"- {t} {item['label']} {item['status']} "
f"(wait {item['wait_sec']}s, run {item['runtime_sec']}s)"
)
return "\n".join(lines)
def format_history(limit: int = 20) -> str:
lines = ["🗂 Queue history"]
if not _history:
lines.append("(empty)")
return "\n".join(lines)
for item in list(_history)[:limit]:
t = time.strftime("%m-%d %H:%M:%S", time.localtime(item["finished_at"]))
lines.append(
f"{t} {item['label']} {item['status']} "
f"(wait {item['wait_sec']}s, run {item['runtime_sec']}s)"
)
return "\n".join(lines)
def get_history_raw() -> list[dict[str, Any]]:
return list(_history)
def get_stats() -> dict[str, Any]:
return dict(_stats)

View File

@@ -22,3 +22,24 @@ async def run_cmd(cmd: list[str], *, use_restic_env: bool = False, timeout: int
except asyncio.TimeoutError:
proc.kill()
return 124, "❌ timeout"
async def run_cmd_full(cmd: list[str], *, use_restic_env: bool = False, timeout: int = 60):
env = os.environ.copy()
env["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
if use_restic_env:
env.update(RESTIC_ENV)
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.STDOUT,
env=env,
)
try:
out, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout)
return proc.returncode, out.decode(errors="ignore")
except asyncio.TimeoutError:
proc.kill()
return 124, "❌ timeout"

73
services/runtime_state.py Normal file
View File

@@ -0,0 +1,73 @@
import json
import os
import threading
import tempfile
from typing import Any, Dict
_PATH = "/var/server-bot/runtime.json"
_STATE: Dict[str, Any] = {}
_LOCK = threading.RLock()
_LOADED = False
def configure(path: str | None):
global _PATH
if path:
_PATH = path
def _load_from_disk():
global _STATE, _LOADED
if not os.path.exists(_PATH):
_STATE = {}
_LOADED = True
return
try:
with open(_PATH, "r", encoding="utf-8") as f:
_STATE = json.load(f)
except Exception:
_STATE = {}
_LOADED = True
def _save():
directory = os.path.dirname(_PATH) or "."
os.makedirs(directory, exist_ok=True)
try:
fd, tmp_path = tempfile.mkstemp(prefix=".runtime.", suffix=".json", dir=directory)
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
json.dump(_STATE, f, ensure_ascii=False)
f.flush()
os.fsync(f.fileno())
os.replace(tmp_path, _PATH)
finally:
if os.path.exists(tmp_path):
try:
os.unlink(tmp_path)
except Exception:
pass
except Exception:
pass
def get_state() -> Dict[str, Any]:
with _LOCK:
if not _LOADED:
_load_from_disk()
return _STATE
def set_state(key: str, value: Any):
with _LOCK:
if not _LOADED:
_load_from_disk()
_STATE[key] = value
_save()
def get(key: str, default: Any = None) -> Any:
with _LOCK:
if not _LOADED:
_load_from_disk()
return _STATE.get(key, default)

95
services/selftest.py Normal file
View File

@@ -0,0 +1,95 @@
import json
from datetime import datetime, timedelta
import asyncio
from typing import Any
from services.health import health
from services.runner import run_cmd_full
from services.incidents import log_incident
from services import runtime_state
def _save_history(entry: dict[str, Any]) -> None:
hist = runtime_state.get("selftest_history", [])
hist = hist[:50] if isinstance(hist, list) else []
hist.insert(0, entry)
runtime_state.set_state("selftest_history", hist[:20])
async def run_selftest(cfg: dict[str, Any], docker_map: dict[str, str]) -> tuple[str, bool]:
lines = ["🧪 Self-test"]
ok = True
# health
try:
htext = await asyncio.to_thread(health, cfg, docker_map)
h_lines = [ln for ln in htext.splitlines() if ln.strip()]
brief = " | ".join(h_lines[1:5]) if len(h_lines) > 1 else h_lines[0] if h_lines else "n/a"
lines.append(f"🟢 Health: {brief}")
except Exception as e:
lines.append(f"🔴 Health failed: {e}")
ok = False
# restic snapshots check
rc, out = await run_cmd_full(["restic", "snapshots", "--json"], use_restic_env=True, timeout=40)
if rc == 0:
try:
snaps = json.loads(out)
if isinstance(snaps, list) and snaps:
snaps.sort(key=lambda s: s.get("time", ""), reverse=True)
last = snaps[0]
t = last.get("time", "?").replace("Z", "").replace("T", " ")[:16]
lines.append(f"🟢 Restic snapshots: {len(snaps)}, last {t}")
else:
lines.append("🟡 Restic snapshots: empty")
except Exception:
lines.append("🟡 Restic snapshots: invalid JSON")
else:
lines.append(f"🔴 Restic snapshots error: {out.strip() or rc}")
ok = False
result_text = "\n".join(lines)
try:
_save_history(
{
"ts": datetime.now().isoformat(),
"ok": ok,
"summary": result_text.splitlines()[1] if len(lines) > 1 else "",
}
)
except Exception:
pass
return result_text, ok
async def schedule_selftest(cfg: dict[str, Any], bot, admin_ids: list[int], docker_map: dict[str, str]):
"""
Run selftest daily at configured time.
"""
sched_cfg = cfg.get("selftest", {}).get("schedule", {})
if not sched_cfg.get("enabled", False):
return
time_str = sched_cfg.get("time", "03:30")
try:
hh, mm = [int(x) for x in time_str.split(":")]
except Exception:
hh, mm = 3, 30
while True:
now = datetime.now()
run_at = now.replace(hour=hh, minute=mm, second=0, microsecond=0)
if run_at <= now:
run_at += timedelta(days=1)
await asyncio.sleep((run_at - now).total_seconds())
text, ok = await run_selftest(cfg, docker_map)
for chat_id in admin_ids:
try:
await bot.send_message(chat_id, text)
except Exception:
pass
if not ok:
try:
log_incident(cfg, "selftest failed", category="selftest")
except Exception:
pass

61
services/ssl_alerts.py Normal file
View File

@@ -0,0 +1,61 @@
import asyncio
import time
from datetime import datetime, timezone
from typing import Any
from services.npmplus import fetch_certificates, _parse_expiry
async def monitor_ssl(cfg: dict[str, Any], notify, bot, chat_id: int):
npm_cfg = cfg.get("npmplus", {})
alert_cfg = npm_cfg.get("alerts", {})
if not alert_cfg.get("enabled", True):
return
days_list = alert_cfg.get("days", [30, 14, 7, 1])
days_list = sorted({int(x) for x in days_list if int(x) >= 0}, reverse=True)
cooldown = int(alert_cfg.get("cooldown_sec", 86400))
interval = int(alert_cfg.get("interval_sec", 3600))
last_sent: dict[str, float] = {}
while True:
now = datetime.now(timezone.utc)
try:
certs = fetch_certificates(cfg)
except Exception:
await asyncio.sleep(interval)
continue
for cert in certs:
name = cert.get("nice_name")
if not name:
domains = cert.get("domain_names") or []
if isinstance(domains, list):
name = ", ".join(domains)
if not name:
name = "unknown"
expiry = _parse_expiry(cert.get("expires_on"))
if expiry is None:
continue
days_left = (expiry - now).days
for threshold in days_list:
if days_left <= threshold:
key = f"{name}:{threshold}"
last_time = last_sent.get(key, 0)
if time.time() - last_time >= cooldown:
level = "critical" if days_left <= 1 else "warn"
await notify(
bot,
chat_id,
f"⚠️ SSL `{name}` expires in {days_left}d (threshold {threshold}d)",
level=level,
key=f"ssl:{name}:{threshold}",
category="ssl",
)
last_sent[key] = time.time()
break
await asyncio.sleep(interval)

107
services/weekly_report.py Normal file
View File

@@ -0,0 +1,107 @@
import asyncio
import socket
from datetime import datetime, timedelta
import psutil
from services.system import worst_disk_usage
from services.alert_mute import list_mutes
from services.incidents import read_recent
from services.docker import docker_cmd
def _parse_hhmm(value: str) -> tuple[int, int]:
try:
h, m = value.split(":", 1)
h = int(h)
m = int(m)
if 0 <= h <= 23 and 0 <= m <= 59:
return h, m
except Exception:
pass
return 8, 0
def _next_run(day: str, time_str: str) -> datetime:
day = (day or "Sun").lower()
day_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
target_wd = day_map.get(day[:3], 6)
hour, minute = _parse_hhmm(time_str or "08:00")
now = datetime.now()
candidate = now.replace(hour=hour, minute=minute, second=0, microsecond=0)
# find next target weekday/time
while candidate <= now or candidate.weekday() != target_wd:
candidate = candidate + timedelta(days=1)
candidate = candidate.replace(hour=hour, minute=minute, second=0, microsecond=0)
return candidate
async def _docker_running_counts(docker_map: dict) -> tuple[int, int]:
total = len(docker_map)
running = 0
for real in docker_map.values():
rc, raw = await docker_cmd(["inspect", "-f", "{{.State.Status}}", real], timeout=10)
if rc == 0 and raw.strip() == "running":
running += 1
return running, total
def _format_uptime(seconds: int) -> str:
days, rem = divmod(seconds, 86400)
hours, rem = divmod(rem, 3600)
minutes, _ = divmod(rem, 60)
return f"{days}d {hours:02d}:{minutes:02d}"
async def build_weekly_report(cfg, docker_map: dict) -> str:
host = socket.gethostname()
uptime = int(datetime.now().timestamp() - psutil.boot_time())
load1, load5, load15 = psutil.getloadavg()
mem = psutil.virtual_memory()
disk_usage, disk_mount = worst_disk_usage()
running, total = await _docker_running_counts(docker_map)
mutes = list_mutes()
incidents_24 = len(read_recent(cfg, 24, limit=1000))
incidents_7d = len(read_recent(cfg, 24 * 7, limit=2000))
lines = [
f"🧾 Weekly report — {host}",
f"⏱ Uptime: {_format_uptime(uptime)}",
f"⚙️ Load: {load1:.2f} {load5:.2f} {load15:.2f}",
f"🧠 RAM: {mem.percent}%",
]
if disk_usage is None:
lines.append("💾 Disk: n/a")
else:
lines.append(f"💾 Disk: {disk_usage}% ({disk_mount})")
lines.append(f"🐳 Docker: {running}/{total} running")
lines.append(f"📓 Incidents: 24h={incidents_24}, 7d={incidents_7d}")
if mutes:
lines.append("🔕 Active mutes:")
for cat, secs in mutes.items():
mins = max(0, secs) // 60
lines.append(f"- {cat}: {mins}m left")
else:
lines.append("🔔 Mutes: none")
return "\n".join(lines)
async def weekly_reporter(cfg, bot, admin_ids: list[int], docker_map: dict):
reports_cfg = cfg.get("reports", {}).get("weekly", {})
if not reports_cfg.get("enabled", False):
return
day = reports_cfg.get("day", "Sun")
time_str = reports_cfg.get("time", "08:00")
while True:
target = _next_run(day, time_str)
wait_sec = (target - datetime.now()).total_seconds()
if wait_sec > 0:
await asyncio.sleep(wait_sec)
try:
text = await build_weekly_report(cfg, docker_map)
for admin_id in admin_ids:
await bot.send_message(admin_id, text)
except Exception:
pass
await asyncio.sleep(60) # small delay to avoid tight loop if time skew

View File

@@ -7,3 +7,5 @@ ARCANE_CACHE: Dict[int, dict] = {}
REBOOT_PENDING: Dict[int, dict] = {}
METRICS_STORE = None
NPMPLUS_TOKEN: Dict[str, object] = {}
PROC_SEARCH_PENDING: Dict[int, dict] = {}
PROC_KILL_PENDING: Dict[int, dict] = {}

View File

@@ -1,5 +1,6 @@
import subprocess
import os
import re
def _cmd(cmd: str) -> str:
@@ -82,6 +83,62 @@ def list_disks() -> list[str]:
return disks
def list_md_arrays() -> list[str]:
# Prefer /proc/mdstat: it reliably lists active md arrays
# even when lsblk tree/filters differ across distros.
out = _cmd("cat /proc/mdstat")
arrays: set[str] = set()
for line in out.splitlines():
m = re.match(r"^\s*(md\d+)\s*:", line)
if m:
arrays.add(f"/dev/{m.group(1)}")
if arrays:
return sorted(arrays)
# Fallback for environments where mdstat parsing is unavailable.
out = _cmd("ls -1 /dev/md* 2>/dev/null")
for line in out.splitlines():
dev = line.strip()
if dev and re.match(r"^/dev/md\d+$", dev):
arrays.add(dev)
return sorted(arrays)
def md_array_status(dev: str) -> str:
out = _cmd("cat /proc/mdstat")
if not out or "ERROR:" in out:
return "⚠️ n/a"
name = dev.rsplit("/", 1)[-1]
lines = out.splitlines()
header = None
idx = -1
for i, line in enumerate(lines):
s = line.strip()
if s.startswith(f"{name} :"):
header = s
idx = i
break
if not header:
return "⚠️ not found in /proc/mdstat"
if "inactive" in header:
return "🔴 inactive"
# Typical mdstat health marker: [UU] for healthy mirrors/raid members.
block = [header]
for line in lines[idx + 1:]:
if not line.strip():
break
block.append(line.strip())
block_text = " ".join(block)
if "[U_" in block_text or "[_U" in block_text:
return "🟡 degraded"
return "🟢 active"
def smart_health(dev: str) -> str:
out = _cmd(f"smartctl -H {dev}")
@@ -122,10 +179,25 @@ def disk_temperature(dev: str) -> str:
return "n/a"
def smart_last_test(dev: str) -> str:
out = _cmd(f"smartctl -l selftest {dev}")
if not out or "ERROR:" in out:
return "n/a"
for line in out.splitlines():
if "No self-tests have been logged" in line:
return "no tests"
if line.lstrip().startswith("#"):
return line.strip()
return "n/a"
def disks() -> str:
disks = list_disks()
md_arrays = list_md_arrays()
if not disks:
if not disks and not md_arrays:
return "💽 Disks\n\n❌ No disks found"
lines = ["💽 Disks (SMART)\n"]
@@ -144,6 +216,12 @@ def disks() -> str:
lines.append(f"{icon} {d}{health}, 🌡 {temp}")
if md_arrays:
lines.append("")
lines.append("🧱 RAID (md)")
for md in md_arrays:
lines.append(f"{md}{md_array_status(md)}")
return "\n".join(lines)

View File

@@ -0,0 +1,20 @@
import unittest
from services.config_check import validate_cfg
class ConfigCheckTests(unittest.TestCase):
def test_admin_ids_without_admin_id_is_valid(self):
cfg = {
"telegram": {
"token": "x",
"admin_ids": [1, 2],
}
}
errors, warnings = validate_cfg(cfg)
self.assertEqual(errors, [])
self.assertIsInstance(warnings, list)
if __name__ == "__main__":
unittest.main()

21
tests/test_disk_report.py Normal file
View File

@@ -0,0 +1,21 @@
import unittest
import types
import sys
# Avoid runtime import of real app/aiogram in services.runner.
sys.modules.setdefault("app", types.SimpleNamespace(RESTIC_ENV={}))
from services.disk_report import _top_dirs_cmd
class DiskReportTests(unittest.TestCase):
def test_top_dirs_cmd_uses_exec_args_without_shell(self):
cmd = _top_dirs_cmd("/tmp/path with spaces", 5)
self.assertEqual(cmd[:4], ["du", "-x", "-h", "-d"])
self.assertNotIn("bash", cmd)
self.assertNotIn("-lc", cmd)
self.assertEqual(cmd[-1], "/tmp/path with spaces")
if __name__ == "__main__":
unittest.main()

59
tests/test_queue.py Normal file
View File

@@ -0,0 +1,59 @@
import asyncio
import tempfile
import unittest
from services import runtime_state
from services import queue as queue_service
class QueueTests(unittest.IsolatedAsyncioTestCase):
async def asyncSetUp(self):
self.tmp = tempfile.TemporaryDirectory()
runtime_state.configure(f"{self.tmp.name}/runtime.json")
queue_service._pending.clear() # type: ignore[attr-defined]
queue_service._history.clear() # type: ignore[attr-defined]
queue_service._stats = { # type: ignore[attr-defined]
"processed": 0,
"avg_wait_sec": 0.0,
"avg_runtime_sec": 0.0,
"last_label": "",
"last_finished_at": 0.0,
}
queue_service._cfg = {"incidents": {"enabled": True}} # type: ignore[attr-defined]
async def asyncTearDown(self):
self.tmp.cleanup()
async def test_worker_logs_failed_job_to_incidents(self):
logged = []
def fake_log_incident(cfg, text, category=None):
logged.append((text, category))
orig = queue_service.log_incident
queue_service.log_incident = fake_log_incident
async def boom():
raise RuntimeError("boom")
worker_task = asyncio.create_task(queue_service.worker())
try:
await queue_service.enqueue("broken-job", boom)
await asyncio.wait_for(queue_service._queue.join(), timeout=2.0) # type: ignore[attr-defined]
finally:
worker_task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await worker_task
queue_service.log_incident = orig
self.assertEqual(queue_service._stats.get("processed"), 1) # type: ignore[attr-defined]
self.assertTrue(any("queue_job_failed label=broken-job" in t for t, _c in logged))
self.assertTrue(any(c == "queue" for _t, c in logged))
import contextlib
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,28 @@
import json
import tempfile
import unittest
from pathlib import Path
from services import runtime_state
class RuntimeStateTests(unittest.TestCase):
def test_set_and_get_persist_between_loads(self):
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "runtime.json"
runtime_state.configure(str(path))
runtime_state.set_state("foo", {"bar": 1})
self.assertEqual(runtime_state.get("foo"), {"bar": 1})
# Force a fresh in-memory state and load from disk again.
runtime_state._STATE = {} # type: ignore[attr-defined]
runtime_state._LOADED = False # type: ignore[attr-defined]
self.assertEqual(runtime_state.get("foo"), {"bar": 1})
raw = json.loads(path.read_text(encoding="utf-8"))
self.assertEqual(raw.get("foo"), {"bar": 1})
if __name__ == "__main__":
unittest.main()