Add selftest scheduler, queue history, and OpenWrt signal stats
This commit is contained in:
@@ -36,6 +36,7 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `category` (string): load/disk/smart/ssl/docker/test.
|
- `category` (string): load/disk/smart/ssl/docker/test.
|
||||||
- `start` (string): Start `HH:MM`.
|
- `start` (string): Start `HH:MM`.
|
||||||
- `end` (string): End `HH:MM` (can wrap over midnight).
|
- `end` (string): End `HH:MM` (can wrap over midnight).
|
||||||
|
- `auto_mute_on_high_load_sec` (int): auto-mute `load` category for N seconds on critical load (0 disables).
|
||||||
- `notify_recovery` (bool): Send recovery notifications.
|
- `notify_recovery` (bool): Send recovery notifications.
|
||||||
- `smart_enabled` (bool): Enable SMART health polling.
|
- `smart_enabled` (bool): Enable SMART health polling.
|
||||||
- `smart_interval_sec` (int): SMART poll interval.
|
- `smart_interval_sec` (int): SMART poll interval.
|
||||||
@@ -82,6 +83,11 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `weekly.day` (string): Weekday `Mon`..`Sun` (default `Sun`).
|
- `weekly.day` (string): Weekday `Mon`..`Sun` (default `Sun`).
|
||||||
- `weekly.time` (string): Local time `HH:MM` (default `08:00`).
|
- `weekly.time` (string): Local time `HH:MM` (default `08:00`).
|
||||||
|
|
||||||
|
## selftest
|
||||||
|
|
||||||
|
- `schedule.enabled` (bool): Enable auto self-test.
|
||||||
|
- `schedule.time` (string): Local time `HH:MM` (default `03:30`).
|
||||||
|
|
||||||
## external_checks
|
## external_checks
|
||||||
|
|
||||||
- `enabled` (bool): Enable background checks.
|
- `enabled` (bool): Enable background checks.
|
||||||
|
|||||||
@@ -36,6 +36,7 @@
|
|||||||
- `category` (string): load/disk/smart/ssl/docker/test.
|
- `category` (string): load/disk/smart/ssl/docker/test.
|
||||||
- `start` (string): начало `HH:MM`.
|
- `start` (string): начало `HH:MM`.
|
||||||
- `end` (string): конец `HH:MM` (интервал может пересекать ночь).
|
- `end` (string): конец `HH:MM` (интервал может пересекать ночь).
|
||||||
|
- `auto_mute_on_high_load_sec` (int): при critical load автоматически мьютить категорию `load` на N секунд (0 — выкл).
|
||||||
- `notify_recovery` (bool): уведомлять о восстановлении.
|
- `notify_recovery` (bool): уведомлять о восстановлении.
|
||||||
- `smart_enabled` (bool): SMART проверки.
|
- `smart_enabled` (bool): SMART проверки.
|
||||||
- `smart_interval_sec` (int): интервал SMART.
|
- `smart_interval_sec` (int): интервал SMART.
|
||||||
@@ -82,6 +83,11 @@
|
|||||||
- `weekly.day` (string): день недели (`Mon`..`Sun`), по умолчанию `Sun`.
|
- `weekly.day` (string): день недели (`Mon`..`Sun`), по умолчанию `Sun`.
|
||||||
- `weekly.time` (string): локальное время `HH:MM`, по умолчанию `08:00`.
|
- `weekly.time` (string): локальное время `HH:MM`, по умолчанию `08:00`.
|
||||||
|
|
||||||
|
## selftest
|
||||||
|
|
||||||
|
- `schedule.enabled` (bool): включить авто self-test.
|
||||||
|
- `schedule.time` (string): локальное время `HH:MM`, по умолчанию `03:30`.
|
||||||
|
|
||||||
## external_checks
|
## external_checks
|
||||||
|
|
||||||
- `enabled` (bool): включить фоновые проверки.
|
- `enabled` (bool): включить фоновые проверки.
|
||||||
|
|||||||
@@ -30,6 +30,8 @@ alerts:
|
|||||||
- category: "load"
|
- category: "load"
|
||||||
start: "23:00"
|
start: "23:00"
|
||||||
end: "08:00"
|
end: "08:00"
|
||||||
|
# Auto-mute load when critical load fires (seconds)
|
||||||
|
auto_mute_on_high_load_sec: 600
|
||||||
quiet_hours:
|
quiet_hours:
|
||||||
enabled: false
|
enabled: false
|
||||||
start: "23:00"
|
start: "23:00"
|
||||||
@@ -78,6 +80,11 @@ reports:
|
|||||||
day: "Sun" # Mon/Tue/Wed/Thu/Fri/Sat/Sun
|
day: "Sun" # Mon/Tue/Wed/Thu/Fri/Sat/Sun
|
||||||
time: "08:00" # HH:MM server local time
|
time: "08:00" # HH:MM server local time
|
||||||
|
|
||||||
|
selftest:
|
||||||
|
schedule:
|
||||||
|
enabled: false
|
||||||
|
time: "03:30"
|
||||||
|
|
||||||
external_checks:
|
external_checks:
|
||||||
enabled: true
|
enabled: true
|
||||||
state_path: "/var/server-bot/external_checks.json"
|
state_path: "/var/server-bot/external_checks.json"
|
||||||
|
|||||||
@@ -56,6 +56,9 @@ HELP_PAGES = [
|
|||||||
"🛠 **Admin & Deploy**\n\n"
|
"🛠 **Admin & Deploy**\n\n"
|
||||||
"Config: `/config_check`, файл `config.yaml` (см. config.example.yaml).\n"
|
"Config: `/config_check`, файл `config.yaml` (см. config.example.yaml).\n"
|
||||||
"Deploy: `deploy.sh` (ssh 10.10.10.10:1090 → git pull → systemctl restart tg-bot).\n"
|
"Deploy: `deploy.sh` (ssh 10.10.10.10:1090 → git pull → systemctl restart tg-bot).\n"
|
||||||
|
"Incidents summary: `/incidents_summary`.\n"
|
||||||
|
"Disk snapshot: `/disk_snapshot`.\n"
|
||||||
|
"BotFather list: `/botfather_list`.\n"
|
||||||
"Безопасность: `safety.dry_run: true` блокирует опасные действия.\n"
|
"Безопасность: `safety.dry_run: true` блокирует опасные действия.\n"
|
||||||
"OpenWrt: кнопка в System → Info.",
|
"OpenWrt: кнопка в System → Info.",
|
||||||
),
|
),
|
||||||
@@ -110,3 +113,33 @@ async def help_cb(cb: CallbackQuery):
|
|||||||
parse_mode="Markdown",
|
parse_mode="Markdown",
|
||||||
)
|
)
|
||||||
await cb.answer()
|
await cb.answer()
|
||||||
|
|
||||||
|
|
||||||
|
BOTFATHER_LIST = """\
|
||||||
|
help - Show help pages
|
||||||
|
status_short - Compact host status
|
||||||
|
health_short - Compact health report
|
||||||
|
selftest - Health + restic snapshot probe
|
||||||
|
alerts - Manage alerts
|
||||||
|
alerts_list - List active mutes
|
||||||
|
alerts_recent - Show recent incidents (24h)
|
||||||
|
alerts_mute_load - Mute load alerts for 60m
|
||||||
|
backup_run - Run backup (queued)
|
||||||
|
backup_history - Show backup log tail
|
||||||
|
docker_status - Docker summary
|
||||||
|
docker_health - Docker inspect/health by alias
|
||||||
|
openwrt - Full OpenWrt status
|
||||||
|
openwrt_wan - OpenWrt WAN only
|
||||||
|
openwrt_clients - OpenWrt wifi clients
|
||||||
|
openwrt_leases - OpenWrt DHCP leases
|
||||||
|
incidents_summary - Incidents counters (24h/7d)
|
||||||
|
disk_snapshot - Disk usage snapshot
|
||||||
|
config_check - Validate config
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/botfather_list")
|
||||||
|
async def botfather_list(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
await msg.answer(f"Commands for BotFather:\n```\n{BOTFATHER_LIST}\n```", parse_mode="Markdown")
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from services.system import format_disks
|
|||||||
from services.health import health
|
from services.health import health
|
||||||
from state import DOCKER_MAP
|
from state import DOCKER_MAP
|
||||||
from services.runner import run_cmd_full
|
from services.runner import run_cmd_full
|
||||||
|
from services.selftest import run_selftest
|
||||||
|
|
||||||
|
|
||||||
async def cmd_status(msg: Message):
|
async def cmd_status(msg: Message):
|
||||||
@@ -125,34 +126,8 @@ async def selftest(msg: Message):
|
|||||||
await msg.answer("⏳ Self-test…", reply_markup=menu_kb)
|
await msg.answer("⏳ Self-test…", reply_markup=menu_kb)
|
||||||
|
|
||||||
async def worker():
|
async def worker():
|
||||||
lines = ["🧪 Self-test"]
|
text = await run_selftest(cfg, DOCKER_MAP)
|
||||||
# health
|
await msg.answer(text, reply_markup=menu_kb)
|
||||||
try:
|
|
||||||
htext = await asyncio.to_thread(health, cfg, DOCKER_MAP)
|
|
||||||
h_lines = [ln for ln in htext.splitlines() if ln.strip()]
|
|
||||||
brief = " | ".join(h_lines[1:5]) if len(h_lines) > 1 else h_lines[0] if h_lines else "n/a"
|
|
||||||
lines.append(f"🟢 Health: {brief}")
|
|
||||||
except Exception as e:
|
|
||||||
lines.append(f"🔴 Health failed: {e}")
|
|
||||||
|
|
||||||
# restic snapshots check
|
|
||||||
rc, out = await run_cmd_full(["restic", "snapshots", "--json"], use_restic_env=True, timeout=40)
|
|
||||||
if rc == 0:
|
|
||||||
try:
|
|
||||||
snaps = json.loads(out)
|
|
||||||
if isinstance(snaps, list) and snaps:
|
|
||||||
snaps.sort(key=lambda s: s.get("time", ""), reverse=True)
|
|
||||||
last = snaps[0]
|
|
||||||
t = last.get("time", "?").replace("Z", "").replace("T", " ")[:16]
|
|
||||||
lines.append(f"🟢 Restic snapshots: {len(snaps)}, last {t}")
|
|
||||||
else:
|
|
||||||
lines.append("🟡 Restic snapshots: empty")
|
|
||||||
except Exception:
|
|
||||||
lines.append("🟡 Restic snapshots: invalid JSON")
|
|
||||||
else:
|
|
||||||
lines.append(f"🔴 Restic snapshots error: {out.strip() or rc}")
|
|
||||||
|
|
||||||
await msg.answer("\n".join(lines), reply_markup=menu_kb)
|
|
||||||
|
|
||||||
asyncio.create_task(worker())
|
asyncio.create_task(worker())
|
||||||
|
|
||||||
|
|||||||
@@ -19,12 +19,14 @@ from services.runner import run_cmd
|
|||||||
from services.npmplus import fetch_certificates, format_certificates, list_proxy_hosts, set_proxy_host
|
from services.npmplus import fetch_certificates, format_certificates, list_proxy_hosts, set_proxy_host
|
||||||
from services.gitea import get_gitea_health
|
from services.gitea import get_gitea_health
|
||||||
from services.openwrt import get_openwrt_status
|
from services.openwrt import get_openwrt_status
|
||||||
|
from services.system import worst_disk_usage
|
||||||
import state
|
import state
|
||||||
from state import UPDATES_CACHE, REBOOT_PENDING
|
from state import UPDATES_CACHE, REBOOT_PENDING
|
||||||
from services.metrics import summarize
|
from services.metrics import summarize
|
||||||
from services.audit import read_audit_tail
|
from services.audit import read_audit_tail
|
||||||
from services.incidents import read_recent, incidents_path
|
from services.incidents import read_recent, incidents_path
|
||||||
from services.external_checks import format_report
|
from services.external_checks import format_report
|
||||||
|
from services.disk_report import build_disk_report
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text == "💽 Disks")
|
@dp.message(F.text == "💽 Disks")
|
||||||
@@ -308,6 +310,48 @@ async def incidents(msg: Message):
|
|||||||
await msg.answer(text, reply_markup=system_logs_audit_kb, parse_mode="Markdown")
|
await msg.answer(text, reply_markup=system_logs_audit_kb, parse_mode="Markdown")
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/incidents_summary")
|
||||||
|
async def incidents_summary(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
last_24h = read_recent(cfg, hours=24, limit=2000)
|
||||||
|
last_7d = read_recent(cfg, hours=24 * 7, limit=4000)
|
||||||
|
|
||||||
|
def count(lines):
|
||||||
|
import re
|
||||||
|
total = len(lines)
|
||||||
|
cats = {}
|
||||||
|
for line in lines:
|
||||||
|
m = re.search(r"category=([A-Za-z0-9_-]+)", line)
|
||||||
|
if m:
|
||||||
|
cats[m.group(1)] = cats.get(m.group(1), 0) + 1
|
||||||
|
top = ", ".join(f"{k}:{v}" for k, v in sorted(cats.items(), key=lambda x: x[1], reverse=True)[:5]) or "n/a"
|
||||||
|
return total, top
|
||||||
|
|
||||||
|
t24, top24 = count(last_24h)
|
||||||
|
t7, top7 = count(last_7d)
|
||||||
|
text = (
|
||||||
|
"📣 Incidents summary\n\n"
|
||||||
|
f"24h: {t24} (top: {top24})\n"
|
||||||
|
f"7d: {t7} (top: {top7})"
|
||||||
|
)
|
||||||
|
await msg.answer(text, reply_markup=system_logs_audit_kb)
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "/disk_snapshot")
|
||||||
|
async def disk_snapshot(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
usage, mount = worst_disk_usage()
|
||||||
|
mount = mount or "/"
|
||||||
|
try:
|
||||||
|
report = await build_disk_report(cfg, mount, usage or 0)
|
||||||
|
except Exception as e:
|
||||||
|
await msg.answer(f"⚠️ Disk snapshot error: {e}")
|
||||||
|
return
|
||||||
|
await msg.answer(f"💽 Disk snapshot ({mount})\n\n{report}", reply_markup=system_info_kb)
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text == "🔒 SSL")
|
@dp.message(F.text == "🔒 SSL")
|
||||||
async def ssl_certs(msg: Message):
|
async def ssl_certs(msg: Message):
|
||||||
if not is_admin_msg(msg):
|
if not is_admin_msg(msg):
|
||||||
|
|||||||
2
main.py
2
main.py
@@ -14,6 +14,7 @@ from services.ssl_alerts import monitor_ssl
|
|||||||
from services.external_checks import monitor_external
|
from services.external_checks import monitor_external
|
||||||
from services.incidents import log_incident
|
from services.incidents import log_incident
|
||||||
from services.logging_setup import setup_logging
|
from services.logging_setup import setup_logging
|
||||||
|
from services.selftest import schedule_selftest
|
||||||
import state
|
import state
|
||||||
import handlers.menu
|
import handlers.menu
|
||||||
import handlers.status
|
import handlers.status
|
||||||
@@ -73,6 +74,7 @@ async def main():
|
|||||||
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
||||||
asyncio.create_task(queue_worker())
|
asyncio.create_task(queue_worker())
|
||||||
asyncio.create_task(weekly_reporter(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
|
asyncio.create_task(weekly_reporter(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
|
||||||
|
asyncio.create_task(schedule_selftest(cfg, bot, ADMIN_IDS, state.DOCKER_MAP))
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
loop.set_exception_handler(_handle_async_exception)
|
loop.set_exception_handler(_handle_async_exception)
|
||||||
await notify_start()
|
await notify_start()
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
|||||||
cooldown = int(alerts_cfg.get("cooldown_sec", 900))
|
cooldown = int(alerts_cfg.get("cooldown_sec", 900))
|
||||||
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
notify_recovery = bool(alerts_cfg.get("notify_recovery", True))
|
||||||
load_only_critical = bool(alerts_cfg.get("load_only_critical", False))
|
load_only_critical = bool(alerts_cfg.get("load_only_critical", False))
|
||||||
|
auto_mute_high_load_sec = int(alerts_cfg.get("auto_mute_on_high_load_sec", 0))
|
||||||
|
|
||||||
disk_warn = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
disk_warn = int(cfg.get("thresholds", {}).get("disk_warn", 80))
|
||||||
snapshot_warn = int(cfg.get("disk_report", {}).get("threshold", disk_warn))
|
snapshot_warn = int(cfg.get("disk_report", {}).get("threshold", disk_warn))
|
||||||
@@ -72,6 +73,10 @@ async def monitor_resources(cfg, notify, bot, chat_id):
|
|||||||
key = "load_high_crit" if level == 2 else "load_high_warn"
|
key = "load_high_crit" if level == 2 else "load_high_warn"
|
||||||
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load")
|
await notify(bot, chat_id, f"{icon} Load high: {load:.2f}", level=level_name, key=key, category="load")
|
||||||
last_sent["load"] = now
|
last_sent["load"] = now
|
||||||
|
if level == 2 and auto_mute_high_load_sec > 0:
|
||||||
|
from services.alert_mute import set_mute
|
||||||
|
|
||||||
|
set_mute("load", auto_mute_high_load_sec)
|
||||||
state["load_level"] = level
|
state["load_level"] = level
|
||||||
|
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|||||||
@@ -240,7 +240,7 @@ def _parse_hostapd_clients(
|
|||||||
*,
|
*,
|
||||||
name_map: dict[str, str] | None = None,
|
name_map: dict[str, str] | None = None,
|
||||||
ifname_meta: dict[str, dict[str, str]] | None = None,
|
ifname_meta: dict[str, dict[str, str]] | None = None,
|
||||||
) -> list[str]:
|
) -> list[tuple[str, int | None, str]]:
|
||||||
if not isinstance(payload, dict):
|
if not isinstance(payload, dict):
|
||||||
return []
|
return []
|
||||||
data = payload.get("clients")
|
data = payload.get("clients")
|
||||||
@@ -248,7 +248,7 @@ def _parse_hostapd_clients(
|
|||||||
items = data.items()
|
items = data.items()
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
clients: list[str] = []
|
clients: list[tuple[str, int | None, str]] = []
|
||||||
name_map = name_map or {}
|
name_map = name_map or {}
|
||||||
meta = (ifname_meta or {}).get(ifname, {})
|
meta = (ifname_meta or {}).get(ifname, {})
|
||||||
ssid = meta.get("ssid") or ""
|
ssid = meta.get("ssid") or ""
|
||||||
@@ -274,7 +274,8 @@ def _parse_hostapd_clients(
|
|||||||
client_label = host
|
client_label = host
|
||||||
else:
|
else:
|
||||||
client_label = str(mac)
|
client_label = str(mac)
|
||||||
clients.append(f"{net_label} {client_label} {sig} rx:{rx} tx:{tx}")
|
line = f"{net_label} {client_label} {sig} rx:{rx} tx:{tx}"
|
||||||
|
clients.append((line, signal if isinstance(signal, (int, float)) else None, net_label))
|
||||||
return clients
|
return clients
|
||||||
|
|
||||||
|
|
||||||
@@ -384,6 +385,7 @@ async def get_openwrt_status(cfg: dict[str, Any], mode: str = "full") -> str:
|
|||||||
if leases_fallback:
|
if leases_fallback:
|
||||||
lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback))
|
lease_name_map.update(_extract_lease_name_map_fallback(leases_fallback))
|
||||||
wifi_net_counts: dict[str, int] = {}
|
wifi_net_counts: dict[str, int] = {}
|
||||||
|
wifi_signals: dict[str, list[int]] = {}
|
||||||
if ifnames:
|
if ifnames:
|
||||||
for ifname in ifnames:
|
for ifname in ifnames:
|
||||||
cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"]
|
cmd_clients = ssh_cmd + ["ubus", "call", f"hostapd.{ifname}", "get_clients"]
|
||||||
@@ -397,14 +399,16 @@ async def get_openwrt_status(cfg: dict[str, Any], mode: str = "full") -> str:
|
|||||||
if isinstance(clients_payload, dict):
|
if isinstance(clients_payload, dict):
|
||||||
label = _net_label_for_ifname(ifname, ifname_meta)
|
label = _net_label_for_ifname(ifname, ifname_meta)
|
||||||
wifi_net_counts[label] = wifi_net_counts.get(label, 0) + len(clients_payload)
|
wifi_net_counts[label] = wifi_net_counts.get(label, 0) + len(clients_payload)
|
||||||
wifi_clients.extend(
|
parsed = _parse_hostapd_clients(
|
||||||
_parse_hostapd_clients(
|
|
||||||
payload,
|
payload,
|
||||||
ifname,
|
ifname,
|
||||||
name_map=lease_name_map,
|
name_map=lease_name_map,
|
||||||
ifname_meta=ifname_meta,
|
ifname_meta=ifname_meta,
|
||||||
)
|
)
|
||||||
)
|
wifi_clients.extend([p[0] for p in parsed])
|
||||||
|
for _line, sig, net_label in parsed:
|
||||||
|
if sig is not None and net_label:
|
||||||
|
wifi_signals.setdefault(net_label, []).append(sig)
|
||||||
|
|
||||||
if leases:
|
if leases:
|
||||||
leases_list = _extract_leases(leases)
|
leases_list = _extract_leases(leases)
|
||||||
@@ -422,6 +426,12 @@ async def get_openwrt_status(cfg: dict[str, Any], mode: str = "full") -> str:
|
|||||||
if wifi_net_counts:
|
if wifi_net_counts:
|
||||||
wifi_section.append("📶 Wi-Fi networks:")
|
wifi_section.append("📶 Wi-Fi networks:")
|
||||||
for label, count in sorted(wifi_net_counts.items()):
|
for label, count in sorted(wifi_net_counts.items()):
|
||||||
|
sigs = wifi_signals.get(label) or []
|
||||||
|
if sigs:
|
||||||
|
avg_sig = sum(sigs) / len(sigs)
|
||||||
|
min_sig = min(sigs)
|
||||||
|
wifi_section.append(f" - {label}: {count} (avg {avg_sig:.0f}dBm, min {min_sig}dBm)")
|
||||||
|
else:
|
||||||
wifi_section.append(f" - {label}: {count}")
|
wifi_section.append(f" - {label}: {count}")
|
||||||
wifi_section.append("")
|
wifi_section.append("")
|
||||||
|
|
||||||
|
|||||||
@@ -16,10 +16,12 @@ _stats: dict[str, Any] = runtime_state.get("queue_stats", {}) or {
|
|||||||
"last_label": "",
|
"last_label": "",
|
||||||
"last_finished_at": 0.0,
|
"last_finished_at": 0.0,
|
||||||
}
|
}
|
||||||
|
_history: deque[dict[str, Any]] = deque(runtime_state.get("queue_history", []) or [], maxlen=50)
|
||||||
|
|
||||||
|
|
||||||
def _save_stats():
|
def _save_stats():
|
||||||
runtime_state.set_state("queue_stats", _stats)
|
runtime_state.set_state("queue_stats", _stats)
|
||||||
|
runtime_state.set_state("queue_history", list(_history))
|
||||||
|
|
||||||
|
|
||||||
async def enqueue(label: str, job: Callable[[], Awaitable[None]]) -> int:
|
async def enqueue(label: str, job: Callable[[], Awaitable[None]]) -> int:
|
||||||
@@ -43,8 +45,11 @@ async def worker():
|
|||||||
pass
|
pass
|
||||||
_current_label = label
|
_current_label = label
|
||||||
_current_meta = {"enqueued_at": enqueued_at, "started_at": time.time()}
|
_current_meta = {"enqueued_at": enqueued_at, "started_at": time.time()}
|
||||||
|
status = "ok"
|
||||||
try:
|
try:
|
||||||
await job()
|
await job()
|
||||||
|
except Exception:
|
||||||
|
status = "err"
|
||||||
finally:
|
finally:
|
||||||
finished_at = time.time()
|
finished_at = time.time()
|
||||||
if _current_meta:
|
if _current_meta:
|
||||||
@@ -60,6 +65,15 @@ async def worker():
|
|||||||
) / _stats["processed"]
|
) / _stats["processed"]
|
||||||
_stats["last_label"] = label
|
_stats["last_label"] = label
|
||||||
_stats["last_finished_at"] = finished_at
|
_stats["last_finished_at"] = finished_at
|
||||||
|
_history.appendleft(
|
||||||
|
{
|
||||||
|
"label": label,
|
||||||
|
"wait_sec": int(wait_sec),
|
||||||
|
"runtime_sec": int(runtime_sec),
|
||||||
|
"finished_at": int(finished_at),
|
||||||
|
"status": status,
|
||||||
|
}
|
||||||
|
)
|
||||||
_save_stats()
|
_save_stats()
|
||||||
_current_label = None
|
_current_label = None
|
||||||
_current_meta = None
|
_current_meta = None
|
||||||
@@ -111,4 +125,13 @@ def format_details(limit: int = 10) -> str:
|
|||||||
last_label = _stats.get("last_label")
|
last_label = _stats.get("last_label")
|
||||||
if last_label:
|
if last_label:
|
||||||
lines.append(f"Last: {last_label}")
|
lines.append(f"Last: {last_label}")
|
||||||
|
if _history:
|
||||||
|
lines.append("")
|
||||||
|
lines.append("🗂 Last jobs:")
|
||||||
|
for item in list(_history)[:5]:
|
||||||
|
t = time.strftime("%H:%M:%S", time.localtime(item["finished_at"]))
|
||||||
|
lines.append(
|
||||||
|
f"- {t} {item['label']} {item['status']} "
|
||||||
|
f"(wait {item['wait_sec']}s, run {item['runtime_sec']}s)"
|
||||||
|
)
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|||||||
66
services/selftest.py
Normal file
66
services/selftest.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import json
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import asyncio
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from services.health import health
|
||||||
|
from services.runner import run_cmd_full
|
||||||
|
|
||||||
|
|
||||||
|
async def run_selftest(cfg: dict[str, Any], docker_map: dict[str, str]) -> str:
|
||||||
|
lines = ["🧪 Self-test"]
|
||||||
|
|
||||||
|
# health
|
||||||
|
try:
|
||||||
|
htext = await asyncio.to_thread(health, cfg, docker_map)
|
||||||
|
h_lines = [ln for ln in htext.splitlines() if ln.strip()]
|
||||||
|
brief = " | ".join(h_lines[1:5]) if len(h_lines) > 1 else h_lines[0] if h_lines else "n/a"
|
||||||
|
lines.append(f"🟢 Health: {brief}")
|
||||||
|
except Exception as e:
|
||||||
|
lines.append(f"🔴 Health failed: {e}")
|
||||||
|
|
||||||
|
# restic snapshots check
|
||||||
|
rc, out = await run_cmd_full(["restic", "snapshots", "--json"], use_restic_env=True, timeout=40)
|
||||||
|
if rc == 0:
|
||||||
|
try:
|
||||||
|
snaps = json.loads(out)
|
||||||
|
if isinstance(snaps, list) and snaps:
|
||||||
|
snaps.sort(key=lambda s: s.get("time", ""), reverse=True)
|
||||||
|
last = snaps[0]
|
||||||
|
t = last.get("time", "?").replace("Z", "").replace("T", " ")[:16]
|
||||||
|
lines.append(f"🟢 Restic snapshots: {len(snaps)}, last {t}")
|
||||||
|
else:
|
||||||
|
lines.append("🟡 Restic snapshots: empty")
|
||||||
|
except Exception:
|
||||||
|
lines.append("🟡 Restic snapshots: invalid JSON")
|
||||||
|
else:
|
||||||
|
lines.append(f"🔴 Restic snapshots error: {out.strip() or rc}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
async def schedule_selftest(cfg: dict[str, Any], bot, admin_ids: list[int], docker_map: dict[str, str]):
|
||||||
|
"""
|
||||||
|
Run selftest daily at configured time.
|
||||||
|
"""
|
||||||
|
sched_cfg = cfg.get("selftest", {}).get("schedule", {})
|
||||||
|
if not sched_cfg.get("enabled", False):
|
||||||
|
return
|
||||||
|
time_str = sched_cfg.get("time", "03:30")
|
||||||
|
try:
|
||||||
|
hh, mm = [int(x) for x in time_str.split(":")]
|
||||||
|
except Exception:
|
||||||
|
hh, mm = 3, 30
|
||||||
|
|
||||||
|
while True:
|
||||||
|
now = datetime.now()
|
||||||
|
run_at = now.replace(hour=hh, minute=mm, second=0, microsecond=0)
|
||||||
|
if run_at <= now:
|
||||||
|
run_at += timedelta(days=1)
|
||||||
|
await asyncio.sleep((run_at - now).total_seconds())
|
||||||
|
text = await run_selftest(cfg, docker_map)
|
||||||
|
for chat_id in admin_ids:
|
||||||
|
try:
|
||||||
|
await bot.send_message(chat_id, text)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
Reference in New Issue
Block a user