Add external checks with uptime
This commit is contained in:
13
CONFIG.en.md
13
CONFIG.en.md
@@ -43,6 +43,19 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
||||
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||
- `backup_count` (int): How many rotated files to keep.
|
||||
|
||||
## external_checks
|
||||
|
||||
- `enabled` (bool): Enable background checks.
|
||||
- `state_path` (string): State file for uptime, default `/var/server-bot/external_checks.json`.
|
||||
- `timeout_sec` (int): Check timeout in seconds.
|
||||
- `interval_sec` (int): Background check interval.
|
||||
- `services` (list): List of checks.
|
||||
- `name` (string): Service name.
|
||||
- `type` (string): `http`, `tcp`, `ping`.
|
||||
- `url` (string): URL for `http`.
|
||||
- `host` (string): Host for `tcp`/`ping`.
|
||||
- `port` (int): Port for `tcp`.
|
||||
|
||||
## arcane
|
||||
|
||||
- `base_url` (string): Arcane API base url.
|
||||
|
||||
13
CONFIG.md
13
CONFIG.md
@@ -43,6 +43,19 @@
|
||||
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||
- `backup_count` (int): сколько файлов хранить.
|
||||
|
||||
## external_checks
|
||||
|
||||
- `enabled` (bool): включить фоновые проверки.
|
||||
- `state_path` (string): файл состояния для аптайма, по умолчанию `/var/server-bot/external_checks.json`.
|
||||
- `timeout_sec` (int): таймаут проверки в секундах.
|
||||
- `interval_sec` (int): интервал фоновых проверок.
|
||||
- `services` (list): список проверок.
|
||||
- `name` (string): название сервиса.
|
||||
- `type` (string): `http`, `tcp`, `ping`.
|
||||
- `url` (string): URL для `http`.
|
||||
- `host` (string): хост для `tcp`/`ping`.
|
||||
- `port` (int): порт для `tcp`.
|
||||
|
||||
## arcane
|
||||
|
||||
- `base_url` (string): base url API Arcane.
|
||||
|
||||
@@ -35,6 +35,20 @@ incidents:
|
||||
rotate_when: "W0"
|
||||
backup_count: 8
|
||||
|
||||
external_checks:
|
||||
enabled: true
|
||||
state_path: "/var/server-bot/external_checks.json"
|
||||
timeout_sec: 5
|
||||
interval_sec: 300
|
||||
services:
|
||||
- name: "example-site"
|
||||
type: "http"
|
||||
url: "https://example.com"
|
||||
- name: "example-ssh"
|
||||
type: "tcp"
|
||||
host: "example.com"
|
||||
port: 22
|
||||
|
||||
arcane:
|
||||
base_url: "http://localhost:3552"
|
||||
api_key: "arc_..."
|
||||
|
||||
@@ -16,6 +16,7 @@ from state import UPDATES_CACHE, REBOOT_PENDING
|
||||
from services.metrics import summarize
|
||||
from services.audit import read_audit_tail
|
||||
from services.incidents import read_recent, incidents_path
|
||||
from services.external_checks import format_report
|
||||
|
||||
|
||||
@dp.message(F.text == "💽 Disks")
|
||||
@@ -197,6 +198,13 @@ async def audit_log(msg: Message):
|
||||
await msg.answer(text, reply_markup=system_logs_kb, parse_mode="Markdown")
|
||||
|
||||
|
||||
@dp.message(F.text == "🌍 External")
|
||||
async def external_checks(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
return
|
||||
await msg.answer(format_report(cfg), reply_markup=system_logs_kb)
|
||||
|
||||
|
||||
@dp.message(F.text == "📣 Incidents")
|
||||
async def incidents(msg: Message):
|
||||
if not is_admin_msg(msg):
|
||||
|
||||
@@ -84,6 +84,7 @@ system_logs_kb = ReplyKeyboardMarkup(
|
||||
[KeyboardButton(text="🧾 Audit"), KeyboardButton(text="📣 Incidents")],
|
||||
[KeyboardButton(text="🧰 Processes"), KeyboardButton(text="🔒 SSL")],
|
||||
[KeyboardButton(text="🌐 URLs"), KeyboardButton(text="🔑 SSH log")],
|
||||
[KeyboardButton(text="🌍 External")],
|
||||
[KeyboardButton(text="⬅️ System")],
|
||||
],
|
||||
resize_keyboard=True,
|
||||
|
||||
3
main.py
3
main.py
@@ -10,6 +10,7 @@ from services.queue import worker as queue_worker
|
||||
from services.notify import notify
|
||||
from services.audit import AuditMiddleware, audit_start
|
||||
from services.ssl_alerts import monitor_ssl
|
||||
from services.external_checks import monitor_external
|
||||
import state
|
||||
import handlers.menu
|
||||
import handlers.status
|
||||
@@ -45,6 +46,8 @@ async def main():
|
||||
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
||||
if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
|
||||
asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
|
||||
if cfg.get("external_checks", {}).get("enabled", True):
|
||||
asyncio.create_task(monitor_external(cfg))
|
||||
state.METRICS_STORE = MetricsStore()
|
||||
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
||||
asyncio.create_task(queue_worker())
|
||||
|
||||
143
services/external_checks.py
Normal file
143
services/external_checks.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
|
||||
def _state_path(cfg: dict[str, Any]) -> str:
|
||||
return cfg.get("external_checks", {}).get("state_path", "/var/server-bot/external_checks.json")
|
||||
|
||||
|
||||
def _load_state(cfg: dict[str, Any]) -> dict[str, Any]:
|
||||
path = _state_path(cfg)
|
||||
if not os.path.exists(path):
|
||||
return {"services": {}, "total_checks": 0, "ok_checks": 0}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return {"services": {}, "total_checks": 0, "ok_checks": 0}
|
||||
|
||||
|
||||
def _save_state(cfg: dict[str, Any], state: dict[str, Any]) -> None:
|
||||
path = _state_path(cfg)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(state, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def _check_http(url: str, timeout: int) -> tuple[bool, str]:
|
||||
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
|
||||
try:
|
||||
with urlopen(req, timeout=timeout) as resp:
|
||||
status = int(resp.status)
|
||||
return status < 400, f"HTTP {status}"
|
||||
except HTTPError as e:
|
||||
return False, f"HTTP {int(e.code)}"
|
||||
except URLError as e:
|
||||
return False, str(e.reason)
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def _check_tcp(host: str, port: int, timeout: int) -> tuple[bool, str]:
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=timeout):
|
||||
return True, "TCP ok"
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def _check_ping(host: str, timeout: int) -> tuple[bool, str]:
|
||||
try:
|
||||
socket.gethostbyname(host)
|
||||
return True, "DNS ok"
|
||||
except Exception:
|
||||
pass
|
||||
return _check_tcp(host, 80, timeout)
|
||||
|
||||
|
||||
def run_checks(cfg: dict[str, Any]) -> dict[str, Any]:
|
||||
checks_cfg = cfg.get("external_checks", {})
|
||||
services = checks_cfg.get("services", [])
|
||||
timeout = int(checks_cfg.get("timeout_sec", 5))
|
||||
|
||||
state = _load_state(cfg)
|
||||
services_state = state.setdefault("services", {})
|
||||
|
||||
results = []
|
||||
for entry in services:
|
||||
name = entry.get("name") or "unknown"
|
||||
check_type = entry.get("type", "http")
|
||||
ok = False
|
||||
detail = "n/a"
|
||||
|
||||
if check_type == "http":
|
||||
url = entry.get("url")
|
||||
if url:
|
||||
ok, detail = _check_http(url, timeout)
|
||||
elif check_type == "tcp":
|
||||
host = entry.get("host")
|
||||
port = int(entry.get("port", 0))
|
||||
if host and port:
|
||||
ok, detail = _check_tcp(host, port, timeout)
|
||||
elif check_type == "ping":
|
||||
host = entry.get("host")
|
||||
if host:
|
||||
ok, detail = _check_ping(host, timeout)
|
||||
|
||||
service_state = services_state.setdefault(name, {"ok": 0, "total": 0})
|
||||
service_state["total"] += 1
|
||||
if ok:
|
||||
service_state["ok"] += 1
|
||||
|
||||
state["total_checks"] = state.get("total_checks", 0) + 1
|
||||
if ok:
|
||||
state["ok_checks"] = state.get("ok_checks", 0) + 1
|
||||
|
||||
results.append({"name": name, "ok": ok, "detail": detail})
|
||||
|
||||
_save_state(cfg, state)
|
||||
return {"results": results, "state": state}
|
||||
|
||||
|
||||
def format_report(cfg: dict[str, Any]) -> str:
|
||||
checks_cfg = cfg.get("external_checks", {})
|
||||
services = checks_cfg.get("services", [])
|
||||
if not services:
|
||||
return "🌍 External checks\n\nℹ️ No services configured"
|
||||
|
||||
data = run_checks(cfg)
|
||||
results = data["results"]
|
||||
state = data["state"]
|
||||
|
||||
total = state.get("total_checks", 0) or 1
|
||||
ok_total = state.get("ok_checks", 0)
|
||||
uptime = 100.0 * ok_total / total
|
||||
|
||||
lines = ["🌍 External checks", ""]
|
||||
for item in results:
|
||||
icon = "🟢" if item["ok"] else "🔴"
|
||||
lines.append(f"{icon} {item['name']}: {item['detail']}")
|
||||
|
||||
lines.append("")
|
||||
lines.append(f"📈 Uptime (global): {uptime:.2f}%")
|
||||
|
||||
lines.append(f"🕒 {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def monitor_external(cfg: dict[str, Any]):
|
||||
checks_cfg = cfg.get("external_checks", {})
|
||||
if not checks_cfg.get("enabled", True):
|
||||
return
|
||||
interval = int(checks_cfg.get("interval_sec", 300))
|
||||
|
||||
while True:
|
||||
run_checks(cfg)
|
||||
await asyncio.sleep(interval)
|
||||
Reference in New Issue
Block a user