Add external checks with uptime
This commit is contained in:
13
CONFIG.en.md
13
CONFIG.en.md
@@ -43,6 +43,19 @@ This project uses `config.yaml`. Start from `config.example.yaml`.
|
|||||||
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
- `rotate_when` (string): Rotation schedule for `TimedRotatingFileHandler`. Example `W0` for weekly on Monday.
|
||||||
- `backup_count` (int): How many rotated files to keep.
|
- `backup_count` (int): How many rotated files to keep.
|
||||||
|
|
||||||
|
## external_checks
|
||||||
|
|
||||||
|
- `enabled` (bool): Enable background checks.
|
||||||
|
- `state_path` (string): State file for uptime, default `/var/server-bot/external_checks.json`.
|
||||||
|
- `timeout_sec` (int): Check timeout in seconds.
|
||||||
|
- `interval_sec` (int): Background check interval.
|
||||||
|
- `services` (list): List of checks.
|
||||||
|
- `name` (string): Service name.
|
||||||
|
- `type` (string): `http`, `tcp`, `ping`.
|
||||||
|
- `url` (string): URL for `http`.
|
||||||
|
- `host` (string): Host for `tcp`/`ping`.
|
||||||
|
- `port` (int): Port for `tcp`.
|
||||||
|
|
||||||
## arcane
|
## arcane
|
||||||
|
|
||||||
- `base_url` (string): Arcane API base url.
|
- `base_url` (string): Arcane API base url.
|
||||||
|
|||||||
13
CONFIG.md
13
CONFIG.md
@@ -43,6 +43,19 @@
|
|||||||
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
- `rotate_when` (string): режим ротации (`TimedRotatingFileHandler`), например `W0`.
|
||||||
- `backup_count` (int): сколько файлов хранить.
|
- `backup_count` (int): сколько файлов хранить.
|
||||||
|
|
||||||
|
## external_checks
|
||||||
|
|
||||||
|
- `enabled` (bool): включить фоновые проверки.
|
||||||
|
- `state_path` (string): файл состояния для аптайма, по умолчанию `/var/server-bot/external_checks.json`.
|
||||||
|
- `timeout_sec` (int): таймаут проверки в секундах.
|
||||||
|
- `interval_sec` (int): интервал фоновых проверок.
|
||||||
|
- `services` (list): список проверок.
|
||||||
|
- `name` (string): название сервиса.
|
||||||
|
- `type` (string): `http`, `tcp`, `ping`.
|
||||||
|
- `url` (string): URL для `http`.
|
||||||
|
- `host` (string): хост для `tcp`/`ping`.
|
||||||
|
- `port` (int): порт для `tcp`.
|
||||||
|
|
||||||
## arcane
|
## arcane
|
||||||
|
|
||||||
- `base_url` (string): base url API Arcane.
|
- `base_url` (string): base url API Arcane.
|
||||||
|
|||||||
@@ -35,6 +35,20 @@ incidents:
|
|||||||
rotate_when: "W0"
|
rotate_when: "W0"
|
||||||
backup_count: 8
|
backup_count: 8
|
||||||
|
|
||||||
|
external_checks:
|
||||||
|
enabled: true
|
||||||
|
state_path: "/var/server-bot/external_checks.json"
|
||||||
|
timeout_sec: 5
|
||||||
|
interval_sec: 300
|
||||||
|
services:
|
||||||
|
- name: "example-site"
|
||||||
|
type: "http"
|
||||||
|
url: "https://example.com"
|
||||||
|
- name: "example-ssh"
|
||||||
|
type: "tcp"
|
||||||
|
host: "example.com"
|
||||||
|
port: 22
|
||||||
|
|
||||||
arcane:
|
arcane:
|
||||||
base_url: "http://localhost:3552"
|
base_url: "http://localhost:3552"
|
||||||
api_key: "arc_..."
|
api_key: "arc_..."
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from state import UPDATES_CACHE, REBOOT_PENDING
|
|||||||
from services.metrics import summarize
|
from services.metrics import summarize
|
||||||
from services.audit import read_audit_tail
|
from services.audit import read_audit_tail
|
||||||
from services.incidents import read_recent, incidents_path
|
from services.incidents import read_recent, incidents_path
|
||||||
|
from services.external_checks import format_report
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text == "💽 Disks")
|
@dp.message(F.text == "💽 Disks")
|
||||||
@@ -197,6 +198,13 @@ async def audit_log(msg: Message):
|
|||||||
await msg.answer(text, reply_markup=system_logs_kb, parse_mode="Markdown")
|
await msg.answer(text, reply_markup=system_logs_kb, parse_mode="Markdown")
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(F.text == "🌍 External")
|
||||||
|
async def external_checks(msg: Message):
|
||||||
|
if not is_admin_msg(msg):
|
||||||
|
return
|
||||||
|
await msg.answer(format_report(cfg), reply_markup=system_logs_kb)
|
||||||
|
|
||||||
|
|
||||||
@dp.message(F.text == "📣 Incidents")
|
@dp.message(F.text == "📣 Incidents")
|
||||||
async def incidents(msg: Message):
|
async def incidents(msg: Message):
|
||||||
if not is_admin_msg(msg):
|
if not is_admin_msg(msg):
|
||||||
|
|||||||
@@ -84,6 +84,7 @@ system_logs_kb = ReplyKeyboardMarkup(
|
|||||||
[KeyboardButton(text="🧾 Audit"), KeyboardButton(text="📣 Incidents")],
|
[KeyboardButton(text="🧾 Audit"), KeyboardButton(text="📣 Incidents")],
|
||||||
[KeyboardButton(text="🧰 Processes"), KeyboardButton(text="🔒 SSL")],
|
[KeyboardButton(text="🧰 Processes"), KeyboardButton(text="🔒 SSL")],
|
||||||
[KeyboardButton(text="🌐 URLs"), KeyboardButton(text="🔑 SSH log")],
|
[KeyboardButton(text="🌐 URLs"), KeyboardButton(text="🔑 SSH log")],
|
||||||
|
[KeyboardButton(text="🌍 External")],
|
||||||
[KeyboardButton(text="⬅️ System")],
|
[KeyboardButton(text="⬅️ System")],
|
||||||
],
|
],
|
||||||
resize_keyboard=True,
|
resize_keyboard=True,
|
||||||
|
|||||||
3
main.py
3
main.py
@@ -10,6 +10,7 @@ from services.queue import worker as queue_worker
|
|||||||
from services.notify import notify
|
from services.notify import notify
|
||||||
from services.audit import AuditMiddleware, audit_start
|
from services.audit import AuditMiddleware, audit_start
|
||||||
from services.ssl_alerts import monitor_ssl
|
from services.ssl_alerts import monitor_ssl
|
||||||
|
from services.external_checks import monitor_external
|
||||||
import state
|
import state
|
||||||
import handlers.menu
|
import handlers.menu
|
||||||
import handlers.status
|
import handlers.status
|
||||||
@@ -45,6 +46,8 @@ async def main():
|
|||||||
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
asyncio.create_task(monitor_smart(cfg, notify, bot, ADMIN_ID))
|
||||||
if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
|
if cfg.get("npmplus", {}).get("alerts", {}).get("enabled", True):
|
||||||
asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
|
asyncio.create_task(monitor_ssl(cfg, notify, bot, ADMIN_ID))
|
||||||
|
if cfg.get("external_checks", {}).get("enabled", True):
|
||||||
|
asyncio.create_task(monitor_external(cfg))
|
||||||
state.METRICS_STORE = MetricsStore()
|
state.METRICS_STORE = MetricsStore()
|
||||||
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
asyncio.create_task(start_sampler(state.METRICS_STORE, interval=5))
|
||||||
asyncio.create_task(queue_worker())
|
asyncio.create_task(queue_worker())
|
||||||
|
|||||||
143
services/external_checks.py
Normal file
143
services/external_checks.py
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
from urllib.error import HTTPError, URLError
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
|
|
||||||
|
def _state_path(cfg: dict[str, Any]) -> str:
|
||||||
|
return cfg.get("external_checks", {}).get("state_path", "/var/server-bot/external_checks.json")
|
||||||
|
|
||||||
|
|
||||||
|
def _load_state(cfg: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
path = _state_path(cfg)
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return {"services": {}, "total_checks": 0, "ok_checks": 0}
|
||||||
|
try:
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
except Exception:
|
||||||
|
return {"services": {}, "total_checks": 0, "ok_checks": 0}
|
||||||
|
|
||||||
|
|
||||||
|
def _save_state(cfg: dict[str, Any], state: dict[str, Any]) -> None:
|
||||||
|
path = _state_path(cfg)
|
||||||
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(state, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_http(url: str, timeout: int) -> tuple[bool, str]:
|
||||||
|
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
|
||||||
|
try:
|
||||||
|
with urlopen(req, timeout=timeout) as resp:
|
||||||
|
status = int(resp.status)
|
||||||
|
return status < 400, f"HTTP {status}"
|
||||||
|
except HTTPError as e:
|
||||||
|
return False, f"HTTP {int(e.code)}"
|
||||||
|
except URLError as e:
|
||||||
|
return False, str(e.reason)
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_tcp(host: str, port: int, timeout: int) -> tuple[bool, str]:
|
||||||
|
try:
|
||||||
|
with socket.create_connection((host, port), timeout=timeout):
|
||||||
|
return True, "TCP ok"
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_ping(host: str, timeout: int) -> tuple[bool, str]:
|
||||||
|
try:
|
||||||
|
socket.gethostbyname(host)
|
||||||
|
return True, "DNS ok"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return _check_tcp(host, 80, timeout)
|
||||||
|
|
||||||
|
|
||||||
|
def run_checks(cfg: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
checks_cfg = cfg.get("external_checks", {})
|
||||||
|
services = checks_cfg.get("services", [])
|
||||||
|
timeout = int(checks_cfg.get("timeout_sec", 5))
|
||||||
|
|
||||||
|
state = _load_state(cfg)
|
||||||
|
services_state = state.setdefault("services", {})
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for entry in services:
|
||||||
|
name = entry.get("name") or "unknown"
|
||||||
|
check_type = entry.get("type", "http")
|
||||||
|
ok = False
|
||||||
|
detail = "n/a"
|
||||||
|
|
||||||
|
if check_type == "http":
|
||||||
|
url = entry.get("url")
|
||||||
|
if url:
|
||||||
|
ok, detail = _check_http(url, timeout)
|
||||||
|
elif check_type == "tcp":
|
||||||
|
host = entry.get("host")
|
||||||
|
port = int(entry.get("port", 0))
|
||||||
|
if host and port:
|
||||||
|
ok, detail = _check_tcp(host, port, timeout)
|
||||||
|
elif check_type == "ping":
|
||||||
|
host = entry.get("host")
|
||||||
|
if host:
|
||||||
|
ok, detail = _check_ping(host, timeout)
|
||||||
|
|
||||||
|
service_state = services_state.setdefault(name, {"ok": 0, "total": 0})
|
||||||
|
service_state["total"] += 1
|
||||||
|
if ok:
|
||||||
|
service_state["ok"] += 1
|
||||||
|
|
||||||
|
state["total_checks"] = state.get("total_checks", 0) + 1
|
||||||
|
if ok:
|
||||||
|
state["ok_checks"] = state.get("ok_checks", 0) + 1
|
||||||
|
|
||||||
|
results.append({"name": name, "ok": ok, "detail": detail})
|
||||||
|
|
||||||
|
_save_state(cfg, state)
|
||||||
|
return {"results": results, "state": state}
|
||||||
|
|
||||||
|
|
||||||
|
def format_report(cfg: dict[str, Any]) -> str:
|
||||||
|
checks_cfg = cfg.get("external_checks", {})
|
||||||
|
services = checks_cfg.get("services", [])
|
||||||
|
if not services:
|
||||||
|
return "🌍 External checks\n\nℹ️ No services configured"
|
||||||
|
|
||||||
|
data = run_checks(cfg)
|
||||||
|
results = data["results"]
|
||||||
|
state = data["state"]
|
||||||
|
|
||||||
|
total = state.get("total_checks", 0) or 1
|
||||||
|
ok_total = state.get("ok_checks", 0)
|
||||||
|
uptime = 100.0 * ok_total / total
|
||||||
|
|
||||||
|
lines = ["🌍 External checks", ""]
|
||||||
|
for item in results:
|
||||||
|
icon = "🟢" if item["ok"] else "🔴"
|
||||||
|
lines.append(f"{icon} {item['name']}: {item['detail']}")
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"📈 Uptime (global): {uptime:.2f}%")
|
||||||
|
|
||||||
|
lines.append(f"🕒 {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
async def monitor_external(cfg: dict[str, Any]):
|
||||||
|
checks_cfg = cfg.get("external_checks", {})
|
||||||
|
if not checks_cfg.get("enabled", True):
|
||||||
|
return
|
||||||
|
interval = int(checks_cfg.get("interval_sec", 300))
|
||||||
|
|
||||||
|
while True:
|
||||||
|
run_checks(cfg)
|
||||||
|
await asyncio.sleep(interval)
|
||||||
Reference in New Issue
Block a user