Files
tg-admin-bot/services/health.py

103 lines
3.3 KiB
Python

import os
import ssl
import subprocess
import psutil
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
from app import RESTIC_ENV
from services.system import worst_disk_usage
def _containers_from_cfg(cfg) -> dict:
return cfg.get("docker", {}).get("containers", {})
def _request_status(url: str, verify_tls: bool) -> int | None:
context = None
if not verify_tls:
context = ssl._create_unverified_context() # nosec - config-controlled
req = Request(url, headers={"User-Agent": "tg-admin-bot"})
try:
with urlopen(req, timeout=8, context=context) as resp:
return int(resp.status)
except HTTPError as e:
return int(e.code)
except URLError:
return None
def _npm_api_base(cfg) -> str | None:
npm_cfg = cfg.get("npmplus", {})
base = (npm_cfg.get("base_url") or "").rstrip("/")
if not base:
return None
if not base.endswith("/api"):
base = f"{base}/api"
return base
def health(cfg, container_map: dict | None = None) -> str:
lines = ["🩺 Health check\n"]
try:
env = os.environ.copy()
env.update(RESTIC_ENV)
subprocess.check_output(["restic", "snapshots"], timeout=10, env=env)
lines.append("🟢 Backup repo reachable")
except Exception:
lines.append("🔴 Backup repo unreachable")
containers = container_map if container_map is not None else _containers_from_cfg(cfg)
for alias, real in containers.items():
out = subprocess.getoutput(
f"docker inspect -f '{{{{.State.Status}}}}' {real}"
)
if out.strip() != "running":
lines.append(f"🔴 {alias} down")
else:
lines.append(f"🟢 {alias} OK")
npm_cfg = cfg.get("npmplus", {})
npm_base = _npm_api_base(cfg)
if npm_base:
npm_status = _request_status(npm_base, npm_cfg.get("verify_tls", True))
if npm_status == 200:
lines.append("🟢 NPMplus API OK")
elif npm_status is None:
lines.append("🔴 NPMplus API unreachable")
else:
lines.append(f"🟡 NPMplus API HTTP {npm_status}")
g_cfg = cfg.get("gitea", {})
g_base = (g_cfg.get("base_url") or "").rstrip("/")
if g_base:
health_paths = ["/api/healthz", "/api/v1/healthz"]
g_status = None
for path in health_paths:
status = _request_status(f"{g_base}{path}", g_cfg.get("verify_tls", True))
if status == 200:
g_status = status
break
if status not in (404, 405):
g_status = status
break
if g_status == 200:
lines.append("🟢 Gitea API OK")
elif g_status is None:
lines.append("🔴 Gitea API unreachable")
else:
lines.append(f"🟡 Gitea API HTTP {g_status}")
usage, mount = worst_disk_usage()
if usage is None:
lines.append("⚠️ Disk n/a")
elif usage > cfg["thresholds"]["disk_warn"]:
lines.append(f"🟡 Disk {usage}% ({mount})")
else:
lines.append(f"🟢 Disk {usage}% ({mount})")
load = psutil.getloadavg()[0]
lines.append(f"{'🟢' if load < cfg['thresholds']['load_warn'] else '🟡'} Load {load}")
return "\n".join(lines)