| 1 | import { NextResponse } from "next/server"; |
| 2 | |
| 3 | interface ServiceCheck { |
| 4 | name: string; |
| 5 | status: "operational" | "degraded" | "down"; |
| 6 | latency: number | null; |
| 7 | detail?: string; |
| 8 | } |
| 9 | |
| 10 | interface ContainerStats { |
| 11 | name: string; |
| 12 | status: string; |
| 13 | cpu_percent: number; |
| 14 | mem_usage_mb: number; |
| 15 | mem_limit_mb: number; |
| 16 | mem_percent: number; |
| 17 | net_rx_mb: number; |
| 18 | net_tx_mb: number; |
| 19 | uptime: string; |
| 20 | } |
| 21 | |
| 22 | interface SystemMetrics { |
| 23 | cpu_percent: number; |
| 24 | mem_total_mb: number; |
| 25 | mem_used_mb: number; |
| 26 | mem_percent: number; |
| 27 | disk_total_gb: number; |
| 28 | disk_used_gb: number; |
| 29 | disk_percent: number; |
| 30 | load_avg: number[]; |
| 31 | uptime: string; |
| 32 | } |
| 33 | |
| 34 | async function checkService( |
| 35 | name: string, |
| 36 | url: string, |
| 37 | timeout = 5000 |
| 38 | ): Promise<ServiceCheck> { |
| 39 | const start = Date.now(); |
| 40 | try { |
| 41 | const controller = new AbortController(); |
| 42 | const timer = setTimeout(() => controller.abort(), timeout); |
| 43 | const res = await fetch(url, { signal: controller.signal, cache: "no-store" }); |
| 44 | clearTimeout(timer); |
| 45 | const latency = Date.now() - start; |
| 46 | if (res.ok || res.status === 401) { |
| 47 | return { name, status: "operational", latency }; |
| 48 | } |
| 49 | return { name, status: "degraded", latency, detail: `HTTP ${res.status}` }; |
| 50 | } catch (err: unknown) { |
| 51 | return { |
| 52 | name, |
| 53 | status: "down", |
| 54 | latency: null, |
| 55 | detail: err instanceof Error ? err.message : "Connection failed", |
| 56 | }; |
| 57 | } |
| 58 | } |
| 59 | |
| 60 | async function getDockerStats(): Promise<ContainerStats[]> { |
| 61 | try { |
| 62 | // Docker socket — works when running inside Docker on the host |
| 63 | const controller = new AbortController(); |
| 64 | const timer = setTimeout(() => controller.abort(), 5000); |
| 65 | const res = await fetch("http://localhost:2375/containers/json?all=false", { |
| 66 | signal: controller.signal, |
| 67 | cache: "no-store", |
| 68 | }); |
| 69 | clearTimeout(timer); |
| 70 | if (!res.ok) return []; |
| 71 | |
| 72 | const containers: any[] = await res.json(); |
| 73 | const stats: ContainerStats[] = []; |
| 74 | |
| 75 | for (const c of containers) { |
| 76 | const name = (c.Names?.[0] ?? "").replace(/^\//, "").replace(/^grove-/, "").replace(/-1$/, ""); |
| 77 | const upSince = c.Created ? new Date(c.Created * 1000) : null; |
| 78 | const uptimeMs = upSince ? Date.now() - upSince.getTime() : 0; |
| 79 | |
| 80 | try { |
| 81 | const sRes = await fetch(`http://localhost:2375/containers/${c.Id}/stats?stream=false`, { |
| 82 | cache: "no-store", |
| 83 | }); |
| 84 | if (sRes.ok) { |
| 85 | const s = await sRes.json(); |
| 86 | const cpuDelta = s.cpu_stats.cpu_usage.total_usage - s.precpu_stats.cpu_usage.total_usage; |
| 87 | const sysDelta = s.cpu_stats.system_cpu_usage - s.precpu_stats.system_cpu_usage; |
| 88 | const cpuCount = s.cpu_stats.online_cpus || 1; |
| 89 | const cpuPercent = sysDelta > 0 ? (cpuDelta / sysDelta) * cpuCount * 100 : 0; |
| 90 | |
| 91 | const memUsage = s.memory_stats.usage - (s.memory_stats.stats?.cache || 0); |
| 92 | const memLimit = s.memory_stats.limit; |
| 93 | |
| 94 | const netRx = Object.values(s.networks || {}).reduce((a: number, n: any) => a + (n.rx_bytes || 0), 0); |
| 95 | const netTx = Object.values(s.networks || {}).reduce((a: number, n: any) => a + (n.tx_bytes || 0), 0); |
| 96 | |
| 97 | stats.push({ |
| 98 | name, |
| 99 | status: c.State, |
| 100 | cpu_percent: Math.round(cpuPercent * 100) / 100, |
| 101 | mem_usage_mb: Math.round(memUsage / 1024 / 1024), |
| 102 | mem_limit_mb: Math.round(memLimit / 1024 / 1024), |
| 103 | mem_percent: Math.round((memUsage / memLimit) * 10000) / 100, |
| 104 | net_rx_mb: Math.round(netRx / 1024 / 1024 * 100) / 100, |
| 105 | net_tx_mb: Math.round(netTx / 1024 / 1024 * 100) / 100, |
| 106 | uptime: formatUptime(uptimeMs), |
| 107 | }); |
| 108 | continue; |
| 109 | } |
| 110 | } catch {} |
| 111 | |
| 112 | stats.push({ |
| 113 | name, |
| 114 | status: c.State, |
| 115 | cpu_percent: 0, |
| 116 | mem_usage_mb: 0, |
| 117 | mem_limit_mb: 0, |
| 118 | mem_percent: 0, |
| 119 | net_rx_mb: 0, |
| 120 | net_tx_mb: 0, |
| 121 | uptime: formatUptime(uptimeMs), |
| 122 | }); |
| 123 | } |
| 124 | |
| 125 | return stats; |
| 126 | } catch { |
| 127 | return []; |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | async function getSystemMetrics(): Promise<SystemMetrics | null> { |
| 132 | try { |
| 133 | // Use /proc on Linux hosts |
| 134 | const [memInfo, loadAvg, uptime, diskStat] = await Promise.allSettled([ |
| 135 | fetchFile("/proc/meminfo"), |
| 136 | fetchFile("/proc/loadavg"), |
| 137 | fetchFile("/proc/uptime"), |
| 138 | fetchFile("/proc/diskstats"), |
| 139 | ]); |
| 140 | |
| 141 | let cpuPercent = 0; |
| 142 | let memTotal = 0; |
| 143 | let memUsed = 0; |
| 144 | let memPercent = 0; |
| 145 | let loads: number[] = []; |
| 146 | let uptimeStr = ""; |
| 147 | |
| 148 | if (memInfo.status === "fulfilled" && memInfo.value) { |
| 149 | const lines = memInfo.value.split("\n"); |
| 150 | const get = (key: string) => { |
| 151 | const line = lines.find((l: string) => l.startsWith(key)); |
| 152 | return line ? parseInt(line.split(/\s+/)[1]) : 0; |
| 153 | }; |
| 154 | memTotal = Math.round(get("MemTotal:") / 1024); |
| 155 | const memFree = get("MemFree:"); |
| 156 | const buffers = get("Buffers:"); |
| 157 | const cached = get("Cached:"); |
| 158 | const available = get("MemAvailable:"); |
| 159 | memUsed = Math.round((get("MemTotal:") - available) / 1024); |
| 160 | memPercent = Math.round((memUsed / memTotal) * 100); |
| 161 | } |
| 162 | |
| 163 | if (loadAvg.status === "fulfilled" && loadAvg.value) { |
| 164 | const parts = loadAvg.value.trim().split(/\s+/); |
| 165 | loads = parts.slice(0, 3).map(Number); |
| 166 | // rough CPU% from 1-min load avg |
| 167 | const { cpus } = await import("os"); |
| 168 | const numCpus = cpus().length; |
| 169 | cpuPercent = Math.min(100, Math.round((loads[0] / numCpus) * 100)); |
| 170 | } |
| 171 | |
| 172 | if (uptime.status === "fulfilled" && uptime.value) { |
| 173 | const secs = parseFloat(uptime.value.split(" ")[0]); |
| 174 | uptimeStr = formatUptime(secs * 1000); |
| 175 | } |
| 176 | |
| 177 | // Disk usage via statfs-like approach (use os module) |
| 178 | let diskTotal = 0; |
| 179 | let diskUsed = 0; |
| 180 | let diskPercent = 0; |
| 181 | try { |
| 182 | const { execSync } = await import("child_process"); |
| 183 | const df = execSync("df -BG / 2>/dev/null || df -g / 2>/dev/null", { encoding: "utf-8" }); |
| 184 | const line = df.trim().split("\n")[1]; |
| 185 | if (line) { |
| 186 | const parts = line.split(/\s+/); |
| 187 | diskTotal = parseInt(parts[1]); |
| 188 | diskUsed = parseInt(parts[2]); |
| 189 | diskPercent = Math.round((diskUsed / diskTotal) * 100); |
| 190 | } |
| 191 | } catch {} |
| 192 | |
| 193 | return { |
| 194 | cpu_percent: cpuPercent, |
| 195 | mem_total_mb: memTotal, |
| 196 | mem_used_mb: memUsed, |
| 197 | mem_percent: memPercent, |
| 198 | disk_total_gb: diskTotal, |
| 199 | disk_used_gb: diskUsed, |
| 200 | disk_percent: diskPercent, |
| 201 | load_avg: loads, |
| 202 | uptime: uptimeStr, |
| 203 | }; |
| 204 | } catch { |
| 205 | return null; |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | async function fetchFile(path: string): Promise<string | null> { |
| 210 | try { |
| 211 | const { readFileSync } = await import("fs"); |
| 212 | return readFileSync(path, "utf-8"); |
| 213 | } catch { |
| 214 | return null; |
| 215 | } |
| 216 | } |
| 217 | |
| 218 | function formatUptime(ms: number): string { |
| 219 | const secs = Math.floor(ms / 1000); |
| 220 | const days = Math.floor(secs / 86400); |
| 221 | const hours = Math.floor((secs % 86400) / 3600); |
| 222 | const mins = Math.floor((secs % 3600) / 60); |
| 223 | if (days > 0) return `${days}d ${hours}h`; |
| 224 | if (hours > 0) return `${hours}h ${mins}m`; |
| 225 | return `${mins}m`; |
| 226 | } |
| 227 | |
| 228 | export async function GET() { |
| 229 | const [checks, containers, system] = await Promise.all([ |
| 230 | Promise.all([ |
| 231 | checkService("Web", "http://grove-web:3000/"), |
| 232 | checkService("API", "http://grove-api:4000/api/health"), |
| 233 | checkService("Hub API", "http://hub-api:4000/api/auth/me"), |
| 234 | checkService("EdenAPI", "http://mononoke-slapi:8443/health_check"), |
| 235 | checkService("Git", "http://mononoke-git:8080/health_check"), |
| 236 | checkService("Bridge", "http://grove-bridge:8443/health_check"), |
| 237 | ]), |
| 238 | getDockerStats(), |
| 239 | getSystemMetrics(), |
| 240 | ]); |
| 241 | |
| 242 | const overall = checks.every((c) => c.status === "operational") |
| 243 | ? "operational" |
| 244 | : checks.some((c) => c.status === "down") |
| 245 | ? "major_outage" |
| 246 | : "degraded"; |
| 247 | |
| 248 | return NextResponse.json({ |
| 249 | status: overall, |
| 250 | services: checks, |
| 251 | containers, |
| 252 | system, |
| 253 | checked_at: new Date().toISOString(), |
| 254 | }); |
| 255 | } |
| 256 | |