7.7 KB256 lines
Blame
1import { NextResponse } from "next/server";
2
3interface ServiceCheck {
4 name: string;
5 status: "operational" | "degraded" | "down";
6 latency: number | null;
7 detail?: string;
8}
9
10interface ContainerStats {
11 name: string;
12 status: string;
13 cpu_percent: number;
14 mem_usage_mb: number;
15 mem_limit_mb: number;
16 mem_percent: number;
17 net_rx_mb: number;
18 net_tx_mb: number;
19 uptime: string;
20}
21
22interface SystemMetrics {
23 cpu_percent: number;
24 mem_total_mb: number;
25 mem_used_mb: number;
26 mem_percent: number;
27 disk_total_gb: number;
28 disk_used_gb: number;
29 disk_percent: number;
30 load_avg: number[];
31 uptime: string;
32}
33
34async function checkService(
35 name: string,
36 url: string,
37 timeout = 5000
38): Promise<ServiceCheck> {
39 const start = Date.now();
40 try {
41 const controller = new AbortController();
42 const timer = setTimeout(() => controller.abort(), timeout);
43 const res = await fetch(url, { signal: controller.signal, cache: "no-store" });
44 clearTimeout(timer);
45 const latency = Date.now() - start;
46 if (res.ok || res.status === 401) {
47 return { name, status: "operational", latency };
48 }
49 return { name, status: "degraded", latency, detail: `HTTP ${res.status}` };
50 } catch (err: unknown) {
51 return {
52 name,
53 status: "down",
54 latency: null,
55 detail: err instanceof Error ? err.message : "Connection failed",
56 };
57 }
58}
59
60async function getDockerStats(): Promise<ContainerStats[]> {
61 try {
62 // Docker socket — works when running inside Docker on the host
63 const controller = new AbortController();
64 const timer = setTimeout(() => controller.abort(), 5000);
65 const res = await fetch("http://localhost:2375/containers/json?all=false", {
66 signal: controller.signal,
67 cache: "no-store",
68 });
69 clearTimeout(timer);
70 if (!res.ok) return [];
71
72 const containers: any[] = await res.json();
73 const stats: ContainerStats[] = [];
74
75 for (const c of containers) {
76 const name = (c.Names?.[0] ?? "").replace(/^\//, "").replace(/^grove-/, "").replace(/-1$/, "");
77 const upSince = c.Created ? new Date(c.Created * 1000) : null;
78 const uptimeMs = upSince ? Date.now() - upSince.getTime() : 0;
79
80 try {
81 const sRes = await fetch(`http://localhost:2375/containers/${c.Id}/stats?stream=false`, {
82 cache: "no-store",
83 });
84 if (sRes.ok) {
85 const s = await sRes.json();
86 const cpuDelta = s.cpu_stats.cpu_usage.total_usage - s.precpu_stats.cpu_usage.total_usage;
87 const sysDelta = s.cpu_stats.system_cpu_usage - s.precpu_stats.system_cpu_usage;
88 const cpuCount = s.cpu_stats.online_cpus || 1;
89 const cpuPercent = sysDelta > 0 ? (cpuDelta / sysDelta) * cpuCount * 100 : 0;
90
91 const memUsage = s.memory_stats.usage - (s.memory_stats.stats?.cache || 0);
92 const memLimit = s.memory_stats.limit;
93
94 const netRx = Object.values(s.networks || {}).reduce((a: number, n: any) => a + (n.rx_bytes || 0), 0);
95 const netTx = Object.values(s.networks || {}).reduce((a: number, n: any) => a + (n.tx_bytes || 0), 0);
96
97 stats.push({
98 name,
99 status: c.State,
100 cpu_percent: Math.round(cpuPercent * 100) / 100,
101 mem_usage_mb: Math.round(memUsage / 1024 / 1024),
102 mem_limit_mb: Math.round(memLimit / 1024 / 1024),
103 mem_percent: Math.round((memUsage / memLimit) * 10000) / 100,
104 net_rx_mb: Math.round(netRx / 1024 / 1024 * 100) / 100,
105 net_tx_mb: Math.round(netTx / 1024 / 1024 * 100) / 100,
106 uptime: formatUptime(uptimeMs),
107 });
108 continue;
109 }
110 } catch {}
111
112 stats.push({
113 name,
114 status: c.State,
115 cpu_percent: 0,
116 mem_usage_mb: 0,
117 mem_limit_mb: 0,
118 mem_percent: 0,
119 net_rx_mb: 0,
120 net_tx_mb: 0,
121 uptime: formatUptime(uptimeMs),
122 });
123 }
124
125 return stats;
126 } catch {
127 return [];
128 }
129}
130
131async function getSystemMetrics(): Promise<SystemMetrics | null> {
132 try {
133 // Use /proc on Linux hosts
134 const [memInfo, loadAvg, uptime, diskStat] = await Promise.allSettled([
135 fetchFile("/proc/meminfo"),
136 fetchFile("/proc/loadavg"),
137 fetchFile("/proc/uptime"),
138 fetchFile("/proc/diskstats"),
139 ]);
140
141 let cpuPercent = 0;
142 let memTotal = 0;
143 let memUsed = 0;
144 let memPercent = 0;
145 let loads: number[] = [];
146 let uptimeStr = "";
147
148 if (memInfo.status === "fulfilled" && memInfo.value) {
149 const lines = memInfo.value.split("\n");
150 const get = (key: string) => {
151 const line = lines.find((l: string) => l.startsWith(key));
152 return line ? parseInt(line.split(/\s+/)[1]) : 0;
153 };
154 memTotal = Math.round(get("MemTotal:") / 1024);
155 const memFree = get("MemFree:");
156 const buffers = get("Buffers:");
157 const cached = get("Cached:");
158 const available = get("MemAvailable:");
159 memUsed = Math.round((get("MemTotal:") - available) / 1024);
160 memPercent = Math.round((memUsed / memTotal) * 100);
161 }
162
163 if (loadAvg.status === "fulfilled" && loadAvg.value) {
164 const parts = loadAvg.value.trim().split(/\s+/);
165 loads = parts.slice(0, 3).map(Number);
166 // rough CPU% from 1-min load avg
167 const { cpus } = await import("os");
168 const numCpus = cpus().length;
169 cpuPercent = Math.min(100, Math.round((loads[0] / numCpus) * 100));
170 }
171
172 if (uptime.status === "fulfilled" && uptime.value) {
173 const secs = parseFloat(uptime.value.split(" ")[0]);
174 uptimeStr = formatUptime(secs * 1000);
175 }
176
177 // Disk usage via statfs-like approach (use os module)
178 let diskTotal = 0;
179 let diskUsed = 0;
180 let diskPercent = 0;
181 try {
182 const { execSync } = await import("child_process");
183 const df = execSync("df -BG / 2>/dev/null || df -g / 2>/dev/null", { encoding: "utf-8" });
184 const line = df.trim().split("\n")[1];
185 if (line) {
186 const parts = line.split(/\s+/);
187 diskTotal = parseInt(parts[1]);
188 diskUsed = parseInt(parts[2]);
189 diskPercent = Math.round((diskUsed / diskTotal) * 100);
190 }
191 } catch {}
192
193 return {
194 cpu_percent: cpuPercent,
195 mem_total_mb: memTotal,
196 mem_used_mb: memUsed,
197 mem_percent: memPercent,
198 disk_total_gb: diskTotal,
199 disk_used_gb: diskUsed,
200 disk_percent: diskPercent,
201 load_avg: loads,
202 uptime: uptimeStr,
203 };
204 } catch {
205 return null;
206 }
207}
208
209async function fetchFile(path: string): Promise<string | null> {
210 try {
211 const { readFileSync } = await import("fs");
212 return readFileSync(path, "utf-8");
213 } catch {
214 return null;
215 }
216}
217
218function formatUptime(ms: number): string {
219 const secs = Math.floor(ms / 1000);
220 const days = Math.floor(secs / 86400);
221 const hours = Math.floor((secs % 86400) / 3600);
222 const mins = Math.floor((secs % 3600) / 60);
223 if (days > 0) return `${days}d ${hours}h`;
224 if (hours > 0) return `${hours}h ${mins}m`;
225 return `${mins}m`;
226}
227
228export async function GET() {
229 const [checks, containers, system] = await Promise.all([
230 Promise.all([
231 checkService("Web", "http://grove-web:3000/"),
232 checkService("API", "http://grove-api:4000/api/health"),
233 checkService("Hub API", "http://hub-api:4000/api/auth/me"),
234 checkService("EdenAPI", "http://mononoke-slapi:8443/health_check"),
235 checkService("Git", "http://mononoke-git:8080/health_check"),
236 checkService("Bridge", "http://grove-bridge:8443/health_check"),
237 ]),
238 getDockerStats(),
239 getSystemMetrics(),
240 ]);
241
242 const overall = checks.every((c) => c.status === "operational")
243 ? "operational"
244 : checks.some((c) => c.status === "down")
245 ? "major_outage"
246 : "degraded";
247
248 return NextResponse.json({
249 status: overall,
250 services: checks,
251 containers,
252 system,
253 checked_at: new Date().toISOString(),
254 });
255}
256