route.ts · grove

7.7 KB256 lines

1	import { NextResponse } from "next/server";
2
3	interface ServiceCheck {
4	name: string;
5	status: "operational" \| "degraded" \| "down";
6	latency: number \| null;
7	detail?: string;
8	}
9
10	interface ContainerStats {
11	name: string;
12	status: string;
13	cpu_percent: number;
14	mem_usage_mb: number;
15	mem_limit_mb: number;
16	mem_percent: number;
17	net_rx_mb: number;
18	net_tx_mb: number;
19	uptime: string;
20	}
21
22	interface SystemMetrics {
23	cpu_percent: number;
24	mem_total_mb: number;
25	mem_used_mb: number;
26	mem_percent: number;
27	disk_total_gb: number;
28	disk_used_gb: number;
29	disk_percent: number;
30	load_avg: number[];
31	uptime: string;
32	}
33
34	async function checkService(
35	name: string,
36	url: string,
37	timeout = 5000
38	): Promise<ServiceCheck> {
39	const start = Date.now();
40	try {
41	const controller = new AbortController();
42	const timer = setTimeout(() => controller.abort(), timeout);
43	const res = await fetch(url, { signal: controller.signal, cache: "no-store" });
44	clearTimeout(timer);
45	const latency = Date.now() - start;
46	if (res.ok \|\| res.status === 401) {
47	return { name, status: "operational", latency };
48	}
49	return { name, status: "degraded", latency, detail: `HTTP ${res.status}` };
50	} catch (err: unknown) {
51	return {
52	name,
53	status: "down",
54	latency: null,
55	detail: err instanceof Error ? err.message : "Connection failed",
56	};
57	}
58	}
59
60	async function getDockerStats(): Promise<ContainerStats[]> {
61	try {
62	// Docker socket — works when running inside Docker on the host
63	const controller = new AbortController();
64	const timer = setTimeout(() => controller.abort(), 5000);
65	const res = await fetch("http://localhost:2375/containers/json?all=false", {
66	signal: controller.signal,
67	cache: "no-store",
68	});
69	clearTimeout(timer);
70	if (!res.ok) return [];
71
72	const containers: any[] = await res.json();
73	const stats: ContainerStats[] = [];
74
75	for (const c of containers) {
76	const name = (c.Names?.[0] ?? "").replace(/^\//, "").replace(/^grove-/, "").replace(/-1$/, "");
77	const upSince = c.Created ? new Date(c.Created * 1000) : null;
78	const uptimeMs = upSince ? Date.now() - upSince.getTime() : 0;
79
80	try {
81	const sRes = await fetch(`http://localhost:2375/containers/${c.Id}/stats?stream=false`, {
82	cache: "no-store",
83	});
84	if (sRes.ok) {
85	const s = await sRes.json();
86	const cpuDelta = s.cpu_stats.cpu_usage.total_usage - s.precpu_stats.cpu_usage.total_usage;
87	const sysDelta = s.cpu_stats.system_cpu_usage - s.precpu_stats.system_cpu_usage;
88	const cpuCount = s.cpu_stats.online_cpus \|\| 1;
89	const cpuPercent = sysDelta > 0 ? (cpuDelta / sysDelta) * cpuCount * 100 : 0;
90
91	const memUsage = s.memory_stats.usage - (s.memory_stats.stats?.cache \|\| 0);
92	const memLimit = s.memory_stats.limit;
93
94	const netRx = Object.values(s.networks \|\| {}).reduce((a: number, n: any) => a + (n.rx_bytes \|\| 0), 0);
95	const netTx = Object.values(s.networks \|\| {}).reduce((a: number, n: any) => a + (n.tx_bytes \|\| 0), 0);
96
97	stats.push({
98	name,
99	status: c.State,
100	cpu_percent: Math.round(cpuPercent * 100) / 100,
101	mem_usage_mb: Math.round(memUsage / 1024 / 1024),
102	mem_limit_mb: Math.round(memLimit / 1024 / 1024),
103	mem_percent: Math.round((memUsage / memLimit) * 10000) / 100,
104	net_rx_mb: Math.round(netRx / 1024 / 1024 * 100) / 100,
105	net_tx_mb: Math.round(netTx / 1024 / 1024 * 100) / 100,
106	uptime: formatUptime(uptimeMs),
107	});
108	continue;
109	}
110	} catch {}
111
112	stats.push({
113	name,
114	status: c.State,
115	cpu_percent: 0,
116	mem_usage_mb: 0,
117	mem_limit_mb: 0,
118	mem_percent: 0,
119	net_rx_mb: 0,
120	net_tx_mb: 0,
121	uptime: formatUptime(uptimeMs),
122	});
123	}
124
125	return stats;
126	} catch {
127	return [];
128	}
129	}
130
131	async function getSystemMetrics(): Promise<SystemMetrics \| null> {
132	try {
133	// Use /proc on Linux hosts
134	const [memInfo, loadAvg, uptime, diskStat] = await Promise.allSettled([
135	fetchFile("/proc/meminfo"),
136	fetchFile("/proc/loadavg"),
137	fetchFile("/proc/uptime"),
138	fetchFile("/proc/diskstats"),
139	]);
140
141	let cpuPercent = 0;
142	let memTotal = 0;
143	let memUsed = 0;
144	let memPercent = 0;
145	let loads: number[] = [];
146	let uptimeStr = "";
147
148	if (memInfo.status === "fulfilled" && memInfo.value) {
149	const lines = memInfo.value.split("\n");
150	const get = (key: string) => {
151	const line = lines.find((l: string) => l.startsWith(key));
152	return line ? parseInt(line.split(/\s+/)[1]) : 0;
153	};
154	memTotal = Math.round(get("MemTotal:") / 1024);
155	const memFree = get("MemFree:");
156	const buffers = get("Buffers:");
157	const cached = get("Cached:");
158	const available = get("MemAvailable:");
159	memUsed = Math.round((get("MemTotal:") - available) / 1024);
160	memPercent = Math.round((memUsed / memTotal) * 100);
161	}
162
163	if (loadAvg.status === "fulfilled" && loadAvg.value) {
164	const parts = loadAvg.value.trim().split(/\s+/);
165	loads = parts.slice(0, 3).map(Number);
166	// rough CPU% from 1-min load avg
167	const { cpus } = await import("os");
168	const numCpus = cpus().length;
169	cpuPercent = Math.min(100, Math.round((loads[0] / numCpus) * 100));
170	}
171
172	if (uptime.status === "fulfilled" && uptime.value) {
173	const secs = parseFloat(uptime.value.split(" ")[0]);
174	uptimeStr = formatUptime(secs * 1000);
175	}
176
177	// Disk usage via statfs-like approach (use os module)
178	let diskTotal = 0;
179	let diskUsed = 0;
180	let diskPercent = 0;
181	try {
182	const { execSync } = await import("child_process");
183	const df = execSync("df -BG / 2>/dev/null \|\| df -g / 2>/dev/null", { encoding: "utf-8" });
184	const line = df.trim().split("\n")[1];
185	if (line) {
186	const parts = line.split(/\s+/);
187	diskTotal = parseInt(parts[1]);
188	diskUsed = parseInt(parts[2]);
189	diskPercent = Math.round((diskUsed / diskTotal) * 100);
190	}
191	} catch {}
192
193	return {
194	cpu_percent: cpuPercent,
195	mem_total_mb: memTotal,
196	mem_used_mb: memUsed,
197	mem_percent: memPercent,
198	disk_total_gb: diskTotal,
199	disk_used_gb: diskUsed,
200	disk_percent: diskPercent,
201	load_avg: loads,
202	uptime: uptimeStr,
203	};
204	} catch {
205	return null;
206	}
207	}
208
209	async function fetchFile(path: string): Promise<string \| null> {
210	try {
211	const { readFileSync } = await import("fs");
212	return readFileSync(path, "utf-8");
213	} catch {
214	return null;
215	}
216	}
217
218	function formatUptime(ms: number): string {
219	const secs = Math.floor(ms / 1000);
220	const days = Math.floor(secs / 86400);
221	const hours = Math.floor((secs % 86400) / 3600);
222	const mins = Math.floor((secs % 3600) / 60);
223	if (days > 0) return `${days}d ${hours}h`;
224	if (hours > 0) return `${hours}h ${mins}m`;
225	return `${mins}m`;
226	}
227
228	export async function GET() {
229	const [checks, containers, system] = await Promise.all([
230	Promise.all([
231	checkService("Web", "http://grove-web:3000/"),
232	checkService("API", "http://grove-api:4000/api/health"),
233	checkService("Hub API", "http://hub-api:4000/api/auth/me"),
234	checkService("EdenAPI", "http://mononoke-slapi:8443/health_check"),
235	checkService("Git", "http://mononoke-git:8080/health_check"),
236	checkService("Bridge", "http://grove-bridge:8443/health_check"),
237	]),
238	getDockerStats(),
239	getSystemMetrics(),
240	]);
241
242	const overall = checks.every((c) => c.status === "operational")
243	? "operational"
244	: checks.some((c) => c.status === "down")
245	? "major_outage"
246	: "degraded";
247
248	return NextResponse.json({
249	status: overall,
250	services: checks,
251	containers,
252	system,
253	checked_at: new Date().toISOString(),
254	});
255	}
256