Skip to content

Commit 7e0123f

Browse files
Анатолий ИвановDimaris-nsk
authored andcommitted
fix(broker-lifecycle): reject crashed brokers via PID-alive probe
isBrokerEndpointReady() only pings the socket for 150ms. If the broker process has crashed but its socket file lingers (unix domain) or the listener drops without the probe noticing, the existing session is trusted and reused — but every downstream task disconnects mid-turn because the transport subsystem behind the socket is actually gone. Add isPidAlive() check consulted before trusting the socket ping. If the PID is dead, tear down and respawn. Safety around the teardown's SIGTERM: - verifyBrokerPid() cross-checks session.pid against the on-disk pid-file contents AND the live process command line via `ps` (POSIX) before returning true. - Windows intentionally returns false — tasklist exposes image name but not command line, and matching node.exe alone is too weak to rule out recycled-PID foreign processes. Windows rotation still cleans socket/pidfile; detached old broker eventually exits on its own since no new client reaches it. - If verifyBrokerPid() returns false (e.g. stale pid-file, PID gone, ps lookup fails), killProcess falls back to null — no signal, only file cleanup, same as trunk behavior. Age-based rotation for healthy-but-degrading brokers was considered and dropped in this revision: rotating a still-serving broker can interrupt a concurrent client's in-flight turn. A proper fix needs an active health probe (e.g. lightweight RPC round-trip) or graceful drain. Out of scope for this PR; filed as a follow-up.
1 parent 807e03a commit 7e0123f

1 file changed

Lines changed: 92 additions & 3 deletions

File tree

plugins/codex/scripts/lib/broker-lifecycle.mjs

Lines changed: 92 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import net from "node:net";
33
import os from "node:os";
44
import path from "node:path";
55
import process from "node:process";
6-
import { spawn } from "node:child_process";
6+
import { spawn, execFileSync } from "node:child_process";
77
import { fileURLToPath } from "node:url";
88
import { createBrokerEndpoint, parseBrokerEndpoint } from "./broker-endpoint.mjs";
99
import { resolveStateDir } from "./state.mjs";
@@ -110,20 +110,109 @@ async function isBrokerEndpointReady(endpoint) {
110110
}
111111
}
112112

113+
// Observed failure: the broker process has crashed but its endpoint socket
114+
// file lingers (unix domain) or listener drops without `waitForBrokerEndpoint`
115+
// noticing in the 150ms probe window. `isBrokerEndpointReady` passes, the
116+
// caller trusts the existing session, and every downstream task disconnects
117+
// mid-turn. Add a PID-alive probe so we catch this class up-front and force
118+
// a fresh broker before trusting the socket.
119+
//
120+
// Age-based rotation was considered to cover slow-degradation (broker alive
121+
// but serving unreliably). Dropped in this revision — rotating a healthy
122+
// broker while it may be mid-turn for a concurrent client can interrupt that
123+
// turn. Proper fix for slow degradation needs a real health probe (e.g.
124+
// lightweight RPC round-trip) or graceful drain, which are out of scope
125+
// for this PR. Left as a follow-up.
126+
127+
function isPidAlive(pid) {
128+
if (!Number.isFinite(pid) || pid <= 0) return false;
129+
try {
130+
// signal 0 only checks existence; no actual signal delivered
131+
process.kill(pid, 0);
132+
return true;
133+
} catch {
134+
return false;
135+
}
136+
}
137+
138+
function isSessionStale(session) {
139+
if (!session) return true;
140+
// PID check — covers crashed-broker case
141+
if (session.pid != null && !isPidAlive(session.pid)) return true;
142+
return false;
143+
}
144+
145+
// Default kill for stale-rotation teardown. Without this, rotating a still-alive
146+
// broker only removes its socket/pid files — the detached process keeps running
147+
// and leaks host resources. SIGTERM is best-effort; ignore missing-process errors.
148+
function defaultKillProcess(pid) {
149+
try {
150+
process.kill(pid, "SIGTERM");
151+
} catch {
152+
// process already gone — fine
153+
}
154+
}
155+
156+
// Cross-check session.pid against the actual running process before signaling.
157+
//
158+
// pid-file alone is insufficient: app-server-broker.mjs only removes it in the
159+
// clean shutdown() path — if the broker crashed ungracefully the pidfile
160+
// lingers, and when the OS recycles that PID to an unrelated process the file
161+
// contents will spuriously match. On POSIX we also check `ps` command line to
162+
// confirm the broker script name is present. On Windows we intentionally do
163+
// not send SIGTERM: `tasklist` exposes image name but not full command line
164+
// via the public CLI, and matching on image-name (`node.exe`) alone is too
165+
// weak to rule out recycled-PID foreign processes. Windows rotation will
166+
// still clean up socket/pidfile — detached old broker eventually exits on
167+
// its own since no new client will reach it.
168+
function verifyBrokerPid(session) {
169+
if (!session || !Number.isFinite(session.pid) || !session.pidFile) return false;
170+
if (process.platform === "win32") return false;
171+
try {
172+
if (!fs.existsSync(session.pidFile)) return false;
173+
const content = fs.readFileSync(session.pidFile, "utf8").trim();
174+
if (Number(content) !== session.pid) return false;
175+
// POSIX: ps exposes the full command, so match instance-specific args.
176+
// Script name alone is too broad — a recycled PID belonging to a foreign
177+
// broker instance (different workspace) would also contain
178+
// "app-server-broker.mjs" and cause a cross-session kill. We also require
179+
// the session's unique --pid-file and --endpoint paths to appear in the
180+
// live command line. Those paths are per-session (see spawnBrokerProcess —
181+
// both are derived from createBrokerSessionDir()), so a recycled PID on
182+
// an unrelated broker cannot match.
183+
const cmd = execFileSync("ps", ["-p", String(session.pid), "-o", "command="], {
184+
encoding: "utf8",
185+
timeout: 1000
186+
});
187+
if (!cmd.includes("app-server-broker.mjs")) return false;
188+
if (!cmd.includes(`--pid-file ${session.pidFile}`)) return false;
189+
if (session.endpoint && !cmd.includes(`--endpoint ${session.endpoint}`)) return false;
190+
return true;
191+
} catch {
192+
// Any lookup failure (process gone, permission denied, timeout) → skip
193+
// the kill. Safe default; socket/pidfile cleanup still proceeds.
194+
return false;
195+
}
196+
}
197+
113198
export async function ensureBrokerSession(cwd, options = {}) {
114199
const existing = loadBrokerSession(cwd);
115-
if (existing && (await isBrokerEndpointReady(existing.endpoint))) {
200+
if (existing && !isSessionStale(existing) && (await isBrokerEndpointReady(existing.endpoint))) {
116201
return existing;
117202
}
118203

119204
if (existing) {
205+
// Only send SIGTERM when the pid-file on disk still maps to session.pid —
206+
// otherwise the PID may have been recycled by the OS to an unrelated
207+
// process and signaling it would kill something we don't own.
208+
const killAllowed = verifyBrokerPid(existing);
120209
teardownBrokerSession({
121210
endpoint: existing.endpoint ?? null,
122211
pidFile: existing.pidFile ?? null,
123212
logFile: existing.logFile ?? null,
124213
sessionDir: existing.sessionDir ?? null,
125214
pid: existing.pid ?? null,
126-
killProcess: options.killProcess ?? null
215+
killProcess: options.killProcess ?? (killAllowed ? defaultKillProcess : null)
127216
});
128217
clearBrokerSession(cwd);
129218
}

0 commit comments

Comments
 (0)