Skip to content

Commit 6ce0540

Browse files
Анатолий ИвановАнатолий Иванов
authored andcommitted
fix(broker-lifecycle): reject stale sessions via PID-alive + age checks
isBrokerEndpointReady() only does a 150ms socket ping. If the broker's underlying codex app-server subprocess is in a bad state — observed after multi-day uptime — the socket still accepts connections, so the existing session is trusted and reused, but every task disconnects mid-turn because the transport subsystem behind the socket is broken. Two complementary guards added in isSessionStale(): 1. PID-alive probe: process.kill(session.pid, 0) detects a crashed broker whose socket file may linger. Covers the acute crash case. 2. Age-based rotation: compare Date.now() against session.startedAt (new field, captured when the broker spawns). Default threshold 6h, overridable via CODEX_COMPANION_BROKER_MAX_AGE_HOURS env var. Covers the slow-degradation case where neither PID nor socket ping surface the problem. Gate added to ensureBrokerSession() before trusting the socket ping: existing session must (a) exist, (b) not be stale, (c) pass the socket ping. If any check fails, the existing session is torn down and a fresh broker is spawned — the same path already used for missing or unreachable brokers. Observed in the wild: broker process with 3d+11h uptime caused 100% task disconnect rate for every /codex:rescue invocation. Restarting the broker (deleting broker.json and letting the next task spawn fresh) fully restored task reliability until the next degradation.
1 parent 807e03a commit 6ce0540

1 file changed

Lines changed: 56 additions & 3 deletions

File tree

plugins/codex/scripts/lib/broker-lifecycle.mjs

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,58 @@ async function isBrokerEndpointReady(endpoint) {
110110
}
111111
}
112112

113+
// Observed failure: after days of broker uptime the socket still accepts
114+
// connections (passes isBrokerEndpointReady), but the underlying `codex
115+
// app-server` subprocess is in a bad state and drops transport mid-turn
116+
// on every task. Socket ping alone does not detect this. Add two guards:
117+
// 1. PID-alive probe — detects a crashed broker before trusting the socket.
118+
// 2. Age-based rotation — forces a fresh broker past a threshold; default
119+
// 6h, overridable via CODEX_COMPANION_BROKER_MAX_AGE_HOURS.
120+
function getBrokerMaxAgeMs() {
121+
const hours = Number(process.env.CODEX_COMPANION_BROKER_MAX_AGE_HOURS);
122+
const safe = Number.isFinite(hours) && hours > 0 ? hours : 6;
123+
return safe * 60 * 60 * 1000;
124+
}
125+
126+
function isPidAlive(pid) {
127+
if (!Number.isFinite(pid) || pid <= 0) return false;
128+
try {
129+
// signal 0 only checks existence; no actual signal delivered
130+
process.kill(pid, 0);
131+
return true;
132+
} catch {
133+
return false;
134+
}
135+
}
136+
137+
function isSessionStale(session) {
138+
if (!session) return true;
139+
// PID check — covers crashed-broker case
140+
if (session.pid != null && !isPidAlive(session.pid)) return true;
141+
// Legacy rotation — sessions persisted before startedAt was introduced
142+
// skip age validation otherwise; force one-time rotation so the fix
143+
// applies to pre-upgrade broker.json entries.
144+
if (!session.startedAt) return true;
145+
// Age check — covers slow-degradation case
146+
const age = Date.now() - session.startedAt;
147+
if (age > getBrokerMaxAgeMs()) return true;
148+
return false;
149+
}
150+
151+
// Default kill for stale-rotation teardown. Without this, rotating a still-alive
152+
// broker only removes its socket/pid files — the detached process keeps running
153+
// and leaks host resources. SIGTERM is best-effort; ignore missing-process errors.
154+
function defaultKillProcess(pid) {
155+
try {
156+
process.kill(pid, "SIGTERM");
157+
} catch {
158+
// process already gone — fine
159+
}
160+
}
161+
113162
export async function ensureBrokerSession(cwd, options = {}) {
114163
const existing = loadBrokerSession(cwd);
115-
if (existing && (await isBrokerEndpointReady(existing.endpoint))) {
164+
if (existing && !isSessionStale(existing) && (await isBrokerEndpointReady(existing.endpoint))) {
116165
return existing;
117166
}
118167

@@ -123,7 +172,10 @@ export async function ensureBrokerSession(cwd, options = {}) {
123172
logFile: existing.logFile ?? null,
124173
sessionDir: existing.sessionDir ?? null,
125174
pid: existing.pid ?? null,
126-
killProcess: options.killProcess ?? null
175+
// Use defaultKillProcess so stale (but PID-alive) brokers actually
176+
// terminate during rotation — otherwise they leak as orphaned
177+
// processes while only their socket/pidfile are cleaned up.
178+
killProcess: options.killProcess ?? defaultKillProcess
127179
});
128180
clearBrokerSession(cwd);
129181
}
@@ -164,7 +216,8 @@ export async function ensureBrokerSession(cwd, options = {}) {
164216
pidFile,
165217
logFile,
166218
sessionDir,
167-
pid: child.pid ?? null
219+
pid: child.pid ?? null,
220+
startedAt: Date.now() // used by isSessionStale() for age-based rotation
168221
};
169222
saveBrokerSession(cwd, session);
170223
return session;

0 commit comments

Comments
 (0)