Skip to content

Commit 2edcedf

Browse files
committed
backend/execute-code: fixing on-exit/finish handling if process is killed
1 parent a61782f commit 2edcedf

File tree

4 files changed

+79
-67
lines changed

4 files changed

+79
-67
lines changed

src/packages/backend/execute-code.ts

+21-11
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ async function executeCodeNoAggregate(
228228
const pid: number | undefined = doSpawn(
229229
{ ...opts, origCommand, job_id, job_config },
230230
async (err, result) => {
231+
log.debug("async/doSpawn returned", { err, result });
231232
try {
232233
const info: Omit<
233234
ExecuteCodeOutputAsync,
@@ -282,17 +283,18 @@ function sumChildren(
282283
procs: Processes,
283284
children: { [pid: number]: number[] },
284285
pid: number,
285-
): { rss: number; pct_cpu: number; cpu_secs: number } {
286+
): { rss: number; pct_cpu: number; cpu_secs: number } | null {
286287
const proc = procs[`${pid}`];
287288
if (proc == null) {
288289
log.debug(`sumChildren: no process ${pid} in proc`);
289-
return { rss: 0, pct_cpu: 0, cpu_secs: 0 };
290+
return null;
290291
}
291292
let rss = proc.stat.mem.rss;
292293
let pct_cpu = proc.cpu.pct;
293294
let cpu_secs = proc.cpu.secs;
294295
for (const ch of children[pid] ?? []) {
295296
const sc = sumChildren(procs, children, ch);
297+
if (sc == null) return null;
296298
rss += sc.rss;
297299
pct_cpu += sc.pct_cpu;
298300
cpu_secs += sc.cpu_secs;
@@ -358,6 +360,7 @@ function doSpawn(
358360
if (callback_done) return;
359361

360362
while (true) {
363+
if (callback_done) return;
361364
const { procs } = await monitor.processes(Date.now());
362365
// reconstruct process tree
363366
const children: { [pid: number]: number[] } = {};
@@ -367,8 +370,13 @@ function doSpawn(
367370
children[ppid].push(pid);
368371
}
369372
// we only consider those, which are the pid itself or one of its children
370-
const { rss, pct_cpu, cpu_secs } = sumChildren(procs, children, pid);
371-
373+
const sc = sumChildren(procs, children, pid);
374+
if (sc == null) {
375+
// If the process by PID is no longer known, either the process was killed or there are too many running.
376+
// in any case, stop monitoring and do not update any data.
377+
return;
378+
}
379+
const { rss, pct_cpu, cpu_secs } = sc;
372380
// ?? fallback, in case the cache "forgot" about it
373381
const obj = asyncCache.get(job_id) ?? job_config;
374382
obj.pid = pid;
@@ -388,8 +396,6 @@ function doSpawn(
388396
const next_s = Math.max(1, Math.floor(elapsed_s / 6));
389397
const wait_s = Math.min(next_s, MONITOR_INTERVAL_S);
390398
await new Promise((done) => setTimeout(done, wait_s * 1000));
391-
392-
if (callback_done) return;
393399
}
394400
}
395401

@@ -426,7 +432,9 @@ function doSpawn(
426432
const obj = asyncCache.get(job_id) ?? opts.job_config;
427433
if (obj != null) {
428434
if (aspect === "pid") {
429-
if (typeof data === "number") obj.pid = data;
435+
if (typeof data === "number") {
436+
obj.pid = data;
437+
}
430438
} else if (typeof data === "string") {
431439
obj[aspect] = data;
432440
}
@@ -470,7 +478,7 @@ function doSpawn(
470478
});
471479

472480
child.on("exit", (code) => {
473-
exit_code = code != null ? code : undefined;
481+
exit_code = code ?? 0;
474482
finish();
475483
});
476484

@@ -495,8 +503,8 @@ function doSpawn(
495503

496504
const finish = (err?) => {
497505
if (!killed && (!stdout_is_done || !stderr_is_done || exit_code == null)) {
498-
// it wasn't killed and one of stdout, stderr, and exit_code hasn't been
499-
// set, so we let the rest of them get set before actually finishing up.
506+
// it wasn't killed and none of stdout, stderr, and exit_code hasn't been set.
507+
// so we let the rest of them get set before actually finishing up.
500508
return;
501509
}
502510
if (callback_done) {
@@ -510,6 +518,7 @@ function doSpawn(
510518
clearTimeout(timer);
511519
timer = undefined;
512520
}
521+
513522
if (opts.verbose && log.isEnabled("debug")) {
514523
log.debug(
515524
"finished exec of",
@@ -524,6 +533,7 @@ function doSpawn(
524533
exit_code,
525534
});
526535
}
536+
527537
if (err) {
528538
cb(err);
529539
} else if (opts.err_on_exit && exit_code != 0) {
@@ -588,7 +598,7 @@ function doSpawn(
588598
} catch (err) {
589599
// Exceptions can happen, which left uncaught messes up calling code big time.
590600
if (opts.verbose) {
591-
log.debug("r.kill raised an exception", err);
601+
log.debug("process.kill raised an exception", err);
592602
}
593603
}
594604
finish(`killed command '${opts.command} ${opts.args?.join(" ")}'`);

src/packages/frontend/frame-editors/latex-editor/actions.ts

+14-32
Original file line numberDiff line numberDiff line change
@@ -709,39 +709,40 @@ export class Actions extends BaseActions<LatexEditorState> {
709709
await this.build_action("clean");
710710
}
711711

712-
async kill(name: BuildSpecName, job: ExecOutput): Promise<void> {
713-
if (job.type !== "async") return;
712+
private async kill(job: ExecOutput): Promise<ExecOutput> {
713+
if (job.type !== "async") return job;
714714
const { pid, status } = job;
715715
if (status === "running" && typeof pid === "number") {
716716
try {
717717
// console.log("LatexEditor/actions/kill: killing", pid);
718718
await exec({
719719
project_id: this.project_id,
720-
command: "kill",
721-
args: [`${pid}`],
720+
// negative PID, to kill the entire process group
721+
command: `kill -9 -${pid}`,
722+
// bash: kill + array does not work. IDK why.
723+
bash: true,
724+
err_on_exit: false,
722725
});
723726
} catch (err) {
724727
// likely "No such process", we just ignore it
725-
// console.info("LatexEditor/actions/kill:", err);
728+
console.info("LatexEditor/actions/kill:", err);
726729
} finally {
727-
// we keep the data for debugging, but set its state to completed
730+
// set this build log to be no longer running
728731
job.status = "completed";
729-
this.set_build_logs({ [name]: job });
730732
}
731733
}
734+
return job;
732735
}
733736

734737
// This stops all known jobs with a status "running" and resets the state.
735738
async stop_build(_id?: string) {
736-
const job_infos = this.store.get("build_logs"); // ("job_infos");
737-
if (job_infos) {
738-
for (const [name, job] of job_infos) {
739-
await this.kill(name, job.toJS());
739+
const build_logs = this.store.get("build_logs");
740+
if (build_logs) {
741+
for (const [name, job] of build_logs) {
742+
this.set_build_logs({ [name]: await this.kill(job.toJS()) });
740743
}
741744
}
742745

743-
// this must run in any case, we want a clean empty map!
744-
//this.setState({ job_infos: Map() });
745746
this.set_status("");
746747
this.is_building = false;
747748
}
@@ -1344,25 +1345,6 @@ export class Actions extends BaseActions<LatexEditorState> {
13441345
});
13451346
}
13461347

1347-
// set_job_infos(obj: {
1348-
// [K in keyof IBuildSpecs]?: ExecuteCodeOutputAsync;
1349-
// }): void {
1350-
// let job_infos: JobInfos = this.store.get("job_infos") ?? Map();
1351-
// let k: BuildSpecName;
1352-
// for (k in obj) {
1353-
// const v: ExecuteCodeOutputAsync | undefined = obj[k];
1354-
// if (v) {
1355-
// job_infos = job_infos.set(
1356-
// k as any,
1357-
// fromJS(v) as any as TypedMap<ExecOutput>,
1358-
// );
1359-
// } else {
1360-
// job_infos = job_infos.delete(k);
1361-
// }
1362-
// }
1363-
// this.setState({ job_infos });
1364-
// }
1365-
13661348
private set_build_logs(obj: { [K in keyof IBuildSpecs]?: BuildLog }): void {
13671349
let build_logs: BuildLogs = this.store.get("build_logs") ?? Map();
13681350
let k: BuildSpecName;

src/packages/frontend/frame-editors/latex-editor/build.tsx

+35-18
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ import Stopwatch from "@cocalc/frontend/editors/stopwatch/stopwatch";
1818
import { webapp_client } from "@cocalc/frontend/webapp-client";
1919
import { path_split, tail } from "@cocalc/util/misc";
2020
import { COLORS } from "@cocalc/util/theme";
21+
import {
22+
ExecuteCodeOutput,
23+
ExecuteCodeOutputAsync,
24+
} from "@cocalc/util/types/execute-code";
2125
import { Actions } from "./actions";
2226
import { BuildCommand } from "./build-command";
2327
import { use_build_logs } from "./hooks";
@@ -95,6 +99,20 @@ export const Build: React.FC<Props> = React.memo((props) => {
9599
});
96100
}
97101

102+
function getMaxMem(stats?: ExecuteCodeOutputAsync["stats"]): string {
103+
if (Array.isArray(stats) && stats.length > 0) {
104+
const max_mem = stats.reduce((cur, val) => {
105+
return val.mem_rss > cur ? val.mem_rss : cur;
106+
}, 0);
107+
// if there is no data (too many processes, etc.) then it is 0.
108+
// That information is misleading and we ignore it
109+
if (max_mem > 0) {
110+
return ` Memory usage: ${max_mem.toFixed(0)} MB.`;
111+
}
112+
}
113+
return "";
114+
}
115+
98116
function render_log(stage: BuildSpecName): AntdTabItem | undefined {
99117
if (build_logs == null) return;
100118
const x: BuildLog | undefined = build_logs.get(stage)?.toJS();
@@ -112,16 +130,7 @@ export const Build: React.FC<Props> = React.memo((props) => {
112130
if (typeof elapsed_s === "number" && elapsed_s > 0) {
113131
job_info_str = `Build time: ${elapsed_s.toFixed(1)} seconds.`;
114132
}
115-
if (Array.isArray(stats) && stats.length > 0) {
116-
const max_mem = stats.reduce((cur, val) => {
117-
return val.mem_rss > cur ? val.mem_rss : cur;
118-
}, 0);
119-
// if there is no data (too many processes, etc.) then it is 0.
120-
// That information is misleading and we ignore it
121-
if (max_mem > 0) {
122-
job_info_str += ` Memory usage: ${max_mem.toFixed(0)} MB.`;
123-
}
124-
}
133+
job_info_str += getMaxMem(stats);
125134
}
126135
const title = BUILD_SPECS[stage].label;
127136
// highlights tab, if there is at least one parsed error
@@ -191,17 +200,18 @@ export const Build: React.FC<Props> = React.memo((props) => {
191200
);
192201
}
193202

194-
// usually, one job is running at a time – only render this if there is no status
203+
// usually, one job is running at a time
195204
function render_jobs(): Rendered {
196205
if (!build_logs) return;
197206
const infos: JSX.Element[] = [];
198207
let isLongRunning = false;
199208
let logTail = "";
200-
build_logs.forEach((info, key) => {
201-
if (!info || info.get("status") !== "running") return;
202-
const stats_str = "";
203-
const start = info.get("start");
204-
logTail = tail(info.get("stdout") ?? "" + info.get("stderr") ?? "", 6);
209+
build_logs.forEach((infoI, key) => {
210+
const info: ExecuteCodeOutput = infoI?.toJS();
211+
if (!info || info.type !== "async" || info.status !== "running") return;
212+
const stats_str = getMaxMem(info.stats);
213+
const start = info.start;
214+
logTail = tail(info.stdout ?? "" + info.stderr ?? "", 6);
205215
isLongRunning ||=
206216
typeof start === "number" &&
207217
webapp_client.server_time() - start > WARN_LONG_RUNNING_S * 1000;
@@ -254,10 +264,17 @@ export const Build: React.FC<Props> = React.memo((props) => {
254264
</Flex>
255265
</div>
256266
<div style={logStyle}>
257-
<span style={{ fontWeight: "bold" }}>
267+
<div
268+
style={{
269+
fontWeight: "bold",
270+
whiteSpace: "nowrap",
271+
overflow: "hidden",
272+
textOverflow: "ellipsis",
273+
}}
274+
>
258275
{status}
259276
{"\n"}
260-
</span>
277+
</div>
261278
{logTail}
262279
</div>
263280
</>

src/packages/frontend/frame-editors/latex-editor/util.ts

+9-6
Original file line numberDiff line numberDiff line change
@@ -76,25 +76,27 @@ export async function gatherJobInfo(
7676
job_info: ExecuteCodeOutputAsync,
7777
set_job_info: (info: ExecuteCodeOutputAsync) => void,
7878
): Promise<void> {
79+
await new Promise((done) => setTimeout(done, 100));
7980
let wait_s = 1;
8081
try {
8182
while (true) {
82-
await new Promise((done) => setTimeout(done, 1000 * wait_s));
8383
const update = await exec({
8484
project_id,
8585
async_get: job_info.job_id,
8686
async_stats: true,
8787
});
88-
if (update.type === "blocking") {
88+
if (update.type !== "async") {
8989
console.warn("Wrong type returned. The project is too old!");
9090
return;
9191
}
9292
if (update.status === "running") {
93+
console.log("gatherJobInfo", update);
9394
set_job_info(update);
9495
} else {
9596
return;
9697
}
97-
wait_s = Math.min(10, wait_s + 1);
98+
await new Promise((done) => setTimeout(done, 1000 * wait_s));
99+
wait_s = Math.min(5, wait_s + 1);
98100
}
99101
} catch {
100102
return;
@@ -134,19 +136,20 @@ export async function runJob(opts: RunJobOpts): Promise<ExecOutput> {
134136
const job_info = await exec(job);
135137

136138
if (job_info.type !== "async") {
137-
// this is not an async job. This could happen for old projects.
139+
// this is not an async job. This happens with "old" projects, not knowing about async_call.
138140
return job_info;
139141
}
140142

141143
if (typeof job_info.pid !== "number") {
142144
throw new Error("Unable to spawn compile job.");
143145
}
144146

145-
set_job_info(job_info);
147+
// this runs async, until the job is no longer "running"
146148
gatherJobInfo(project_id, job_info, set_job_info);
147149

148150
while (true) {
149151
try {
152+
// This also returns the result, if the job has already completed.
150153
const output = await exec({
151154
project_id,
152155
async_get: job_info.job_id,
@@ -160,7 +163,7 @@ export async function runJob(opts: RunJobOpts): Promise<ExecOutput> {
160163
return output;
161164
} catch (err) {
162165
if (err === TIMEOUT_CALLING_PROJECT) {
163-
// this will be fine, hopefully. We continue trying to get a reply.
166+
// This will eventually be fine, hopefully. We continue trying to get a reply.
164167
await new Promise((done) => setTimeout(done, 100));
165168
} else {
166169
throw new Error(

0 commit comments

Comments
 (0)