Skip to content

Commit c7d7697

Browse files
machshevrswarbrick
authored andcommitted
[dvsim] more robust local launcher process management
Signed-off-by: James McCorrie <[email protected]>
1 parent 79fb85d commit c7d7697

File tree

3 files changed

+84
-52
lines changed

3 files changed

+84
-52
lines changed

util/dvsim/Launcher.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import re
1010
import sys
1111
from pathlib import Path
12+
from typing import Union
1213

1314
from utils import VERBOSE, clean_odirs, mk_symlink, rm_path
1415

@@ -18,6 +19,11 @@ def __init__(self, msg) -> None:
1819
self.msg = msg
1920

2021

22+
class LauncherBusy(Exception):
23+
def __init__(self, msg):
24+
self.msg = msg
25+
26+
2127
class ErrorMessage(
2228
collections.namedtuple(
2329
"ErrorMessage",
@@ -219,7 +225,7 @@ def launch(self) -> None:
219225
self._pre_launch()
220226
self._do_launch()
221227

222-
def poll(self) -> None:
228+
def poll(self) -> Union[str, None]:
223229
"""Poll the launched job for completion.
224230
225231
Invokes _check_status() and _post_finish() when the job completes.
@@ -285,10 +291,11 @@ def _find_patterns(patterns, line):
285291
if chk_failed and _find_patterns(self.deploy.fail_patterns, line):
286292
# If failed, then nothing else to do. Just return.
287293
# Provide some extra lines for context.
294+
end = cnt + 5
288295
return "F", ErrorMessage(
289296
line_number=cnt + 1,
290297
message=line.strip(),
291-
context=lines[cnt : cnt + 5],
298+
context=lines[cnt:end],
292299
)
293300

294301
if chk_passed:

util/dvsim/LocalLauncher.py

Lines changed: 56 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
# Copyright lowRISC contributors (OpenTitan project).
22
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
33
# SPDX-License-Identifier: Apache-2.0
4+
"""Launcher implementation to run jobs as subprocesses on the local machine."""
45

56
import datetime
67
import os
78
import shlex
89
import subprocess
10+
from pathlib import Path
11+
from typing import Union
912

10-
from Launcher import ErrorMessage, Launcher, LauncherError
13+
from Launcher import ErrorMessage, Launcher, LauncherBusy, LauncherError
1114

1215

1316
class LocalLauncher(Launcher):
@@ -18,7 +21,8 @@ def __init__(self, deploy):
1821
super().__init__(deploy)
1922

2023
# Popen object when launching the job.
21-
self.process = None
24+
self._process = None
25+
self._log_file = None
2226

2327
def _do_launch(self) -> None:
2428
# Update the shell's env vars with self.exports. Values in exports must
@@ -37,34 +41,37 @@ def _do_launch(self) -> None:
3741
self._dump_env_vars(exports)
3842

3943
if not self.deploy.sim_cfg.interactive:
44+
log_path = Path(self.deploy.get_log_path())
45+
timeout_mins = self.deploy.get_timeout_mins()
46+
47+
self.timeout_secs = timeout_mins * 60 if timeout_mins else None
48+
4049
try:
41-
f = open(
42-
self.deploy.get_log_path(),
50+
self._log_file = log_path.open(
4351
"w",
4452
encoding="UTF-8",
4553
errors="surrogateescape",
4654
)
47-
f.write("[Executing]:\n{}\n\n".format(self.deploy.cmd))
48-
f.flush()
49-
timeout_mins = self.deploy.get_timeout_mins()
50-
if timeout_mins:
51-
self.timeout_secs = timeout_mins * 60
52-
else:
53-
self.timeout_secs = None
54-
self.process = subprocess.Popen(
55+
self._log_file.write(f"[Executing]:\n{self.deploy.cmd}\n\n")
56+
self._log_file.flush()
57+
58+
self._process = subprocess.Popen(
5559
shlex.split(self.deploy.cmd),
5660
bufsize=4096,
5761
universal_newlines=True,
58-
stdout=f,
59-
stderr=f,
62+
stdout=self._log_file,
63+
stderr=self._log_file,
6064
env=exports,
6165
)
66+
67+
except BlockingIOError as e:
68+
raise LauncherBusy(f"Failed to launch job: {e}") from e
69+
6270
except subprocess.SubprocessError as e:
63-
raise LauncherError(
64-
"IO Error: {}\nSee {}".format(e, self.deploy.get_log_path())
65-
)
71+
raise LauncherError(f"IO Error: {e}\nSee {log_path}") from e
72+
6673
finally:
67-
self._close_process()
74+
self._close_job_log_file()
6875
else:
6976
# Interactive: Set RUN_INTERACTIVE to 1
7077
exports["RUN_INTERACTIVE"] = "1"
@@ -73,7 +80,7 @@ def _do_launch(self) -> None:
7380
# no timeout and blocking op as user controls the flow
7481
print("Interactive mode is not supported yet.")
7582
print(f"Cmd : {self.deploy.cmd}")
76-
self.process = subprocess.Popen(
83+
self._process = subprocess.Popen(
7784
shlex.split(self.deploy.cmd),
7885
stdin=None,
7986
stdout=None,
@@ -84,12 +91,12 @@ def _do_launch(self) -> None:
8491
)
8592

8693
# Wait until the process exit
87-
self.process.wait()
94+
self._process.wait()
8895

8996
self._link_odir("D")
9097

91-
def poll(self):
92-
"""Check status of the running process
98+
def poll(self) -> Union[str, None]:
99+
"""Check status of the running process.
93100
94101
This returns 'D', 'P', 'F', or 'K'. If 'D', the job is still running.
95102
If 'P', the job finished successfully. If 'F', the job finished with
@@ -98,20 +105,20 @@ def poll(self):
98105
This function must only be called after running self.dispatch_cmd() and
99106
must not be called again once it has returned 'P' or 'F'.
100107
"""
108+
if self._process is None:
109+
return "E"
101110

102-
assert self.process is not None
103111
elapsed_time = datetime.datetime.now() - self.start_time
104112
self.job_runtime_secs = elapsed_time.total_seconds()
105-
if self.process.poll() is None:
113+
if self._process.poll() is None:
106114
if (
107-
self.timeout_secs and
108-
(self.job_runtime_secs > self.timeout_secs) and not
109-
(self.deploy.gui)
115+
self.timeout_secs
116+
and (self.job_runtime_secs > self.timeout_secs) # noqa: W503
117+
and not (self.deploy.gui) # noqa: W503
110118
):
111119
self._kill()
112-
timeout_message = (
113-
f"Job timed out after {self.deploy.get_timeout_mins()} minutes"
114-
)
120+
timeout_mins = self.deploy.get_timeout_mins()
121+
timeout_message = f"Job timed out after {timeout_mins} minutes"
115122
self._post_finish(
116123
"K",
117124
ErrorMessage(
@@ -124,44 +131,46 @@ def poll(self):
124131

125132
return "D"
126133

127-
self.exit_code = self.process.returncode
134+
self.exit_code = self._process.returncode
128135
status, err_msg = self._check_status()
129136
self._post_finish(status, err_msg)
137+
130138
return self.status
131139

132-
def _kill(self):
140+
def _kill(self) -> None:
133141
"""Kill the running process.
134142
135143
Try to kill the running process. Send SIGTERM first, wait a bit,
136144
and then send SIGKILL if it didn't work.
137145
"""
138-
assert self.process is not None
139-
self.process.terminate()
146+
if self._process is None:
147+
# process already dead or didn't start
148+
return
149+
150+
self._process.terminate()
140151
try:
141-
self.process.wait(timeout=2)
152+
self._process.wait(timeout=2)
142153
except subprocess.TimeoutExpired:
143-
self.process.kill()
154+
self._process.kill()
144155

145-
def kill(self):
156+
def kill(self) -> None:
146157
"""Kill the running process.
147158
148159
This must be called between dispatching and reaping the process (the
149160
same window as poll()).
150-
151161
"""
152162
self._kill()
153163
self._post_finish(
154-
"K", ErrorMessage(line_number=None, message="Job killed!", context=[])
164+
"K",
165+
ErrorMessage(line_number=None, message="Job killed!", context=[]),
155166
)
156167

157-
def _post_finish(self, status, err_msg):
158-
self._close_process()
159-
self.process = None
168+
def _post_finish(self, status: str, err_msg: Union[ErrorMessage, None]) -> None:
169+
self._close_job_log_file()
170+
self._process = None
160171
super()._post_finish(status, err_msg)
161172

162-
def _close_process(self):
173+
def _close_job_log_file(self) -> None:
163174
"""Close the file descriptors associated with the process."""
164-
165-
assert self.process
166-
if self.process.stdout:
167-
self.process.stdout.close()
175+
if self._log_file:
176+
self._log_file.close()

util/dvsim/Scheduler.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import threading
77
from signal import SIGINT, SIGTERM, signal
88

9-
from Launcher import LauncherError
9+
from Launcher import LauncherBusy, LauncherError
1010
from StatusPrinter import get_status_printer
1111
from Timer import Timer
1212
from utils import VERBOSE
@@ -376,10 +376,11 @@ def _poll(self, hms):
376376
status = item.launcher.poll()
377377
level = VERBOSE
378378

379-
assert status in ["D", "P", "F", "K"]
379+
assert status in ["D", "P", "F", "E", "K"]
380380
if status == "D":
381381
continue
382-
elif status == "P":
382+
383+
if status == "P":
383384
self._passed[target].add(item)
384385
elif status == "F":
385386
self._failed[target].add(item)
@@ -481,10 +482,25 @@ def _dispatch(self, hms):
481482
for item in to_dispatch:
482483
try:
483484
item.launcher.launch()
485+
484486
except LauncherError as err:
485487
log.exception(err.msg)
486488
self._kill_item(item)
487489

490+
except LauncherBusy as err:
491+
log.error("Launcher busy: %s", err)
492+
493+
self._queued[target].push(item)
494+
495+
log.log(
496+
VERBOSE,
497+
"[%s]: [%s]: [reqeued]: %s",
498+
hms,
499+
target,
500+
item.full_name,
501+
)
502+
continue
503+
488504
self._running[target].append(item)
489505
self.item_to_status[item] = "D"
490506

0 commit comments

Comments
 (0)