Skip to content

Commit 6c0a996

Browse files
committed
safer walltime limits in trainings
1 parent e0f62a6 commit 6c0a996

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

psiflow/models/_mace.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ def train(
162162
) -> str:
163163
import yaml
164164

165+
actual_walltime = int(0.9 * walltime) # reserve 10 % for safe shutdown
165166
mace_config["train_file"] = inputs[1].filepath
166167
mace_config["valid_file"] = inputs[2].filepath
167168
config_str = yaml.dump(dict(mace_config))
@@ -172,7 +173,7 @@ def train(
172173
command_tmp,
173174
command_cd,
174175
command_write,
175-
"timeout -s 15 {}s psiflow-train-mace".format(max(walltime - 15, 0)),
176+
"timeout -s 15 {}s psiflow-train-mace".format(actual_walltime),
176177
"--config config.yaml",
177178
"--model {} || true;".format(inputs[0].filepath),
178179
"ls *;",

psiflow/models/_nequip.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ def train(
249249
) -> str:
250250
import yaml
251251

252+
actual_walltime = int(0.9 * walltime) # reserve 10 % for safe shutdown
252253
nequip_config["dataset_file_name"] = inputs[1].filepath
253254
nequip_config["validation_dataset"] = "ase"
254255
nequip_config["validation_dataset_file_name"] = inputs[2].filepath
@@ -262,7 +263,7 @@ def train(
262263
command_cd,
263264
command_env,
264265
command_write,
265-
"timeout -s 15 {}s".format(max(walltime - 15, 0)), # 15 s slack
266+
"timeout -s 15 {}s".format(actual_walltime), # 15 s slack
266267
"psiflow-train-nequip",
267268
"--config config.yaml",
268269
"--model {} || true;".format(inputs[0].filepath),

0 commit comments

Comments
 (0)