From 51a06f2b301c9a7e0644b081a146e6d3f69da135 Mon Sep 17 00:00:00 2001 From: Andy Wagner Date: Mon, 7 Apr 2025 11:45:14 -0700 Subject: [PATCH] Remove crash from AIPM in torchX at end (#1042) Summary: AIPM is crashing when closing the application. This resolves this by not killing the app_ids Differential Revision: D72580570 --- torchx/schedulers/local_scheduler.py | 4 --- .../schedulers/test/local_scheduler_test.py | 27 ------------------- 2 files changed, 31 deletions(-) diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py index 9250ee72a..9c06e30d0 100644 --- a/torchx/schedulers/local_scheduler.py +++ b/torchx/schedulers/local_scheduler.py @@ -1109,10 +1109,6 @@ def _cancel_existing(self, app_id: str) -> None: local_app.state = AppState.CANCELLED def close(self) -> None: - # terminate all apps - for app_id, app in self._apps.items(): - log.debug(f"Terminating app: {app_id}") - app.kill() # delete logdir if torchx created a log dir if self._base_log_dir and self._created_tmp_log_dir: shutil.rmtree(self._base_log_dir, ignore_errors=True) diff --git a/torchx/schedulers/test/local_scheduler_test.py b/torchx/schedulers/test/local_scheduler_test.py index 8dae7ecba..be308a868 100644 --- a/torchx/schedulers/test/local_scheduler_test.py +++ b/torchx/schedulers/test/local_scheduler_test.py @@ -1112,33 +1112,6 @@ def test_get_cuda_devices_not_set(self, _: MagicMock) -> None: self.assertFalse(ENV_CUDA_VISIBLE_DEVICES in role_params[2].env) self.assertFalse(ENV_CUDA_VISIBLE_DEVICES in role_params[3].env) - def test_no_orphan_process_function(self) -> None: - self._test_orphan_workflow() - - def _test_orphan_workflow(self) -> None: - mp_queue = mp.Queue() - child_nproc = 2 - - proc = mp.Process( - target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc) - ) - proc.start() - total_processes = child_nproc + 1 - pids = [] - for _ in range(total_processes): - pids.append(mp_queue.get(timeout=5)) - parent_pid = pids[0] - child_pids = pids[1:] - - os.kill(parent_pid, signal.SIGTERM) - # Wait to give time for signal handlers to finish work - time.sleep(5) - for child_pid in child_pids: - # Killing parent should kill all children, we expect that each call to - # os.kill would raise OSError - with self.assertRaises(OSError): - os.kill(child_pid, 0) - class JoinPATHTest(unittest.TestCase): def test_join_PATH(self) -> None: