Revert "Fix poision child process issue when call getAccelerator() (pytorch#144368)"

pytorchmergebot · pytorchmergebot · commit b80ecc4457dd · 2025-01-10T23:36:43.000Z
This reverts commit 2583d83. Reverted pytorch#144368 on behalf of https://github.com/clee2000 due to broke internal tests D68023262, probably the same problem as noted in the issue this PR is mentioned above ([comment](pytorch#144368 (comment)))
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -484,22 +484,8 @@ inline DeprecatedTypeProperties& MPS(ScalarType s) {
       Backend::MPS, s);
 }
 
-// Note [at::hasXXX() vs. at::globalContext().hasXXX()]
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//
-// The purpose of `at::hasXXX()` is to check if device XXX is available at
-// runtime. In contrast, `at::globalContext().hasXXX()` determines whether
-// support for device XXX was included in the PyTorch build (enabled at compile
-// time) or if a device XXX extension has already been registered with PyTorch.
-//
-// `at::globalContext().hasXXX()` is often used in functions like
-// `getAccelerator()` instead of `at::hasXXX()` to avoid initializing the
-// runtime for device XXX (which can poison child processes while detecting the
-// current accelerator).
-
 inline bool hasCUDA() {
-  return globalContext().hasCUDA() &&
-      (detail::getCUDAHooks().deviceCount() > 0);
+  return globalContext().hasCUDA();
 }
 
 inline bool hasMTIA() {
@@ -527,7 +513,7 @@ inline bool hasMAIA() {
 }
 
 inline bool hasXPU() {
-  return globalContext().hasXPU() && (detail::getXPUHooks().deviceCount() > 0);
+  return globalContext().hasXPU();
 }
 
 inline bool hasHPU() {
@@ -585,24 +571,31 @@ inline void manual_seed(uint64_t seed) {
     std::lock_guard<std::mutex> lock(gen.mutex());
     gen.set_current_seed(seed);
   }
-
-  for (const auto i : c10::irange(detail::getCUDAHooks().deviceCount())) {
-    auto cuda_gen = globalContext().defaultGenerator(
-        Device(at::kCUDA, static_cast<c10::DeviceIndex>(i)));
-    {
-      // See Note [Acquire lock when using random generators]
-      std::lock_guard<std::mutex> lock(cuda_gen.mutex());
-      cuda_gen.set_current_seed(seed);
+  // NB: Sometimes we build with CUDA, but we don't have any GPUs
+  // available. In that case, we must not seed CUDA; it will fail!
+  const auto cuda_num_gpus = detail::getCUDAHooks().deviceCount();
+  if (hasCUDA() && cuda_num_gpus > 0) {
+    for (const auto i : c10::irange(cuda_num_gpus)) {
+      auto cuda_gen = globalContext().defaultGenerator(
+          Device(at::kCUDA, static_cast<c10::DeviceIndex>(i)));
+      {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(cuda_gen.mutex());
+        cuda_gen.set_current_seed(seed);
+      }
     }
   }
 
-  for (const auto i : c10::irange(detail::getXPUHooks().deviceCount())) {
-    auto xpu_gen = globalContext().defaultGenerator(
-        Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));
-    {
-      // See Note [Acquire lock when using random generators]
-      std::lock_guard<std::mutex> lock(xpu_gen.mutex());
-      xpu_gen.set_current_seed(seed);
+  const auto xpu_num_gpus = detail::getXPUHooks().deviceCount();
+  if (hasXPU() && xpu_num_gpus) {
+    for (const auto i : c10::irange(xpu_num_gpus)) {
+      auto xpu_gen = globalContext().defaultGenerator(
+          Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));
+      {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(xpu_gen.mutex());
+        xpu_gen.set_current_seed(seed);
+      }
     }
   }
 
diff --git a/aten/src/ATen/DeviceAccelerator.cpp b/aten/src/ATen/DeviceAccelerator.cpp
@@ -6,7 +6,7 @@ namespace at::accelerator {
 
 std::optional<c10::DeviceType> getAccelerator(bool checked) {
 #define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
-  if (at::globalContext().has##device_name()) {    \
+  if (at::has##device_name()) {                    \
     device_type = k##device_name;                  \
     TORCH_CHECK(                                   \
         !is_accelerator_detected,                  \
diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h
@@ -18,7 +18,7 @@ namespace at::accelerator {
 // CUDA, MTIA, XPU, HIP, MPS, PrivateUse1
 
 // Ensures that only one accelerator is available (at
-// *compile time* if possible) and return it.
+// compile time if possible) and return it.
 // When checked is true, the returned optional always has a value.
 TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
 
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -148,10 +148,7 @@ bool CUDAHooks::isPinnedPtr(const void* data) const {
 }
 
 bool CUDAHooks::hasCUDA() const {
-  // This function determines if CUDA is built into PyTorch. It helps avoid
-  // initializing the CUDA runtime (which can poison child processes) while
-  // detecting the current accelerator.
-  return true;
+  return at::cuda::is_available();
 }
 
 bool CUDAHooks::hasMAGMA() const {
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
@@ -2392,7 +2392,7 @@ TEST(FuturesTest, Basic) {
 // Sparse CUDA tensor test
 TEST(FutureTest, SparseTensor) {
   // Skip test if CUDA is not available.
-  bool has_cuda = at::hasCUDA();
+  bool has_cuda = at::globalContext().hasCUDA();
   if (!has_cuda) {
     LOG(INFO) << "CUDA not available, skipping test";
   }
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -3345,14 +3345,7 @@ def check_output(script: str) -> str:
         VISIBLE_DEVICES = (
             "HIP_VISIBLE_DEVICES" if TEST_WITH_ROCM else "CUDA_VISIBLE_DEVICES"
         )
-        test_script = f"""\
-import os
-import torch
-os.environ['{VISIBLE_DEVICES}']='32'
-
-torch.device(0) # see https://github.com/pytorch/pytorch/issues/144152
-print(torch.cuda.device_count())
-"""
+        test_script = f"import os; import torch;os.environ['{VISIBLE_DEVICES}']='32';print(torch.cuda.device_count())"
         rc = check_output(test_script)
         self.assertEqual(rc, "0")
         if not TEST_WITH_ROCM:
diff --git a/test/test_xpu.py b/test/test_xpu.py
@@ -192,7 +192,6 @@ def test_multi_process(model, input):
     torch.nn.ReLU(),
     torch.nn.MaxPool2d(2, 2),
 )
-torch.device(0) # see https://github.com/pytorch/pytorch/issues/144152
 test_multi_process(model, input)
 test_multi_process(model, input)
 print(torch.xpu.device_count())

Original file line number	Diff line number	Diff line change
`@@ -148,10 +148,7 @@ bool CUDAHooks::isPinnedPtr(const void* data) const {`
`148`	`148`	`}`
`149`	`149`
`150`	`150`	`bool CUDAHooks::hasCUDA() const {`
`151`		`- // This function determines if CUDA is built into PyTorch. It helps avoid`
`152`		`- // initializing the CUDA runtime (which can poison child processes) while`
`153`		`- // detecting the current accelerator.`
`154`		`- return true;`
	`151`	`+ return at::cuda::is_available();`
`155`	`152`	`}`
`156`	`153`
`157`	`154`	`bool CUDAHooks::hasMAGMA() const {`
Original file line number	Diff line number	Diff line change
`@@ -2392,7 +2392,7 @@ TEST(FuturesTest, Basic) {`
`2392`	`2392`	`// Sparse CUDA tensor test`
`2393`	`2393`	`TEST(FutureTest, SparseTensor) {`
`2394`	`2394`	`// Skip test if CUDA is not available.`
`2395`		`- bool has_cuda = at::hasCUDA();`
	`2395`	`+ bool has_cuda = at::globalContext().hasCUDA();`
`2396`	`2396`	`if (!has_cuda) {`
`2397`	`2397`	`LOG(INFO) << "CUDA not available, skipping test";`
`2398`	`2398`	`}`
Original file line number	Diff line number	Diff line change
`@@ -192,7 +192,6 @@ def test_multi_process(model, input):`
`192`	`192`	`torch.nn.ReLU(),`
`193`	`193`	`torch.nn.MaxPool2d(2, 2),`
`194`	`194`	`)`
`195`		`-torch.device(0) # see https://github.com/pytorch/pytorch/issues/144152`
`196`	`195`	`test_multi_process(model, input)`
`197`	`196`	`test_multi_process(model, input)`
`198`	`197`	`print(torch.xpu.device_count())`