[UR][CUDA] Fix prefetch/advise early exit (#18395)

npmiller · web-flow · commit a5c7d88947ff · 2025-05-12T13:23:44.000+01:00
When we exit early we still need to return a proper event.

This fixes segfaults in the UR CTS, the HIP version of these entry
points was already handling this properly, so remove the known failures
for both.
diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp
@@ -1603,24 +1603,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
   ur_device_handle_t Device = hQueue->getDevice();
 
-  // Certain cuda devices and Windows do not have support for some Unified
-  // Memory features. cuMemPrefetchAsync requires concurrent memory access
-  // for managed memory. Therefore, ignore prefetch hint if concurrent managed
-  // memory access is not available.
-  if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-    UR_LOG(WARN, "Prefetch hint ignored as device does not support "
-                 "concurrent managed access.");
-    return UR_RESULT_SUCCESS;
-  }
-
-  unsigned int IsManaged;
-  UR_CHECK_ERROR(cuPointerGetAttribute(
-      &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
-  if (!IsManaged) {
-    UR_LOG(WARN, "Prefetch hint ignored as prefetch only works with USM.");
-    return UR_RESULT_SUCCESS;
-  }
-
   ur_result_t Result = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
@@ -1635,12 +1617,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
               UR_COMMAND_MEM_BUFFER_COPY, hQueue, CuStream));
       UR_CHECK_ERROR(EventPtr->start());
     }
+
+    // Ensure we release the event even on early exit
+    OnScopeExit ReleaseEvent([&]() {
+      if (phEvent) {
+        UR_CHECK_ERROR(EventPtr->record());
+        *phEvent = EventPtr.release();
+      }
+    });
+
+    // Certain cuda devices and Windows do not have support for some Unified
+    // Memory features. cuMemPrefetchAsync requires concurrent memory access
+    // for managed memory. Therefore, ignore prefetch hint if concurrent managed
+    // memory access is not available.
+    if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      UR_LOG(WARN, "Prefetch hint ignored as device does not support "
+                   "concurrent managed access.");
+      return UR_RESULT_SUCCESS;
+    }
+
+    unsigned int IsManaged;
+    UR_CHECK_ERROR(cuPointerGetAttribute(
+        &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
+    if (!IsManaged) {
+      UR_LOG(WARN, "Prefetch hint ignored as prefetch only works with USM.");
+      return UR_RESULT_SUCCESS;
+    }
+
     UR_CHECK_ERROR(
         cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream));
-    if (phEvent) {
-      UR_CHECK_ERROR(EventPtr->record());
-      *phEvent = EventPtr.release();
-    }
   } catch (ur_result_t Err) {
     Result = Err;
   }
@@ -1656,37 +1661,6 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
       &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
   UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
 
-  // Certain cuda devices and Windows do not have support for some Unified
-  // Memory features. Passing CU_MEM_ADVISE_SET/CLEAR_PREFERRED_LOCATION and
-  // to cuMemAdvise on a GPU device requires the GPU device to report a non-zero
-  // value for CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore
-  // memory advise if concurrent managed memory access is not available.
-  if ((advice & UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION) ||
-      (advice & UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION) ||
-      (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) ||
-      (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) ||
-      (advice & UR_USM_ADVICE_FLAG_DEFAULT)) {
-    ur_device_handle_t Device = hQueue->getDevice();
-    if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-      UR_LOG(WARN, "Mem advise ignored as device does not support "
-                   "concurrent managed access.");
-      return UR_RESULT_SUCCESS;
-    }
-
-    // TODO: If ptr points to valid system-allocated pageable memory we should
-    // check that the device also has the
-    // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
-  }
-
-  unsigned int IsManaged;
-  UR_CHECK_ERROR(cuPointerGetAttribute(
-      &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
-  if (!IsManaged) {
-    UR_LOG(WARN,
-           "Memory advice ignored as memory advices only works with USM.");
-    return UR_RESULT_SUCCESS;
-  }
-
   ur_result_t Result = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
@@ -1700,6 +1674,47 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
       UR_CHECK_ERROR(EventPtr->start());
     }
 
+    // Ensure we release the event even on early exit
+    OnScopeExit ReleaseEvent([&]() {
+      if (phEvent) {
+        UR_CHECK_ERROR(EventPtr->record());
+        *phEvent = EventPtr.release();
+      }
+    });
+
+    // Certain cuda devices and Windows do not have support for some Unified
+    // Memory features. Passing CU_MEM_ADVISE_SET/CLEAR_PREFERRED_LOCATION and
+    // to cuMemAdvise on a GPU device requires the GPU device to report a
+    // non-zero value for CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+    // Therfore, ignore memory advise if concurrent managed memory access is not
+    // available.
+    if ((advice & UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION) ||
+        (advice & UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION) ||
+        (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) ||
+        (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) ||
+        (advice & UR_USM_ADVICE_FLAG_DEFAULT)) {
+      ur_device_handle_t Device = hQueue->getDevice();
+      if (!getAttribute(Device,
+                        CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+        UR_LOG(WARN, "Mem advise ignored as device does not support "
+                     "concurrent managed access.");
+        return UR_RESULT_SUCCESS;
+      }
+
+      // TODO: If ptr points to valid system-allocated pageable memory we should
+      // check that the device also has the
+      // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
+    }
+
+    unsigned int IsManaged;
+    UR_CHECK_ERROR(cuPointerGetAttribute(
+        &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
+    if (!IsManaged) {
+      UR_LOG(WARN,
+             "Memory advice ignored as memory advices only works with USM.");
+      return UR_RESULT_SUCCESS;
+    }
+
     if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_READ_MOSTLY,
@@ -1714,11 +1729,6 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
       Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice,
                               hQueue->getDevice()->get());
     }
-
-    if (phEvent) {
-      UR_CHECK_ERROR(EventPtr->record());
-      *phEvent = EventPtr.release();
-    }
   } catch (ur_result_t err) {
     Result = err;
   } catch (...) {
diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueUSMAdvise.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueUSMAdvise.cpp
@@ -25,8 +25,6 @@ UUR_DEVICE_TEST_SUITE_WITH_PARAM(
     uur::deviceTestWithParamPrinter<ur_usm_advice_flag_t>);
 
 TEST_P(urEnqueueUSMAdviseWithParamTest, Success) {
-  UUR_KNOWN_FAILURE_ON(uur::HIP{}, uur::CUDA{});
-
   ur_event_handle_t advise_event = nullptr;
   ur_result_t result = urEnqueueUSMAdvise(queue, ptr, allocation_size,
                                           getParam(), &advise_event);
@@ -54,8 +52,6 @@ struct urEnqueueUSMAdviseTest : uur::urUSMDeviceAllocTest {
 UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueUSMAdviseTest);
 
 TEST_P(urEnqueueUSMAdviseTest, MultipleParamsSuccess) {
-  UUR_KNOWN_FAILURE_ON(uur::HIP{}, uur::CUDA{});
-
   ur_result_t result = urEnqueueUSMAdvise(queue, ptr, allocation_size,
                                           UR_USM_ADVICE_FLAG_SET_READ_MOSTLY |
                                               UR_USM_ADVICE_FLAG_BIAS_CACHED,
diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp
@@ -24,11 +24,6 @@ UUR_DEVICE_TEST_SUITE_WITH_PARAM(
 
 TEST_P(urEnqueueUSMPrefetchWithParamTest, Success) {
   UUR_KNOWN_FAILURE_ON(
-      // HIP and CUDA return UR_RESULT_ERROR_ADAPTER_SPECIFIC to issue a
-      // warning about the hint being unsupported. The same applies for
-      // subsequent fails in this file.
-      // TODO: codify this in the spec and account for it in the CTS.
-      uur::HIP{}, uur::CUDA{},
       // The setup for the parent fixture does a urQueueFlush, which isn't
       // supported by native cpu. Again same goes for subsequent fails in
       // this file.
@@ -53,7 +48,7 @@ TEST_P(urEnqueueUSMPrefetchWithParamTest, Success) {
  * executing.
  */
 TEST_P(urEnqueueUSMPrefetchWithParamTest, CheckWaitEvent) {
-  UUR_KNOWN_FAILURE_ON(uur::HIP{}, uur::CUDA{}, uur::NativeCPU{});
+  UUR_KNOWN_FAILURE_ON(uur::NativeCPU{});
 
   ur_queue_handle_t fill_queue;
   ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &fill_queue));