Use caller CUDA stream for D2H and H2D copies (pytorch#20498)

Conarnar · facebook-github-bot · commit 32968c0d46cb · 2026-06-27T03:23:53.000-07:00
Summary:

CudaAllocator memory copies now support async copy on a caller-provided CUDA stream. When a caller stream is available (via `getCallerStream()`), `copy_host_to_device` and `copy_device_to_host` use `cudaMemcpyAsync`. When no caller stream is set, the synchronous `cudaMemcpy` path is used as before.

Additionally:
- Added null pointer and zero-byte validation — null `dst`/`src` return `Error::InvalidArgument` instead of aborting in `cudaMemcpy`, and zero-byte copies return `Error::Ok` early.
- Assert single-GPU case (index 0 or -1) until multi-GPU stream validation is added.
- Wired `//executorch/extension/cuda:caller_stream` dependency in TARGETS.
- Added `extension_cuda` dependencies to CMakeLists.txt.
- Added `test_cuda_allocator` with coverage for sync/async paths and error handling.

Reviewed By: Gasoonjia

Differential Revision: D109590531
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -213,6 +213,48 @@ jobs:
         pip install gguf
         python -m pytest examples/models/gemma4_31b/quant/tests/ examples/models/gemma4_31b/tests/ --ignore=examples/models/gemma4_31b/tests/test_mlx_pipeline.py -v -o "addopts="
 
+  unittest-cuda-runtime:
+    name: unittest-cuda-runtime
+    needs: [changed-files, run-decision]
+    if: |
+      contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
+      contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+      contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
+      needs.run-decision.outputs.is-full-run == 'true'
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "13.0"
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        # Install executorch in editable mode so custom op libs land in-tree
+        bash ./install_executorch.sh
+
+        # The Triton-compiled .so files in the CUDA backend require GLIBCXX_3.4.29
+        # which the default system libstdc++ doesn't have. Install a newer one.
+        conda install -y -c conda-forge 'libstdcxx-ng>=12'
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+
+        # Build ExecuTorch with CUDA support and tests enabled
+        cmake --preset llm-release-cuda -DEXECUTORCH_BUILD_TESTS=ON
+        cmake --build cmake-out --target install -j
+
+        # Build and run CUDA runtime C++ unit tests
+        cmake --build cmake-out --target test_cuda_allocator test_cuda_mutable_state -j
+        ctest --test-dir cmake-out -R test_cuda_allocator --output-on-failure -V
+        ctest --test-dir cmake-out -R test_cuda_mutable_state --output-on-failure -V
+
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available).
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -160,7 +160,7 @@ endif()
 if(_cuda_is_msvc_toolchain)
   target_link_libraries(
     aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart CUDA::curand
-                            ${CMAKE_DL_LIBS}
+                            extension_cuda ${CMAKE_DL_LIBS}
   )
   # Link object library directly so symbols are pulled exactly once while
   # avoiding duplicate static/object inclusion and interface leakage.
@@ -169,8 +169,13 @@ else()
   target_link_libraries(
     aoti_cuda_shims
     PRIVATE cuda_platform
-    PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive
-           CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS}
+    PUBLIC -Wl,--whole-archive
+           aoti_common_shims_slim
+           -Wl,--no-whole-archive
+           CUDA::cudart
+           CUDA::curand
+           extension_cuda
+           ${CMAKE_DL_LIBS}
   )
 endif()
 
@@ -243,6 +248,12 @@ install(
 if(BUILD_TESTING)
   include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
+  et_cxx_test(
+    test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp EXTRA_LIBS
+    aoti_cuda_backend
+  )
+  target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1)
+
   et_cxx_test(
     test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
     EXTRA_LIBS aoti_cuda_backend
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -92,6 +92,7 @@ runtime.cxx_library(
         "//executorch/runtime/core:device_allocator",
     ],
     deps = [
+        "//executorch/extension/cuda:caller_stream",
         "//executorch/runtime/platform:platform",
     ],
     nvcc_flags = get_nvcc_arch_args() + [
@@ -163,3 +164,20 @@ cpp_unittest(
         platform = "gpu-remote-execution",
     ),
 )
+
+cpp_unittest(
+    name = "test_cuda_allocator",
+    srcs = ["test/test_cuda_allocator.cpp"],
+    deps = [
+        ":cuda_allocator",
+        "//executorch/extension/cuda:caller_stream",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/platform:platform",
+    ],
+    external_deps = [("cuda", None, "cuda-lazy")],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
+    keep_gpu_sections = True,
+    remote_execution = re_test_utils.remote_execution(
+        platform = "gpu-remote-execution",
+    ),
+)
diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 
+#include <executorch/extension/cuda/caller_stream.h>
 #include <executorch/runtime/platform/log.h>
 
 namespace executorch::backends::cuda {
@@ -19,6 +20,85 @@ using executorch::runtime::Result;
 using executorch::runtime::etensor::DeviceIndex;
 using executorch::runtime::etensor::DeviceType;
 
+namespace {
+
+Error copy_impl(
+    void* dst,
+    const void* src,
+    size_t nbytes,
+    DeviceIndex index,
+    cudaMemcpyKind kind) {
+  ET_CHECK_OR_RETURN_ERROR(
+      kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToHost,
+      InvalidArgument,
+      "CudaAllocator::copy_impl: unsupported cudaMemcpyKind %d",
+      static_cast<int>(kind));
+  const char* method = kind == cudaMemcpyHostToDevice
+      ? "CudaAllocator::copy_host_to_device"
+      : "CudaAllocator::copy_device_to_host";
+  ET_CHECK_OR_RETURN_ERROR(
+      dst != nullptr, InvalidArgument, "%s: dst is null", method);
+  ET_CHECK_OR_RETURN_ERROR(
+      src != nullptr, InvalidArgument, "%s: src is null", method);
+  ET_CHECK_OR_RETURN_ERROR(
+      index >= -1,
+      InvalidArgument,
+      "%s: invalid device index %d (must be >= -1)",
+      method,
+      static_cast<int>(index));
+  const auto caller_stream = executorch::extension::cuda::getCallerStream();
+  if (caller_stream) {
+    // TODO: validate caller stream device matches index.
+    // For now assert index is -1 or 0.
+    ET_CHECK_OR_RETURN_ERROR(
+        index == -1 || index == 0,
+        InvalidArgument,
+        "%s: with caller stream, only supports device 0 or -1 (current), got %d",
+        method,
+        static_cast<int>(index));
+  }
+  if (nbytes == 0) {
+    return Error::Ok;
+  }
+
+  int prev_device = 0;
+  cudaError_t prev_device_err = cudaSuccess;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      (void)cudaSetDevice(index);
+    }
+  }
+  cudaError_t err = cudaSuccess;
+  if (caller_stream) {
+    err = cudaMemcpyAsync(dst, src, nbytes, kind, *caller_stream);
+    if (err == cudaSuccess && kind == cudaMemcpyDeviceToHost) {
+      err = cudaStreamSynchronize(*caller_stream);
+    }
+  } else {
+    err = cudaMemcpy(dst, src, nbytes, kind);
+  }
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    (void)cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMemcpy %s failed: %s (%zu bytes, device %d)",
+        kind == cudaMemcpyHostToDevice ? "H2D" : "D2H",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+} // namespace
+
 Result<void*>
 CudaAllocator::allocate(size_t nbytes, DeviceIndex index, size_t alignment) {
   // index == -1 means "use the current CUDA device"; any value < -1 is invalid.
@@ -124,72 +204,20 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
   }
 }
 
-// TODO(gasoonjia): Add support for async copy
 Error CudaAllocator::copy_host_to_device(
     void* dst,
     const void* src,
     size_t nbytes,
     DeviceIndex index) {
-  int prev_device = 0;
-  cudaError_t prev_device_err = cudaSuccess;
-
-  if (index >= 0) {
-    prev_device_err = cudaGetDevice(&prev_device);
-    if (prev_device_err == cudaSuccess) {
-      cudaSetDevice(index);
-    }
-  }
-
-  cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
-
-  if (index >= 0 && prev_device_err == cudaSuccess) {
-    cudaSetDevice(prev_device);
-  }
-
-  if (err != cudaSuccess) {
-    ET_LOG(
-        Error,
-        "cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
-        cudaGetErrorString(err),
-        nbytes,
-        static_cast<int>(index));
-    return Error::Internal;
-  }
-  return Error::Ok;
+  return copy_impl(dst, src, nbytes, index, cudaMemcpyHostToDevice);
 }
 
-// TODO(gasoonjia): Add support for async copy
 Error CudaAllocator::copy_device_to_host(
     void* dst,
     const void* src,
     size_t nbytes,
     DeviceIndex index) {
-  int prev_device = 0;
-  cudaError_t prev_device_err = cudaSuccess;
-
-  if (index >= 0) {
-    prev_device_err = cudaGetDevice(&prev_device);
-    if (prev_device_err == cudaSuccess) {
-      cudaSetDevice(index);
-    }
-  }
-
-  cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
-
-  if (index >= 0 && prev_device_err == cudaSuccess) {
-    cudaSetDevice(prev_device);
-  }
-
-  if (err != cudaSuccess) {
-    ET_LOG(
-        Error,
-        "cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
-        cudaGetErrorString(err),
-        nbytes,
-        static_cast<int>(index));
-    return Error::Internal;
-  }
-  return Error::Ok;
+  return copy_impl(dst, src, nbytes, index, cudaMemcpyDeviceToHost);
 }
 
 DeviceType CudaAllocator::device_type() const {
diff --git a/backends/cuda/runtime/test/test_cuda_allocator.cpp b/backends/cuda/runtime/test/test_cuda_allocator.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+#include <vector>
+
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
+#include <executorch/extension/cuda/caller_stream.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
+using executorch::backends::cuda::CudaAllocator;
+using executorch::runtime::Error;
+
+class CudaAllocatorTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available";
+    }
+  }
+};
+
+TEST_F(CudaAllocatorTest, CopyRoundtrip) {
+  CudaAllocator& a = CudaAllocator::instance();
+  constexpr size_t N = 1024;
+  auto res = a.allocate(N, 0);
+  ASSERT_TRUE(res.ok());
+  void* dptr = res.get();
+
+  std::vector<uint8_t> h_src(N, 42), h_dst(N, 0);
+  ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
+  EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok);
+  EXPECT_EQ(h_src, h_dst);
+
+  a.deallocate(dptr, 0);
+}
+
+TEST_F(CudaAllocatorTest, CopyRoundtripWithCallerStream) {
+  int device = 0;
+  ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
+  ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
+  // TODO: validate caller stream device matches index once CallerStreamGuard
+  // exposes device. For now assert single-GPU case.
+  cudaStream_t s;
+  ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
+  {
+    executorch::extension::cuda::CallerStreamGuard g(s);
+
+    CudaAllocator& a = CudaAllocator::instance();
+    auto res = a.allocate(256, 0);
+    ASSERT_TRUE(res.ok());
+    void* d = res.get();
+    std::vector<uint8_t> h_src(256, 5), h_dst(256, 0);
+    ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok);
+    EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok);
+    EXPECT_EQ(h_src, h_dst);
+    EXPECT_EQ(cudaStreamSynchronize(s), cudaSuccess);
+
+    a.deallocate(d, 0);
+  }
+  ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
+}
+
+TEST_F(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) {
+  CudaAllocator& a = CudaAllocator::instance();
+  // null dst should fail gracefully not CHECK abort
+  std::vector<uint8_t> h(8, 1);
+  Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0);
+  EXPECT_EQ(e, Error::InvalidArgument)
+      << "expected InvalidArgument for null dst, got "
+      << static_cast<uint32_t>(e);
+}
+
+TEST_F(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) {
+  CudaAllocator& a = CudaAllocator::instance();
+  void* dummy_dst = reinterpret_cast<void*>(0x1);
+  Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0);
+  EXPECT_EQ(e, Error::InvalidArgument)
+      << "expected InvalidArgument for null src, got "
+      << static_cast<uint32_t>(e);
+}
+
+TEST_F(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) {
+  CudaAllocator& a = CudaAllocator::instance();
+  void* dummy_src = reinterpret_cast<void*>(0x1);
+  Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0);
+  EXPECT_EQ(e, Error::InvalidArgument)
+      << "expected InvalidArgument for null dst, got "
+      << static_cast<uint32_t>(e);
+}
+
+TEST_F(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) {
+  CudaAllocator& a = CudaAllocator::instance();
+  std::vector<uint8_t> h(8, 1);
+  // null src should fail gracefully not CHECK abort
+  Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0);
+  EXPECT_EQ(e, Error::InvalidArgument)
+      << "expected InvalidArgument for null src, got "
+      << static_cast<uint32_t>(e);
+}
diff --git a/extension/cuda/caller_stream.h b/extension/cuda/caller_stream.h