Skip to content

Commit 32968c0

Browse files
Conarnarfacebook-github-bot
authored andcommitted
Use caller CUDA stream for D2H and H2D copies (pytorch#20498)
Summary: CudaAllocator memory copies now support async copy on a caller-provided CUDA stream. When a caller stream is available (via `getCallerStream()`), `copy_host_to_device` and `copy_device_to_host` use `cudaMemcpyAsync`. When no caller stream is set, the synchronous `cudaMemcpy` path is used as before. Additionally: - Added null pointer and zero-byte validation — null `dst`/`src` return `Error::InvalidArgument` instead of aborting in `cudaMemcpy`, and zero-byte copies return `Error::Ok` early. - Assert single-GPU case (index 0 or -1) until multi-GPU stream validation is added. - Wired `//executorch/extension/cuda:caller_stream` dependency in TARGETS. - Added `extension_cuda` dependencies to CMakeLists.txt. - Added `test_cuda_allocator` with coverage for sync/async paths and error handling. Reviewed By: Gasoonjia Differential Revision: D109590531
1 parent 9e23c9c commit 32968c0

6 files changed

Lines changed: 273 additions & 57 deletions

File tree

.github/workflows/cuda.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,48 @@ jobs:
213213
pip install gguf
214214
python -m pytest examples/models/gemma4_31b/quant/tests/ examples/models/gemma4_31b/tests/ --ignore=examples/models/gemma4_31b/tests/test_mlx_pipeline.py -v -o "addopts="
215215
216+
unittest-cuda-runtime:
217+
name: unittest-cuda-runtime
218+
needs: [changed-files, run-decision]
219+
if: |
220+
contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
221+
contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
222+
contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
223+
contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
224+
contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
225+
contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
226+
needs.run-decision.outputs.is-full-run == 'true'
227+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
228+
permissions:
229+
id-token: write
230+
contents: read
231+
with:
232+
timeout: 90
233+
runner: linux.g5.4xlarge.nvidia.gpu
234+
gpu-arch-type: cuda
235+
gpu-arch-version: "13.0"
236+
use-custom-docker-registry: false
237+
submodules: recursive
238+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
239+
script: |
240+
set -eux
241+
# Install executorch in editable mode so custom op libs land in-tree
242+
bash ./install_executorch.sh
243+
244+
# The Triton-compiled .so files in the CUDA backend require GLIBCXX_3.4.29
245+
# which the default system libstdc++ doesn't have. Install a newer one.
246+
conda install -y -c conda-forge 'libstdcxx-ng>=12'
247+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
248+
249+
# Build ExecuTorch with CUDA support and tests enabled
250+
cmake --preset llm-release-cuda -DEXECUTORCH_BUILD_TESTS=ON
251+
cmake --build cmake-out --target install -j
252+
253+
# Build and run CUDA runtime C++ unit tests
254+
cmake --build cmake-out --target test_cuda_allocator test_cuda_mutable_state -j
255+
ctest --test-dir cmake-out -R test_cuda_allocator --output-on-failure -V
256+
ctest --test-dir cmake-out -R test_cuda_mutable_state --output-on-failure -V
257+
216258
export-model-cuda-artifact:
217259
name: export-model-cuda-artifact
218260
# Skip this job if the pull request is from a fork (HuggingFace secrets are not available).

backends/cuda/CMakeLists.txt

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ endif()
160160
if(_cuda_is_msvc_toolchain)
161161
target_link_libraries(
162162
aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart CUDA::curand
163-
${CMAKE_DL_LIBS}
163+
extension_cuda ${CMAKE_DL_LIBS}
164164
)
165165
# Link object library directly so symbols are pulled exactly once while
166166
# avoiding duplicate static/object inclusion and interface leakage.
@@ -169,8 +169,13 @@ else()
169169
target_link_libraries(
170170
aoti_cuda_shims
171171
PRIVATE cuda_platform
172-
PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive
173-
CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS}
172+
PUBLIC -Wl,--whole-archive
173+
aoti_common_shims_slim
174+
-Wl,--no-whole-archive
175+
CUDA::cudart
176+
CUDA::curand
177+
extension_cuda
178+
${CMAKE_DL_LIBS}
174179
)
175180
endif()
176181

@@ -243,6 +248,12 @@ install(
243248
if(BUILD_TESTING)
244249
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
245250

251+
et_cxx_test(
252+
test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp EXTRA_LIBS
253+
aoti_cuda_backend
254+
)
255+
target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1)
256+
246257
et_cxx_test(
247258
test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
248259
EXTRA_LIBS aoti_cuda_backend

backends/cuda/runtime/TARGETS

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ runtime.cxx_library(
9292
"//executorch/runtime/core:device_allocator",
9393
],
9494
deps = [
95+
"//executorch/extension/cuda:caller_stream",
9596
"//executorch/runtime/platform:platform",
9697
],
9798
nvcc_flags = get_nvcc_arch_args() + [
@@ -163,3 +164,20 @@ cpp_unittest(
163164
platform = "gpu-remote-execution",
164165
),
165166
)
167+
168+
cpp_unittest(
169+
name = "test_cuda_allocator",
170+
srcs = ["test/test_cuda_allocator.cpp"],
171+
deps = [
172+
":cuda_allocator",
173+
"//executorch/extension/cuda:caller_stream",
174+
"//executorch/runtime/core:core",
175+
"//executorch/runtime/platform:platform",
176+
],
177+
external_deps = [("cuda", None, "cuda-lazy")],
178+
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
179+
keep_gpu_sections = True,
180+
remote_execution = re_test_utils.remote_execution(
181+
platform = "gpu-remote-execution",
182+
),
183+
)

backends/cuda/runtime/cuda_allocator.cpp

Lines changed: 82 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <cuda_runtime.h>
1212

13+
#include <executorch/extension/cuda/caller_stream.h>
1314
#include <executorch/runtime/platform/log.h>
1415

1516
namespace executorch::backends::cuda {
@@ -19,6 +20,85 @@ using executorch::runtime::Result;
1920
using executorch::runtime::etensor::DeviceIndex;
2021
using executorch::runtime::etensor::DeviceType;
2122

23+
namespace {
24+
25+
Error copy_impl(
26+
void* dst,
27+
const void* src,
28+
size_t nbytes,
29+
DeviceIndex index,
30+
cudaMemcpyKind kind) {
31+
ET_CHECK_OR_RETURN_ERROR(
32+
kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToHost,
33+
InvalidArgument,
34+
"CudaAllocator::copy_impl: unsupported cudaMemcpyKind %d",
35+
static_cast<int>(kind));
36+
const char* method = kind == cudaMemcpyHostToDevice
37+
? "CudaAllocator::copy_host_to_device"
38+
: "CudaAllocator::copy_device_to_host";
39+
ET_CHECK_OR_RETURN_ERROR(
40+
dst != nullptr, InvalidArgument, "%s: dst is null", method);
41+
ET_CHECK_OR_RETURN_ERROR(
42+
src != nullptr, InvalidArgument, "%s: src is null", method);
43+
ET_CHECK_OR_RETURN_ERROR(
44+
index >= -1,
45+
InvalidArgument,
46+
"%s: invalid device index %d (must be >= -1)",
47+
method,
48+
static_cast<int>(index));
49+
const auto caller_stream = executorch::extension::cuda::getCallerStream();
50+
if (caller_stream) {
51+
// TODO: validate caller stream device matches index.
52+
// For now assert index is -1 or 0.
53+
ET_CHECK_OR_RETURN_ERROR(
54+
index == -1 || index == 0,
55+
InvalidArgument,
56+
"%s: with caller stream, only supports device 0 or -1 (current), got %d",
57+
method,
58+
static_cast<int>(index));
59+
}
60+
if (nbytes == 0) {
61+
return Error::Ok;
62+
}
63+
64+
int prev_device = 0;
65+
cudaError_t prev_device_err = cudaSuccess;
66+
67+
if (index >= 0) {
68+
prev_device_err = cudaGetDevice(&prev_device);
69+
if (prev_device_err == cudaSuccess) {
70+
(void)cudaSetDevice(index);
71+
}
72+
}
73+
cudaError_t err = cudaSuccess;
74+
if (caller_stream) {
75+
err = cudaMemcpyAsync(dst, src, nbytes, kind, *caller_stream);
76+
if (err == cudaSuccess && kind == cudaMemcpyDeviceToHost) {
77+
err = cudaStreamSynchronize(*caller_stream);
78+
}
79+
} else {
80+
err = cudaMemcpy(dst, src, nbytes, kind);
81+
}
82+
83+
if (index >= 0 && prev_device_err == cudaSuccess) {
84+
(void)cudaSetDevice(prev_device);
85+
}
86+
87+
if (err != cudaSuccess) {
88+
ET_LOG(
89+
Error,
90+
"cudaMemcpy %s failed: %s (%zu bytes, device %d)",
91+
kind == cudaMemcpyHostToDevice ? "H2D" : "D2H",
92+
cudaGetErrorString(err),
93+
nbytes,
94+
static_cast<int>(index));
95+
return Error::Internal;
96+
}
97+
return Error::Ok;
98+
}
99+
100+
} // namespace
101+
22102
Result<void*>
23103
CudaAllocator::allocate(size_t nbytes, DeviceIndex index, size_t alignment) {
24104
// index == -1 means "use the current CUDA device"; any value < -1 is invalid.
@@ -124,72 +204,20 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
124204
}
125205
}
126206

127-
// TODO(gasoonjia): Add support for async copy
128207
Error CudaAllocator::copy_host_to_device(
129208
void* dst,
130209
const void* src,
131210
size_t nbytes,
132211
DeviceIndex index) {
133-
int prev_device = 0;
134-
cudaError_t prev_device_err = cudaSuccess;
135-
136-
if (index >= 0) {
137-
prev_device_err = cudaGetDevice(&prev_device);
138-
if (prev_device_err == cudaSuccess) {
139-
cudaSetDevice(index);
140-
}
141-
}
142-
143-
cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
144-
145-
if (index >= 0 && prev_device_err == cudaSuccess) {
146-
cudaSetDevice(prev_device);
147-
}
148-
149-
if (err != cudaSuccess) {
150-
ET_LOG(
151-
Error,
152-
"cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
153-
cudaGetErrorString(err),
154-
nbytes,
155-
static_cast<int>(index));
156-
return Error::Internal;
157-
}
158-
return Error::Ok;
212+
return copy_impl(dst, src, nbytes, index, cudaMemcpyHostToDevice);
159213
}
160214

161-
// TODO(gasoonjia): Add support for async copy
162215
Error CudaAllocator::copy_device_to_host(
163216
void* dst,
164217
const void* src,
165218
size_t nbytes,
166219
DeviceIndex index) {
167-
int prev_device = 0;
168-
cudaError_t prev_device_err = cudaSuccess;
169-
170-
if (index >= 0) {
171-
prev_device_err = cudaGetDevice(&prev_device);
172-
if (prev_device_err == cudaSuccess) {
173-
cudaSetDevice(index);
174-
}
175-
}
176-
177-
cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
178-
179-
if (index >= 0 && prev_device_err == cudaSuccess) {
180-
cudaSetDevice(prev_device);
181-
}
182-
183-
if (err != cudaSuccess) {
184-
ET_LOG(
185-
Error,
186-
"cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
187-
cudaGetErrorString(err),
188-
nbytes,
189-
static_cast<int>(index));
190-
return Error::Internal;
191-
}
192-
return Error::Ok;
220+
return copy_impl(dst, src, nbytes, index, cudaMemcpyDeviceToHost);
193221
}
194222

195223
DeviceType CudaAllocator::device_type() const {
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
11+
#include <cuda_runtime.h>
12+
13+
#include <cstdint>
14+
#include <vector>
15+
16+
#include <executorch/backends/cuda/runtime/cuda_allocator.h>
17+
#include <executorch/extension/cuda/caller_stream.h>
18+
#include <executorch/runtime/core/error.h>
19+
#include <executorch/runtime/platform/platform.h>
20+
21+
using executorch::backends::cuda::CudaAllocator;
22+
using executorch::runtime::Error;
23+
24+
class CudaAllocatorTest : public testing::Test {
25+
protected:
26+
void SetUp() override {
27+
et_pal_init();
28+
29+
int device_count = 0;
30+
cudaError_t err = cudaGetDeviceCount(&device_count);
31+
if (err != cudaSuccess || device_count == 0) {
32+
GTEST_SKIP() << "CUDA not available";
33+
}
34+
}
35+
};
36+
37+
TEST_F(CudaAllocatorTest, CopyRoundtrip) {
38+
CudaAllocator& a = CudaAllocator::instance();
39+
constexpr size_t N = 1024;
40+
auto res = a.allocate(N, 0);
41+
ASSERT_TRUE(res.ok());
42+
void* dptr = res.get();
43+
44+
std::vector<uint8_t> h_src(N, 42), h_dst(N, 0);
45+
ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
46+
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok);
47+
EXPECT_EQ(h_src, h_dst);
48+
49+
a.deallocate(dptr, 0);
50+
}
51+
52+
TEST_F(CudaAllocatorTest, CopyRoundtripWithCallerStream) {
53+
int device = 0;
54+
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
55+
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
56+
// TODO: validate caller stream device matches index once CallerStreamGuard
57+
// exposes device. For now assert single-GPU case.
58+
cudaStream_t s;
59+
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
60+
{
61+
executorch::extension::cuda::CallerStreamGuard g(s);
62+
63+
CudaAllocator& a = CudaAllocator::instance();
64+
auto res = a.allocate(256, 0);
65+
ASSERT_TRUE(res.ok());
66+
void* d = res.get();
67+
std::vector<uint8_t> h_src(256, 5), h_dst(256, 0);
68+
ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok);
69+
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok);
70+
EXPECT_EQ(h_src, h_dst);
71+
EXPECT_EQ(cudaStreamSynchronize(s), cudaSuccess);
72+
73+
a.deallocate(d, 0);
74+
}
75+
ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
76+
}
77+
78+
TEST_F(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) {
79+
CudaAllocator& a = CudaAllocator::instance();
80+
// null dst should fail gracefully not CHECK abort
81+
std::vector<uint8_t> h(8, 1);
82+
Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0);
83+
EXPECT_EQ(e, Error::InvalidArgument)
84+
<< "expected InvalidArgument for null dst, got "
85+
<< static_cast<uint32_t>(e);
86+
}
87+
88+
TEST_F(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) {
89+
CudaAllocator& a = CudaAllocator::instance();
90+
void* dummy_dst = reinterpret_cast<void*>(0x1);
91+
Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0);
92+
EXPECT_EQ(e, Error::InvalidArgument)
93+
<< "expected InvalidArgument for null src, got "
94+
<< static_cast<uint32_t>(e);
95+
}
96+
97+
TEST_F(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) {
98+
CudaAllocator& a = CudaAllocator::instance();
99+
void* dummy_src = reinterpret_cast<void*>(0x1);
100+
Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0);
101+
EXPECT_EQ(e, Error::InvalidArgument)
102+
<< "expected InvalidArgument for null dst, got "
103+
<< static_cast<uint32_t>(e);
104+
}
105+
106+
TEST_F(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) {
107+
CudaAllocator& a = CudaAllocator::instance();
108+
std::vector<uint8_t> h(8, 1);
109+
// null src should fail gracefully not CHECK abort
110+
Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0);
111+
EXPECT_EQ(e, Error::InvalidArgument)
112+
<< "expected InvalidArgument for null src, got "
113+
<< static_cast<uint32_t>(e);
114+
}

0 commit comments

Comments
 (0)