Skip to content

Commit 8c890ba

Browse files
committed
addcuda2
1 parent ebaa3de commit 8c890ba

File tree

8 files changed

+89
-21
lines changed

8 files changed

+89
-21
lines changed

slides/moderncuda/README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# 现代 C++ 的 CUDA 编程
22

3-
参考资料:https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
3+
参考资料:
4+
5+
- https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
6+
- https://www.cs.sfu.ca/~ashriram/Courses/CS431/assets/lectures/Part8/GPU1.pdf
47

58
## 配置 CUDA 开发环境
69

slides/moderncuda/better_cuda.cuh slides/moderncuda/cudapp.cuh

+40-12
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <cstddef>
44
#include <cstdio>
55
#include <cstdlib>
6+
#include <cstdarg>
67
#include <cuda_runtime.h>
78
#include <memory>
89
#include <new>
@@ -11,7 +12,7 @@
1112
#include <utility>
1213
#include <vector>
1314

14-
namespace cupp {
15+
namespace cudapp {
1516

1617
std::error_category const &cudaErrorCategory() noexcept {
1718
static struct : std::error_category {
@@ -40,7 +41,7 @@ void throwCudaError(cudaError_t err, char const *file, int line) {
4041
do { \
4142
cudaError_t err = (expr); \
4243
if (err != cudaSuccess) [[unlikely]] { \
43-
::cupp::throwCudaError(err, __FILE__, __LINE__); \
44+
::cudapp::throwCudaError(err, __FILE__, __LINE__); \
4445
} \
4546
} while (0)
4647

@@ -265,10 +266,22 @@ public:
265266
}
266267
};
267268

268-
void synchronize() const {
269+
void join() const {
269270
CHECK_CUDA(cudaEventSynchronize(*this));
270271
}
271272

273+
bool joinReady() const {
274+
cudaError_t res = cudaEventQuery(*this);
275+
if (res == cudaSuccess) {
276+
return true;
277+
}
278+
if (res == cudaErrorNotReady) {
279+
return false;
280+
}
281+
CHECK_CUDA(res);
282+
return false;
283+
}
284+
272285
float elapsedMillis(CudaEvent const &event) const {
273286
float result;
274287
CHECK_CUDA(cudaEventElapsedTime(&result, *this, event));
@@ -315,10 +328,6 @@ public:
315328
return CudaStream(nullptr);
316329
}
317330

318-
void synchronize() const {
319-
CHECK_CUDA(cudaStreamSynchronize(*this));
320-
}
321-
322331
void copy(void *dst, void *src, size_t size, cudaMemcpyKind kind) const {
323332
CHECK_CUDA(cudaMemcpyAsync(dst, src, size, kind, *this));
324333
}
@@ -348,23 +357,27 @@ public:
348357
CHECK_CUDA(cudaStreamWaitEvent(*this, event, flags));
349358
}
350359

351-
void asyncWait(cudaStreamCallback_t callback, void *userData) const {
360+
void join() const {
361+
CHECK_CUDA(cudaStreamSynchronize(*this));
362+
}
363+
364+
void joinAsync(cudaStreamCallback_t callback, void *userData) const {
352365
CHECK_CUDA(cudaStreamAddCallback(*this, callback, userData, 0));
353366
}
354367

355368
template <class Func>
356-
void asyncWait(Func &&func) const {
369+
void joinAsync(Func &&func) const {
357370
auto userData = std::make_unique<Func>();
358371
cudaStreamCallback_t callback = [](cudaStream_t stream,
359372
cudaError_t status, void *userData) {
360373
std::unique_ptr<Func> func(static_cast<Func *>(userData));
361374
(*func)(stream, status);
362375
};
363-
asyncWait(callback, userData.get());
376+
joinAsync(callback, userData.get());
364377
userData.release();
365378
}
366379

367-
bool pollWait() {
380+
bool joinReady() const {
368381
cudaError_t res = cudaStreamQuery(*this);
369382
if (res == cudaSuccess) {
370383
return true;
@@ -418,7 +431,7 @@ struct CudaAllocator : private Arena {
418431
if (res == cudaErrorMemoryAllocation) [[unlikely]] {
419432
throw std::bad_alloc();
420433
}
421-
CHECK_CUDA(("Arena::doMalloc", res));
434+
CHECK_CUDA(res /* Arena::doMalloc */);
422435
return static_cast<T *>(ptr);
423436
}
424437

@@ -459,6 +472,21 @@ struct CudaAllocator : private Arena {
459472
template <class T>
460473
using CudaVector = std::vector<T, CudaAllocator<T>>;
461474

475+
#if defined(__clang__) && defined(__CUDACC__) && defined(__GLIBCXX__)
476+
__host__ __device__ static void printf(const char *fmt, ...) {
477+
va_list args;
478+
va_start(args, fmt);
479+
#if __CUDA_ARCH__
480+
::vprintf(fmt, (const char *)args);
481+
#else
482+
::vprintf(fmt, args);
483+
#endif
484+
va_end(args);
485+
}
486+
#else
487+
using ::printf;
488+
#endif
489+
462490
// #if __cpp_lib_memory_resource
463491
// template <class Arena>
464492
// struct CudaResource : std::pmr::memory_resource, private Arena {

slides/moderncuda/example.cu slides/moderncuda/example/example.cu

+1-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55
}
66

77
__device__ void device_func() {
8-
auto t = cooperative_groups::this_thread();
9-
t.size();
8+
std::sin(1);
109
}
1110

1211
__host__ __device__ void host_device_func() {
File renamed without changes.
File renamed without changes.

slides/moderncuda/main.cu

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#include <cuda_runtime.h>
2+
#include <nvfunctional>
3+
#include "cudapp.cuh"
4+
5+
using namespace cudapp;
6+
7+
extern "C" __global__ void kernel(int x) {
8+
printf("内核参数 x = %d\n", x);
9+
printf("线程编号 (%d, %d)\n", blockIdx.x, threadIdx.x);
10+
}
11+
12+
int main() {
13+
int x = 42;
14+
kernel<<<3, 4, 0, 0>>>(x);
15+
16+
void *args[] = {&x};
17+
CHECK_CUDA(cudaLaunchKernel((const void *)kernel, dim3(3), dim3(4), args, 0, 0));
18+
19+
cudaLaunchConfig_t cfg{};
20+
cfg.blockDim = dim3(3);
21+
cfg.gridDim = dim3(4);
22+
cfg.dynamicSmemBytes = 0;
23+
cfg.stream = 0;
24+
cfg.attrs = nullptr;
25+
cfg.numAttrs = 0;
26+
CHECK_CUDA(cudaLaunchKernelEx(&cfg, kernel, x));
27+
28+
const char *name;
29+
CHECK_CUDA(cudaFuncGetName(&name, (const void *)kernel));
30+
31+
CudaStream::nullStream().join();
32+
return 0;
33+
}

slides/moderncuda/tinybench.cpp

+1-6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1+
#define TINYBENCH_IMPL_MAIN
12
#define TINYBENCH_IMPL
23
#include "tinybench.hpp"
3-
4-
[[gnu::weak]] int main() {
5-
std::unique_ptr<tinybench::Reporter> rep(tinybench::makeMultipleReporter({tinybench::makeConsoleReporter()}));
6-
rep->run_all();
7-
return 0;
8-
}

slides/moderncuda/tinybench.hpp

+10
Original file line numberDiff line numberDiff line change
@@ -885,3 +885,13 @@ Reporter *makeMultipleReporter(std::vector<Reporter *> const &reporters) {
885885

886886
}
887887
#endif
888+
889+
#ifdef TINYBENCH_IMPL_MAIN
890+
#include <memory>
891+
892+
[[gnu::weak]] int main() {
893+
std::unique_ptr<tinybench::Reporter> rep(tinybench::makeMultipleReporter({tinybench::makeConsoleReporter()}));
894+
rep->run_all();
895+
return 0;
896+
}
897+
#endif

0 commit comments

Comments
 (0)