Skip to content

Commit 7e0cdf8

Browse files
committed
add cuda codes
1 parent 49a2c1b commit 7e0cdf8

File tree

16 files changed

+3657
-28
lines changed

16 files changed

+3657
-28
lines changed

cudaguide/1_moderncuda/README.md

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -320,13 +320,23 @@ CUDA 中的函数分为三大类:
320320

321321
但是,CPU 上的每个进程只会调用一次 `main` 入口点,而 GPU 上的 `__global__` 会被调用 n 次(n 的大小在 host 函数中指定),所有启动的 n 个 `__global__` 函数互相之间是并行执行的,每个线程的入口点都是 `__global__`,因此一个“网格”含有多个“线程”。
322322

323+
通过在函数的定义前加上关键字前缀,可以改变函数的类型。
324+
325+
其中 `__host__` 可以省略,不带任何前缀时默认就算 `__host__`。因此 C++ 的代码可以无缝植入到 CUDA 代码中,C++ 的函数默认就会被视为 `__host__`,放在 CPU 侧编译。
326+
327+
以下是一个案例:
328+
323329
```cuda
324330
#include <cuda_runtime.h>
325331
326-
/*__host__*/ void host_func() {
332+
/*__host__*/ void host_func() { // 仅为此函数编译 CPU 版本
333+
printf("hello from host!\n");
334+
std::cout << "cout from host!" << std::endl;
327335
}
328336
329-
__device__ void device_func() {
337+
__device__ void device_func() { // 仅为此函数编译 GPU 版本
338+
printf("hello from device!\n"); // OK!CUDA 为 printf 做了个特殊的 device 版重载,可以在 GPU 中调用的一个特供版本
339+
std::cout << "cout from host!" << std::endl; // 编译错误,cout 是 C++ 的华丽胡哨垃圾,CUDA 官方没有适配,无法在 GPU 调用
330340
}
331341
332342
__host__ __device__ void host_device_func() {
@@ -374,7 +384,9 @@ int main() {
374384
// kernel<<<blockDim, gridDim>>>(...)
375385
kernel<<<3, 4>>>();
376386
377-
// 强制同步:等待此前启动过的所有内核执行完成
387+
// CUDA 内核的启动都是异步的,类似于 std::thread(kernel).detach() 的效果
388+
// 因此需要强制同步,等待此前启动过的所有内核执行完成,才退出程序
389+
// 否则可能内核还没来得及启动,CPU 程序就被你退出了,导致收不到 printf 的信息
378390
cudaDeviceSynchronize();
379391
380392
return 0;
@@ -398,7 +410,7 @@ int main() {
398410
| 启动内核 `kernel<<<3, 4>>>()` | `std::async` 或 `std::thread` |
399411
| `cudaDeviceSynchronize` | `future.wait()` 或 `thread.join()` |
400412

401-
### 小彭老师为你准备的 CUDA 框架
413+
### 小彭老师赋能 CUDA 现代化
402414

403415
```cuda
404416
#include <cuda_runtime.h>
@@ -433,18 +445,14 @@ int main() {
433445
cfg.numAttrs = 0;
434446
CHECK_CUDA(cudaLaunchKernelEx(&cfg, kernel, x));
435447
436-
const char *name;
437-
CHECK_CUDA(cudaFuncGetName(&name, kernel));
438-
printf("内核名字:%s\n", name);
439-
440448
// 1. 强制同步:等待此前启动过的所有内核执行完成
441449
CHECK_CUDA(cudaDeviceSynchronize());
442450
443451
// 2. 仅同步 0 号流(null-stream)
444452
CHECK_CUDA(cudaStreamSynchronize(0));
445453
446454
// 3. 仅同步 0 号流,但使用小彭老师现代 CUDA 框架
447-
CudaStream::nullStream().join();
455+
CudaStream::defaultStream().join();
448456
449457
return 0;
450458
}

cudaguide/1_moderncuda/cudapp.cuh

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
#pragma once
22

3-
#include "debug.hpp"
43
#include <cuda_runtime.h>
4+
#include <nvfunctional>
55
#include <version>
66
#include <cstddef>
77
#include <cstdio>
88
#include <cstdlib>
99
#include <cstdarg>
10+
#include <cstdarg>
1011
#include <memory>
1112
#include <new>
1213
#include <string>
@@ -187,6 +188,8 @@ private:
187188

188189
public:
189190
CudaMemPool(std::nullptr_t) noexcept {}
191+
CudaMemPool(CudaMemPool &&) = default;
192+
CudaMemPool &operator=(CudaMemPool &&) = default;
190193

191194
struct Builder {
192195
private:
@@ -259,12 +262,17 @@ private:
259262

260263
public:
261264
CudaEvent(std::nullptr_t) noexcept {}
265+
CudaEvent(CudaEvent &&) = default;
266+
CudaEvent &operator=(CudaEvent &&) = default;
262267

263268
struct Builder {
264269
private:
265270
int flags = cudaEventDefault;
266271

267272
public:
273+
Builder() = default;
274+
explicit Builder(int flags) noexcept : flags(flags) {}
275+
268276
Builder &withBlockingSync(bool blockingSync = true) noexcept {
269277
if (blockingSync) {
270278
flags |= cudaEventBlockingSync;
@@ -303,24 +311,28 @@ public:
303311
CHECK_CUDA(cudaEventSynchronize(*this));
304312
}
305313

306-
bool joinReady() const {
314+
bool poll() const {
307315
cudaError_t res = cudaEventQuery(*this);
308316
if (res == cudaSuccess) {
309317
return true;
310318
}
311319
if (res == cudaErrorNotReady) {
312320
return false;
313321
}
314-
CHECK_CUDA(res);
322+
CHECK_CUDA(res /* cudaEventQuery */);
315323
return false;
316324
}
317325

318326
float elapsedMillis(CudaEvent const &event) const {
319327
float result;
320-
CHECK_CUDA(cudaEventElapsedTime(&result, *this, event));
328+
CHECK_CUDA(cudaEventElapsedTime(&result, event, *this));
321329
return result;
322330
}
323331

332+
float operator-(CudaEvent const &event) const {
333+
return elapsedMillis(event);
334+
}
335+
324336
~CudaEvent() {
325337
if (*this) {
326338
CHECK_CUDA(cudaEventDestroy(*this));
@@ -335,12 +347,17 @@ private:
335347

336348
public:
337349
CudaStream(std::nullptr_t) noexcept {}
350+
CudaStream(CudaStream &&) = default;
351+
CudaStream &operator=(CudaStream &&) = default;
338352

339353
struct Builder {
340354
private:
341355
int flags = cudaStreamDefault;
342356

343357
public:
358+
Builder() = default;
359+
explicit Builder(int flags) noexcept : flags(flags) {}
360+
344361
Builder &withNonBlocking(bool nonBlocking = true) noexcept {
345362
if (nonBlocking) {
346363
flags |= cudaStreamNonBlocking;
@@ -357,10 +374,14 @@ public:
357374
}
358375
};
359376

360-
static CudaStream nullStream() noexcept {
377+
static CudaStream defaultStream() noexcept {
361378
return CudaStream(nullptr);
362379
}
363380

381+
static CudaStream perThreadStream() noexcept {
382+
return CudaStream(cudaStreamPerThread);
383+
}
384+
364385
void copy(void *dst, void *src, size_t size, cudaMemcpyKind kind) const {
365386
CHECK_CUDA(cudaMemcpyAsync(dst, src, size, kind, *this));
366387
}
@@ -381,11 +402,17 @@ public:
381402
copy(dst, src, size, cudaMemcpyHostToHost);
382403
}
383404

384-
void record(CudaEvent const &event) const {
405+
void recordEvent(CudaEvent const &event) const {
385406
CHECK_CUDA(cudaEventRecord(event, *this));
386407
}
387408

388-
void wait(CudaEvent const &event,
409+
CudaEvent recordEvent() const {
410+
CudaEvent event = CudaEvent::Builder().build();
411+
recordEvent(event);
412+
return event;
413+
}
414+
415+
void waitEvent(CudaEvent const &event,
389416
unsigned int flags = cudaEventWaitDefault) const {
390417
CHECK_CUDA(cudaStreamWaitEvent(*this, event, flags));
391418
}
@@ -403,22 +430,23 @@ public:
403430
auto userData = std::make_unique<Func>();
404431
cudaStreamCallback_t callback = [](cudaStream_t stream,
405432
cudaError_t status, void *userData) {
433+
CHECK_CUDA(status /* joinAsync cudaStreamCallback */);
406434
std::unique_ptr<Func> func(static_cast<Func *>(userData));
407-
(*func)(stream, status);
435+
(*func)();
408436
};
409437
joinAsync(callback, userData.get());
410438
userData.release();
411439
}
412440

413-
bool joinReady() const {
441+
bool poll() const {
414442
cudaError_t res = cudaStreamQuery(*this);
415443
if (res == cudaSuccess) {
416444
return true;
417445
}
418446
if (res == cudaErrorNotReady) {
419447
return false;
420448
}
421-
CHECK_CUDA(res);
449+
CHECK_CUDA(res /* cudaStreamQuery */);
422450
return false;
423451
}
424452

@@ -428,7 +456,7 @@ public:
428456
}
429457

430458
~CudaStream() {
431-
if (*this) {
459+
if (*this && *this != cudaStreamPerThread) {
432460
CHECK_CUDA(cudaStreamDestroy(*this));
433461
}
434462
}
@@ -522,8 +550,8 @@ struct CudaAllocator : private Arena {
522550
};
523551
};
524552

525-
template <class T>
526-
using CudaVector = std::vector<T, CudaAllocator<T>>;
553+
template <class T, class Arena = CudaManagedArena>
554+
using CudaVector = std::vector<T, CudaAllocator<T, Arena>>;
527555

528556
#if defined(__clang__) && defined(__CUDACC__) && defined(__GLIBCXX__)
529557
__host__ __device__ static void printf(const char *fmt, ...) {

cudaguide/1_moderncuda/main.cu

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,26 +22,27 @@ int main() {
2222

2323
// 3. cudaLaunchKernelEx
2424
cudaLaunchConfig_t cfg{};
25-
cfg.blockDim = dim3(3);
26-
cfg.gridDim = dim3(4);
27-
cfg.dynamicSmemBytes = 0;
28-
cfg.stream = 0;
25+
cfg.blockDim = dim3(3); // threadIdx 的变化范围
26+
cfg.gridDim = dim3(4); // blockIdx 的变化范围
27+
cfg.dynamicSmemBytes = 0; // shared-memory 大小(暂不使用)
28+
cfg.stream = 0; // 在 0 号流(默认流)上启动
2929
cfg.attrs = nullptr;
3030
cfg.numAttrs = 0;
3131
CHECK_CUDA(cudaLaunchKernelEx(&cfg, kernel, x));
3232

33+
// 还能查询内核名字
3334
const char *name;
3435
CHECK_CUDA(cudaFuncGetName(&name, kernel));
3536
printf("内核名字:%s\n", name);
3637

3738
// 1. 强制同步:等待此前启动过的所有内核执行完成
3839
CHECK_CUDA(cudaDeviceSynchronize());
3940

40-
// 2. 仅同步 0 号流(null-stream)
41+
// 2. 仅同步 0 号流(默认流):等待此前在 0 号流上启动过的所有内核执行完成
4142
CHECK_CUDA(cudaStreamSynchronize(0));
4243

4344
// 3. 仅同步 0 号流,但使用小彭老师现代 CUDA 框架
44-
CudaStream::nullStream().join();
45+
CudaStream::defaultStream().join();
4546

4647
return 0;
4748
}

cudaguide/2_cudastream/CMakeLists.txt

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
cmake_minimum_required(VERSION 3.12)
2+
3+
set(CMAKE_CXX_STANDARD 20)
4+
set(CMAKE_CUDA_STANDARD 20)
5+
set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF)
6+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --expt-extended-lambda")
7+
8+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
9+
# try to set CMAKE_CUDA_ARCHITECTURES to native card version
10+
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
11+
set(CMAKE_CUDA_ARCHITECTURES native)
12+
else()
13+
message(STATUS "Running __nvcc_device_query for CMAKE_CUDA_ARCHITECTURES")
14+
execute_process(COMMAND "__nvcc_device_query"
15+
RESULT_VARIABLE NVCC_DEVICE_QUERY_RESULT
16+
OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES
17+
OUTPUT_STRIP_TRAILING_WHITESPACE)
18+
if (NVCC_DEVICE_QUERY_RESULT EQUAL 0)
19+
message(STATUS "CMAKE_CUDA_ARCHITECTURES set to ${CMAKE_CUDA_ARCHITECTURES}")
20+
else()
21+
message(WARNING "CMAKE_CUDA_ARCHITECTURES not defined and __nvcc_device_query failed.")
22+
unset(CMAKE_CUDA_ARCHITECTURES)
23+
endif()
24+
endif()
25+
endif()
26+
27+
project(cudaradixsort LANGUAGES CXX CUDA)
28+
29+
file(GLOB sources "*.cpp" "*.cu")
30+
add_executable(${PROJECT_NAME} ${sources})
31+
target_link_libraries(${PROJECT_NAME} PRIVATE cusparse cublas)
32+
find_package(fmt REQUIRED)
33+
target_link_libraries(${PROJECT_NAME} PRIVATE fmt::fmt)

0 commit comments

Comments
 (0)