forked from DiamondLightSource/fast-feedback-service
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcuda_common.hpp
463 lines (411 loc) · 15.7 KB
/
cuda_common.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
#ifndef CUDA_COMMON_H
#define CUDA_COMMON_H
#include <cuda_runtime.h>
#include <fmt/core.h>
#include <fmt/os.h>
#include <lodepng.h>
#include <algorithm>
#include <argparse/argparse.hpp>
#include <cinttypes>
#include <filesystem>
#include <fstream>
#include <stdexcept>
#include <type_traits>
#include <vector>
#include "common.hpp"
// #if __has_include(<hdf5.h>)
// #define HAS_HDF5
// namespace _hdf5 {
// #include <hdf5.h>
// }
// #endif
class cuda_error : public std::runtime_error {
public:
using std::runtime_error::runtime_error;
};
inline auto cuda_error_string(cudaError_t err) {
const char *err_name = cudaGetErrorName(err);
const char *err_str = cudaGetErrorString(err);
return fmt::format("{}: {}", std::string{err_name}, std::string{err_str});
}
inline auto _cuda_check_error(cudaError_t err, const char *file, int line_num) {
if (err != cudaSuccess) {
throw cuda_error(
fmt::format("{}:{}: {}", file, line_num, cuda_error_string(err)));
}
}
#define CUDA_CHECK(x) _cuda_check_error((x), __FILE__, __LINE__)
/// Raise an exception IF CUDA is in an error state, with the name and description
inline auto cuda_throw_error() -> void {
auto err = cudaGetLastError();
if (err != cudaSuccess) {
throw cuda_error(cuda_error_string(err));
}
}
class CUDAArgumentParser;
struct CUDAArguments {
public:
bool verbose = false;
std::string file;
int device_index = 0;
std::optional<size_t> image_number;
cudaDeviceProp device;
private:
friend class CUDAArgumentParser;
};
class CUDAArgumentParser : public argparse::ArgumentParser {
public:
CUDAArgumentParser(std::string version = "0.1.0")
: ArgumentParser("", version, argparse::default_arguments::help) {
this->add_argument("--version")
.help("print version information and exits")
.action([=](const auto & /*unused*/) {
fmt::print("{}\n", version);
std::exit(0);
})
.default_value(false)
.implicit_value(true)
.nargs(0);
this->add_argument("-v", "--verbose")
.help("Verbose output")
.implicit_value(false)
.action([&](const std::string &value) { _arguments.verbose = true; });
this->add_argument("-d", "--device")
.help("Index of the CUDA device device to target.")
.default_value(0)
.metavar("INDEX")
.action([&](const std::string &value) {
_arguments.device_index = std::stoi(value);
return _arguments.device_index;
});
this->add_argument("--image")
.help("Single image number to analyse, if not all")
.metavar("NUM")
.action([&](const std::string &value) {
_arguments.image_number = std::stoi(value);
return _arguments.image_number;
});
this->add_argument("--list-devices")
.help("List the order of CUDA devices, then quit.")
.implicit_value(false)
.action([](const std::string &value) {
int deviceCount;
if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
fmt::print("\033[1;31mError: Could not get GPU count ({})\033[0m\n",
cudaGetErrorString(cudaGetLastError()));
std::exit(1);
}
fmt::print("System GPUs:\n");
for (int device = 0; device < deviceCount; ++device) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
fmt::print(" {:2d}: {} (PCI {}:{}:{}, CUDA {}.{})\n",
device,
bold("{}", deviceProp.name),
deviceProp.pciDomainID,
deviceProp.pciBusID,
deviceProp.pciDeviceID,
deviceProp.major,
deviceProp.minor);
}
std::exit(0);
});
}
auto parse_args(int argc, char **argv) -> CUDAArguments {
// Convert these to std::string
std::vector<std::string> args{argv, argv + argc};
// Look for a "common.args" file in the current folder. If
// present, add each line as an argument.
std::ifstream common("common.args");
std::filesystem::path argfile{"common.args"};
if (std::filesystem::exists(argfile)) {
fmt::print("File {} exists, loading default args:\n",
bold(argfile.string()));
std::fstream f{argfile};
std::string arg;
while (std::getline(f, arg)) {
// Make sure this argument isn't already set
// if(std::find(vector.begin(), vector.end(), item)!=vector.end()){
// Found the item
if (std::find(args.begin(), args.end(), arg) != args.end()) {
continue;
}
if (arg.size() > 0) {
fmt::print(" {}\n", arg);
args.push_back(arg);
}
}
}
try {
ArgumentParser::parse_args(args);
} catch (std::runtime_error &e) {
fmt::print("{}: {}\n{}\n",
bold(red("Error")),
red(e.what()),
ArgumentParser::usage());
std::exit(1);
}
// cudaDeviceProp deviceProp;
if (cudaSetDevice(_arguments.device_index) != cudaSuccess) {
fmt::print(
"\033[1;31m{}\033[0m\033[31m: Could not select device ({})\033[0m\n",
"Error",
cuda_error_string(cudaGetLastError()));
std::exit(1);
}
if (cudaGetDeviceProperties(&_arguments.device, _arguments.device_index)
!= cudaSuccess) {
fmt::print(fmt::runtime(red("{}: Could not inspect GPU ({})\n",
bold("Error"),
cuda_error_string(cudaGetLastError()))));
std::exit(1);
}
fmt::print("Using {} (CUDA {}.{})\n\n",
bold(_arguments.device.name),
_arguments.device.major,
_arguments.device.minor);
#ifdef HAS_HDF5
// If we activated h5read, then handle hdf5 verbosity
if (_activated_h5read && !_arguments.verbose) {
_hdf5::H5Eset_auto((_hdf5::hid_t)0, NULL, NULL);
}
#endif
return _arguments;
}
void add_h5read_arguments() {
bool implicit_sample = std::getenv("H5READ_IMPLICIT_SAMPLE") != NULL;
auto &group = add_mutually_exclusive_group(!implicit_sample);
group.add_argument("--sample")
.help(
"Don't load a data file, instead use generated test data. If "
"H5READ_IMPLICIT_SAMPLE is set, then this is assumed, if a file is not "
"provided.")
.implicit_value(true);
group.add_argument("file")
.metavar("FILE.nxs")
.help("Path to the Nexus file to parse")
.action([&](const std::string &value) { _arguments.file = value; });
_activated_h5read = true;
}
private:
CUDAArguments _arguments{};
bool _activated_h5read = false;
};
template <typename T>
auto make_cuda_malloc(size_t num_items = 1) {
using Tb = typename std::remove_extent<T>::type;
Tb *obj = nullptr;
auto err = cudaMalloc(&obj, sizeof(Tb) * num_items);
if (err != cudaSuccess || obj == nullptr) {
throw cuda_error(
fmt::format("Error in make_cuda_malloc: {}", cuda_error_string(err)));
}
auto deleter = [](Tb *ptr) { cudaFree(ptr); };
return std::unique_ptr<T, decltype(deleter)>{obj, deleter};
}
template <typename T>
auto make_cuda_managed_malloc(size_t num_items) {
using Tb = typename std::remove_extent<T>::type;
Tb *obj = nullptr;
auto err = cudaMallocManaged(&obj, sizeof(Tb) * num_items);
if (err != cudaSuccess || obj == nullptr) {
throw cuda_error(
fmt::format("Error in make_cuda_managed_malloc: {}", cuda_error_string(err)));
}
auto deleter = [](Tb *ptr) { cudaFree(ptr); };
return std::unique_ptr<T, decltype(deleter)>{obj, deleter};
}
/// Allocate memory using cudaMallocHost
template <typename T>
auto make_cuda_pinned_malloc(size_t num_items = 1) {
using Tb = typename std::remove_extent<T>::type;
Tb *obj = nullptr;
auto err = cudaMallocHost(&obj, sizeof(Tb) * num_items);
if (err != cudaSuccess || obj == nullptr) {
throw cuda_error(
fmt::format("Error in make_cuda_pinned_malloc: {}", cuda_error_string(err)));
}
auto deleter = [](Tb *ptr) { cudaFreeHost(ptr); };
return std::shared_ptr<T[]>{obj, deleter};
}
template <typename T>
auto make_cuda_pitched_malloc(size_t width, size_t height) {
static_assert(!std::is_unbounded_array_v<T>,
"T automatically returns unbounded array pointer");
size_t pitch = 0;
T *obj = nullptr;
auto err = cudaMallocPitch(&obj, &pitch, width * sizeof(T), height);
if (err != cudaSuccess || obj == nullptr) {
throw cuda_error(
fmt::format("Error in make_cuda_pitched_malloc: {}", cuda_error_string(err)));
}
auto deleter = [](T *ptr) { cudaFree(ptr); };
return std::make_pair(std::shared_ptr<T[]>(obj, deleter), pitch / sizeof(T));
}
/**
* @brief Function to allocate a pitched memory buffer on the GPU.
* @param data The pointer to the allocated memory.
* @param width The width of the buffer.
* @param height The height of the buffer.
* @param pitch The pitch of the buffer.
*/
template <typename T>
struct PitchedMalloc {
public:
using value_type = T;
PitchedMalloc(std::shared_ptr<T[]> data, size_t width, size_t height, size_t pitch)
: _data(data), width(width), height(height), pitch(pitch) {}
PitchedMalloc(size_t width, size_t height) : width(width), height(height) {
auto [alloc, alloc_pitch] = make_cuda_pitched_malloc<T>(width, height);
_data = alloc;
pitch = alloc_pitch;
}
auto get() {
return _data.get();
}
auto size_bytes() -> size_t const {
return pitch * height * sizeof(T);
}
auto pitch_bytes() -> size_t const {
return pitch * sizeof(T);
}
std::shared_ptr<T[]> _data;
size_t width;
size_t height;
size_t pitch;
};
class CudaStream {
cudaStream_t _stream;
public:
CudaStream() {
cudaStreamCreate(&_stream);
}
~CudaStream() {
cudaStreamDestroy(_stream);
}
operator cudaStream_t() const {
return _stream;
}
};
class CudaEvent {
cudaEvent_t event;
public:
CudaEvent() {
if (cudaEventCreate(&event) != cudaSuccess) {
cuda_throw_error();
}
}
CudaEvent(cudaEvent_t event) : event(event) {}
~CudaEvent() {
cudaEventDestroy(event);
}
void record(cudaStream_t stream = 0) {
if (cudaEventRecord(event, stream) != cudaSuccess) {
cuda_throw_error();
}
}
/// Elapsed Event time, in milliseconds
float elapsed_time(CudaEvent &since) {
float elapsed_time = 0.0f;
if (cudaEventElapsedTime(&elapsed_time, since.event, event) != cudaSuccess) {
cuda_throw_error();
}
return elapsed_time;
}
void synchronize() {
if (cudaEventSynchronize(event) != cudaSuccess) {
cuda_throw_error();
}
}
};
/**
* @brief Save a 2D device array to a PNG file.
*
* @tparam PixelType The data type of the pixels (e.g., uint8_t).
* @tparam TransformFunc A callable object or lambda that performs the pixel transformation.
* @param device_ptr Pointer to the device array.
* @param device_pitch The pitch (width in bytes) of the device array.
* @param width The width of the image in pixels.
* @param height The height of the image in pixels.
* @param stream The CUDA stream to use for asynchronous copy and synchronization.
* @param output_filename The name of the output PNG file.
* @param transform_func The pixel transformation function (e.g., to invert pixel values).
*/
template <typename PixelType, typename TransformFunc>
void save_device_data_to_png(PixelType *device_ptr,
size_t device_pitch,
int width,
int height,
cudaStream_t stream,
const std::string &output_filename,
TransformFunc transform_func) {
// Allocate host vector to hold the copied data
std::vector<PixelType> host_data(width * height);
// Copy data from device to host asynchronously
cudaMemcpy2DAsync(host_data.data(),
width * sizeof(PixelType), // Host pitch (bytes)
device_ptr,
device_pitch, // Device pitch (bytes)
width * sizeof(PixelType), // Width (bytes)
height,
cudaMemcpyDeviceToHost,
stream);
// Synchronize the stream to ensure the copy is complete
cudaStreamSynchronize(stream);
// Apply the transformation function to each pixel
for (auto &pixel : host_data) {
pixel = transform_func(pixel);
}
// Encode and save the image as a PNG file
lodepng::encode(fmt::format("{}.png", output_filename),
reinterpret_cast<const unsigned char *>(host_data.data()),
width,
height,
LCT_GREY);
}
/**
* @brief Save the coordinates of pixels that satisfy a condition to a text file.
*
* @tparam PixelType The data type of the pixels (e.g., uint8_t).
* @tparam ConditionFunc A callable object or lambda that determines which pixels should be logged.
* @param device_ptr Pointer to the device array.
* @param device_pitch The pitch (width in bytes) of the device array.
* @param width The width of the image in pixels.
* @param height The height of the image in pixels.
* @param stream The CUDA stream to use for asynchronous copy and synchronization.
* @param output_filename The name of the output text file.
* @param condition_func The condition function that returns true for pixels to be logged.
*/
template <typename PixelType, typename ConditionFunc>
void save_device_data_to_txt(PixelType *device_ptr,
size_t device_pitch,
int width,
int height,
cudaStream_t stream,
const std::string &output_filename,
ConditionFunc condition_func) {
// Allocate host vector to hold the copied data
std::vector<PixelType> host_data(width * height);
// Copy data from device to host asynchronously
cudaMemcpy2DAsync(host_data.data(),
width * sizeof(PixelType), // Host pitch (bytes)
device_ptr,
device_pitch, // Device pitch (bytes)
width * sizeof(PixelType), // Width (bytes)
height,
cudaMemcpyDeviceToHost,
stream);
// Synchronize the stream to ensure the copy is complete
cudaStreamSynchronize(stream);
// Open an output file for the coordinates
auto out = fmt::output_file(fmt::format("{}.txt", output_filename));
// Write the coordinates of the pixels that satisfy the condition
for (int y = 0, k = 0; y < height; ++y) {
for (int x = 0; x < width; ++x, ++k) {
if (condition_func(host_data[k])) {
out.print("{}, {}, {}\n", x, y, host_data[k]);
}
}
}
}
#endif