-
Notifications
You must be signed in to change notification settings - Fork 769
/
Copy pathcommon.cpp
173 lines (149 loc) · 5.06 KB
/
common.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
//===--------- common.cpp - CUDA Adapter ----------------------------------===//
//
// Copyright (C) 2023 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "common.hpp"
#include "logger/ur_logger.hpp"
#include <cuda.h>
#include <nvml.h>
#include <sstream>
ur_result_t mapErrorUR(CUresult Result) {
switch (Result) {
case CUDA_SUCCESS:
return UR_RESULT_SUCCESS;
case CUDA_ERROR_NOT_PERMITTED:
return UR_RESULT_ERROR_INVALID_OPERATION;
case CUDA_ERROR_INVALID_CONTEXT:
return UR_RESULT_ERROR_INVALID_CONTEXT;
case CUDA_ERROR_INVALID_DEVICE:
return UR_RESULT_ERROR_INVALID_DEVICE;
case CUDA_ERROR_INVALID_VALUE:
return UR_RESULT_ERROR_INVALID_VALUE;
case CUDA_ERROR_OUT_OF_MEMORY:
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
default:
return UR_RESULT_ERROR_UNKNOWN;
}
}
ur_result_t mapErrorUR(nvmlReturn_t Result) {
switch (Result) {
case NVML_SUCCESS:
return UR_RESULT_SUCCESS;
case NVML_ERROR_NOT_SUPPORTED:
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
case NVML_ERROR_GPU_IS_LOST:
return UR_RESULT_ERROR_DEVICE_LOST;
case NVML_ERROR_MEMORY:
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
case NVML_ERROR_INSUFFICIENT_RESOURCES:
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
default:
return UR_RESULT_ERROR_UNKNOWN;
}
}
void checkErrorUR(CUresult Result, const char *Function, int Line,
const char *File) {
if (Result == CUDA_SUCCESS || Result == CUDA_ERROR_DEINITIALIZED) {
return;
}
const char *ErrorString = nullptr;
const char *ErrorName = nullptr;
cuGetErrorName(Result, &ErrorName);
cuGetErrorString(Result, &ErrorString);
std::stringstream SS;
SS << "\nUR CUDA ERROR:"
<< "\n\tValue: " << Result
<< "\n\tName: " << ErrorName
<< "\n\tDescription: " << ErrorString
<< "\n\tFunction: " << Function << "\n\tSource Location: " << File
<< ":" << Line << "\n";
UR_LOG(ERR, "{}", SS.str());
if (std::getenv("PI_CUDA_ABORT") != nullptr ||
std::getenv("UR_CUDA_ABORT") != nullptr) {
std::abort();
}
throw mapErrorUR(Result);
}
void checkErrorUR(nvmlReturn_t Result, const char *Function, int Line,
const char *File) {
if (Result == NVML_SUCCESS) {
return;
}
const char *ErrorString = nullptr;
ErrorString = nvmlErrorString(Result);
std::stringstream SS;
SS << "\nUR NVML ERROR:"
<< "\n\tValue: " << Result
<< "\n\tDescription: " << ErrorString
<< "\n\tFunction: " << Function << "\n\tSource Location: " << File
<< ":" << Line << "\n";
UR_LOG(ERR, "{}", SS.str());
if (std::getenv("PI_CUDA_ABORT") != nullptr ||
std::getenv("UR_CUDA_ABORT") != nullptr) {
std::abort();
}
throw mapErrorUR(Result);
}
void checkErrorUR(ur_result_t Result, const char *Function, int Line,
const char *File) {
if (Result == UR_RESULT_SUCCESS) {
return;
}
std::stringstream SS;
SS << "\nUR ERROR:"
<< "\n\tValue: " << Result << "\n\tFunction: " << Function
<< "\n\tSource Location: " << File << ":" << Line << "\n";
UR_LOG(ERR, "{}", SS.str());
if (std::getenv("PI_CUDA_ABORT") != nullptr) {
std::abort();
}
throw Result;
}
std::string getCudaVersionString() {
int driver_version = 0;
cuDriverGetVersion(&driver_version);
// The version is returned as (1000 major + 10 minor).
std::stringstream stream;
stream << "CUDA " << driver_version / 1000 << "."
<< driver_version % 1000 / 10;
return stream.str();
}
// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
thread_local char ErrorMessage[MaxMessageSize]{};
// Utility function for setting a message and warning
[[maybe_unused]] void setErrorMessage(const char *pMessage,
ur_result_t ErrorCode) {
assert(strlen(pMessage) < MaxMessageSize);
// Copy at most MaxMessageSize - 1 bytes to ensure the resultant string is
// always null terminated.
strncpy(ErrorMessage, pMessage, MaxMessageSize - 1);
ErrorMessageCode = ErrorCode;
}
void setPluginSpecificMessage(CUresult cu_res) {
const char *error_string;
const char *error_name;
cuGetErrorName(cu_res, &error_name);
cuGetErrorString(cu_res, &error_string);
char *message = (char *)malloc(strlen(error_string) + strlen(error_name) + 2);
strcpy(message, error_name);
strcat(message, "\n");
strcat(message, error_string);
setErrorMessage(message, UR_RESULT_ERROR_ADAPTER_SPECIFIC);
free(message);
}
namespace umf {
ur_result_t getProviderNativeError(const char *providerName, int32_t error) {
if (strcmp(providerName, "CUDA") == 0) {
return mapErrorUR(static_cast<CUresult>(error));
}
return UR_RESULT_ERROR_UNKNOWN;
}
} // namespace umf