Skip to content

Commit cf26de2

Browse files
authored
Merge pull request #940 from Naghasan/victor/kernel-fusion-amd
[UR][HIP] Enable kernel finalization using comgr
2 parents 3a3aae3 + 2fd9dea commit cf26de2

File tree

7 files changed

+256
-2
lines changed

7 files changed

+256
-2
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ List of options provided by CMake:
132132
| UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF |
133133
| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF |
134134
| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD |
135+
| UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD |
135136

136137
### Additional make targets
137138

source/adapters/hip/CMakeLists.txt

+13-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include")
1818
set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include")
1919

2020
# Set HIP lib dir
21-
set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib")
21+
set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/lib")
2222

2323
# Check if HIP library path exists (AMD platform only)
2424
if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
@@ -99,6 +99,18 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
9999
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
100100
)
101101

102+
if(UR_ENABLE_COMGR)
103+
add_library(amd_comgr SHARED IMPORTED GLOBAL)
104+
set_target_properties(
105+
amd_comgr PROPERTIES
106+
IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamd_comgr.so"
107+
INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
108+
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
109+
)
110+
target_link_libraries(pi_hip PUBLIC amd_comgr)
111+
target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION)
112+
endif(UR_ENABLE_COMGR)
113+
102114
target_link_libraries(${TARGET_NAME} PRIVATE
103115
${PROJECT_NAME}::headers
104116
${PROJECT_NAME}::common

source/adapters/hip/common.cpp

+63
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,23 @@
1111

1212
#include <sstream>
1313

14+
#ifdef SYCL_ENABLE_KERNEL_FUSION
15+
ur_result_t mapErrorUR(amd_comgr_status_t Result) {
16+
switch (Result) {
17+
case AMD_COMGR_STATUS_SUCCESS:
18+
return UR_RESULT_SUCCESS;
19+
case AMD_COMGR_STATUS_ERROR:
20+
return UR_RESULT_ERROR_UNKNOWN;
21+
case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT:
22+
return UR_RESULT_ERROR_INVALID_ARGUMENT;
23+
case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES:
24+
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
25+
default:
26+
return UR_RESULT_ERROR_UNKNOWN;
27+
}
28+
}
29+
#endif
30+
1431
ur_result_t mapErrorUR(hipError_t Result) {
1532
switch (Result) {
1633
case hipSuccess:
@@ -30,6 +47,52 @@ ur_result_t mapErrorUR(hipError_t Result) {
3047
}
3148
}
3249

50+
#ifdef SYCL_ENABLE_KERNEL_FUSION
51+
void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line,
52+
const char *File) {
53+
if (Result == AMD_COMGR_STATUS_SUCCESS) {
54+
return;
55+
}
56+
57+
if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr ||
58+
std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) {
59+
const char *ErrorString = nullptr;
60+
const char *ErrorName = nullptr;
61+
switch (Result) {
62+
case AMD_COMGR_STATUS_ERROR:
63+
ErrorName = "AMD_COMGR_STATUS_ERROR";
64+
ErrorString = "Generic error";
65+
break;
66+
case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT:
67+
ErrorName = "AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT";
68+
ErrorString =
69+
"One of the actual arguments does not meet a precondition stated in "
70+
"the documentation of the corresponding formal argument.";
71+
break;
72+
case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES:
73+
ErrorName = "AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES";
74+
ErrorString = "Failed to allocate the necessary resources";
75+
break;
76+
default:
77+
break;
78+
}
79+
std::cerr << "\nUR HIP ERROR:"
80+
<< "\n\tValue: " << Result
81+
<< "\n\tName: " << ErrorName
82+
<< "\n\tDescription: " << ErrorString
83+
<< "\n\tFunction: " << Function
84+
<< "\n\tSource Location: " << File << ":" << Line << "\n\n";
85+
}
86+
87+
if (std::getenv("PI_HIP_ABORT") != nullptr ||
88+
std::getenv("UR_HIP_ABORT") != nullptr) {
89+
std::abort();
90+
}
91+
92+
throw mapErrorUR(Result);
93+
}
94+
#endif
95+
3396
void checkErrorUR(hipError_t Result, const char *Function, int Line,
3497
const char *File) {
3598
if (Result == hipSuccess) {

source/adapters/hip/common.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
//===----------------------------------------------------------------------===//
1010
#pragma once
1111

12+
#ifdef SYCL_ENABLE_KERNEL_FUSION
13+
#include <amd_comgr/amd_comgr.h>
14+
#endif
1215
#include <hip/hip_runtime.h>
1316
#include <ur/ur.hpp>
1417

@@ -69,6 +72,10 @@ typedef hipArray *hipCUarray;
6972

7073
ur_result_t mapErrorUR(hipError_t Result);
7174

75+
#ifdef SYCL_ENABLE_KERNEL_FUSION
76+
void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line,
77+
const char *File);
78+
#endif
7279
void checkErrorUR(hipError_t Result, const char *Function, int Line,
7380
const char *File);
7481
void checkErrorUR(ur_result_t Result, const char *Function, int Line,

source/adapters/hip/program.cpp

+164-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,70 @@
1010

1111
#include "program.hpp"
1212

13+
#ifdef SYCL_ENABLE_KERNEL_FUSION
14+
#include <amd_comgr/amd_comgr.h>
15+
namespace {
16+
template <typename ReleaseType, ReleaseType Release, typename T>
17+
struct COMgrObjCleanUp {
18+
COMgrObjCleanUp(T Obj) : Obj{Obj} {}
19+
~COMgrObjCleanUp() { Release(Obj); }
20+
T Obj;
21+
};
22+
23+
using COMgrDataTCleanUp =
24+
COMgrObjCleanUp<decltype(&amd_comgr_release_data), &amd_comgr_release_data,
25+
amd_comgr_data_t>;
26+
using COMgrDataSetTCleanUp =
27+
COMgrObjCleanUp<decltype(&amd_comgr_destroy_data_set),
28+
&amd_comgr_destroy_data_set, amd_comgr_data_set_t>;
29+
using COMgrActionInfoCleanUp =
30+
COMgrObjCleanUp<decltype(&amd_comgr_destroy_action_info),
31+
&amd_comgr_destroy_action_info, amd_comgr_action_info_t>;
32+
33+
void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog,
34+
size_t MaxLogSize) {
35+
size_t count = 0;
36+
amd_comgr_status_t status = amd_comgr_action_data_count(
37+
BuildDataSet, AMD_COMGR_DATA_KIND_LOG, &count);
38+
39+
if (status != AMD_COMGR_STATUS_SUCCESS || count == 0) {
40+
std::strcpy(BuildLog, "extracting build log failed (no log).");
41+
return;
42+
}
43+
44+
amd_comgr_data_t LogBinaryData;
45+
46+
if (amd_comgr_action_data_get_data(BuildDataSet, AMD_COMGR_DATA_KIND_LOG, 0,
47+
&LogBinaryData) !=
48+
AMD_COMGR_STATUS_SUCCESS) {
49+
std::strcpy(BuildLog, "extracting build log failed (no data).");
50+
return;
51+
}
52+
COMgrDataTCleanUp LogDataCleanup{LogBinaryData};
53+
54+
size_t binarySize = 0;
55+
if (amd_comgr_get_data(LogBinaryData, &binarySize, NULL) !=
56+
AMD_COMGR_STATUS_SUCCESS) {
57+
std::strcpy(BuildLog, "extracting build log failed (no log size).");
58+
return;
59+
}
60+
61+
if (binarySize == 0) {
62+
std::strcpy(BuildLog, "no log.");
63+
return;
64+
}
65+
66+
size_t bufSize = binarySize < MaxLogSize ? binarySize : MaxLogSize;
67+
68+
if (amd_comgr_get_data(LogBinaryData, &bufSize, BuildLog) !=
69+
AMD_COMGR_STATUS_SUCCESS) {
70+
std::strcpy(BuildLog, "extracting build log failed (cannot copy log).");
71+
return;
72+
}
73+
}
74+
} // namespace
75+
#endif
76+
1377
ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt)
1478
: Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{
1579
Ctxt} {
@@ -18,6 +82,22 @@ ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt)
1882

1983
ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }
2084

85+
ur_result_t
86+
ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
87+
size_t Length) {
88+
for (size_t i = 0; i < Length; ++i) {
89+
const ur_program_metadata_t MetadataElement = Metadata[i];
90+
std::string MetadataElementName{MetadataElement.pName};
91+
92+
if (MetadataElementName ==
93+
__SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION) {
94+
assert(MetadataElement.type == UR_PROGRAM_METADATA_TYPE_UINT32);
95+
IsRelocatable = MetadataElement.value.data32;
96+
}
97+
}
98+
return UR_RESULT_SUCCESS;
99+
}
100+
21101
ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
22102
// Do not re-set program binary data which has already been set as that will
23103
// delete the old binary data.
@@ -28,7 +108,80 @@ ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
28108
return UR_RESULT_SUCCESS;
29109
}
30110

111+
ur_result_t ur_program_handle_t_::finalizeRelocatable() {
112+
#ifndef SYCL_ENABLE_KERNEL_FUSION
113+
assert(false && "Relocation only available with fusion");
114+
return UR_RESULT_ERROR_UNKNOWN;
115+
#else
116+
assert(IsRelocatable && "Not a relocatable input");
117+
amd_comgr_data_t ComgrData;
118+
amd_comgr_data_set_t RelocatableData;
119+
UR_CHECK_ERROR(amd_comgr_create_data_set(&RelocatableData));
120+
COMgrDataSetTCleanUp RelocatableDataCleanup{RelocatableData};
121+
122+
UR_CHECK_ERROR(
123+
amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &ComgrData));
124+
// RAII for auto clean-up
125+
COMgrDataTCleanUp DataCleanup{ComgrData};
126+
UR_CHECK_ERROR(amd_comgr_set_data(ComgrData, BinarySizeInBytes, Binary));
127+
UR_CHECK_ERROR(amd_comgr_set_data_name(ComgrData, "jit_obj.o"));
128+
129+
UR_CHECK_ERROR(amd_comgr_data_set_add(RelocatableData, ComgrData));
130+
131+
amd_comgr_action_info_t Action;
132+
133+
UR_CHECK_ERROR(amd_comgr_create_action_info(&Action));
134+
COMgrActionInfoCleanUp ActionCleanUp{Action};
135+
136+
std::string ISA = "amdgcn-amd-amdhsa--";
137+
hipDeviceProp_t Props;
138+
detail::ur::assertion(hipGetDeviceProperties(
139+
&Props, Context->getDevice()->get()) == hipSuccess);
140+
ISA += Props.gcnArchName;
141+
UR_CHECK_ERROR(amd_comgr_action_info_set_isa_name(Action, ISA.data()));
142+
143+
UR_CHECK_ERROR(amd_comgr_action_info_set_logging(Action, true));
144+
145+
amd_comgr_data_set_t Output;
146+
UR_CHECK_ERROR(amd_comgr_create_data_set(&Output));
147+
COMgrDataSetTCleanUp OutputDataCleanup{Output};
148+
149+
if (amd_comgr_do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
150+
Action, RelocatableData,
151+
Output) != AMD_COMGR_STATUS_SUCCESS) {
152+
getCoMgrBuildLog(Output, ErrorLog, MAX_LOG_SIZE);
153+
return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
154+
}
155+
amd_comgr_data_t binaryData;
156+
157+
UR_CHECK_ERROR(amd_comgr_action_data_get_data(
158+
Output, AMD_COMGR_DATA_KIND_EXECUTABLE, 0, &binaryData));
159+
{
160+
COMgrDataTCleanUp binaryDataCleanUp{binaryData};
161+
162+
size_t binarySize = 0;
163+
UR_CHECK_ERROR(amd_comgr_get_data(binaryData, &binarySize, NULL));
164+
165+
ExecutableCache.resize(binarySize);
166+
167+
UR_CHECK_ERROR(
168+
amd_comgr_get_data(binaryData, &binarySize, ExecutableCache.data()));
169+
}
170+
Binary = ExecutableCache.data();
171+
BinarySizeInBytes = ExecutableCache.size();
172+
return UR_RESULT_SUCCESS;
173+
#endif
174+
}
175+
31176
ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
177+
if (IsRelocatable) {
178+
if (finalizeRelocatable() != UR_RESULT_SUCCESS) {
179+
BuildStatus = UR_PROGRAM_BUILD_STATUS_ERROR;
180+
return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
181+
}
182+
IsRelocatable = false;
183+
}
184+
32185
if (BuildOptions) {
33186
this->BuildOptions = BuildOptions;
34187
}
@@ -246,7 +399,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
246399
/// Note: Only supports one device
247400
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
248401
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
249-
const uint8_t *pBinary, const ur_program_properties_t *,
402+
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
250403
ur_program_handle_t *phProgram) {
251404
UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY);
252405
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
@@ -259,6 +412,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
259412

260413
// TODO: Set metadata here and use reqd_work_group_size information.
261414
// See urProgramCreateWithBinary in CUDA adapter.
415+
if (pProperties) {
416+
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
417+
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
418+
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
419+
return UR_RESULT_ERROR_INVALID_SIZE;
420+
}
421+
Result =
422+
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
423+
}
424+
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
262425

263426
auto pBinary_string = reinterpret_cast<const char *>(pBinary);
264427
if (size == 0) {

source/adapters/hip/program.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ struct ur_program_handle_t_ {
2323
size_t BinarySizeInBytes;
2424
std::atomic_uint32_t RefCount;
2525
ur_context_handle_t Context;
26+
std::string ExecutableCache;
27+
28+
// Metadata
29+
bool IsRelocatable = false;
2630

2731
constexpr static size_t MAX_LOG_SIZE = 8192u;
2832

@@ -33,9 +37,12 @@ struct ur_program_handle_t_ {
3337
ur_program_handle_t_(ur_context_handle_t Ctxt);
3438
~ur_program_handle_t_();
3539

40+
ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);
41+
3642
ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes);
3743

3844
ur_result_t buildProgram(const char *BuildOptions);
45+
ur_result_t finalizeRelocatable();
3946
ur_context_handle_t getContext() const { return Context; };
4047

4148
native_type get() const noexcept { return Module; };

source/ur/ur.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER =
4949
#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \
5050
"@reqd_work_group_size"
5151
#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
52+
#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"
5253

5354
// Terminates the process with a catastrophic error message.
5455
[[noreturn]] inline void die(const char *Message) {

0 commit comments

Comments
 (0)