Skip to content

Commit 2fd9dea

Browse files
committed
[HIP] Enable kernel finalization using comgr
For kernel fusion support for hip, we need to finalize the kernels using comgr. The patch finalizes tagged binaries during buildProgram before handing it over to the hip runtime. Signed-off-by: Victor Lomuller <[email protected]>
1 parent b38855e commit 2fd9dea

File tree

7 files changed

+256
-2
lines changed

7 files changed

+256
-2
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ List of options provided by CMake:
132132
| UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF |
133133
| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF |
134134
| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD |
135+
| UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD |
135136

136137
### Additional make targets
137138

source/adapters/hip/CMakeLists.txt

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include")
1818
set(UR_HIP_HSA_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/hsa/include")
1919

2020
# Set HIP lib dir
21-
set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/hip/lib")
21+
set(UR_HIP_LIB_DIR "${UR_HIP_ROCM_DIR}/lib")
2222

2323
# Check if HIP library path exists (AMD platform only)
2424
if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
@@ -99,6 +99,18 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD")
9999
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
100100
)
101101

102+
if(UR_ENABLE_COMGR)
103+
add_library(amd_comgr SHARED IMPORTED GLOBAL)
104+
set_target_properties(
105+
amd_comgr PROPERTIES
106+
IMPORTED_LOCATION "${UR_HIP_LIB_DIR}/libamd_comgr.so"
107+
INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
108+
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
109+
)
110+
target_link_libraries(pi_hip PUBLIC amd_comgr)
111+
target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION)
112+
endif(UR_ENABLE_COMGR)
113+
102114
target_link_libraries(${TARGET_NAME} PRIVATE
103115
${PROJECT_NAME}::headers
104116
${PROJECT_NAME}::common

source/adapters/hip/common.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,23 @@
1111

1212
#include <sstream>
1313

14+
#ifdef SYCL_ENABLE_KERNEL_FUSION
15+
ur_result_t mapErrorUR(amd_comgr_status_t Result) {
16+
switch (Result) {
17+
case AMD_COMGR_STATUS_SUCCESS:
18+
return UR_RESULT_SUCCESS;
19+
case AMD_COMGR_STATUS_ERROR:
20+
return UR_RESULT_ERROR_UNKNOWN;
21+
case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT:
22+
return UR_RESULT_ERROR_INVALID_ARGUMENT;
23+
case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES:
24+
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
25+
default:
26+
return UR_RESULT_ERROR_UNKNOWN;
27+
}
28+
}
29+
#endif
30+
1431
ur_result_t mapErrorUR(hipError_t Result) {
1532
switch (Result) {
1633
case hipSuccess:
@@ -30,6 +47,52 @@ ur_result_t mapErrorUR(hipError_t Result) {
3047
}
3148
}
3249

50+
#ifdef SYCL_ENABLE_KERNEL_FUSION
51+
void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line,
52+
const char *File) {
53+
if (Result == AMD_COMGR_STATUS_SUCCESS) {
54+
return;
55+
}
56+
57+
if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr ||
58+
std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) {
59+
const char *ErrorString = nullptr;
60+
const char *ErrorName = nullptr;
61+
switch (Result) {
62+
case AMD_COMGR_STATUS_ERROR:
63+
ErrorName = "AMD_COMGR_STATUS_ERROR";
64+
ErrorString = "Generic error";
65+
break;
66+
case AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT:
67+
ErrorName = "AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT";
68+
ErrorString =
69+
"One of the actual arguments does not meet a precondition stated in "
70+
"the documentation of the corresponding formal argument.";
71+
break;
72+
case AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES:
73+
ErrorName = "AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES";
74+
ErrorString = "Failed to allocate the necessary resources";
75+
break;
76+
default:
77+
break;
78+
}
79+
std::cerr << "\nUR HIP ERROR:"
80+
<< "\n\tValue: " << Result
81+
<< "\n\tName: " << ErrorName
82+
<< "\n\tDescription: " << ErrorString
83+
<< "\n\tFunction: " << Function
84+
<< "\n\tSource Location: " << File << ":" << Line << "\n\n";
85+
}
86+
87+
if (std::getenv("PI_HIP_ABORT") != nullptr ||
88+
std::getenv("UR_HIP_ABORT") != nullptr) {
89+
std::abort();
90+
}
91+
92+
throw mapErrorUR(Result);
93+
}
94+
#endif
95+
3396
void checkErrorUR(hipError_t Result, const char *Function, int Line,
3497
const char *File) {
3598
if (Result == hipSuccess) {

source/adapters/hip/common.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
//===----------------------------------------------------------------------===//
1010
#pragma once
1111

12+
#ifdef SYCL_ENABLE_KERNEL_FUSION
13+
#include <amd_comgr/amd_comgr.h>
14+
#endif
1215
#include <hip/hip_runtime.h>
1316
#include <ur/ur.hpp>
1417

@@ -69,6 +72,10 @@ typedef hipArray *hipCUarray;
6972

7073
ur_result_t mapErrorUR(hipError_t Result);
7174

75+
#ifdef SYCL_ENABLE_KERNEL_FUSION
76+
void checkErrorUR(amd_comgr_status_t Result, const char *Function, int Line,
77+
const char *File);
78+
#endif
7279
void checkErrorUR(hipError_t Result, const char *Function, int Line,
7380
const char *File);
7481
void checkErrorUR(ur_result_t Result, const char *Function, int Line,

source/adapters/hip/program.cpp

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,70 @@
1010

1111
#include "program.hpp"
1212

13+
#ifdef SYCL_ENABLE_KERNEL_FUSION
14+
#include <amd_comgr/amd_comgr.h>
15+
namespace {
16+
template <typename ReleaseType, ReleaseType Release, typename T>
17+
struct COMgrObjCleanUp {
18+
COMgrObjCleanUp(T Obj) : Obj{Obj} {}
19+
~COMgrObjCleanUp() { Release(Obj); }
20+
T Obj;
21+
};
22+
23+
using COMgrDataTCleanUp =
24+
COMgrObjCleanUp<decltype(&amd_comgr_release_data), &amd_comgr_release_data,
25+
amd_comgr_data_t>;
26+
using COMgrDataSetTCleanUp =
27+
COMgrObjCleanUp<decltype(&amd_comgr_destroy_data_set),
28+
&amd_comgr_destroy_data_set, amd_comgr_data_set_t>;
29+
using COMgrActionInfoCleanUp =
30+
COMgrObjCleanUp<decltype(&amd_comgr_destroy_action_info),
31+
&amd_comgr_destroy_action_info, amd_comgr_action_info_t>;
32+
33+
void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog,
34+
size_t MaxLogSize) {
35+
size_t count = 0;
36+
amd_comgr_status_t status = amd_comgr_action_data_count(
37+
BuildDataSet, AMD_COMGR_DATA_KIND_LOG, &count);
38+
39+
if (status != AMD_COMGR_STATUS_SUCCESS || count == 0) {
40+
std::strcpy(BuildLog, "extracting build log failed (no log).");
41+
return;
42+
}
43+
44+
amd_comgr_data_t LogBinaryData;
45+
46+
if (amd_comgr_action_data_get_data(BuildDataSet, AMD_COMGR_DATA_KIND_LOG, 0,
47+
&LogBinaryData) !=
48+
AMD_COMGR_STATUS_SUCCESS) {
49+
std::strcpy(BuildLog, "extracting build log failed (no data).");
50+
return;
51+
}
52+
COMgrDataTCleanUp LogDataCleanup{LogBinaryData};
53+
54+
size_t binarySize = 0;
55+
if (amd_comgr_get_data(LogBinaryData, &binarySize, NULL) !=
56+
AMD_COMGR_STATUS_SUCCESS) {
57+
std::strcpy(BuildLog, "extracting build log failed (no log size).");
58+
return;
59+
}
60+
61+
if (binarySize == 0) {
62+
std::strcpy(BuildLog, "no log.");
63+
return;
64+
}
65+
66+
size_t bufSize = binarySize < MaxLogSize ? binarySize : MaxLogSize;
67+
68+
if (amd_comgr_get_data(LogBinaryData, &bufSize, BuildLog) !=
69+
AMD_COMGR_STATUS_SUCCESS) {
70+
std::strcpy(BuildLog, "extracting build log failed (cannot copy log).");
71+
return;
72+
}
73+
}
74+
} // namespace
75+
#endif
76+
1377
ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt)
1478
: Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{
1579
Ctxt} {
@@ -18,6 +82,22 @@ ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt)
1882

1983
ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }
2084

85+
ur_result_t
86+
ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
87+
size_t Length) {
88+
for (size_t i = 0; i < Length; ++i) {
89+
const ur_program_metadata_t MetadataElement = Metadata[i];
90+
std::string MetadataElementName{MetadataElement.pName};
91+
92+
if (MetadataElementName ==
93+
__SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION) {
94+
assert(MetadataElement.type == UR_PROGRAM_METADATA_TYPE_UINT32);
95+
IsRelocatable = MetadataElement.value.data32;
96+
}
97+
}
98+
return UR_RESULT_SUCCESS;
99+
}
100+
21101
ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
22102
// Do not re-set program binary data which has already been set as that will
23103
// delete the old binary data.
@@ -28,7 +108,80 @@ ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
28108
return UR_RESULT_SUCCESS;
29109
}
30110

111+
ur_result_t ur_program_handle_t_::finalizeRelocatable() {
112+
#ifndef SYCL_ENABLE_KERNEL_FUSION
113+
assert(false && "Relocation only available with fusion");
114+
return UR_RESULT_ERROR_UNKNOWN;
115+
#else
116+
assert(IsRelocatable && "Not a relocatable input");
117+
amd_comgr_data_t ComgrData;
118+
amd_comgr_data_set_t RelocatableData;
119+
UR_CHECK_ERROR(amd_comgr_create_data_set(&RelocatableData));
120+
COMgrDataSetTCleanUp RelocatableDataCleanup{RelocatableData};
121+
122+
UR_CHECK_ERROR(
123+
amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &ComgrData));
124+
// RAII for auto clean-up
125+
COMgrDataTCleanUp DataCleanup{ComgrData};
126+
UR_CHECK_ERROR(amd_comgr_set_data(ComgrData, BinarySizeInBytes, Binary));
127+
UR_CHECK_ERROR(amd_comgr_set_data_name(ComgrData, "jit_obj.o"));
128+
129+
UR_CHECK_ERROR(amd_comgr_data_set_add(RelocatableData, ComgrData));
130+
131+
amd_comgr_action_info_t Action;
132+
133+
UR_CHECK_ERROR(amd_comgr_create_action_info(&Action));
134+
COMgrActionInfoCleanUp ActionCleanUp{Action};
135+
136+
std::string ISA = "amdgcn-amd-amdhsa--";
137+
hipDeviceProp_t Props;
138+
detail::ur::assertion(hipGetDeviceProperties(
139+
&Props, Context->getDevice()->get()) == hipSuccess);
140+
ISA += Props.gcnArchName;
141+
UR_CHECK_ERROR(amd_comgr_action_info_set_isa_name(Action, ISA.data()));
142+
143+
UR_CHECK_ERROR(amd_comgr_action_info_set_logging(Action, true));
144+
145+
amd_comgr_data_set_t Output;
146+
UR_CHECK_ERROR(amd_comgr_create_data_set(&Output));
147+
COMgrDataSetTCleanUp OutputDataCleanup{Output};
148+
149+
if (amd_comgr_do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
150+
Action, RelocatableData,
151+
Output) != AMD_COMGR_STATUS_SUCCESS) {
152+
getCoMgrBuildLog(Output, ErrorLog, MAX_LOG_SIZE);
153+
return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
154+
}
155+
amd_comgr_data_t binaryData;
156+
157+
UR_CHECK_ERROR(amd_comgr_action_data_get_data(
158+
Output, AMD_COMGR_DATA_KIND_EXECUTABLE, 0, &binaryData));
159+
{
160+
COMgrDataTCleanUp binaryDataCleanUp{binaryData};
161+
162+
size_t binarySize = 0;
163+
UR_CHECK_ERROR(amd_comgr_get_data(binaryData, &binarySize, NULL));
164+
165+
ExecutableCache.resize(binarySize);
166+
167+
UR_CHECK_ERROR(
168+
amd_comgr_get_data(binaryData, &binarySize, ExecutableCache.data()));
169+
}
170+
Binary = ExecutableCache.data();
171+
BinarySizeInBytes = ExecutableCache.size();
172+
return UR_RESULT_SUCCESS;
173+
#endif
174+
}
175+
31176
ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
177+
if (IsRelocatable) {
178+
if (finalizeRelocatable() != UR_RESULT_SUCCESS) {
179+
BuildStatus = UR_PROGRAM_BUILD_STATUS_ERROR;
180+
return UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
181+
}
182+
IsRelocatable = false;
183+
}
184+
32185
if (BuildOptions) {
33186
this->BuildOptions = BuildOptions;
34187
}
@@ -246,7 +399,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
246399
/// Note: Only supports one device
247400
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
248401
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
249-
const uint8_t *pBinary, const ur_program_properties_t *,
402+
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
250403
ur_program_handle_t *phProgram) {
251404
UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY);
252405
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
@@ -259,6 +412,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
259412

260413
// TODO: Set metadata here and use reqd_work_group_size information.
261414
// See urProgramCreateWithBinary in CUDA adapter.
415+
if (pProperties) {
416+
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
417+
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
418+
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
419+
return UR_RESULT_ERROR_INVALID_SIZE;
420+
}
421+
Result =
422+
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
423+
}
424+
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
262425

263426
auto pBinary_string = reinterpret_cast<const char *>(pBinary);
264427
if (size == 0) {

source/adapters/hip/program.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ struct ur_program_handle_t_ {
2323
size_t BinarySizeInBytes;
2424
std::atomic_uint32_t RefCount;
2525
ur_context_handle_t Context;
26+
std::string ExecutableCache;
27+
28+
// Metadata
29+
bool IsRelocatable = false;
2630

2731
constexpr static size_t MAX_LOG_SIZE = 8192u;
2832

@@ -33,9 +37,12 @@ struct ur_program_handle_t_ {
3337
ur_program_handle_t_(ur_context_handle_t Ctxt);
3438
~ur_program_handle_t_();
3539

40+
ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);
41+
3642
ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes);
3743

3844
ur_result_t buildProgram(const char *BuildOptions);
45+
ur_result_t finalizeRelocatable();
3946
ur_context_handle_t getContext() const { return Context; };
4047

4148
native_type get() const noexcept { return Module; };

source/ur/ur.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER =
4949
#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \
5050
"@reqd_work_group_size"
5151
#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
52+
#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"
5253

5354
// Terminates the process with a catastrophic error message.
5455
[[noreturn]] inline void die(const char *Message) {

0 commit comments

Comments
 (0)