Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions source/adapters/hip/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(VendorId);
}
case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
int ComputeUnits = 0;
UR_CHECK_ERROR(hipDeviceGetAttribute(
&ComputeUnits, hipDeviceAttributeMultiprocessorCount, hDevice->get()));
detail::ur::assertion(ComputeUnits >= 0);
return ReturnValue(static_cast<uint32_t>(ComputeUnits));
return ReturnValue(hDevice->getNumComputeUnits());
}
case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
return ReturnValue(MaxWorkItemDimensions);
Expand Down
6 changes: 6 additions & 0 deletions source/adapters/hip/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct ur_device_handle_t_ {
ur_platform_handle_t Platform;
hipEvent_t EvBase; // HIP event used as base counter
uint32_t DeviceIndex;
uint32_t NumComputeUnits{0};

int MaxWorkGroupSize{0};
int MaxBlockDimX{0};
Expand All @@ -41,6 +42,9 @@ struct ur_device_handle_t_ {
: HIPDevice(HipDevice), RefCount{1}, Platform(Platform), EvBase(EvBase),
DeviceIndex(DeviceIndex) {

UR_CHECK_ERROR(hipDeviceGetAttribute(
reinterpret_cast<int *>(&NumComputeUnits),
hipDeviceAttributeMultiprocessorCount, HIPDevice));
UR_CHECK_ERROR(hipDeviceGetAttribute(
&MaxWorkGroupSize, hipDeviceAttributeMaxThreadsPerBlock, HIPDevice));
UR_CHECK_ERROR(hipDeviceGetAttribute(
Expand Down Expand Up @@ -85,6 +89,8 @@ struct ur_device_handle_t_ {

int getManagedMemSupport() const noexcept { return ManagedMemSupport; };

uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };

int getConcurrentManagedAccess() const noexcept {
return ConcurrentManagedAccess;
};
Expand Down
58 changes: 45 additions & 13 deletions source/adapters/hip/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,24 +161,56 @@ urKernelRelease(ur_kernel_handle_t hKernel) {
return UR_RESULT_SUCCESS;
}

// TODO(ur): Not implemented on hip atm. Also, need to add tests for this
// feature.
UR_APIEXPORT ur_result_t UR_APICALL
urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
uint32_t *pGroupCountRet) {
std::ignore = hKernel;
UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL);

std::ignore = hDevice;
std::ignore = workDim;
std::ignore = pLocalWorkSize;
std::ignore = dynamicSharedMemorySize;
std::ignore = pGroupCountRet;
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;

size_t localWorkSize = pLocalWorkSize[0];
localWorkSize *= (workDim >= 2 ? pLocalWorkSize[1] : 1);
localWorkSize *= (workDim == 3 ? pLocalWorkSize[2] : 1);

// We need to set the active current device for this kernel explicitly here,
// because the occupancy querying API does not take device parameter.
ur_device_handle_t Device = hKernel->getProgram()->getDevice();
ScopedDevice Active(Device);
try {
// We need to calculate max num of work-groups using per-device semantics.

int MaxNumActiveGroupsPerCU{0};
UR_CHECK_ERROR(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&MaxNumActiveGroupsPerCU, hKernel->get(), localWorkSize,
dynamicSharedMemorySize));
detail::ur::assertion(MaxNumActiveGroupsPerCU >= 0);
// Handle the case where we can't have all SMs active with at least 1 group
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think AMD calls these WGPs (Work-Group Processors). SM is for Nvidia architectures.

// per SM. In that case, the device is still able to run 1 work-group, hence
// we will manually check if it is possible with the available HW resources.
if (MaxNumActiveGroupsPerCU == 0) {
size_t MaxWorkGroupSize{};
urKernelGetGroupInfo(
hKernel, Device, UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE,
sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr);
size_t MaxLocalSizeBytes{};
urDeviceGetInfo(Device, UR_DEVICE_INFO_LOCAL_MEM_SIZE,
sizeof(MaxLocalSizeBytes), &MaxLocalSizeBytes, nullptr);
if (localWorkSize > MaxWorkGroupSize ||
dynamicSharedMemorySize > MaxLocalSizeBytes)
*pGroupCountRet = 0;
else
*pGroupCountRet = 1;
} else {
// Multiply by the number of SMs (CUs = compute units) on the device in
// order to retreive the total number of groups/blocks that can be
// launched.
*pGroupCountRet = Device->getNumComputeUnits() * MaxNumActiveGroupsPerCU;
}
} catch (ur_result_t Err) {
return Err;
}
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
Expand Down