Skip to content

Commit f82027a

Browse files
committed
gpu - update gen non-tensor block strategy
1 parent 9123fb0 commit f82027a

File tree

2 files changed

+19
-4
lines changed

2 files changed

+19
-4
lines changed

backends/cuda-gen/ceed-cuda-gen-operator.c

+9-2
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
282282
CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
283283
int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
284284

285-
CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
286-
cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
285+
if (is_tensor) {
286+
CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
287+
cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
288+
} else {
289+
CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
290+
291+
grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
292+
block[2] = elems_per_block;
293+
}
287294
CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
288295

289296
CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));

backends/hip-gen/ceed-hip-gen-operator.c

+10-2
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,18 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
203203
const CeedInt Q_1d = data->Q_1d;
204204
const CeedInt P_1d = data->max_P_1d;
205205
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
206-
CeedInt block_sizes[3];
207206

208207
CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
209-
CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
208+
CeedInt block_sizes[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
209+
210+
if (is_tensor) {
211+
CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
212+
} else {
213+
CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
214+
215+
elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
216+
block_sizes[2] = elems_per_block;
217+
}
210218
if (dim == 1 || !is_tensor) {
211219
CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
212220
CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);

0 commit comments

Comments
 (0)