@@ -1361,49 +1361,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
1361
1361
return UR_RESULT_ERROR_INVALID_VALUE;
1362
1362
}
1363
1363
1364
- auto KernelCommandHandle = static_cast <kernel_command_handle *>(hCommand);
1364
+ try {
1365
+ auto KernelCommandHandle = static_cast <kernel_command_handle *>(hCommand);
1365
1366
1366
- UR_CHECK_ERROR (validateCommandDesc (KernelCommandHandle, pUpdateKernelLaunch));
1367
- UR_CHECK_ERROR (
1368
- updateKernelArguments (CommandBuffer->Device , pUpdateKernelLaunch));
1369
- UR_CHECK_ERROR (updateCommand (KernelCommandHandle, pUpdateKernelLaunch));
1370
-
1371
- // If no work-size is provided make sure we pass nullptr to setKernelParams so
1372
- // it can guess the local work size.
1373
- const bool ProvidedLocalSize = !KernelCommandHandle->isNullLocalSize ();
1374
- size_t *LocalWorkSize =
1375
- ProvidedLocalSize ? KernelCommandHandle->LocalWorkSize : nullptr ;
1376
-
1377
- // Set the number of threads per block to the number of threads per warp
1378
- // by default unless user has provided a better number.
1379
- size_t ThreadsPerBlock[3 ] = {32u , 1u , 1u };
1380
- size_t BlocksPerGrid[3 ] = {1u , 1u , 1u };
1381
- CUfunction CuFunc = KernelCommandHandle->Kernel ->get ();
1382
- auto Result = setKernelParams (
1383
- CommandBuffer->Context , CommandBuffer->Device ,
1384
- KernelCommandHandle->WorkDim , KernelCommandHandle->GlobalWorkOffset ,
1385
- KernelCommandHandle->GlobalWorkSize , LocalWorkSize,
1386
- KernelCommandHandle->Kernel , CuFunc, ThreadsPerBlock, BlocksPerGrid);
1387
- if (Result != UR_RESULT_SUCCESS) {
1388
- return Result;
1389
- }
1367
+ UR_CHECK_ERROR (
1368
+ validateCommandDesc (KernelCommandHandle, pUpdateKernelLaunch));
1369
+ UR_CHECK_ERROR (
1370
+ updateKernelArguments (CommandBuffer->Device , pUpdateKernelLaunch));
1371
+ UR_CHECK_ERROR (updateCommand (KernelCommandHandle, pUpdateKernelLaunch));
1390
1372
1391
- CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params ;
1373
+ // If no work-size is provided make sure we pass nullptr to setKernelParams
1374
+ // so it can guess the local work size.
1375
+ const bool ProvidedLocalSize = !KernelCommandHandle->isNullLocalSize ();
1376
+ size_t *LocalWorkSize =
1377
+ ProvidedLocalSize ? KernelCommandHandle->LocalWorkSize : nullptr ;
1392
1378
1393
- Params.func = CuFunc;
1394
- Params.gridDimX = BlocksPerGrid[0 ];
1395
- Params.gridDimY = BlocksPerGrid[1 ];
1396
- Params.gridDimZ = BlocksPerGrid[2 ];
1397
- Params.blockDimX = ThreadsPerBlock[0 ];
1398
- Params.blockDimY = ThreadsPerBlock[1 ];
1399
- Params.blockDimZ = ThreadsPerBlock[2 ];
1400
- Params.sharedMemBytes = KernelCommandHandle->Kernel ->getLocalSize ();
1401
- Params.kernelParams =
1402
- const_cast <void **>(KernelCommandHandle->Kernel ->getArgIndices ().data ());
1379
+ // Set the number of threads per block to the number of threads per warp
1380
+ // by default unless user has provided a better number.
1381
+ size_t ThreadsPerBlock[3 ] = {32u , 1u , 1u };
1382
+ size_t BlocksPerGrid[3 ] = {1u , 1u , 1u };
1383
+ CUfunction CuFunc = KernelCommandHandle->Kernel ->get ();
1384
+ auto Result = setKernelParams (
1385
+ CommandBuffer->Context , CommandBuffer->Device ,
1386
+ KernelCommandHandle->WorkDim , KernelCommandHandle->GlobalWorkOffset ,
1387
+ KernelCommandHandle->GlobalWorkSize , LocalWorkSize,
1388
+ KernelCommandHandle->Kernel , CuFunc, ThreadsPerBlock, BlocksPerGrid);
1389
+ if (Result != UR_RESULT_SUCCESS) {
1390
+ return Result;
1391
+ }
1403
1392
1404
- CUgraphNode Node = KernelCommandHandle->Node ;
1405
- CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec ;
1406
- UR_CHECK_ERROR (cuGraphExecKernelNodeSetParams (CudaGraphExec, Node, &Params));
1393
+ CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params ;
1394
+
1395
+ Params.func = CuFunc;
1396
+ Params.gridDimX = BlocksPerGrid[0 ];
1397
+ Params.gridDimY = BlocksPerGrid[1 ];
1398
+ Params.gridDimZ = BlocksPerGrid[2 ];
1399
+ Params.blockDimX = ThreadsPerBlock[0 ];
1400
+ Params.blockDimY = ThreadsPerBlock[1 ];
1401
+ Params.blockDimZ = ThreadsPerBlock[2 ];
1402
+ Params.sharedMemBytes = KernelCommandHandle->Kernel ->getLocalSize ();
1403
+ Params.kernelParams = const_cast <void **>(
1404
+ KernelCommandHandle->Kernel ->getArgIndices ().data ());
1405
+
1406
+ CUgraphNode Node = KernelCommandHandle->Node ;
1407
+ CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec ;
1408
+ UR_CHECK_ERROR (
1409
+ cuGraphExecKernelNodeSetParams (CudaGraphExec, Node, &Params));
1410
+ } catch (ur_result_t Err) {
1411
+ return Err;
1412
+ }
1407
1413
return UR_RESULT_SUCCESS;
1408
1414
}
1409
1415
0 commit comments