File tree 1 file changed +6
-1
lines changed
1 file changed +6
-1
lines changed Original file line number Diff line number Diff line change @@ -1484,14 +1484,19 @@ static void ggml_cuda_op_mul_mat(
1484
1484
const size_t nbytes_data = ggml_nbytes (src0);
1485
1485
const size_t nbytes_padding = ggml_row_size (src0->type , MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1486
1486
dev[id].src0_dd = dev[id].src0_dd_alloc .alloc (ctx.pool (id), nbytes_data + nbytes_padding);
1487
+ // TODO: remove this for MUSA once the Guilty Lockup issue is resolved
1488
+ #ifndef GGML_USE_MUSA
1487
1489
CUDA_CHECK (cudaMemsetAsync (dev[id].src0_dd , 0 , nbytes_data + nbytes_padding, stream));
1490
+ #else // GGML_USE_MUSA
1491
+ CUDA_CHECK (cudaMemsetAsync (dev[id].src0_dd + nbytes_data, 0 , nbytes_padding, stream));
1492
+ #endif // !GGML_USE_MUSA
1488
1493
}
1489
1494
1490
1495
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
1491
1496
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized (src0->type ) && ggml_backend_buffer_get_usage (src0->buffer ) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr ) {
1492
1497
const size_t nbytes_data = ggml_row_size (src0->type , (dev[id].row_high - dev[id].row_low )*ne00);
1493
1498
const size_t nbytes_padding = ggml_row_size (src0->type , MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1494
- CUDA_CHECK (cudaMemsetAsync (dev[id].src0_dd + nbytes_data , 0 , nbytes_padding, stream));
1499
+ CUDA_CHECK (cudaMemsetAsync (dev[id].src0_dd + nbytes_data, 0 , nbytes_padding, stream));
1495
1500
}
1496
1501
1497
1502
if (src1_on_device && src1_is_contiguous) {
You can’t perform that action at this time.
0 commit comments