Skip to content

Commit 7a6cef6

Browse files
authored
Format *.cu and *.cuh with lintrunner (#25189)
### Description Use lintrunner to format *.cu and *.cuh files. ### Motivation and Context Some cuda code is not formatted. This will make the style consistent.
1 parent 505b135 commit 7a6cef6

File tree

90 files changed

+3006
-3081
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+3006
-3081
lines changed

.lintrunner.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,15 @@ include_patterns = [
9898
'**/*.cc',
9999
'**/*.hpp',
100100
'**/*.cpp',
101+
'**/*.cuh',
102+
'**/*.cu',
101103
'**/*.m',
102104
'**/*.mm',
103105
]
104106
exclude_patterns = [
105107
'java/**', # FIXME: Enable clang-format for java
106108
'onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/**', # Contains data chunks
109+
'onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/*.generated.cu', # Generated code
107110
'onnxruntime/core/flatbuffers/schema/*.fbs.h', # Generated code
108111
'onnxruntime/test/flatbuffers/*.fbs.h', # Generated code
109112
'onnxruntime/core/graph/contrib_ops/quantization_defs.cc',

cmake/utils/detect_cuda_arch.cu

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,36 +4,30 @@
44
#include <iostream>
55
#include <vector>
66

7-
int main(int argc, char* argv[])
8-
{
9-
int n_devices = 0;
10-
int rc = cudaGetDeviceCount(&n_devices);
11-
if (rc != cudaSuccess)
12-
{
13-
cudaError_t error = cudaGetLastError();
14-
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
15-
return rc;
16-
}
7+
int main(int argc, char* argv[]) {
8+
int n_devices = 0;
9+
int rc = cudaGetDeviceCount(&n_devices);
10+
if (rc != cudaSuccess) {
11+
cudaError_t error = cudaGetLastError();
12+
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
13+
return rc;
14+
}
1715

18-
std::vector<std::pair<int, int>> arch(n_devices);
19-
for (int cd = 0; cd < n_devices; ++cd)
20-
{
21-
cudaDeviceProp dev;
22-
int rc = cudaGetDeviceProperties(&dev, cd);
23-
if (rc != cudaSuccess)
24-
{
25-
cudaError_t error = cudaGetLastError();
26-
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
27-
return rc;
28-
}
29-
else
30-
{
31-
arch[cd] = {dev.major, dev.minor};
32-
}
16+
std::vector<std::pair<int, int>> arch(n_devices);
17+
for (int cd = 0; cd < n_devices; ++cd) {
18+
cudaDeviceProp dev;
19+
int rc = cudaGetDeviceProperties(&dev, cd);
20+
if (rc != cudaSuccess) {
21+
cudaError_t error = cudaGetLastError();
22+
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
23+
return rc;
24+
} else {
25+
arch[cd] = {dev.major, dev.minor};
3326
}
27+
}
3428

35-
std::pair<int, int> best_cc = *std::max_element(begin(arch), end(arch));
36-
std::cout << best_cc.first << best_cc.second;
29+
std::pair<int, int> best_cc = *std::max_element(begin(arch), end(arch));
30+
std::cout << best_cc.first << best_cc.second;
3731

38-
return 0;
32+
return 0;
3933
}

onnxruntime/contrib_ops/cuda/activation/activations_impl.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@ struct OP_QuickGelu : public CtxQuickGelu {
5959
#define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \
6060
template void Impl_##name<T>(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
6161

62-
#define SPECIALIZED_UNARY_ACTIVATIONL_HFD(name) \
63-
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, half) \
64-
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, float) \
65-
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, double) \
62+
#define SPECIALIZED_UNARY_ACTIVATIONL_HFD(name) \
63+
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, half) \
64+
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, float) \
65+
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, double) \
6666
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, BFloat16)
6767

6868
#define UNARY_ACTIVATION_OP_NAME(name) \

onnxruntime/contrib_ops/cuda/bert/attention_impl.cu

Lines changed: 81 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -358,42 +358,42 @@ Status LeanAttention(
358358
constexpr bool is_bf16 = false;
359359

360360
ORT_RETURN_IF_ERROR(onnxruntime::lean::mha_fwd_kvcache(
361-
device_prop, stream,
362-
data.q,
363-
data.k, // k_cache
364-
data.v, // v_cache
365-
nullptr, // new_k (we have appended new_k to k_cache)
366-
nullptr, // new_v (we have appended new_v to k_cache)
367-
data.output,
368-
reinterpret_cast<void*>(data.softmax_lse),
369-
nullptr, // seqlens_k
370-
nullptr, // cos_cache
371-
nullptr, // sin_cache
372-
nullptr, // block_table
373-
parameters.batch_size,
374-
parameters.num_heads,
375-
parameters.num_heads, // num_heads_k
376-
parameters.head_size,
377-
parameters.sequence_length, // seqlen_q
378-
parameters.total_sequence_length, // seqlen_k
379-
0, // seqlen_k_new
380-
0, // rotary_dim
381-
scale, // softmax_scale
382-
parameters.is_unidirectional,
383-
is_bf16,
384-
false, // past_bsnh
385-
data.num_splits,
386-
data.grid_dim_z,
387-
data.max_tiles_per_tb,
388-
data.high_load_tbs,
389-
data.tiles_per_head,
390-
reinterpret_cast<void*>(data.softmax_lse_accum),
391-
reinterpret_cast<void*>(data.out_accum),
392-
data.lean_sync_flag,
393-
-1, // local_window_size
394-
false, // is_rotary_interleaved
395-
false // is_packed_qkv
396-
));
361+
device_prop, stream,
362+
data.q,
363+
data.k, // k_cache
364+
data.v, // v_cache
365+
nullptr, // new_k (we have appended new_k to k_cache)
366+
nullptr, // new_v (we have appended new_v to k_cache)
367+
data.output,
368+
reinterpret_cast<void*>(data.softmax_lse),
369+
nullptr, // seqlens_k
370+
nullptr, // cos_cache
371+
nullptr, // sin_cache
372+
nullptr, // block_table
373+
parameters.batch_size,
374+
parameters.num_heads,
375+
parameters.num_heads, // num_heads_k
376+
parameters.head_size,
377+
parameters.sequence_length, // seqlen_q
378+
parameters.total_sequence_length, // seqlen_k
379+
0, // seqlen_k_new
380+
0, // rotary_dim
381+
scale, // softmax_scale
382+
parameters.is_unidirectional,
383+
is_bf16,
384+
false, // past_bsnh
385+
data.num_splits,
386+
data.grid_dim_z,
387+
data.max_tiles_per_tb,
388+
data.high_load_tbs,
389+
data.tiles_per_head,
390+
reinterpret_cast<void*>(data.softmax_lse_accum),
391+
reinterpret_cast<void*>(data.out_accum),
392+
data.lean_sync_flag,
393+
-1, // local_window_size
394+
false, // is_rotary_interleaved
395+
false // is_packed_qkv
396+
));
397397

398398
return Status::OK();
399399
}
@@ -414,8 +414,6 @@ Status LeanAttention(
414414
}
415415
#endif
416416

417-
418-
419417
template <typename T>
420418
Status CudnnFlashAttention(
421419
cudnnHandle_t cudnn_handle,
@@ -439,21 +437,21 @@ Status CudnnFlashAttention(
439437
data.k,
440438
data.v,
441439
attention_bias,
442-
nullptr, // (optional) mask_sequence_lengths_q
443-
mask_sequence_lengths_kv, // (optional) mask_sequence_lengths_kv
440+
nullptr, // (optional) mask_sequence_lengths_q
441+
mask_sequence_lengths_kv, // (optional) mask_sequence_lengths_kv
444442
parameters.batch_size,
445-
parameters.num_heads, // num_heads_q,
446-
parameters.num_heads, // num_heads_kv,
447-
parameters.head_size, // head_size_qk
448-
parameters.v_head_size, // head_size_v
449-
parameters.sequence_length, // sequence_length_q
450-
parameters.total_sequence_length, // sequence_length_kv
451-
scale, // scaling factor applied prior softmax
452-
parameters.is_unidirectional, // causal
453-
is_bf16, // True if bfloat16, otherwise float16
454-
parameters.broadcast_attn_bias_dim_0, // broadcast attention bias dimension 0 or not
455-
parameters.broadcast_attn_bias_dim_1, // broadcast attention bias dimension 1 or not
456-
0, // sliding window length. 0 means no sliding window.
443+
parameters.num_heads, // num_heads_q,
444+
parameters.num_heads, // num_heads_kv,
445+
parameters.head_size, // head_size_qk
446+
parameters.v_head_size, // head_size_v
447+
parameters.sequence_length, // sequence_length_q
448+
parameters.total_sequence_length, // sequence_length_kv
449+
scale, // scaling factor applied prior softmax
450+
parameters.is_unidirectional, // causal
451+
is_bf16, // True if bfloat16, otherwise float16
452+
parameters.broadcast_attn_bias_dim_0, // broadcast attention bias dimension 0 or not
453+
parameters.broadcast_attn_bias_dim_1, // broadcast attention bias dimension 1 or not
454+
0, // sliding window length. 0 means no sliding window.
457455
data.qkv_format,
458456
cudnn_handle,
459457
ort_stream,
@@ -540,10 +538,9 @@ Status EfficientAttention(
540538

541539
template <typename T, typename QK>
542540
Status LaunchDecoderMaskedMultiHeadAttention(
543-
const DecoderMaskedMultiHeadAttentionParameters& parameters,
544-
cudaStream_t stream,
545-
const int head_size) {
546-
541+
const DecoderMaskedMultiHeadAttentionParameters& parameters,
542+
cudaStream_t stream,
543+
const int head_size) {
547544
DUMP_STRING_INIT();
548545
DUMP_STRING("DMMHA parameters...");
549546
DUMP_STRING("is_mha = ", (parameters.is_mha == true));
@@ -763,7 +760,7 @@ Status UnfusedAttention(
763760
if (nullptr != data.output_qk) {
764761
int64_t qk_size = (int64_t)batch_size * num_heads * sequence_length * total_sequence_length;
765762
ORT_RETURN_IF_ERROR(
766-
(CopyQK<T, QK>(stream, static_cast<int>(qk_size), data.scratch, reinterpret_cast<QK*>(data.output_qk))));
763+
(CopyQK<T, QK>(stream, static_cast<int>(qk_size), data.scratch, reinterpret_cast<QK*>(data.output_qk))));
767764
}
768765
ORT_RETURN_IF_ERROR(
769766
ComputeSoftmax<T>(
@@ -802,7 +799,7 @@ Status ConcatPastToPresent(int batch_size, int num_heads, int qk_head_size, int
802799
// past_v (BxNxPxH) + v (BxNxLxH) => present_v (BxNxTxH)
803800
// When there is past state, the head size for Q/K/V shall be same: H == H_v.
804801

805-
if (nullptr != data.present) { // Attention op
802+
if (nullptr != data.present) { // Attention op
806803
assert(data.qkv_format == AttentionQkvFormat::Q_K_V_BNSH ||
807804
data.qkv_format == AttentionQkvFormat::Q_K_V_BNSH_QKV_BS3NH);
808805

@@ -811,12 +808,10 @@ Status ConcatPastToPresent(int batch_size, int num_heads, int qk_head_size, int
811808
stream, total_sequence_length, sequence_length, batch_size, qk_head_size, num_heads,
812809
max_threads_per_block, 2, data.past, data.k, data.present));
813810

814-
815-
816811
// Update pointers to present_k and present_v.
817812
data.k = data.present;
818813
data.v = data.present + batch_size * num_heads * total_sequence_length * qk_head_size;
819-
} else { // MultiHeadAttention op
814+
} else { // MultiHeadAttention op
820815
if (nullptr != data.present_key) {
821816
ORT_ENFORCE(data.qkv_format == AttentionQkvFormat::Q_K_V_BNSH ||
822817
data.qkv_format == AttentionQkvFormat::Q_K_V_BSNH_BNSH_BNSH);
@@ -826,16 +821,16 @@ Status ConcatPastToPresent(int batch_size, int num_heads, int qk_head_size, int
826821

827822
ORT_RETURN_IF_ERROR(
828823
LaunchConcatTensorToTensor(stream, total_sequence_length, sequence_length,
829-
batch_size, qk_head_size, num_heads,
830-
max_threads_per_block, 1, data.past_key, data.k, data.present_key));
824+
batch_size, qk_head_size, num_heads,
825+
max_threads_per_block, 1, data.past_key, data.k, data.present_key));
831826
ORT_RETURN_IF_ERROR(
832827
LaunchConcatTensorToTensor(stream, total_sequence_length, sequence_length,
833-
batch_size, v_head_size, num_heads,
834-
max_threads_per_block, 1, data.past_value, data.v, data.present_value));
828+
batch_size, v_head_size, num_heads,
829+
max_threads_per_block, 1, data.past_value, data.v, data.present_value));
835830
// Update pointers to present_k and present_v.
836831
data.k = data.present_key;
837832
data.v = data.present_value;
838-
} else { // nullptr == data.past_key && nullptr != data.present_key
833+
} else { // nullptr == data.past_key && nullptr != data.present_key
839834
if (data.k != data.present_key) {
840835
int64_t k_size = (int64_t)batch_size * num_heads * total_sequence_length * qk_head_size;
841836
cudaMemcpyAsync(data.present_key, data.k, k_size * sizeof(T), cudaMemcpyDeviceToDevice, stream);
@@ -889,7 +884,7 @@ Status PastPresentBufferShare(int batch_size, int num_heads, int qk_head_size, i
889884
return Status::OK();
890885
}
891886

892-
if (combined_key_value) { // Attention op
887+
if (combined_key_value) { // Attention op
893888
assert(data.gemm_buffer != nullptr);
894889

895890
if (data.present != data.past) {
@@ -924,9 +919,9 @@ Status PastPresentBufferShare(int batch_size, int num_heads, int qk_head_size, i
924919
constexpr bool is_past_kv_bnsh_format = true;
925920
constexpr bool is_new_kv_bnsh_format = true;
926921
ORT_RETURN_IF_ERROR(LaunchConcatKVInPlace(
927-
batch_size, num_heads, qk_head_size, parameters.max_sequence_length,
928-
data.seqlens_k_total, nullptr, parameters.sequence_length, data.k, data.v, data.present_key, data.present_value,
929-
is_past_kv_bnsh_format, is_new_kv_bnsh_format, stream, max_threads_per_block));
922+
batch_size, num_heads, qk_head_size, parameters.max_sequence_length,
923+
data.seqlens_k_total, nullptr, parameters.sequence_length, data.k, data.v, data.present_key, data.present_value,
924+
is_past_kv_bnsh_format, is_new_kv_bnsh_format, stream, max_threads_per_block));
930925

931926
data.k = data.present_key;
932927
data.v = data.present_value;
@@ -981,13 +976,13 @@ Status QkvToContext(
981976

982977
if (!parameters.past_present_share_buffer) {
983978
ORT_RETURN_IF_ERROR(ConcatPastToPresent<T>(batch_size, num_heads, qk_head_size, v_head_size,
984-
sequence_length, total_sequence_length,
985-
stream, max_threads_per_block, data));
979+
sequence_length, total_sequence_length,
980+
stream, max_threads_per_block, data));
986981

987982
} else { // past_present_share_buffer
988983
ORT_RETURN_IF_ERROR(PastPresentBufferShare<T>(batch_size, num_heads, qk_head_size, v_head_size,
989-
sequence_length, fused_runner,
990-
parameters, data, stream, max_threads_per_block));
984+
sequence_length, fused_runner,
985+
parameters, data, stream, max_threads_per_block));
991986
}
992987

993988
// Q, K and V are ready now
@@ -1078,24 +1073,24 @@ template Status QkvToContext<half, float>(
10781073
AttentionData<half>& data);
10791074

10801075
template Status LaunchDecoderMaskedMultiHeadAttention<float, float>(
1081-
const DecoderMaskedMultiHeadAttentionParameters& parameters,
1082-
cudaStream_t stream,
1083-
const int head_size);
1076+
const DecoderMaskedMultiHeadAttentionParameters& parameters,
1077+
cudaStream_t stream,
1078+
const int head_size);
10841079

10851080
template Status LaunchDecoderMaskedMultiHeadAttention<float, half>(
1086-
const DecoderMaskedMultiHeadAttentionParameters& parameters,
1087-
cudaStream_t stream,
1088-
const int head_size);
1081+
const DecoderMaskedMultiHeadAttentionParameters& parameters,
1082+
cudaStream_t stream,
1083+
const int head_size);
10891084

10901085
template Status LaunchDecoderMaskedMultiHeadAttention<uint16_t, float>(
1091-
const DecoderMaskedMultiHeadAttentionParameters& parameters,
1092-
cudaStream_t stream,
1093-
const int head_size);
1086+
const DecoderMaskedMultiHeadAttentionParameters& parameters,
1087+
cudaStream_t stream,
1088+
const int head_size);
10941089

10951090
template Status LaunchDecoderMaskedMultiHeadAttention<uint16_t, half>(
1096-
const DecoderMaskedMultiHeadAttentionParameters& parameters,
1097-
cudaStream_t stream,
1098-
const int head_size);
1091+
const DecoderMaskedMultiHeadAttentionParameters& parameters,
1092+
cudaStream_t stream,
1093+
const int head_size);
10991094

11001095
} // namespace cuda
11011096
} // namespace contrib

onnxruntime/contrib_ops/cuda/bert/attention_kv_cache.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,6 @@ Status LaunchConcatTensorToTensor(cudaStream_t stream,
197197
return CUDA_CALL(cudaGetLastError());
198198
}
199199

200-
201200
#ifndef USE_ROCM // exclude the following from hipify since they are not used in ROCM EP
202201

203202
// ----------------------------------------------------------------------------------

0 commit comments

Comments
 (0)