@@ -3150,7 +3150,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
3150
3150
}
3151
3151
3152
3152
// rope == RoPE == rotary positional embedding
3153
- static __global__ void rope_f32 (const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
3153
+ static __global__ void rope_f32 (const float * x, float * dst, const int ncols, const float p0,
3154
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
3154
3155
const int col = 2 *(blockDim .x *blockIdx .x + threadIdx .x );
3155
3156
3156
3157
if (col >= ncols) {
@@ -3160,7 +3161,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
3160
3161
const int row = blockDim .y *blockIdx .y + threadIdx .y ;
3161
3162
const int i = row*ncols + col;
3162
3163
3163
- const float theta = p *powf (theta_scale, col/2 );
3164
+ const float theta = (p0 + p_delta * (row/p_delta_rows)) *powf (theta_scale, col/2 );
3164
3165
const float sin_theta = sinf (theta);
3165
3166
const float cos_theta = cosf (theta);
3166
3167
@@ -3764,12 +3765,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
3764
3765
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0 , stream>>> (x, dst, scale, k);
3765
3766
}
3766
3767
3767
- static void rope_f32_cuda (const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
3768
+ static void rope_f32_cuda (const float * x, float * dst, const int ncols, const int nrows, const float p0,
3769
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
3768
3770
GGML_ASSERT (nrows % 2 == 0 );
3769
3771
const dim3 block_dims (2 *CUDA_ROPE_BLOCK_SIZE, 1 , 1 );
3770
3772
const int num_blocks_x = (ncols + 2 *CUDA_ROPE_BLOCK_SIZE - 1 ) / (2 *CUDA_ROPE_BLOCK_SIZE);
3771
3773
const dim3 block_nums (num_blocks_x, nrows, 1 );
3772
- rope_f32<<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols, p , theta_scale);
3774
+ rope_f32<<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols, p0, p_delta, p_delta_rows , theta_scale);
3773
3775
}
3774
3776
3775
3777
static void rope_glm_f32_cuda (const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
@@ -4465,6 +4467,7 @@ inline void ggml_cuda_op_rope(
4465
4467
GGML_ASSERT (dst_ddf_i != nullptr );
4466
4468
4467
4469
const int64_t ne00 = src0->ne [0 ];
4470
+ const int64_t ne01 = src0->ne [1 ];
4468
4471
const int64_t i01_diff = i01_high - i01_low;
4469
4472
4470
4473
const int n_past = ((int32_t *) dst->op_params )[0 ];
@@ -4478,17 +4481,18 @@ inline void ggml_cuda_op_rope(
4478
4481
memcpy (&freq_scale, (int32_t *) dst->op_params + 5 , sizeof (float ));
4479
4482
4480
4483
const float theta_scale = powf (freq_base, -2 .0f /n_dims);
4481
- const float p = (((mode & 1 ) == 0 ? n_past + i02 : i02)) * freq_scale;
4482
4484
4483
- bool is_glm = mode & 4 ;
4485
+ const bool is_glm = mode & 4 ;
4484
4486
4485
4487
// compute
4486
4488
if (is_glm) {
4489
+ const float p = (((mode & 1 ) == 0 ? n_past + i02 : i02)) * freq_scale;
4487
4490
const float id_p = min (p, n_ctx - 2 .f );
4488
4491
const float block_p = max (p - (n_ctx - 2 .f ), 0 .f );
4489
4492
rope_glm_f32_cuda (src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
4490
4493
} else {
4491
- rope_f32_cuda (src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
4494
+ const float p0 = (((mode & 1 ) == 0 ? n_past : 0 )) * freq_scale;
4495
+ rope_f32_cuda (src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
4492
4496
}
4493
4497
4494
4498
(void ) src1;
@@ -5103,7 +5107,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
5103
5107
5104
5108
void ggml_cuda_rope (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5105
5109
GGML_ASSERT (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
5106
- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_rope, true , false ); // FIXME flatten changes results
5110
+
5111
+ const int mode = ((int32_t *) dst->op_params )[2 ];
5112
+ const bool is_glm = mode & 4 ;
5113
+ ggml_cuda_op (src0, src1, dst, ggml_cuda_op_rope, true , !is_glm); // flatten support not implemented for glm
5107
5114
}
5108
5115
5109
5116
void ggml_cuda_nop (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
0 commit comments