Skip to content

Commit 31facce

Browse files
Merge pull request #114 from menloresearch/update-dev-from-master-2025-06-04-00-09
Sync master with upstream release b5581
2 parents a0efdc7 + 71e74a3 commit 31facce

File tree

8 files changed

+1163
-3
lines changed

8 files changed

+1163
-3
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,12 @@ set(GGML_OPENCL_KERNELS
9595
sub
9696
sum_rows
9797
transpose
98+
concat
99+
tsembd
100+
upscale
101+
tanh
102+
pad
103+
repeat
98104
)
99105

100106
foreach (K ${GGML_OPENCL_KERNELS})

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 747 additions & 3 deletions
Large diffs are not rendered by default.
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
kernel void kernel_concat_f32_contiguous(
2+
global const char * p_src0, ulong off_src0,
3+
global const char * p_src1, ulong off_src1,
4+
global char * p_dst, ulong off_dst,
5+
int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice
6+
int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes)
7+
int d_ne0, int d_ne1, int d_ne2, // dst->ne[0..2] for the slice
8+
int dim
9+
) {
10+
global const float * src0 = (global const float*)((global char*)p_src0 + off_src0);
11+
global const float * src1 = (global const float*)((global char*)p_src1 + off_src1);
12+
global float * dst = (global float*)((global char*)p_dst + off_dst);
13+
14+
int i0 = get_global_id(0); // Index along dst's 0th dimension
15+
int i1 = get_global_id(1); // Index along dst's 1st dimension
16+
int i2 = get_global_id(2); // Index along dst's 2nd dimension
17+
18+
if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) {
19+
return;
20+
}
21+
22+
ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0;
23+
ulong src_idx;
24+
25+
if (dim == 0) {
26+
if (i0 < d_ne00) { // Data from src0
27+
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
28+
dst[dst_idx] = src0[src_idx];
29+
} else { // Data from src1
30+
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00);
31+
dst[dst_idx] = src1[src_idx];
32+
}
33+
} else if (dim == 1) {
34+
if (i1 < d_ne01) { // Data from src0
35+
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
36+
dst[dst_idx] = src0[src_idx];
37+
} else { // Data from src1
38+
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0;
39+
dst[dst_idx] = src1[src_idx];
40+
}
41+
} else if (dim == 2) {
42+
if (i2 < d_ne02) { // Data from src0
43+
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
44+
dst[dst_idx] = src0[src_idx];
45+
} else { // Data from src1
46+
47+
src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0;
48+
dst[dst_idx] = src1[src_idx];
49+
}
50+
}
51+
}
52+
53+
kernel void kernel_concat_f32_non_contiguous(
54+
global const char * p_src0, ulong off_src0,
55+
global const char * p_src1, ulong off_src1,
56+
global char * p_dst, ulong off_dst,
57+
58+
long ne00, long ne01, long ne02, long ne03,
59+
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
60+
61+
ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1
62+
63+
long d_ne0, long d_ne1, long d_ne2, long d_ne3,
64+
ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3,
65+
int dim
66+
) {
67+
global const char * src0_base = p_src0 + off_src0;
68+
global const char * src1_base = p_src1 + off_src1;
69+
global char * dst_base = p_dst + off_dst;
70+
71+
long current_i1 = get_global_id(0); // Index for dst_dim_1
72+
long current_i2 = get_global_id(1); // Index for dst_dim_2
73+
long current_i3 = get_global_id(2); // Index for dst_dim_3
74+
75+
if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) {
76+
return;
77+
}
78+
79+
global const float * x_val_ptr;
80+
global float * y_val_ptr;
81+
82+
for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) {
83+
bool use_src0;
84+
long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3;
85+
86+
if (dim == 0) {
87+
use_src0 = (current_i0 < ne00);
88+
if (!use_src0) { s_i0 = current_i0 - ne00; }
89+
} else if (dim == 1) {
90+
use_src0 = (current_i1 < ne01);
91+
if (!use_src0) { s_i1 = current_i1 - ne01; }
92+
} else if (dim == 2) {
93+
use_src0 = (current_i2 < ne02);
94+
if (!use_src0) { s_i2 = current_i2 - ne02; }
95+
} else { // dim == 3
96+
use_src0 = (current_i3 < ne03);
97+
if (!use_src0) { s_i3 = current_i3 - ne03; }
98+
}
99+
100+
if (use_src0) {
101+
x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00);
102+
} else {
103+
x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10);
104+
}
105+
106+
y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0);
107+
*y_val_ptr = *x_val_ptr;
108+
}
109+
}

ggml/src/ggml-opencl/kernels/pad.cl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
kernel void kernel_pad(
2+
global const void * src0_ptr,
3+
ulong src0_offset,
4+
global void * dst_ptr,
5+
ulong dst_offset,
6+
int s_ne0, int s_ne1, int s_ne2,
7+
int d_ne0, int d_ne1, int d_ne2
8+
) {
9+
global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset);
10+
global float * dst = (global float *)((global char *)dst_ptr + dst_offset);
11+
12+
int nidx = get_global_id(0);
13+
int idx_d1 = get_group_id(1);
14+
int idx_d2 = get_group_id(2);
15+
16+
if (nidx >= d_ne0) {
17+
return;
18+
}
19+
20+
int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1;
21+
22+
bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2);
23+
24+
if (in_src_bounds) {
25+
int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1;
26+
dst[dst_el_offset] = src0[src_el_offset];
27+
} else {
28+
dst[dst_el_offset] = 0.0f;
29+
}
30+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
kernel void kernel_repeat(
2+
global const char * src0_data_in,
3+
global char * dst_data_in,
4+
ulong src0_offset,
5+
ulong dst_offset,
6+
int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
7+
ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
8+
int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
9+
ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
10+
) {
11+
global const char * src0_data = src0_data_in + src0_offset;
12+
global char * dst_data = dst_data_in + dst_offset;
13+
14+
const int d3 = get_global_id(2);
15+
const int d2 = get_global_id(1);
16+
const int d1 = get_global_id(0);
17+
18+
if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
19+
return;
20+
}
21+
22+
const int s3 = d3 % src0_ne3;
23+
const int s2 = d2 % src0_ne2;
24+
const int s1 = d1 % src0_ne1;
25+
26+
const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
27+
global char * p_dst_slice = dst_data + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
28+
29+
for (int d0 = 0; d0 < dst_ne0; ++d0) {
30+
// Determine source index for dimension 0 based on tiling/broadcasting.
31+
const int s0 = d0 % src0_ne0;
32+
33+
const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
34+
global char * restrict current_dst_el_ptr = p_dst_slice + (ulong)d0*dst_nb0;
35+
for (int k = 0; k < src0_nb0; ++k) {
36+
current_dst_el_ptr[k] = current_src_el_ptr[k];
37+
}
38+
}
39+
}

ggml/src/ggml-opencl/kernels/tanh.cl

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
#ifdef cl_intel_required_subgroup_size
4+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
5+
#define INTEL_GPU 1
6+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
7+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
8+
#elif defined(cl_qcom_reqd_sub_group_size)
9+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
10+
#define ADRENO_GPU 1
11+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
12+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
13+
#endif
14+
15+
kernel void kernel_tanh_f32_nd(
16+
global void * p_src0_base, ulong off_src0_abs,
17+
global void * p_dst_base, ulong off_dst_abs,
18+
int ne00, int ne01, int ne02, int ne03,
19+
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
20+
int ne10, int ne11, int ne12, int ne13,
21+
ulong nb10, ulong nb11, ulong nb12, ulong nb13
22+
) {
23+
int i0 = get_global_id(0);
24+
int i1 = get_global_id(1);
25+
int i2 = get_global_id(2);
26+
27+
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
28+
for (int i3 = 0; i3 < ne13; ++i3) {
29+
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
30+
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
31+
32+
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
33+
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
34+
35+
*dst_val_ptr = tanh(*src_val_ptr);
36+
}
37+
}
38+
}
39+
40+
kernel void kernel_tanh_f16_nd(
41+
global void * p_src0_base, ulong off_src0_abs,
42+
global void * p_dst_base, ulong off_dst_abs,
43+
int ne00, int ne01, int ne02, int ne03,
44+
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
45+
int ne10, int ne11, int ne12, int ne13,
46+
ulong nb10, ulong nb11, ulong nb12, ulong nb13
47+
) {
48+
int i0 = get_global_id(0);
49+
int i1 = get_global_id(1);
50+
int i2 = get_global_id(2);
51+
52+
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
53+
for (int i3 = 0; i3 < ne13; ++i3) {
54+
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
55+
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
56+
57+
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
58+
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
59+
60+
*dst_val_ptr = tanh(*src_val_ptr);
61+
}
62+
}
63+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
kernel void kernel_timestep_embedding(
2+
global const void * p_timesteps,
3+
ulong off_timesteps,
4+
global void * p_dst,
5+
ulong off_dst,
6+
int dst_nb1_bytes,
7+
int logical_dim,
8+
int max_period
9+
) {
10+
int local_i;
11+
int local_j;
12+
int local_half_dim;
13+
float local_timestep_val;
14+
float local_freq;
15+
float local_arg;
16+
global float * local_embed_data_ptr;
17+
global const float * local_timesteps_input_ptr;
18+
global float * local_dst_output_base_ptr;
19+
20+
local_timesteps_input_ptr = (global const float *)((global char *)p_timesteps + off_timesteps);
21+
local_dst_output_base_ptr = (global float *)((global char *)p_dst + off_dst);
22+
23+
local_i = get_global_id(1);
24+
local_j = get_global_id(0);
25+
26+
local_half_dim = logical_dim / 2;
27+
local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);
28+
29+
if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
30+
local_embed_data_ptr[logical_dim] = 0.0f;
31+
}
32+
33+
if (local_j >= local_half_dim) {
34+
return;
35+
}
36+
37+
local_timestep_val = local_timesteps_input_ptr[local_i];
38+
39+
if (local_half_dim == 0) {
40+
local_freq = 1.0f;
41+
} else {
42+
local_freq = exp(-log((float)max_period) * (float)local_j / (float)local_half_dim);
43+
}
44+
45+
local_arg = local_timestep_val * local_freq;
46+
local_embed_data_ptr[local_j] = cos(local_arg);
47+
local_embed_data_ptr[local_j + local_half_dim] = sin(local_arg);
48+
}

0 commit comments

Comments
 (0)