Skip to content

SYCL: Fix im2col #12910

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4018,8 +4018,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return ggml_is_contiguous(op->src[0]);
}
case GGML_OP_IM2COL:
// TODO: add support for the new F32 operations
return op->src[0]->type == GGML_TYPE_F16;
return true;
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_POOL_2D:
Expand Down
149 changes: 82 additions & 67 deletions ggml/src/ggml-sycl/im2col.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,110 +12,125 @@

#include "im2col.hpp"

#include <sycl/sycl.hpp>
#include <type_traits> // For std::is_same_v

#include "ggml.h"

template <typename T>
static void im2col_kernel(
const float *x, T *dst, int64_t batch_offset, int64_t offset_delta,
int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
int64_t pelements, int64_t CHW, int s0, int s1, int p0, int p1, int d0, int d1,
const sycl::nd_item<3> &item_ct1) {
static void im2col_kernel(const float * x, T * dst, int64_t batch_offset, int64_t offset_delta, int64_t IC, int64_t IW,
int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
int s0, int s1, int p0, int p1, int d0, int d1, const sycl::nd_item<3> & item_ct1) {
const int64_t work_group_size = item_ct1.get_local_range(2);
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
const int64_t global_id = item_ct1.get_local_id(2) + (work_group_size * item_ct1.get_group(2));

// make each work-item deal with more elements since sycl global range can not exceed max int
for (int64_t i = global_id; i < pelements; i += work_group_size * item_ct1.get_group_range(2)) {

for (int64_t i = global_id; i < pelements; i += (work_group_size * item_ct1.get_group_range(2))) {
const int64_t ksize = OW * (KH > 1 ? KW : 1);
const int64_t kx = i / ksize;
const int64_t kd = kx * ksize;
const int64_t ky = (i - kd) / OW;
const int64_t ix = i % OW;

const int64_t oh = item_ct1.get_group(1);
const int64_t batch = item_ct1.get_group(0) / IC;
const int64_t ic = item_ct1.get_group(0) % IC;

const int64_t iiw = ix * s0 + kx * d0 - p0;
const int64_t iih = oh * s1 + ky * d1 - p1;

const int64_t offset_dst =
((batch * OH + oh) * OW + ix) * CHW +
(ic * (KW * KH) + ky * KW + kx);

if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] =
sycl::vec<float, 1>(0.0f)
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
} else {
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
dst[offset_dst] =
sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
const int64_t kx = i / ksize;
const int64_t kd = kx * ksize;
const int64_t ky = (i - kd) / OW;
const int64_t ix = i % OW;

const int64_t oh = item_ct1.get_group(1);
const int64_t batch = item_ct1.get_group(0) / IC;
const int64_t ic = item_ct1.get_group(0) % IC;

const int64_t iiw = (ix * s0) + (kx * d0) - p0;
const int64_t iih = (oh * s1) + (ky * d1) - p1;

const int64_t offset_dst = (((batch * OH + oh) * OW + ix) * CHW) + (ic * (KW * KH) + ky * KW + kx);

const int64_t offset_src_base = (ic * offset_delta) + (batch * batch_offset);
const int64_t offset_src = offset_src_base + (iih * IW) + iiw;

const bool out_of_bounds = (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW);
const float src_val = out_of_bounds ? 0.0f : x[offset_src];

if constexpr (std::is_same_v<T, sycl::half>) {
dst[offset_dst] = sycl::half(src_val);
} else if constexpr (std::is_same_v<T, float>) {
dst[offset_dst] = src_val;
}
}
}

template <typename T>
static void im2col_sycl(
const float *x, T *dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW,
int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta,
int s0, int s1, int p0, int p1, int d0, int d1,
queue_ptr stream) {
static void im2col_sycl_internal(const float * x, T * dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW,
int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta,
int s0, int s1, int p0, int p1, int d0, int d1, queue_ptr stream) {
const int64_t parallel_elements = OW * KW * KH;
const int64_t num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
const int64_t num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;

// decrease global range when it exceeds the max int
int64_t local_size = downsample_sycl_global_range(batch * IC * OH * num_blocks, SYCL_IM2COL_BLOCK_SIZE);

sycl::range<3> block_nums(batch * IC, OH, num_blocks);
sycl::range<3> local_range(1, 1, local_size);

{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});

stream->parallel_for(
sycl::nd_range<3>(block_nums * local_range, local_range),
[=](sycl::nd_item<3> item_ct1) {
im2col_kernel(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH,
parallel_elements, (IC * KH * KW), s0, s1, p0,
p1, d0, d1, item_ct1);
});
const int64_t CHW = IC * KH * KW;

stream->parallel_for(sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
im2col_kernel<T>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, CHW, s0, s1,
p0, p1, d0, d1, item_ct1);
});
}

static void im2col_sycl_f16(const float * x, sycl::half * dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH,
int64_t KW, int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset,
int64_t offset_delta, int s0, int s1, int p0, int p1, int d0, int d1, queue_ptr stream) {
if (!stream->get_device().has(sycl::aspect::fp16)) {
throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported),
"Device does not support half precision (fp16) operations!");
}
im2col_sycl_internal<sycl::half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0,
p1, d0, d1, stream);
}

static void im2col_sycl_f32(const float * x, float * dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW,
int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta, int s0,
int s1, int p0, int p1, int d0, int d1, queue_ptr stream) {
im2col_sycl_internal<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1,
d0, d1, stream);
}

void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];

GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);

const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
const int32_t s0 = ((const int32_t *) (dst->op_params))[0];
const int32_t s1 = ((const int32_t *) (dst->op_params))[1];
const int32_t p0 = ((const int32_t *) (dst->op_params))[2];
const int32_t p1 = ((const int32_t *) (dst->op_params))[3];
const int32_t d0 = ((const int32_t *) (dst->op_params))[4];
const int32_t d1 = ((const int32_t *) (dst->op_params))[5];

const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
const bool is_2D = ((const int32_t *) (dst->op_params))[6] == 1;

const int64_t IC = src1->ne[is_2D ? 2 : 1];
const int64_t IH = is_2D ? src1->ne[1] : 1;
const int64_t IW = src1->ne[0];
const int64_t IW = src1->ne[0];

const int64_t KH = is_2D ? src0->ne[1] : 1;
const int64_t KW = src0->ne[0];
const int64_t KW = src0->ne[0];

const int64_t OH = is_2D ? dst->ne[2] : 1;
const int64_t OW = dst->ne[1];
const int64_t OW = dst->ne[1];

const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / sizeof(float);
const int64_t batch = src1->ne[is_2D ? 3 : 2];
const size_t batch_offset = src1->nb[is_2D ? 3 : 2] / sizeof(float);

const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
const int64_t batch = src1->ne[3];
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
queue_ptr stream = ctx.stream();

if (dst->type == GGML_TYPE_F16) {
im2col_sycl((const float *) src1->data, (sycl::half *)dst->data, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, ctx.stream());
im2col_sycl_f16((const float *) src1->data, (sycl::half *) dst->data, IW, IH, OW, OH, KW, KH, IC, batch,
batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
} else {
im2col_sycl((const float *) src1->data, (float *)dst->data, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, ctx.stream());
im2col_sycl_f32((const float *) src1->data, (float *) dst->data, IW, IH, OW, OH, KW, KH, IC, batch,
batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
}
}
Loading