diff --git a/src/ggml-metal/ggml-metal-device.cpp b/src/ggml-metal/ggml-metal-device.cpp index 5607deaf41..08095dcf06 100644 --- a/src/ggml-metal/ggml-metal-device.cpp +++ b/src/ggml-metal/ggml-metal-device.cpp @@ -1438,6 +1438,30 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d(ggml_met return res; } +ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_2d(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_CONV_2D); + + GGML_ASSERT(ggml_is_contiguous(op->src[0])); + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_conv_2d_%s_%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name); + if (res) { + return res; + } + + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + + return res; +} + ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale(ggml_metal_library_t lib, const ggml_tensor * op) { assert(op->op == GGML_OP_UPSCALE); diff --git a/src/ggml-metal/ggml-metal-device.h b/src/ggml-metal/ggml-metal-device.h index cb27dca989..5a8bc0c1cc 100644 --- a/src/ggml-metal/ggml-metal-device.h +++ b/src/ggml-metal/ggml-metal-device.h @@ -133,6 +133,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope (ggml_me ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op); +ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_2d (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/src/ggml-metal/ggml-metal-device.m b/src/ggml-metal/ggml-metal-device.m index 606cfd0a5e..5d0d3f0d37 100644 --- a/src/ggml-metal/ggml-metal-device.m +++ b/src/ggml-metal/ggml-metal-device.m @@ -883,6 +883,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te return true; case GGML_OP_IM2COL: return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32); + case GGML_OP_CONV_2D: + return ggml_is_contiguous(op->src[0]) && + op->src[1]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32); case GGML_OP_POOL_1D: return false; case GGML_OP_UPSCALE: diff --git a/src/ggml-metal/ggml-metal-impl.h b/src/ggml-metal/ggml-metal-impl.h index 7a878a657b..6d02befa97 100644 --- a/src/ggml-metal/ggml-metal-impl.h +++ b/src/ggml-metal/ggml-metal-impl.h @@ -528,6 +528,36 @@ typedef struct { uint64_t nb2; } ggml_metal_kargs_conv_transpose_2d; +typedef struct { + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; + uint64_t nb0; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; + int32_t IW; + int32_t IH; + int32_t KW; + int32_t KH; + int32_t IC; + int32_t OC; + int32_t OW; + int32_t OH; + int32_t N; + int32_t s0; + int32_t s1; + int32_t p0; + int32_t p1; + int32_t d0; + int32_t d1; +} ggml_metal_kargs_conv_2d; + typedef struct { uint64_t ofs0; uint64_t ofs1; diff --git a/src/ggml-metal/ggml-metal-ops.cpp b/src/ggml-metal/ggml-metal-ops.cpp index 7a85edbdcd..e42163d9a3 100644 --- a/src/ggml-metal/ggml-metal-ops.cpp +++ b/src/ggml-metal/ggml-metal-ops.cpp @@ -10,6 +10,7 @@ #include #include +#include static ggml_metal_buffer_id ggml_metal_get_buffer_id(const ggml_tensor * t) { if (!t) { @@ -364,6 +365,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_im2col(ctx, idx); } break; + case GGML_OP_CONV_2D: + { + n_fuse = ggml_metal_op_conv_2d(ctx, idx); + } break; case GGML_OP_CONV_TRANSPOSE_1D: { n_fuse = ggml_metal_op_conv_transpose_1d(ctx, idx); @@ -503,10 +508,10 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) { /*.ne1 =*/ ne1, /*.ne2 =*/ ne2, /*.ne3 =*/ ne3, - /*.nb0 =*/ nb0, - /*.nb1 =*/ nb1, - /*.nb2 =*/ nb2, - /*.nb3 =*/ nb3, + /*.nb0 =*/ (uint64_t) nb0, + /*.nb1 =*/ (uint64_t) nb1, + /*.nb2 =*/ (uint64_t) nb2, + /*.nb3 =*/ (uint64_t) nb3, /*.dim =*/ dim, }; @@ -3077,6 +3082,84 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) { return 1; } +int ggml_metal_op_conv_2d(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne); + GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb); + GGML_TENSOR_LOCALS( int32_t, ne, op, ne); + GGML_TENSOR_LOCALS(uint32_t, nb, op, nb); + + GGML_ASSERT(ggml_is_contiguous(op->src[0])); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32); + + const int32_t s0 = ((const int32_t *) op->op_params)[0]; + const int32_t s1 = ((const int32_t *) op->op_params)[1]; + const int32_t p0 = ((const int32_t *) op->op_params)[2]; + const int32_t p1 = ((const int32_t *) op->op_params)[3]; + const int32_t d0 = ((const int32_t *) op->op_params)[4]; + const int32_t d1 = ((const int32_t *) op->op_params)[5]; + + ggml_metal_kargs_conv_2d args = { + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.nb0 =*/ nb0, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + /*.nb3 =*/ nb3, + /*.IW =*/ ne10, + /*.IH =*/ ne11, + /*.KW =*/ ne00, + /*.KH =*/ ne01, + /*.IC =*/ ne02, + /*.OC =*/ ne03, + /*.OW =*/ ne0, + /*.OH =*/ ne1, + /*.N =*/ ne3, + /*.s0 =*/ s0, + /*.s1 =*/ s1, + /*.p0 =*/ p0, + /*.p1 =*/ p1, + /*.d0 =*/ d0, + /*.d1 =*/ d1, + }; + + ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op); + + int nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline); + nth = std::min(nth, 256); + nth = std::max(nth, 1); + + const uint64_t n_out = (uint64_t) ne0 * ne1 * ne2 * ne3; + uint64_t tg64 = (n_out + nth - 1)/nth; + tg64 = std::max(tg64, 1); + tg64 = std::min(tg64, (uint64_t) std::numeric_limits::max()); + const int tg = (int) tg64; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); + + ggml_metal_encoder_dispatch_threadgroups(enc, tg, 1, 1, nth, 1, 1); + + return 1; +} + int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) { ggml_tensor * op = ctx->node(idx); diff --git a/src/ggml-metal/ggml-metal-ops.h b/src/ggml-metal/ggml-metal-ops.h index 0d9cb8af7c..3cf400dc45 100644 --- a/src/ggml-metal/ggml-metal-ops.h +++ b/src/ggml-metal/ggml-metal-ops.h @@ -70,6 +70,7 @@ int ggml_metal_op_group_norm (ggml_metal_op_t ctx, int idx); int ggml_metal_op_norm (ggml_metal_op_t ctx, int idx); int ggml_metal_op_rope (ggml_metal_op_t ctx, int idx); int ggml_metal_op_im2col (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_conv_2d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_conv_transpose_1d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_upscale (ggml_metal_op_t ctx, int idx); diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index cea535ade7..ded06cd724 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -4146,6 +4146,130 @@ template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col; //template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext; //template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext; +template +kernel void kernel_conv_2d( + constant ggml_metal_kargs_conv_2d & args, + device const TK * weights, + device const float * src, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const uint threads_per_tg = ntg.x * ntg.y * ntg.z; + const uint tg_index = (tgpig.z * tgpg.y + tgpig.y) * tgpg.x + tgpig.x; + const uint local_thread = tpitg.z * (ntg.x * ntg.y) + tpitg.y * ntg.x + tpitg.x; + const uint thread_index = tg_index * threads_per_tg + local_thread; + const uint64_t total_threads = (uint64_t) threads_per_tg * tgpg.x * tgpg.y * tgpg.z; + const uint64_t total_outputs = (uint64_t) args.N * args.OC * args.OH * args.OW; + + const ulong stride_w = args.nb10 / sizeof(float); + const ulong stride_h = args.nb11 / sizeof(float); + const ulong stride_c = args.nb12 / sizeof(float); + const ulong stride_n = args.nb13 / sizeof(float); + const ulong dst_stride_w = args.nb0 / sizeof(float); + const ulong dst_stride_h = args.nb1 / sizeof(float); + const ulong dst_stride_c = args.nb2 / sizeof(float); + const ulong dst_stride_n = args.nb3 / sizeof(float); + + const ulong k_stride_w = args.nb00 / sizeof(TK); + const ulong k_stride_h = args.nb01 / sizeof(TK); + const ulong k_stride_c = args.nb02 / sizeof(TK); + const ulong k_stride_o = args.nb03 / sizeof(TK); + + for (uint64_t index = thread_index; index < total_outputs; index += total_threads) { + uint64_t tmp = index; + + const int32_t ow = tmp % args.OW; tmp /= args.OW; + const int32_t oh = tmp % args.OH; tmp /= args.OH; + const int32_t oc = tmp % args.OC; tmp /= args.OC; + const int32_t n = tmp; + + float acc = 0.0f; + + const int32_t base_x = ow*args.s0 - args.p0; + const int32_t base_y = oh*args.s1 - args.p1; + + int32_t ky_start = 0; + if (base_y < 0) { + ky_start = (-base_y + args.d1 - 1)/args.d1; + } + int32_t ky_end = args.KH; + const int32_t y_max = args.IH - 1 - base_y; + if (y_max < 0) { + ky_end = ky_start; + } else if (base_y + (args.KH - 1)*args.d1 >= args.IH) { + ky_end = min(ky_end, y_max/args.d1 + 1); + } + + int32_t kx_start = 0; + if (base_x < 0) { + kx_start = (-base_x + args.d0 - 1)/args.d0; + } + int32_t kx_end = args.KW; + const int32_t x_max = args.IW - 1 - base_x; + if (x_max < 0) { + kx_end = kx_start; + } else if (base_x + (args.KW - 1)*args.d0 >= args.IW) { + kx_end = min(kx_end, x_max/args.d0 + 1); + } + + if (ky_start < ky_end && kx_start < kx_end) { + const device const float * src_n = src + (ulong) n * stride_n; + const device const TK * w_oc = weights + (ulong) oc * k_stride_o; + + for (int32_t ic = 0; ic < args.IC; ++ic) { + const device const float * src_c = src_n + (ulong) ic * stride_c; + const device const TK * w_c = w_oc + (ulong) ic * k_stride_c; + + for (int32_t ky = ky_start; ky < ky_end; ++ky) { + const int32_t iy = base_y + ky*args.d1; + const device const float * src_row = src_c + (ulong) iy * stride_h; + const device const TK * w_row = w_c + (ulong) ky * k_stride_h; + + for (int32_t kx = kx_start; kx < kx_end; ++kx) { + const int32_t ix = base_x + kx*args.d0; + const device const float * src_elem = src_row + (ulong) ix * stride_w; + const device const TK * w_elem = w_row + (ulong) kx * k_stride_w; + + acc += (*src_elem) * (float) (*w_elem); + } + } + } + } + + device float * dst_ptr = dst + + (ulong) n * dst_stride_n + + (ulong) oc * dst_stride_c + + (ulong) oh * dst_stride_h + + (ulong) ow * dst_stride_w; + *dst_ptr = acc; + } +} + +template [[host_name("kernel_conv_2d_f32_f32")]] +kernel void kernel_conv_2d( + constant ggml_metal_kargs_conv_2d & args, + device const float * weights, + device const float * src, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template [[host_name("kernel_conv_2d_f16_f32")]] +kernel void kernel_conv_2d( + constant ggml_metal_kargs_conv_2d & args, + device const half * weights, + device const float * src, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + typedef void (conv_transpose_1d_t)( constant ggml_metal_kargs_conv_transpose_1d & args, device const float * src0, diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index 2aa50ebf24..d7377d7d28 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -169,6 +169,10 @@ struct ggml_cgraph * build_graph(const test_model& model) { ggml_set_name(conv2d_res, "conv2d_res"); ggml_build_forward_expand(gf, conv2d_res); + struct ggml_tensor* conv2d_direct_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); + ggml_set_name(conv2d_direct_res, "conv2d_direct_res"); + ggml_build_forward_expand(gf, conv2d_direct_res); + ggml_free(ctx0); return gf; } @@ -191,45 +195,57 @@ struct ggml_cgraph * compute_graph(const test_model & model, ggml_gallocr_t allo return gf; } -int main(void) -{ - ggml_time_init(); +static bool run_conv2d_case(bool request_gpu) { + const char * label = request_gpu ? "GPU" : "CPU"; + printf("\n=== Running CONV2D test on %s backend ===\n", label); test_model model; - load_model(model, true); + load_model(model, request_gpu); - ggml_gallocr_t allocr = NULL; +#ifdef GGML_USE_METAL + if (request_gpu && !ggml_backend_is_metal(model.backend)) { + fprintf(stderr, "Skipping GPU test: Metal backend unavailable\n"); + ggml_free(model.ctx); + ggml_backend_buffer_free(model.buffer); + ggml_backend_free(model.backend); + return true; + } +#endif - { - allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - //create the worst case graph for memory usage estimation + { struct ggml_cgraph * gf = build_graph(model); - - // compute the required memory ggml_gallocr_reserve(allocr, gf); size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); - fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); + fprintf(stderr, "run_conv2d_case(%s): compute buffer size: %.2f MB\n", label, mem_size/1024.0f/1024.0f); } struct ggml_cgraph * gf_res = compute_graph(model, allocr); - struct ggml_tensor * im2col_res = NULL; - struct ggml_tensor * conv2d_res = NULL; + struct ggml_tensor * im2col_res = NULL; + struct ggml_tensor * conv2d_res = NULL; + struct ggml_tensor * conv2d_direct_res = NULL; - for(int i = 0; i < ggml_graph_n_nodes(gf_res); ++i) { - if(strcmp(ggml_get_name(ggml_graph_node(gf_res, i)), "im2col_res") == 0) { + for (int i = 0; i < ggml_graph_n_nodes(gf_res); ++i) { + if (strcmp(ggml_get_name(ggml_graph_node(gf_res, i)), "im2col_res") == 0) { im2col_res = ggml_graph_node(gf_res, i); - } else if(strcmp(ggml_get_name(ggml_graph_node(gf_res, i)), "conv2d_res") == 0) { + } else if (strcmp(ggml_get_name(ggml_graph_node(gf_res, i)), "conv2d_res") == 0) { conv2d_res = ggml_graph_node(gf_res, i); + } else if (strcmp(ggml_get_name(ggml_graph_node(gf_res, i)), "conv2d_direct_res") == 0) { + conv2d_direct_res = ggml_graph_node(gf_res, i); } } + GGML_ASSERT(conv2d_direct_res != NULL); + std::vector im2col_data(ggml_nelements(im2col_res)); std::vector conv2d_data(ggml_nelements(conv2d_res)); + std::vector conv2d_direct_data(ggml_nelements(conv2d_direct_res)); ggml_backend_tensor_get(im2col_res, im2col_data.data(), 0, ggml_nbytes(im2col_res)); ggml_backend_tensor_get(conv2d_res, conv2d_data.data(), 0, ggml_nbytes(conv2d_res)); + ggml_backend_tensor_get(conv2d_direct_res, conv2d_direct_data.data(), 0, ggml_nbytes(conv2d_direct_res)); const int n_conv2d_test = 480; const int n_im2col_test = 4320; @@ -359,33 +375,58 @@ int main(void) 15872, 15872, 15872, 15872, 15872, 0, 0, 0 }; - printf("\nPerforming test:\n"); + printf("\nPerforming test (%s backend):\n", label); - bool passed = true; - for(int i = 0; i < n_conv2d_test; i++) { - if( - im2col_data[i] != expected_im2col[i]) { - passed = false; + bool im2col_ok = true; + for (int i = 0; i < n_conv2d_test; i++) { + if (im2col_data[i] != expected_im2col[i]) { + im2col_ok = false; break; } } - - printf("ggml_im2col (%d): %s\n", (int) ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m"); - - passed = true; - for(int i = 0; i < n_conv2d_test; i++) { - if(conv2d_data[i] != expected_conv2d[i]) { - passed = false; + printf("ggml_im2col (%d): %s\n", (int) ggml_nelements(im2col_res), + im2col_ok && (ggml_nelements(im2col_res) == n_im2col_test) ? "\033[32mPASSED\033[0m" + : "\033[31mFAILED\033[0m"); + + bool conv_ok = true; + for (int i = 0; i < n_conv2d_test; i++) { + if (conv2d_data[i] != expected_conv2d[i]) { + conv_ok = false; break; } } - - printf("ggml_conv2d (%d): %s\n", (int) ggml_nelements(conv2d_res), passed && (ggml_nelements(conv2d_res) == n_conv2d_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m"); + printf("ggml_conv2d (%d): %s\n", (int) ggml_nelements(conv2d_res), + conv_ok && (ggml_nelements(conv2d_res) == n_conv2d_test) ? "\033[32mPASSED\033[0m" + : "\033[31mFAILED\033[0m"); + + bool conv_direct_ok = true; + for (int i = 0; i < n_conv2d_test; i++) { + if (conv2d_direct_data[i] != expected_conv2d[i]) { + conv_direct_ok = false; + break; + } + } + printf("ggml_conv2d_direct (%d): %s\n", (int) ggml_nelements(conv2d_direct_res), + conv_direct_ok && (ggml_nelements(conv2d_direct_res) == n_conv2d_test) ? "\033[32mPASSED\033[0m" + : "\033[31mFAILED\033[0m"); ggml_free(model.ctx); - ggml_backend_buffer_free(model.buffer); ggml_backend_free(model.backend); ggml_gallocr_free(allocr); - return 0; + + return im2col_ok && conv_ok && conv_direct_ok; +} + +int main(void) { + ggml_time_init(); + + bool ok_cpu = run_conv2d_case(false); +#ifdef GGML_USE_METAL + bool ok_gpu = run_conv2d_case(true); +#else + bool ok_gpu = true; +#endif + + return (ok_cpu && ok_gpu) ? 0 : 1; }