24
24
#include <future>
25
25
#include <thread>
26
26
27
+ #if defined(_MSC_VER)
28
+ # define NOMINMAX 1
29
+ # include <windows.h>
30
+ # define YIELD() YieldProcessor()
31
+ #elif defined(__clang__) || defined(__GNUC__)
32
+ # if defined(__x86_64__) ||defined(__i386__)
33
+ # include <immintrin.h>
34
+ # define YIELD() _mm_pause()
35
+ # elif defined(__arm__) || defined(__aarch64__)
36
+ # if defined(__clang__)
37
+ # include <arm_acle.h>
38
+ # define YIELD() __yield()
39
+ # else
40
+ # define YIELD() asm volatile("yield")
41
+ # endif
42
+ # endif
43
+ #endif
44
+
45
+ #if !defined(YIELD)
46
+ #define YIELD()
47
+ #endif
48
+
27
49
#include "ggml-impl.h"
28
50
#include "ggml-backend-impl.h"
29
51
@@ -787,7 +809,8 @@ struct ggml_backend_vk_context {
787
809
ggml_vk_garbage_collector gc;
788
810
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
789
811
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
790
- vk::Fence fence;
812
+ vk::Fence fence, almost_ready_fence;
813
+ bool almost_ready_fence_pending {};
791
814
792
815
vk_buffer buffer_pool[MAX_VK_BUFFERS];
793
816
@@ -878,6 +901,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
878
901
879
902
static void ggml_backend_vk_free(ggml_backend_t backend);
880
903
904
+ // Wait for ctx->fence to be signaled.
905
+ static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
906
+ // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
907
+ // during this wait.
908
+ if (ctx->almost_ready_fence_pending) {
909
+ VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
910
+ ctx->device->device.resetFences({ ctx->almost_ready_fence });
911
+ ctx->almost_ready_fence_pending = false;
912
+ }
913
+
914
+ // Spin (w/pause) waiting for the graph to finish executing.
915
+ vk::Result result;
916
+ while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
917
+ if (result != vk::Result::eNotReady) {
918
+ fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
919
+ exit(1);
920
+ }
921
+ for (uint32_t i = 0; i < 100; ++i) {
922
+ YIELD();
923
+ YIELD();
924
+ YIELD();
925
+ YIELD();
926
+ YIELD();
927
+ YIELD();
928
+ YIELD();
929
+ YIELD();
930
+ YIELD();
931
+ YIELD();
932
+ }
933
+ }
934
+ ctx->device->device.resetFences({ ctx->fence });
935
+ }
936
+
881
937
// variables to track number of compiles in progress
882
938
static uint32_t compile_count = 0;
883
939
static std::mutex compile_count_mutex;
@@ -3355,6 +3411,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
3355
3411
ctx->prealloc_size_split_k = 0;
3356
3412
3357
3413
ctx->fence = ctx->device->device.createFence({});
3414
+ ctx->almost_ready_fence = ctx->device->device.createFence({});
3358
3415
3359
3416
#ifdef GGML_VULKAN_CHECK_RESULTS
3360
3417
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
@@ -7959,11 +8016,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
7959
8016
}
7960
8017
}
7961
8018
7962
- static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
8019
+ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready );
7963
8020
7964
8021
// Returns true if node has enqueued work into the queue, false otherwise
7965
8022
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
7966
- static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
8023
+ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
7967
8024
if (ggml_is_empty(node) || !node->buffer) {
7968
8025
return false;
7969
8026
}
@@ -8335,7 +8392,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8335
8392
8336
8393
ctx->compute_ctx.reset();
8337
8394
8338
- bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
8395
+ bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready );
8339
8396
if (!ok) {
8340
8397
if (node->op == GGML_OP_UNARY) {
8341
8398
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
@@ -8349,7 +8406,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8349
8406
return true;
8350
8407
}
8351
8408
8352
- static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true) {
8409
+ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) {
8353
8410
ggml_backend_buffer * buf = nullptr;
8354
8411
8355
8412
switch (tensor->op) {
@@ -8452,12 +8509,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
8452
8509
memcpy(cpy.dst, cpy.src, cpy.n);
8453
8510
}
8454
8511
8455
- ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8512
+ if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
8513
+ ggml_vk_submit(subctx, ctx->almost_ready_fence);
8514
+ ctx->almost_ready_fence_pending = true;
8515
+ } else {
8516
+ ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8517
+ }
8456
8518
8457
8519
if (use_fence) {
8458
- VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
8459
-
8460
- ctx->device->device.resetFences({ ctx->fence });
8520
+ ggml_vk_wait_for_fence(ctx);
8461
8521
}
8462
8522
#ifdef GGML_VULKAN_CHECK_RESULTS
8463
8523
ggml_vk_check_results_1(tensor);
@@ -8543,6 +8603,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
8543
8603
ctx->gc.events.clear();
8544
8604
8545
8605
ctx->device->device.destroyFence(ctx->fence);
8606
+ ctx->device->device.destroyFence(ctx->almost_ready_fence);
8546
8607
}
8547
8608
8548
8609
static int ggml_vk_get_device_count() {
@@ -8889,8 +8950,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
8889
8950
}
8890
8951
8891
8952
ggml_vk_submit(transfer_ctx, ctx->fence);
8892
- VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
8893
- ctx->device->device.resetFences({ ctx->fence });
8953
+ ggml_vk_wait_for_fence(ctx);
8894
8954
8895
8955
for (auto& cpy : transfer_ctx->out_memcpys) {
8896
8956
memcpy(cpy.dst, cpy.src, cpy.n);
@@ -8909,7 +8969,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
8909
8969
8910
8970
uint64_t total_mat_mul_bytes = 0;
8911
8971
for (int i = 0; i < cgraph->n_nodes; i++) {
8912
- ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
8972
+ ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false );
8913
8973
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
8914
8974
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
8915
8975
}
@@ -8951,11 +9011,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
8951
9011
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
8952
9012
}
8953
9013
9014
+ // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
9015
+ bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
8954
9016
bool submit = (submitted_nodes >= nodes_per_submit) ||
8955
9017
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8956
- (i == last_node);
9018
+ (i == last_node) ||
9019
+ (almost_ready && !ctx->almost_ready_fence_pending);
8957
9020
8958
- bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
9021
+ bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
8959
9022
8960
9023
if (enqueued) {
8961
9024
++submitted_nodes;
@@ -8967,7 +9030,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
8967
9030
#endif
8968
9031
}
8969
9032
8970
- if (submit) {
9033
+ if (submit && enqueued ) {
8971
9034
first_node_in_batch = true;
8972
9035
submitted_nodes = 0;
8973
9036
mul_mat_bytes = 0;
0 commit comments