@@ -558,18 +558,27 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
558
558
559
559
[desc release];
560
560
561
+ //GGML_LOG_INFO("%s: resized heap to %zu\n", __func__, [heap->obj size]);
562
+
561
563
ggml_metal_heap_reset(heap);
562
564
563
565
return true;
564
566
}
565
567
566
- static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size) {
567
- const size_t alignment = 1024*1024;
568
+ static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, bool no_alloc) {
569
+ // note: this is probably more than needed, but just in case
570
+ const size_t alignment = 1024;
568
571
569
572
const size_t size_aligned = GGML_PAD(size, alignment);
570
573
574
+ //GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, need = %zu, fail = %d\n", __func__, size, size_aligned, heap->need, heap->fail);
575
+
571
576
heap->need += size_aligned;
572
577
578
+ if (no_alloc) {
579
+ return nil;
580
+ }
581
+
573
582
if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment:alignment]) {
574
583
heap->fail = 1;
575
584
}
@@ -883,7 +892,7 @@ @implementation GGMLMetalClass
883
892
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
884
893
ctx->cmd_bufs[i].obj = nil;
885
894
886
- // create 1MB heaps per command buffer
895
+ // create initial small heaps per command buffer
887
896
// these can be resized during compute when necessary
888
897
ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 32);
889
898
}
@@ -1624,17 +1633,19 @@ static bool ggml_metal_encode_node(
1624
1633
GGML_ABORT("unsupported op");
1625
1634
}
1626
1635
1636
+ const bool no_alloc = no_compute;
1637
+
1638
+ // heap buffers for temporary data
1627
1639
id<MTLBuffer> h_src0 = nil;
1640
+
1628
1641
switch (dst->op) {
1629
1642
case GGML_OP_SOFT_MAX:
1630
1643
{
1631
- h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0));
1632
- if (!h_src0) {
1633
- // GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n",
1634
- // __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]);
1644
+ h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), no_alloc );
1645
+ if (!no_alloc && ! h_src0) {
1646
+ GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d \n",
1647
+ __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail );
1635
1648
return false;
1636
- } else {
1637
- //GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0));
1638
1649
}
1639
1650
} break;
1640
1651
default:
@@ -4707,16 +4718,13 @@ static enum ggml_status ggml_metal_graph_compute(
4707
4718
// number of threads in addition to the main thread
4708
4719
const int n_cb = ctx->n_cb;
4709
4720
4710
- int n_try = 2;
4711
-
4712
4721
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
4713
4722
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
4714
4723
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
4715
4724
// each thread creates it's own command buffer and enqueues the ops in parallel
4716
4725
//
4717
4726
// tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
4718
4727
4719
- while (n_try-- > 0) {
4720
4728
@autoreleasepool {
4721
4729
ctx->gf = gf;
4722
4730
@@ -4834,55 +4842,6 @@ static enum ggml_status ggml_metal_graph_compute(
4834
4842
}
4835
4843
}
4836
4844
4837
- bool retry = false;
4838
-
4839
- // check heap statuses
4840
- for (int i = 0; i <= n_cb; ++i) {
4841
- struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
4842
-
4843
- const size_t need = heap->need;
4844
-
4845
- //printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]);
4846
-
4847
- if (heap->fail == 0) {
4848
- ggml_metal_heap_reset(ctx->cmd_bufs[i].heap);
4849
- [heap->obj setPurgeableState:MTLPurgeableStateEmpty];
4850
-
4851
- continue;
4852
- }
4853
-
4854
- if (heap->fail == 2) {
4855
- GGML_LOG_ERROR("%s: command buffer %d, MTLHeap ran out of buffers, max = %d\n", __func__, i, heap->n);
4856
- return GGML_STATUS_ALLOC_FAILED;
4857
- }
4858
-
4859
- if (heap->fail == 3) {
4860
- GGML_LOG_ERROR("%s: command buffer %d, MTLHeap failed to allocate buffer, max = %d\n", __func__, i, heap->n);
4861
- return GGML_STATUS_ALLOC_FAILED;
4862
- }
4863
-
4864
- //GGML_LOG_INFO("%s: command buffer %d, MTLHeap need = %zu\n", __func__, i, need);
4865
-
4866
- if (!ggml_metal_heap_resize(heap, need)) {
4867
- GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need);
4868
- return GGML_STATUS_ALLOC_FAILED;
4869
- }
4870
-
4871
- retry = true;
4872
- }
4873
-
4874
- if (!retry) {
4875
- break;
4876
- }
4877
-
4878
- //printf("XXXXXXXXXXXXXXXXXXXXXXX retry\n");
4879
-
4880
- if (n_try == 0) {
4881
- GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__);
4882
- return GGML_STATUS_ALLOC_FAILED;
4883
- }
4884
- }
4885
-
4886
4845
return GGML_STATUS_SUCCESS;
4887
4846
}
4888
4847
@@ -5257,21 +5216,38 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
5257
5216
5258
5217
const bool should_capture = ctx->capture_next_compute;
5259
5218
5260
- bool no_compute = false ;
5219
+ ggml_metal_heap_reset(heap) ;
5261
5220
5262
5221
for (int idx = node_start; idx < node_end; ++idx) {
5263
- if (should_capture) {
5264
- [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
5265
- }
5222
+ ggml_metal_encode_node(backend, idx, encoder, heap, true);
5223
+ }
5224
+
5225
+ bool can_compute = true;
5266
5226
5267
- const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, no_compute);
5227
+ if (heap->need > [heap->obj size]) {
5228
+ const size_t need = heap->need;
5268
5229
5269
- if (should_capture) {
5270
- [encoder popDebugGroup];
5230
+ if (!ggml_metal_heap_resize(heap, need)) {
5231
+ GGML_LOG_ERROR("%s: failed to resize MTLHeap, need = %zu\n", __func__, need);
5232
+ can_compute = false;
5271
5233
}
5234
+ }
5235
+
5236
+ if (can_compute) {
5237
+ for (int idx = node_start; idx < node_end; ++idx) {
5238
+ if (should_capture) {
5239
+ [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
5240
+ }
5241
+
5242
+ const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, false);
5272
5243
5273
- if (!res) {
5274
- no_compute = true;
5244
+ if (should_capture) {
5245
+ [encoder popDebugGroup];
5246
+ }
5247
+
5248
+ if (!res) {
5249
+ break;
5250
+ }
5275
5251
}
5276
5252
}
5277
5253
0 commit comments