Skip to content

Commit 91d5dc5

Browse files
committed
cont : heap allocation now works [no ci]
1 parent cbb617e commit 91d5dc5

File tree

1 file changed

+46
-70
lines changed

1 file changed

+46
-70
lines changed

ggml/src/ggml-metal/ggml-metal.m

+46-70
Original file line numberDiff line numberDiff line change
@@ -558,18 +558,27 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
558558

559559
[desc release];
560560

561+
//GGML_LOG_INFO("%s: resized heap to %zu\n", __func__, [heap->obj size]);
562+
561563
ggml_metal_heap_reset(heap);
562564

563565
return true;
564566
}
565567

566-
static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size) {
567-
const size_t alignment = 1024*1024;
568+
static id<MTLBuffer> ggml_metal_heap_alloc(struct ggml_metal_heap * heap, size_t size, bool no_alloc) {
569+
// note: this is probably more than needed, but just in case
570+
const size_t alignment = 1024;
568571

569572
const size_t size_aligned = GGML_PAD(size, alignment);
570573

574+
//GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, need = %zu, fail = %d\n", __func__, size, size_aligned, heap->need, heap->fail);
575+
571576
heap->need += size_aligned;
572577

578+
if (no_alloc) {
579+
return nil;
580+
}
581+
573582
if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment:alignment]) {
574583
heap->fail = 1;
575584
}
@@ -883,7 +892,7 @@ @implementation GGMLMetalClass
883892
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
884893
ctx->cmd_bufs[i].obj = nil;
885894

886-
// create 1MB heaps per command buffer
895+
// create initial small heaps per command buffer
887896
// these can be resized during compute when necessary
888897
ctx->cmd_bufs[i].heap = ggml_metal_heap_init(device, 32);
889898
}
@@ -1624,17 +1633,19 @@ static bool ggml_metal_encode_node(
16241633
GGML_ABORT("unsupported op");
16251634
}
16261635

1636+
const bool no_alloc = no_compute;
1637+
1638+
// heap buffers for temporary data
16271639
id<MTLBuffer> h_src0 = nil;
1640+
16281641
switch (dst->op) {
16291642
case GGML_OP_SOFT_MAX:
16301643
{
1631-
h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0));
1632-
if (!h_src0) {
1633-
//GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n",
1634-
// __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]);
1644+
h_src0 = ggml_metal_heap_alloc(heap, ggml_nbytes(src0), no_alloc);
1645+
if (!no_alloc && !h_src0) {
1646+
GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu, fail = %d\n",
1647+
__func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize], heap->fail);
16351648
return false;
1636-
} else {
1637-
//GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0));
16381649
}
16391650
} break;
16401651
default:
@@ -4707,16 +4718,13 @@ static enum ggml_status ggml_metal_graph_compute(
47074718
// number of threads in addition to the main thread
47084719
const int n_cb = ctx->n_cb;
47094720

4710-
int n_try = 2;
4711-
47124721
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
47134722
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
47144723
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
47154724
// each thread creates it's own command buffer and enqueues the ops in parallel
47164725
//
47174726
// tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
47184727

4719-
while (n_try-- > 0) {
47204728
@autoreleasepool {
47214729
ctx->gf = gf;
47224730

@@ -4834,55 +4842,6 @@ static enum ggml_status ggml_metal_graph_compute(
48344842
}
48354843
}
48364844

4837-
bool retry = false;
4838-
4839-
// check heap statuses
4840-
for (int i = 0; i <= n_cb; ++i) {
4841-
struct ggml_metal_heap * heap = ctx->cmd_bufs[i].heap;
4842-
4843-
const size_t need = heap->need;
4844-
4845-
//printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]);
4846-
4847-
if (heap->fail == 0) {
4848-
ggml_metal_heap_reset(ctx->cmd_bufs[i].heap);
4849-
[heap->obj setPurgeableState:MTLPurgeableStateEmpty];
4850-
4851-
continue;
4852-
}
4853-
4854-
if (heap->fail == 2) {
4855-
GGML_LOG_ERROR("%s: command buffer %d, MTLHeap ran out of buffers, max = %d\n", __func__, i, heap->n);
4856-
return GGML_STATUS_ALLOC_FAILED;
4857-
}
4858-
4859-
if (heap->fail == 3) {
4860-
GGML_LOG_ERROR("%s: command buffer %d, MTLHeap failed to allocate buffer, max = %d\n", __func__, i, heap->n);
4861-
return GGML_STATUS_ALLOC_FAILED;
4862-
}
4863-
4864-
//GGML_LOG_INFO("%s: command buffer %d, MTLHeap need = %zu\n", __func__, i, need);
4865-
4866-
if (!ggml_metal_heap_resize(heap, need)) {
4867-
GGML_LOG_ERROR("%s: failed to increase heap size to %zu\n", __func__, need);
4868-
return GGML_STATUS_ALLOC_FAILED;
4869-
}
4870-
4871-
retry = true;
4872-
}
4873-
4874-
if (!retry) {
4875-
break;
4876-
}
4877-
4878-
//printf("XXXXXXXXXXXXXXXXXXXXXXX retry\n");
4879-
4880-
if (n_try == 0) {
4881-
GGML_LOG_ERROR("%s: failed to allocate heap memory\n", __func__);
4882-
return GGML_STATUS_ALLOC_FAILED;
4883-
}
4884-
}
4885-
48864845
return GGML_STATUS_SUCCESS;
48874846
}
48884847

@@ -5257,21 +5216,38 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
52575216

52585217
const bool should_capture = ctx->capture_next_compute;
52595218

5260-
bool no_compute = false;
5219+
ggml_metal_heap_reset(heap);
52615220

52625221
for (int idx = node_start; idx < node_end; ++idx) {
5263-
if (should_capture) {
5264-
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
5265-
}
5222+
ggml_metal_encode_node(backend, idx, encoder, heap, true);
5223+
}
5224+
5225+
bool can_compute = true;
52665226

5267-
const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, no_compute);
5227+
if (heap->need > [heap->obj size]) {
5228+
const size_t need = heap->need;
52685229

5269-
if (should_capture) {
5270-
[encoder popDebugGroup];
5230+
if (!ggml_metal_heap_resize(heap, need)) {
5231+
GGML_LOG_ERROR("%s: failed to resize MTLHeap, need = %zu\n", __func__, need);
5232+
can_compute = false;
52715233
}
5234+
}
5235+
5236+
if (can_compute) {
5237+
for (int idx = node_start; idx < node_end; ++idx) {
5238+
if (should_capture) {
5239+
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
5240+
}
5241+
5242+
const bool res = ggml_metal_encode_node(backend, idx, encoder, heap, false);
52725243

5273-
if (!res) {
5274-
no_compute = true;
5244+
if (should_capture) {
5245+
[encoder popDebugGroup];
5246+
}
5247+
5248+
if (!res) {
5249+
break;
5250+
}
52755251
}
52765252
}
52775253

0 commit comments

Comments
 (0)