Skip to content

Commit 4d0d647

Browse files
committed
improve graph splitting, partial fix for --no-kv-offload
1 parent d107459 commit 4d0d647

File tree

3 files changed

+127
-41
lines changed

3 files changed

+127
-41
lines changed

ggml-backend.c

+116-16
Original file line numberDiff line numberDiff line change
@@ -737,21 +737,32 @@ struct ggml_backend_sched_split {
737737
int i_end;
738738
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
739739
int n_inputs;
740+
// graph view of this split
740741
struct ggml_cgraph graph;
741742
};
742743

744+
// TODO: group all the hash values into a single struct for clarity
745+
//struct sched_hash_value {
746+
// ggml_tallocr_t tallocr;
747+
// struct ggml_tensor * copies[GGML_MAX_BACKENDS];
748+
//};
749+
743750
struct ggml_backend_sched {
744751
int n_backends;
745752
ggml_backend_t backends[GGML_MAX_BACKENDS];
746753
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
747754

748755
ggml_gallocr_t galloc;
749756

757+
// hash keys of the nodes in the graph
750758
struct ggml_hash_set hash_set;
751-
ggml_tallocr_t * node_talloc; // [hash_set.size]
752-
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
759+
// hash values (arrays of [hash_set.size])
760+
ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
761+
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
753762

763+
// copy of the graph with modified inputs
754764
struct ggml_cgraph * graph;
765+
755766
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
756767
int n_splits;
757768

@@ -928,6 +939,12 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
928939
return dup;
929940
}
930941

942+
943+
//#define DEBUG_PASS1
944+
//#define DEBUG_PASS2
945+
//#define DEBUG_PASS3
946+
//#define DEBUG_PASS4
947+
931948
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
932949
// TODO: merge passes
933950
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@@ -977,42 +994,110 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
977994
node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
978995
}
979996
}
980-
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
997+
#ifdef DEBUG_PASS1
998+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
999+
#endif
9811000

9821001
// pass 2: assign backends to ops from current assignments
9831002
// start from the end and assign the same backend to previous ops
1003+
1004+
// expand gpu backends (ie non last prio) up and down, ignoring cpu
1005+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1006+
1007+
// pass 2.1 expand gpu up
9841008
{
9851009
ggml_tallocr_t cur_allocr = NULL;
9861010
for (int i = graph->n_nodes - 1; i >= 0; i--) {
9871011
struct ggml_tensor * node = graph->nodes[i];
1012+
if (ggml_is_view_op(node->op)) {
1013+
continue;
1014+
}
9881015
ggml_tallocr_t node_allocr = node_allocr(node);
9891016
if (node_allocr != NULL) {
990-
cur_allocr = node_allocr;
1017+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1018+
cur_allocr = NULL;
1019+
}
1020+
else {
1021+
cur_allocr = node_allocr;
1022+
}
9911023
} else {
9921024
node_allocr(node) = cur_allocr;
9931025
SET_CAUSE(node, "2.cur");
9941026
}
9951027
}
9961028
}
9971029

998-
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1030+
// pass 2.2 expand gpu down
1031+
{
1032+
ggml_tallocr_t cur_allocr = NULL;
1033+
for (int i = 0; i < graph->n_nodes; i++) {
1034+
struct ggml_tensor * node = graph->nodes[i];
1035+
if (ggml_is_view_op(node->op)) {
1036+
continue;
1037+
}
1038+
ggml_tallocr_t node_allocr = node_allocr(node);
1039+
if (node_allocr != NULL) {
1040+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1041+
cur_allocr = NULL;
1042+
}
1043+
else {
1044+
cur_allocr = node_allocr;
1045+
}
1046+
} else {
1047+
node_allocr(node) = cur_allocr;
1048+
SET_CAUSE(node, "2.cur");
1049+
}
1050+
}
1051+
}
1052+
1053+
// pass 2.3 expand rest up
1054+
{
1055+
ggml_tallocr_t cur_allocr = NULL;
1056+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
1057+
struct ggml_tensor * node = graph->nodes[i];
1058+
if (ggml_is_view_op(node->op)) {
1059+
continue;
1060+
}
1061+
ggml_tallocr_t node_allocr = node_allocr(node);
1062+
if (node_allocr != NULL) {
1063+
cur_allocr = node_allocr;
1064+
} else {
1065+
node_allocr(node) = cur_allocr;
1066+
SET_CAUSE(node, "2.cur");
1067+
}
1068+
}
1069+
}
1070+
#ifdef DEBUG_PASS2
1071+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1072+
#endif
9991073

1000-
// pass 3: assign backends to remaining src from dst (should only be leafs)
1074+
// pass 3: assign backends to remaining src from dst and view_src
10011075
for (int i = 0; i < graph->n_nodes; i++) {
10021076
struct ggml_tensor * node = graph->nodes[i];
1003-
ggml_tallocr_t node_allocr = node_allocr(node);
1077+
ggml_tallocr_t cur_allocr = node_allocr(node);
1078+
if (ggml_is_view_op(node->op) && cur_allocr == NULL) {
1079+
cur_allocr = node_allocr(node) = node_allocr(node->view_src);
1080+
SET_CAUSE(node, "3.vsrc");
1081+
}
10041082
for (int j = 0; j < GGML_MAX_SRC; j++) {
10051083
struct ggml_tensor * src = node->src[j];
10061084
if (src == NULL) {
10071085
break;
10081086
}
10091087
ggml_tallocr_t src_allocr = node_allocr(src);
10101088
if (src_allocr == NULL) {
1011-
node_allocr(src) = node_allocr;
1089+
if (src->view_src != NULL) {
1090+
// views are always on the same backend as the source
1091+
node_allocr(src) = node_allocr(src->view_src);
1092+
} else {
1093+
node_allocr(src) = cur_allocr;
1094+
}
10121095
}
10131096
}
10141097
}
1015-
//printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1098+
#ifdef DEBUG_PASS3
1099+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1100+
#endif
10161101

10171102
// pass 4: split graph, find tensors that need to be copied
10181103
{
@@ -1074,7 +1159,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
10741159
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
10751160
}
10761161

1077-
// create copies
1162+
// create a copy of the input in the split's backend
10781163
size_t id = hash_id(src);
10791164
if (sched->node_copies[id][cur_backend_id] == NULL) {
10801165
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@@ -1090,8 +1175,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
10901175
sched->splits[cur_split].i_end = graph->n_nodes;
10911176
sched->n_splits = cur_split + 1;
10921177
}
1093-
1094-
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1178+
#ifdef DEBUG_PASS4
1179+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1180+
#endif
10951181

10961182
#ifndef NDEBUG
10971183
// sanity check: all sources should have the same backend as the node
@@ -1101,6 +1187,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
11011187
if (node_allocr == NULL) {
11021188
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
11031189
}
1190+
if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
1191+
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1192+
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1193+
node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
1194+
}
11041195
for (int j = 0; j < GGML_MAX_SRC; j++) {
11051196
struct ggml_tensor * src = node->src[j];
11061197
if (src == NULL) {
@@ -1112,8 +1203,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
11121203
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
11131204
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
11141205
}
1206+
if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
1207+
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1208+
src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
1209+
src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
1210+
}
11151211
}
11161212
}
1213+
fflush(stderr);
11171214
#endif
11181215

11191216
// create copies of the graph for each split
@@ -1127,6 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
11271224
for (int j = 0; j < split->n_inputs; j++) {
11281225
struct ggml_tensor * input = split->inputs[j];
11291226
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
1227+
// add a dependency to the input source so that it is not freed before the copy is done
11301228
input_cpy->src[0] = input;
11311229
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
11321230
}
@@ -1163,19 +1261,20 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
11631261
struct ggml_tensor * input = split->inputs[j];
11641262
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
11651263
if (input->buffer == NULL) {
1264+
GGML_ASSERT(false);
11661265
if (input->view_src == NULL) {
11671266
fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
1168-
exit(1);
1267+
GGML_ASSERT(false);
11691268
}
11701269
// FIXME: may need to use the sched buffer instead
11711270
ggml_backend_view_init(input->view_src->buffer, input);
11721271
}
11731272
if (input_cpy->buffer == NULL) {
11741273
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
1175-
exit(1);
1274+
GGML_ASSERT(false);
11761275
}
1177-
//GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
1178-
//GGML_ASSERT(input_cpy->buffer->backend == split_backend);
1276+
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1277+
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple time
11791278
ggml_backend_tensor_copy(input, input_cpy);
11801279
}
11811280
// ggml_backend_synchronize(split_backend);
@@ -1301,6 +1400,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
13011400
}
13021401

13031402
// utils
1403+
13041404
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
13051405
GGML_ASSERT(tensor->buffer == NULL);
13061406
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized

ggml-cuda.cu

+2-23
Original file line numberDiff line numberDiff line change
@@ -9712,6 +9712,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
97129712
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
97139713
continue;
97149714

9715+
#ifndef NDEBUG
97159716
assert(node->backend == GGML_BACKEND_GPU);
97169717
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
97179718
assert(node->extra != nullptr);
@@ -9723,35 +9724,13 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
97239724
assert(node->src[j]->extra != nullptr);
97249725
}
97259726
}
9727+
#endif
97269728

97279729
bool ok = ggml_cuda_compute_forward(&params, node);
97289730
if (!ok) {
97299731
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
97309732
}
97319733
GGML_ASSERT(ok);
9732-
9733-
#if 0
9734-
if (node->type == GGML_TYPE_F32) {
9735-
cudaDeviceSynchronize();
9736-
std::vector<float> tmp(ggml_nelements(node), 0.0f);
9737-
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
9738-
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
9739-
ggml_type_name(node->src[0]->type),
9740-
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
9741-
node->src[0]->name,
9742-
node->src[1] ? node->src[1]->name : "none");
9743-
double sum = 0.0;
9744-
double sq_sum = 0.0;
9745-
for (int i = 0; i < ggml_nelements(node); i++) {
9746-
printf("%f ", tmp[i]);
9747-
sum += tmp[i];
9748-
sq_sum += tmp[i]*tmp[i];
9749-
}
9750-
printf("\n");
9751-
printf("sum: %f, ", sum);
9752-
printf("sq_sum: %f\n", sq_sum);
9753-
}
9754-
#endif
97559734
}
97569735

97579736
UNUSED(backend);

llama.cpp

+9-2
Original file line numberDiff line numberDiff line change
@@ -1730,7 +1730,6 @@ static bool llama_kv_cache_init(
17301730
return false;
17311731
}
17321732
ggml_backend_buffer_clear(buf, 0);
1733-
// FIXME: buffer type name
17341733
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
17351734
cache.bufs.push_back(buf);
17361735
}
@@ -2463,9 +2462,9 @@ struct llama_model_loader {
24632462
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
24642463
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
24652464
if (!cur) {
2465+
// some tensors may be allocated in a different context
24662466
continue;
24672467
}
2468-
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
24692468

24702469
if (progress_callback) {
24712470
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
@@ -3734,6 +3733,8 @@ static bool llm_load_tensors(
37343733
if (buf == nullptr) {
37353734
throw std::runtime_error("failed to allocate buffer");
37363735
}
3736+
// indicate that this buffer contains weights
3737+
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are always scheduled to the backend that contains the weight
37373738
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
37383739
model.bufs.push_back(buf);
37393740
ctx_bufs.emplace_back(ctx, buf);
@@ -4336,6 +4337,12 @@ struct llm_build_context {
43364337
cb(Vcur, "Vcur", il);
43374338
}
43384339

4340+
// these nodes are to the graph together so that they are not reordered
4341+
// by doing so, the number of splits in the graph is reduced
4342+
ggml_build_forward_expand(gf, Qcur);
4343+
ggml_build_forward_expand(gf, Kcur);
4344+
ggml_build_forward_expand(gf, Vcur);
4345+
43394346
Qcur = ggml_rope_custom(
43404347
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
43414348
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,

0 commit comments

Comments
 (0)