@@ -737,21 +737,32 @@ struct ggml_backend_sched_split {
737
737
int i_end ;
738
738
struct ggml_tensor * inputs [GGML_MAX_SPLIT_INPUTS ];
739
739
int n_inputs ;
740
+ // graph view of this split
740
741
struct ggml_cgraph graph ;
741
742
};
742
743
744
+ // TODO: group all the hash values into a single struct for clarity
745
+ //struct sched_hash_value {
746
+ // ggml_tallocr_t tallocr;
747
+ // struct ggml_tensor * copies[GGML_MAX_BACKENDS];
748
+ //};
749
+
743
750
struct ggml_backend_sched {
744
751
int n_backends ;
745
752
ggml_backend_t backends [GGML_MAX_BACKENDS ];
746
753
ggml_tallocr_t tallocs [GGML_MAX_BACKENDS ];
747
754
748
755
ggml_gallocr_t galloc ;
749
756
757
+ // hash keys of the nodes in the graph
750
758
struct ggml_hash_set hash_set ;
751
- ggml_tallocr_t * node_talloc ; // [hash_set.size]
752
- struct ggml_tensor * (* node_copies )[GGML_MAX_BACKENDS ]; // [hash_set.size][GGML_MAX_BACKENDS]
759
+ // hash values (arrays of [hash_set.size])
760
+ ggml_tallocr_t * node_talloc ; // tallocr assigned to each node (indirectly this is the backend)
761
+ struct ggml_tensor * (* node_copies )[GGML_MAX_BACKENDS ]; // copies of each node for each destination backend
753
762
763
+ // copy of the graph with modified inputs
754
764
struct ggml_cgraph * graph ;
765
+
755
766
struct ggml_backend_sched_split splits [GGML_MAX_SPLITS ];
756
767
int n_splits ;
757
768
@@ -928,6 +939,12 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
928
939
return dup ;
929
940
}
930
941
942
+
943
+ //#define DEBUG_PASS1
944
+ //#define DEBUG_PASS2
945
+ //#define DEBUG_PASS3
946
+ //#define DEBUG_PASS4
947
+
931
948
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
932
949
// TODO: merge passes
933
950
static void sched_split_graph (ggml_backend_sched_t sched , struct ggml_cgraph * graph ) {
@@ -977,42 +994,110 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
977
994
node_allocr (node ) = ggml_backend_sched_get_tallocr (sched , node_backend );
978
995
}
979
996
}
980
- //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
997
+ #ifdef DEBUG_PASS1
998
+ fprintf (stderr , "PASS 1 ASSIGNMENTS\n" ); sched_print_assignments (sched , graph );
999
+ #endif
981
1000
982
1001
// pass 2: assign backends to ops from current assignments
983
1002
// start from the end and assign the same backend to previous ops
1003
+
1004
+ // expand gpu backends (ie non last prio) up and down, ignoring cpu
1005
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1006
+
1007
+ // pass 2.1 expand gpu up
984
1008
{
985
1009
ggml_tallocr_t cur_allocr = NULL ;
986
1010
for (int i = graph -> n_nodes - 1 ; i >= 0 ; i -- ) {
987
1011
struct ggml_tensor * node = graph -> nodes [i ];
1012
+ if (ggml_is_view_op (node -> op )) {
1013
+ continue ;
1014
+ }
988
1015
ggml_tallocr_t node_allocr = node_allocr (node );
989
1016
if (node_allocr != NULL ) {
990
- cur_allocr = node_allocr ;
1017
+ if (sched_allocr_prio (sched , node_allocr ) == sched -> n_backends - 1 ) {
1018
+ cur_allocr = NULL ;
1019
+ }
1020
+ else {
1021
+ cur_allocr = node_allocr ;
1022
+ }
991
1023
} else {
992
1024
node_allocr (node ) = cur_allocr ;
993
1025
SET_CAUSE (node , "2.cur" );
994
1026
}
995
1027
}
996
1028
}
997
1029
998
- //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1030
+ // pass 2.2 expand gpu down
1031
+ {
1032
+ ggml_tallocr_t cur_allocr = NULL ;
1033
+ for (int i = 0 ; i < graph -> n_nodes ; i ++ ) {
1034
+ struct ggml_tensor * node = graph -> nodes [i ];
1035
+ if (ggml_is_view_op (node -> op )) {
1036
+ continue ;
1037
+ }
1038
+ ggml_tallocr_t node_allocr = node_allocr (node );
1039
+ if (node_allocr != NULL ) {
1040
+ if (sched_allocr_prio (sched , node_allocr ) == sched -> n_backends - 1 ) {
1041
+ cur_allocr = NULL ;
1042
+ }
1043
+ else {
1044
+ cur_allocr = node_allocr ;
1045
+ }
1046
+ } else {
1047
+ node_allocr (node ) = cur_allocr ;
1048
+ SET_CAUSE (node , "2.cur" );
1049
+ }
1050
+ }
1051
+ }
1052
+
1053
+ // pass 2.3 expand rest up
1054
+ {
1055
+ ggml_tallocr_t cur_allocr = NULL ;
1056
+ for (int i = graph -> n_nodes - 1 ; i >= 0 ; i -- ) {
1057
+ struct ggml_tensor * node = graph -> nodes [i ];
1058
+ if (ggml_is_view_op (node -> op )) {
1059
+ continue ;
1060
+ }
1061
+ ggml_tallocr_t node_allocr = node_allocr (node );
1062
+ if (node_allocr != NULL ) {
1063
+ cur_allocr = node_allocr ;
1064
+ } else {
1065
+ node_allocr (node ) = cur_allocr ;
1066
+ SET_CAUSE (node , "2.cur" );
1067
+ }
1068
+ }
1069
+ }
1070
+ #ifdef DEBUG_PASS2
1071
+ fprintf (stderr , "PASS 2 ASSIGNMENTS\n" ); sched_print_assignments (sched , graph );
1072
+ #endif
999
1073
1000
- // pass 3: assign backends to remaining src from dst (should only be leafs)
1074
+ // pass 3: assign backends to remaining src from dst and view_src
1001
1075
for (int i = 0 ; i < graph -> n_nodes ; i ++ ) {
1002
1076
struct ggml_tensor * node = graph -> nodes [i ];
1003
- ggml_tallocr_t node_allocr = node_allocr (node );
1077
+ ggml_tallocr_t cur_allocr = node_allocr (node );
1078
+ if (ggml_is_view_op (node -> op ) && cur_allocr == NULL ) {
1079
+ cur_allocr = node_allocr (node ) = node_allocr (node -> view_src );
1080
+ SET_CAUSE (node , "3.vsrc" );
1081
+ }
1004
1082
for (int j = 0 ; j < GGML_MAX_SRC ; j ++ ) {
1005
1083
struct ggml_tensor * src = node -> src [j ];
1006
1084
if (src == NULL ) {
1007
1085
break ;
1008
1086
}
1009
1087
ggml_tallocr_t src_allocr = node_allocr (src );
1010
1088
if (src_allocr == NULL ) {
1011
- node_allocr (src ) = node_allocr ;
1089
+ if (src -> view_src != NULL ) {
1090
+ // views are always on the same backend as the source
1091
+ node_allocr (src ) = node_allocr (src -> view_src );
1092
+ } else {
1093
+ node_allocr (src ) = cur_allocr ;
1094
+ }
1012
1095
}
1013
1096
}
1014
1097
}
1015
- //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1098
+ #ifdef DEBUG_PASS3
1099
+ fprintf (stderr , "PASS 3 ASSIGNMENTS\n" ); sched_print_assignments (sched , graph );
1100
+ #endif
1016
1101
1017
1102
// pass 4: split graph, find tensors that need to be copied
1018
1103
{
@@ -1074,7 +1159,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1074
1159
sched -> splits [cur_split ].inputs [n_inputs ] = (struct ggml_tensor * )src ;
1075
1160
}
1076
1161
1077
- // create copies
1162
+ // create a copy of the input in the split's backend
1078
1163
size_t id = hash_id (src );
1079
1164
if (sched -> node_copies [id ][cur_backend_id ] == NULL ) {
1080
1165
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout (sched -> ctx , src );
@@ -1090,8 +1175,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1090
1175
sched -> splits [cur_split ].i_end = graph -> n_nodes ;
1091
1176
sched -> n_splits = cur_split + 1 ;
1092
1177
}
1093
-
1094
- //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1178
+ #ifdef DEBUG_PASS4
1179
+ fprintf (stderr , "PASS 4 ASSIGNMENTS\n" ); sched_print_assignments (sched , graph );
1180
+ #endif
1095
1181
1096
1182
#ifndef NDEBUG
1097
1183
// sanity check: all sources should have the same backend as the node
@@ -1101,6 +1187,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1101
1187
if (node_allocr == NULL ) {
1102
1188
fprintf (stderr , "!!!!!!! %s has no backend\n" , node -> name );
1103
1189
}
1190
+ if (node -> view_src != NULL && node_allocr != node_allocr (node -> view_src )) {
1191
+ fprintf (stderr , "!!!!!!! %s has backend %s, view_src %s has backend %s\n" ,
1192
+ node -> name , node_allocr ? ggml_backend_name (get_allocr_backend (sched , node_allocr )) : "NULL" ,
1193
+ node -> view_src -> name , node_allocr (node -> view_src ) ? ggml_backend_name (get_allocr_backend (sched , node_allocr (node -> view_src ))) : "NULL" );
1194
+ }
1104
1195
for (int j = 0 ; j < GGML_MAX_SRC ; j ++ ) {
1105
1196
struct ggml_tensor * src = node -> src [j ];
1106
1197
if (src == NULL ) {
@@ -1112,8 +1203,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1112
1203
node -> name , node_allocr ? ggml_backend_name (get_allocr_backend (sched , node_allocr )) : "NULL" ,
1113
1204
j , src -> name , src_allocr ? ggml_backend_name (get_allocr_backend (sched , src_allocr )) : "NULL" );
1114
1205
}
1206
+ if (src -> view_src != NULL && src_allocr != node_allocr (src -> view_src )) {
1207
+ fprintf (stderr , "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n" ,
1208
+ src -> name , src_allocr ? ggml_backend_name (get_allocr_backend (sched , src_allocr )) : "NULL" ,
1209
+ src -> view_src -> name , node_allocr (src -> view_src ) ? ggml_backend_name (get_allocr_backend (sched , node_allocr (src -> view_src ))) : "NULL" );
1210
+ }
1115
1211
}
1116
1212
}
1213
+ fflush (stderr );
1117
1214
#endif
1118
1215
1119
1216
// create copies of the graph for each split
@@ -1127,6 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1127
1224
for (int j = 0 ; j < split -> n_inputs ; j ++ ) {
1128
1225
struct ggml_tensor * input = split -> inputs [j ];
1129
1226
struct ggml_tensor * input_cpy = sched -> node_copies [hash_id (input )][sched_allocr_prio (sched , split -> tallocr )];
1227
+ // add a dependency to the input source so that it is not freed before the copy is done
1130
1228
input_cpy -> src [0 ] = input ;
1131
1229
graph_copy -> nodes [graph_copy -> n_nodes ++ ] = input_cpy ;
1132
1230
}
@@ -1163,19 +1261,20 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1163
1261
struct ggml_tensor * input = split -> inputs [j ];
1164
1262
struct ggml_tensor * input_cpy = sched -> node_copies [hash_id (input )][sched_backend_prio (sched , split_backend )];
1165
1263
if (input -> buffer == NULL ) {
1264
+ GGML_ASSERT (false);
1166
1265
if (input -> view_src == NULL ) {
1167
1266
fprintf (stderr , "input %s has no buffer and no view_src\n" , input -> name );
1168
- exit ( 1 );
1267
+ GGML_ASSERT (false );
1169
1268
}
1170
1269
// FIXME: may need to use the sched buffer instead
1171
1270
ggml_backend_view_init (input -> view_src -> buffer , input );
1172
1271
}
1173
1272
if (input_cpy -> buffer == NULL ) {
1174
1273
fprintf (stderr , "input_cpy %s has no buffer\n" , input_cpy -> name );
1175
- exit ( 1 );
1274
+ GGML_ASSERT (false );
1176
1275
}
1177
- //GGML_ASSERT( input->buffer->backend != input_cpy->buffer->backend);
1178
- //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
1276
+ // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1277
+ // this is important to avoid copying constants such as KQ_mask and inp_pos multiple time
1179
1278
ggml_backend_tensor_copy (input , input_cpy );
1180
1279
}
1181
1280
// ggml_backend_synchronize(split_backend);
@@ -1301,6 +1400,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1301
1400
}
1302
1401
1303
1402
// utils
1403
+
1304
1404
void ggml_backend_view_init (ggml_backend_buffer_t buffer , struct ggml_tensor * tensor ) {
1305
1405
GGML_ASSERT (tensor -> buffer == NULL );
1306
1406
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
0 commit comments