Skip to content

Commit bd43693

Browse files
committed
Sync with master, initial gguf implementation
Signed-off-by: mudler <[email protected]>
1 parent 6f4f035 commit bd43693

File tree

7 files changed

+42
-92
lines changed

7 files changed

+42
-92
lines changed

Makefile

+4-4
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ endif
3636
BUILD_TYPE?=
3737
# keep standard at C11 and C++11
3838
CFLAGS = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC
39-
CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
39+
CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/common -I./common -O3 -DNDEBUG -std=c++11 -fPIC
4040
LDFLAGS =
4141

4242
# warnings
@@ -176,7 +176,7 @@ $(info )
176176
# Use this if you want to set the default behavior
177177

178178
llama.cpp/grammar-parser.o:
179-
cd build && cp -rf examples/CMakeFiles/common.dir/grammar-parser.cpp.o ../llama.cpp/grammar-parser.o
179+
cd build && cp -rf common/CMakeFiles/common.dir/grammar-parser.cpp.o ../llama.cpp/grammar-parser.o
180180

181181
llama.cpp/ggml-alloc.o:
182182
cd build && cp -rf CMakeFiles/ggml.dir/ggml-alloc.c.o ../llama.cpp/ggml-alloc.o
@@ -201,10 +201,10 @@ llama.cpp/llama.o:
201201
cd build && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o
202202

203203
llama.cpp/common.o:
204-
cd build && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o
204+
cd build && cp -rf common/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o
205205

206206
binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o
207-
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)
207+
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/common binding.cpp -o binding.o -c $(LDFLAGS)
208208

209209
## https://github.com/ggerganov/llama.cpp/pull/1902
210210
prepare:

binding.cpp

+13-36
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,10 @@ int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings) {
4747

4848
int n_past = 0;
4949

50-
// Add a space in front of the first character to match OG llama tokenizer behavior
51-
params.prompt.insert(0, 1, ' ');
5250

5351
// tokenize the prompt
5452
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
5553

56-
// determine newline token
57-
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
5854

5955
if (embd_inp.size() > 0) {
6056
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
@@ -83,7 +79,7 @@ int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tok
8379

8480
for (int i = 0; i < tokenSize; i++) {
8581
auto token_str = llama_token_to_str(ctx, tokens[i]);
86-
if (token_str == nullptr) {
82+
if (token_str.c_str() == "") {
8783
continue;
8884
}
8985
std::vector<std::string> my_vector;
@@ -185,9 +181,6 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
185181

186182
std::vector<llama_token> embd_inp;
187183
if ( !params.prompt.empty() || session_tokens.empty() ) {
188-
// Add a space in front of the first character to match OG llama tokenizer behavior
189-
params.prompt.insert(0, 1, ' ');
190-
191184
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
192185
} else {
193186
embd_inp = session_tokens;
@@ -255,9 +248,6 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
255248
}
256249
}
257250

258-
// determine newline token
259-
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
260-
261251
grammar_parser::parse_state parsed_grammar;
262252
llama_grammar * grammar = NULL;
263253
if (!params.grammar.empty()) {
@@ -271,7 +261,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
271261
fprintf(stderr, "\n");
272262

273263
{
274-
auto it = params.logit_bias.find(llama_token_eos());
264+
auto it = params.logit_bias.find(llama_token_eos(ctx));
275265
if (it != params.logit_bias.end() && it->second == -INFINITY) {
276266
fprintf(stderr,
277267
"%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
@@ -301,7 +291,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
301291

302292
// do one empty run to warm up the model
303293
{
304-
const std::vector<llama_token> tmp = { llama_token_bos(), };
294+
const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
305295
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
306296
llama_reset_timings(ctx);
307297
}
@@ -475,7 +465,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
475465
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
476466

477467
// Apply penalties
478-
float nl_logit = logits[llama_token_nl()];
468+
float nl_logit = logits[llama_token_nl(ctx)];
479469
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
480470
llama_sample_repetition_penalty(ctx, &candidates_p,
481471
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -484,7 +474,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
484474
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
485475
last_n_repeat, alpha_frequency, alpha_presence);
486476
if (!penalize_nl) {
487-
logits[llama_token_nl()] = nl_logit;
477+
logits[llama_token_nl(ctx)] = nl_logit;
488478
}
489479
if (grammar != NULL) {
490480
llama_sample_grammar(ctx, &candidates_p, grammar);
@@ -530,7 +520,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
530520
// call the token callback, no need to check if one is actually registered, that will
531521
// be handled on the Go side.
532522
auto token_str = llama_token_to_str(ctx, id);
533-
if (!tokenCallback(state_pr, (char*)token_str)) {
523+
if (!tokenCallback(state_pr, (char*)token_str.c_str())) {
534524
break;
535525
}
536526
} else {
@@ -547,7 +537,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
547537
}
548538

549539
for (auto id : embd) {
550-
res += llama_token_to_str(ctx, id);
540+
res += llama_token_to_str(ctx, id).c_str();
551541
}
552542

553543
// if not currently processing queued inputs;
@@ -576,7 +566,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
576566
}
577567

578568
// end of text token
579-
if (!embd.empty() && embd.back() == llama_token_eos()) {
569+
if (!embd.empty() && embd.back() == llama_token_eos(ctx)) {
580570
break;
581571
}
582572
}
@@ -734,7 +724,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
734724
params->path_prompt_cache = session_file;
735725

736726
if (ignore_eos) {
737-
params->logit_bias[llama_token_eos()] = -INFINITY;
727+
params->ignore_eos = true;
738728
}
739729
if(antiprompt_count > 0) {
740730
params->antiprompt = create_vector(antiprompt, antiprompt_count);
@@ -759,8 +749,8 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
759749
return params;
760750
}
761751

762-
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) {
763-
return load_binding_model(fname, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, low_vram, n_gpu_layers, n_batch, maingpu, tensorsplit, numa, rope_freq_base, rope_freq_scale, rms_norm_eps, n_gqa);
752+
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) {
753+
return load_binding_model(fname, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, low_vram, n_gpu_layers, n_batch, maingpu, tensorsplit, numa, rope_freq_base, rope_freq_scale);
764754
}
765755

766756
/*
@@ -778,7 +768,7 @@ struct llama_binding_state {
778768
llama_model * model;
779769
};
780770
781-
void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa);
771+
void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale);
782772
783773
common.cpp:
784774
@@ -792,7 +782,7 @@ gpt_params* create_gpt_params(const std::string& fname) {
792782
return lparams;
793783
}
794784
795-
void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) {
785+
void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) {
796786
// load the model
797787
gpt_params * lparams = create_gpt_params(fname);
798788
llama_model * model;
@@ -807,19 +797,6 @@ void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f
807797
lparams->n_gpu_layers = n_gpu_layers;
808798
lparams->use_mmap = mmap;
809799
810-
// Keep sane defaults
811-
if (n_gqa != 0) {
812-
lparams->n_gqa = n_gqa;
813-
} else {
814-
lparams->n_gqa = 1;
815-
}
816-
817-
if (rms_norm_eps != 0.0f) {
818-
lparams->rms_norm_eps = rms_norm_eps;
819-
} else {
820-
lparams->rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
821-
}
822-
823800
lparams->low_vram = low_vram;
824801
if (rope_freq_base != 0.0f) {
825802
lparams->rope_freq_base = rope_freq_base;

binding.h

+1-3
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ void* load_model(const char *fname,
2828
const char *tensorsplit,
2929
bool numa,
3030
float rope_freq_base,
31-
float rope_freq_scale,
32-
float rms_norm_eps,
33-
int n_gqa);
31+
float rope_freq_scale);
3432

3533
int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);
3634

llama.cpp

llama.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package llama
22

3-
// #cgo CXXFLAGS: -I${SRCDIR}/llama.cpp/examples -I${SRCDIR}/llama.cpp
3+
// #cgo CXXFLAGS: -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp
44
// #cgo LDFLAGS: -L${SRCDIR}/ -lbinding -lm -lstdc++
55
// #cgo darwin LDFLAGS: -framework Accelerate
66
// #cgo darwin CXXFLAGS: -std=c++11
@@ -30,7 +30,6 @@ func New(model string, opts ...ModelOption) (*LLama, error) {
3030
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM),
3131
C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA),
3232
C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale),
33-
C.float(mo.RMSNormEPS), C.int(mo.GQA),
3433
)
3534

3635
if result == nil {

options.go

-15
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ type ModelOptions struct {
1515
TensorSplit string
1616
FreqRopeBase float32
1717
FreqRopeScale float32
18-
RMSNormEPS float32
19-
GQA int
2018
}
2119

2220
type PredictOptions struct {
@@ -63,7 +61,6 @@ var DefaultModelOptions ModelOptions = ModelOptions{
6361
Seed: 0,
6462
F16Memory: false,
6563
MLock: false,
66-
GQA: 1,
6764
Embeddings: false,
6865
MMap: true,
6966
LowVRAM: false,
@@ -100,18 +97,6 @@ func SetContext(c int) ModelOption {
10097
}
10198
}
10299

103-
func WithGQA(gqa int) ModelOption {
104-
return func(p *ModelOptions) {
105-
p.GQA = gqa
106-
}
107-
}
108-
109-
func WithRMSNormEPS(rms float32) ModelOption {
110-
return func(p *ModelOptions) {
111-
p.RMSNormEPS = rms
112-
}
113-
}
114-
115100
func WithRopeFreqBase(f float32) ModelOption {
116101
return func(p *ModelOptions) {
117102
p.FreqRopeBase = f

patches/1902-cuda.patch

+22-31
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
diff --git a/examples/common.cpp b/examples/common.cpp
2-
index bd39d92..17ff47e 100644
3-
--- a/examples/common.cpp
4-
+++ b/examples/common.cpp
5-
@@ -701,18 +701,93 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
1+
diff --git a/common/common.cpp b/common/common.cpp
2+
index d7e1a57..d4db9eb 100644
3+
--- a/common/common.cpp
4+
+++ b/common/common.cpp
5+
@@ -678,19 +678,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
66
return std::make_tuple(nullptr, nullptr);
77
}
88

@@ -19,9 +19,13 @@ index bd39d92..17ff47e 100644
1919
- }
2020
- }
2121
-
22-
return std::make_tuple(model, lctx);
22+
if (params.ignore_eos) {
23+
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
24+
}
25+
@@ -765,3 +752,77 @@ std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token
26+
return std::string(result.data(), result.size());
2327
}
24-
+
28+
2529
+
2630
+gpt_params* create_gpt_params(const std::string& fname) {
2731
+ gpt_params* lparams = new gpt_params;
@@ -33,7 +37,7 @@ index bd39d92..17ff47e 100644
3337
+ return lparams;
3438
+}
3539
+
36-
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) {
40+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) {
3741
+ // load the model
3842
+ gpt_params * lparams = create_gpt_params(fname);
3943
+ llama_model * model;
@@ -48,19 +52,6 @@ index bd39d92..17ff47e 100644
4852
+ lparams->n_gpu_layers = n_gpu_layers;
4953
+ lparams->use_mmap = mmap;
5054
+
51-
+ // Keep sane defaults
52-
+ if (n_gqa != 0) {
53-
+ lparams->n_gqa = n_gqa;
54-
+ } else {
55-
+ lparams->n_gqa = 1;
56-
+ }
57-
+
58-
+ if (rms_norm_eps != 0.0f) {
59-
+ lparams->rms_norm_eps = rms_norm_eps;
60-
+ } else {
61-
+ lparams->rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
62-
+ }
63-
+
6455
+ lparams->low_vram = low_vram;
6556
+ if (rope_freq_base != 0.0f) {
6657
+ lparams->rope_freq_base = rope_freq_base;
@@ -110,19 +101,19 @@ index bd39d92..17ff47e 100644
110101
+ return state;
111102
+}
112103
\ No newline at end of file
113-
diff --git a/examples/common.h b/examples/common.h
114-
index 375bc0a..7e7f356 100644
115-
--- a/examples/common.h
116-
+++ b/examples/common.h
117-
@@ -112,3 +112,10 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
118-
119-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
120-
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
104+
diff --git a/common/common.h b/common/common.h
105+
index c50a6ed..40c691f 100644
106+
--- a/common/common.h
107+
+++ b/common/common.h
108+
@@ -128,3 +128,11 @@ std::string llama_token_to_str(
109+
std::string llama_token_to_str_bpe(
110+
const struct llama_context * ctx,
111+
llama_token token);
112+
+
121113
+
122114
+struct llama_binding_state {
123115
+ llama_context * ctx;
124116
+ llama_model * model;
125117
+};
126118
+
127-
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa);
128-
\ No newline at end of file
119+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale);

0 commit comments

Comments
 (0)