@@ -47,14 +47,10 @@ int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings) {
47
47
48
48
int n_past = 0 ;
49
49
50
- // Add a space in front of the first character to match OG llama tokenizer behavior
51
- params.prompt .insert (0 , 1 , ' ' );
52
50
53
51
// tokenize the prompt
54
52
auto embd_inp = ::llama_tokenize (ctx, params.prompt , true );
55
53
56
- // determine newline token
57
- auto llama_token_newline = ::llama_tokenize (ctx, " \n " , false );
58
54
59
55
if (embd_inp.size () > 0 ) {
60
56
if (llama_eval (ctx, embd_inp.data (), embd_inp.size (), n_past, params.n_threads )) {
@@ -83,7 +79,7 @@ int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tok
83
79
84
80
for (int i = 0 ; i < tokenSize; i++) {
85
81
auto token_str = llama_token_to_str (ctx, tokens[i]);
86
- if (token_str == nullptr ) {
82
+ if (token_str. c_str () == " " ) {
87
83
continue ;
88
84
}
89
85
std::vector<std::string> my_vector;
@@ -185,9 +181,6 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
185
181
186
182
std::vector<llama_token> embd_inp;
187
183
if ( !params.prompt .empty () || session_tokens.empty () ) {
188
- // Add a space in front of the first character to match OG llama tokenizer behavior
189
- params.prompt .insert (0 , 1 , ' ' );
190
-
191
184
embd_inp = ::llama_tokenize (ctx, params.prompt , true );
192
185
} else {
193
186
embd_inp = session_tokens;
@@ -255,9 +248,6 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
255
248
}
256
249
}
257
250
258
- // determine newline token
259
- auto llama_token_newline = ::llama_tokenize (ctx, " \n " , false );
260
-
261
251
grammar_parser::parse_state parsed_grammar;
262
252
llama_grammar * grammar = NULL ;
263
253
if (!params.grammar .empty ()) {
@@ -271,7 +261,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
271
261
fprintf (stderr, " \n " );
272
262
273
263
{
274
- auto it = params.logit_bias .find (llama_token_eos ());
264
+ auto it = params.logit_bias .find (llama_token_eos (ctx ));
275
265
if (it != params.logit_bias .end () && it->second == -INFINITY) {
276
266
fprintf (stderr,
277
267
" %s: warning: EOS token is disabled, which will cause most grammars to fail\n " , __func__);
@@ -301,7 +291,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
301
291
302
292
// do one empty run to warm up the model
303
293
{
304
- const std::vector<llama_token> tmp = { llama_token_bos (), };
294
+ const std::vector<llama_token> tmp = { llama_token_bos (ctx ), };
305
295
llama_eval (ctx, tmp.data (), tmp.size (), 0 , params.n_threads );
306
296
llama_reset_timings (ctx);
307
297
}
@@ -475,7 +465,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
475
465
llama_token_data_array candidates_p = { candidates.data (), candidates.size (), false };
476
466
477
467
// Apply penalties
478
- float nl_logit = logits[llama_token_nl ()];
468
+ float nl_logit = logits[llama_token_nl (ctx )];
479
469
auto last_n_repeat = std::min (std::min ((int )last_n_tokens.size (), repeat_last_n), n_ctx);
480
470
llama_sample_repetition_penalty (ctx, &candidates_p,
481
471
last_n_tokens.data () + last_n_tokens.size () - last_n_repeat,
@@ -484,7 +474,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
484
474
last_n_tokens.data () + last_n_tokens.size () - last_n_repeat,
485
475
last_n_repeat, alpha_frequency, alpha_presence);
486
476
if (!penalize_nl) {
487
- logits[llama_token_nl ()] = nl_logit;
477
+ logits[llama_token_nl (ctx )] = nl_logit;
488
478
}
489
479
if (grammar != NULL ) {
490
480
llama_sample_grammar (ctx, &candidates_p, grammar);
@@ -530,7 +520,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
530
520
// call the token callback, no need to check if one is actually registered, that will
531
521
// be handled on the Go side.
532
522
auto token_str = llama_token_to_str (ctx, id);
533
- if (!tokenCallback (state_pr, (char *)token_str)) {
523
+ if (!tokenCallback (state_pr, (char *)token_str. c_str () )) {
534
524
break ;
535
525
}
536
526
} else {
@@ -547,7 +537,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
547
537
}
548
538
549
539
for (auto id : embd) {
550
- res += llama_token_to_str (ctx, id);
540
+ res += llama_token_to_str (ctx, id). c_str () ;
551
541
}
552
542
553
543
// if not currently processing queued inputs;
@@ -576,7 +566,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
576
566
}
577
567
578
568
// end of text token
579
- if (!embd.empty () && embd.back () == llama_token_eos ()) {
569
+ if (!embd.empty () && embd.back () == llama_token_eos (ctx )) {
580
570
break ;
581
571
}
582
572
}
@@ -734,7 +724,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
734
724
params->path_prompt_cache = session_file;
735
725
736
726
if (ignore_eos) {
737
- params->logit_bias [ llama_token_eos ()] = -INFINITY ;
727
+ params->ignore_eos = true ;
738
728
}
739
729
if (antiprompt_count > 0 ) {
740
730
params->antiprompt = create_vector (antiprompt, antiprompt_count);
@@ -759,8 +749,8 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
759
749
return params;
760
750
}
761
751
762
- void * load_model (const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa ) {
763
- return load_binding_model (fname, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, low_vram, n_gpu_layers, n_batch, maingpu, tensorsplit, numa, rope_freq_base, rope_freq_scale, rms_norm_eps, n_gqa );
752
+ void * load_model (const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) {
753
+ return load_binding_model (fname, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, low_vram, n_gpu_layers, n_batch, maingpu, tensorsplit, numa, rope_freq_base, rope_freq_scale);
764
754
}
765
755
766
756
/*
@@ -778,7 +768,7 @@ struct llama_binding_state {
778
768
llama_model * model;
779
769
};
780
770
781
- void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa );
771
+ void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale);
782
772
783
773
common.cpp:
784
774
@@ -792,7 +782,7 @@ gpt_params* create_gpt_params(const std::string& fname) {
792
782
return lparams;
793
783
}
794
784
795
- void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa ) {
785
+ void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) {
796
786
// load the model
797
787
gpt_params * lparams = create_gpt_params(fname);
798
788
llama_model * model;
@@ -807,19 +797,6 @@ void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f
807
797
lparams->n_gpu_layers = n_gpu_layers;
808
798
lparams->use_mmap = mmap;
809
799
810
- // Keep sane defaults
811
- if (n_gqa != 0) {
812
- lparams->n_gqa = n_gqa;
813
- } else {
814
- lparams->n_gqa = 1;
815
- }
816
-
817
- if (rms_norm_eps != 0.0f) {
818
- lparams->rms_norm_eps = rms_norm_eps;
819
- } else {
820
- lparams->rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
821
- }
822
-
823
800
lparams->low_vram = low_vram;
824
801
if (rope_freq_base != 0.0f) {
825
802
lparams->rope_freq_base = rope_freq_base;
0 commit comments