diff --git a/Makefile b/Makefile index 1d70387..dcb4a96 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ endif BUILD_TYPE?= # keep standard at C11 and C++11 CFLAGS = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC +CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/common -I./common -O3 -DNDEBUG -std=c++11 -fPIC LDFLAGS = # warnings @@ -176,7 +176,7 @@ $(info ) # Use this if you want to set the default behavior llama.cpp/grammar-parser.o: - cd build && cp -rf examples/CMakeFiles/common.dir/grammar-parser.cpp.o ../llama.cpp/grammar-parser.o + cd build && cp -rf common/CMakeFiles/common.dir/grammar-parser.cpp.o ../llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o: cd build && cp -rf CMakeFiles/ggml.dir/ggml-alloc.c.o ../llama.cpp/ggml-alloc.o @@ -201,10 +201,10 @@ llama.cpp/llama.o: cd build && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o llama.cpp/common.o: - cd build && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o + cd build && cp -rf common/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o - $(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS) + $(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/common binding.cpp -o binding.o -c $(LDFLAGS) ## https://github.com/ggerganov/llama.cpp/pull/1902 prepare: @@ -221,5 +221,5 @@ clean: rm -rf build test: libbinding.a - test -f ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/open-llama-7B-v2-open-instruct-GGML/resolve/main/open-llama-7b-v2-open-instruct.ggmlv3.q2_K.bin -O ggllm-test-model.bin + test -f ggllm-test-model.bin || wget -q https://huggingface.co/klosax/openllama-3b-v2-gguf/resolve/main/openllama-3b-v2-q4_0.gguf -O ggllm-test-model.bin C_INCLUDE_PATH=${INCLUDE_PATH} CGO_LDFLAGS=${CGO_LDFLAGS} LIBRARY_PATH=${LIBRARY_PATH} TEST_MODEL=ggllm-test-model.bin go test -v ./... diff --git a/README.md b/README.md index b63dd4c..7faa9d0 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,12 @@ Check out [this](https://about.sourcegraph.com/blog/go/gophercon-2018-adventures If you are looking for an high-level OpenAI compatible API, check out [here](https://github.com/go-skynet/llama-cli). +## Attention! + +Since https://github.com/go-skynet/go-llama.cpp/pull/180 is merged, now go-llama.cpp is not anymore compatible with ggllm file, but it works with the new `gguf` file format. See also the upstream PR: https://github.com/ggerganov/llama.cpp/pull/2398. + +If you need to use the `gglm` format, use the https://github.com/go-skynet/go-llama.cpp/releases/tag/pre-gguf tag. + ## Usage Note: This repository uses git submodules to keep track of [LLama.cpp](https://github.com/ggerganov/llama.cpp). diff --git a/binding.cpp b/binding.cpp index 6d17c7d..aa088b3 100644 --- a/binding.cpp +++ b/binding.cpp @@ -47,14 +47,10 @@ int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings) { int n_past = 0; - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); // tokenize the prompt auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); if (embd_inp.size() > 0) { if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { @@ -83,9 +79,6 @@ int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tok for (int i = 0; i < tokenSize; i++) { auto token_str = llama_token_to_str(ctx, tokens[i]); - if (token_str == nullptr) { - continue; - } std::vector my_vector; std::string str_token(token_str); // create a new std::string from the char* params_p->prompt += str_token; @@ -185,9 +178,6 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { std::vector embd_inp; if ( !params.prompt.empty() || session_tokens.empty() ) { - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); - embd_inp = ::llama_tokenize(ctx, params.prompt, true); } else { embd_inp = session_tokens; @@ -251,13 +241,10 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i])); + fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str()); } } - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); - grammar_parser::parse_state parsed_grammar; llama_grammar * grammar = NULL; if (!params.grammar.empty()) { @@ -271,7 +258,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { fprintf(stderr, "\n"); { - auto it = params.logit_bias.find(llama_token_eos()); + auto it = params.logit_bias.find(llama_token_eos(ctx)); if (it != params.logit_bias.end() && it->second == -INFINITY) { fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); @@ -301,7 +288,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { // do one empty run to warm up the model { - const std::vector tmp = { llama_token_bos(), }; + const std::vector tmp = { llama_token_bos(ctx), }; llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); llama_reset_timings(ctx); } @@ -475,7 +462,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; // Apply penalties - float nl_logit = logits[llama_token_nl()]; + float nl_logit = logits[llama_token_nl(ctx)]; auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); llama_sample_repetition_penalty(ctx, &candidates_p, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, @@ -484,7 +471,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_repeat, alpha_frequency, alpha_presence); if (!penalize_nl) { - logits[llama_token_nl()] = nl_logit; + logits[llama_token_nl(ctx)] = nl_logit; } if (grammar != NULL) { llama_sample_grammar(ctx, &candidates_p, grammar); @@ -530,7 +517,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { // call the token callback, no need to check if one is actually registered, that will // be handled on the Go side. auto token_str = llama_token_to_str(ctx, id); - if (!tokenCallback(state_pr, (char*)token_str)) { + if (!tokenCallback(state_pr, (char*)token_str.c_str())) { break; } } else { @@ -547,7 +534,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { } for (auto id : embd) { - res += llama_token_to_str(ctx, id); + res += llama_token_to_str(ctx, id).c_str(); } // if not currently processing queued inputs; @@ -576,7 +563,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos()) { + if (!embd.empty() && embd.back() == llama_token_eos(ctx)) { break; } } @@ -734,7 +721,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token params->path_prompt_cache = session_file; if (ignore_eos) { - params->logit_bias[llama_token_eos()] = -INFINITY; + params->ignore_eos = true; } if(antiprompt_count > 0) { params->antiprompt = create_vector(antiprompt, antiprompt_count); @@ -759,8 +746,8 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token return params; } -void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) { - return load_binding_model(fname, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, low_vram, n_gpu_layers, n_batch, maingpu, tensorsplit, numa, rope_freq_base, rope_freq_scale, rms_norm_eps, n_gqa); +void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) { + return load_binding_model(fname, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, low_vram, n_gpu_layers, n_batch, maingpu, tensorsplit, numa, rope_freq_base, rope_freq_scale); } /* @@ -778,7 +765,7 @@ struct llama_binding_state { llama_model * model; }; -void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa); +void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale); common.cpp: @@ -792,7 +779,7 @@ gpt_params* create_gpt_params(const std::string& fname) { return lparams; } -void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) { +void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) { // load the model gpt_params * lparams = create_gpt_params(fname); llama_model * model; @@ -807,19 +794,6 @@ void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f lparams->n_gpu_layers = n_gpu_layers; lparams->use_mmap = mmap; - // Keep sane defaults - if (n_gqa != 0) { - lparams->n_gqa = n_gqa; - } else { - lparams->n_gqa = 1; - } - - if (rms_norm_eps != 0.0f) { - lparams->rms_norm_eps = rms_norm_eps; - } else { - lparams->rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; - } - lparams->low_vram = low_vram; if (rope_freq_base != 0.0f) { lparams->rope_freq_base = rope_freq_base; diff --git a/binding.h b/binding.h index 77d116f..9f3d2bb 100644 --- a/binding.h +++ b/binding.h @@ -28,9 +28,7 @@ void* load_model(const char *fname, const char *tensorsplit, bool numa, float rope_freq_base, - float rope_freq_scale, - float rms_norm_eps, - int n_gqa); + float rope_freq_scale); int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings); diff --git a/llama.cpp b/llama.cpp index dadbed9..6381d4e 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit dadbed99e65252d79f81101a392d0d6497b86caa +Subproject commit 6381d4e110bd0ec02843a60bbeb8b6fc37a9ace9 diff --git a/llama.go b/llama.go index 6f0276d..b7bdadc 100644 --- a/llama.go +++ b/llama.go @@ -1,6 +1,6 @@ package llama -// #cgo CXXFLAGS: -I${SRCDIR}/llama.cpp/examples -I${SRCDIR}/llama.cpp +// #cgo CXXFLAGS: -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp // #cgo LDFLAGS: -L${SRCDIR}/ -lbinding -lm -lstdc++ // #cgo darwin LDFLAGS: -framework Accelerate // #cgo darwin CXXFLAGS: -std=c++11 @@ -30,7 +30,6 @@ func New(model string, opts ...ModelOption) (*LLama, error) { C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA), C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale), - C.float(mo.RMSNormEPS), C.int(mo.GQA), ) if result == nil { diff --git a/options.go b/options.go index 0bd5f35..3cfe077 100644 --- a/options.go +++ b/options.go @@ -15,8 +15,6 @@ type ModelOptions struct { TensorSplit string FreqRopeBase float32 FreqRopeScale float32 - RMSNormEPS float32 - GQA int } type PredictOptions struct { @@ -63,7 +61,6 @@ var DefaultModelOptions ModelOptions = ModelOptions{ Seed: 0, F16Memory: false, MLock: false, - GQA: 1, Embeddings: false, MMap: true, LowVRAM: false, @@ -100,18 +97,6 @@ func SetContext(c int) ModelOption { } } -func WithGQA(gqa int) ModelOption { - return func(p *ModelOptions) { - p.GQA = gqa - } -} - -func WithRMSNormEPS(rms float32) ModelOption { - return func(p *ModelOptions) { - p.RMSNormEPS = rms - } -} - func WithRopeFreqBase(f float32) ModelOption { return func(p *ModelOptions) { p.FreqRopeBase = f diff --git a/patches/1902-cuda.patch b/patches/1902-cuda.patch index a2c7b78..6327c44 100644 --- a/patches/1902-cuda.patch +++ b/patches/1902-cuda.patch @@ -1,8 +1,8 @@ -diff --git a/examples/common.cpp b/examples/common.cpp -index bd39d92..17ff47e 100644 ---- a/examples/common.cpp -+++ b/examples/common.cpp -@@ -701,18 +701,93 @@ std::tuple llama_init_from_gpt_par +diff --git a/common/common.cpp b/common/common.cpp +index d7e1a57..d4db9eb 100644 +--- a/common/common.cpp ++++ b/common/common.cpp +@@ -678,19 +678,6 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } @@ -19,9 +19,13 @@ index bd39d92..17ff47e 100644 - } - } - - return std::make_tuple(model, lctx); + if (params.ignore_eos) { + params.logit_bias[llama_token_eos(lctx)] = -INFINITY; + } +@@ -765,3 +752,77 @@ std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token + return std::string(result.data(), result.size()); } -+ + + +gpt_params* create_gpt_params(const std::string& fname) { + gpt_params* lparams = new gpt_params; @@ -33,7 +37,7 @@ index bd39d92..17ff47e 100644 + return lparams; +} + -+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) { ++void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) { + // load the model + gpt_params * lparams = create_gpt_params(fname); + llama_model * model; @@ -48,19 +52,6 @@ index bd39d92..17ff47e 100644 + lparams->n_gpu_layers = n_gpu_layers; + lparams->use_mmap = mmap; + -+ // Keep sane defaults -+ if (n_gqa != 0) { -+ lparams->n_gqa = n_gqa; -+ } else { -+ lparams->n_gqa = 1; -+ } -+ -+ if (rms_norm_eps != 0.0f) { -+ lparams->rms_norm_eps = rms_norm_eps; -+ } else { -+ lparams->rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; -+ } -+ + lparams->low_vram = low_vram; + if (rope_freq_base != 0.0f) { + lparams->rope_freq_base = rope_freq_base; @@ -110,19 +101,19 @@ index bd39d92..17ff47e 100644 + return state; +} \ No newline at end of file -diff --git a/examples/common.h b/examples/common.h -index 375bc0a..7e7f356 100644 ---- a/examples/common.h -+++ b/examples/common.h -@@ -112,3 +112,10 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s - - std::tuple llama_init_from_gpt_params(const gpt_params & params); - struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); +diff --git a/common/common.h b/common/common.h +index c50a6ed..40c691f 100644 +--- a/common/common.h ++++ b/common/common.h +@@ -128,3 +128,11 @@ std::string llama_token_to_str( + std::string llama_token_to_str_bpe( + const struct llama_context * ctx, + llama_token token); ++ + +struct llama_binding_state { + llama_context * ctx; + llama_model * model; +}; + -+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa); -\ No newline at end of file ++void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale);