Skip to content

Sync with master, initial gguf implementation #180

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ endif
BUILD_TYPE?=
# keep standard at C11 and C++11
CFLAGS = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/common -I./common -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =

# warnings
Expand Down Expand Up @@ -176,7 +176,7 @@ $(info )
# Use this if you want to set the default behavior

llama.cpp/grammar-parser.o:
cd build && cp -rf examples/CMakeFiles/common.dir/grammar-parser.cpp.o ../llama.cpp/grammar-parser.o
cd build && cp -rf common/CMakeFiles/common.dir/grammar-parser.cpp.o ../llama.cpp/grammar-parser.o

llama.cpp/ggml-alloc.o:
cd build && cp -rf CMakeFiles/ggml.dir/ggml-alloc.c.o ../llama.cpp/ggml-alloc.o
Expand All @@ -201,10 +201,10 @@ llama.cpp/llama.o:
cd build && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o

llama.cpp/common.o:
cd build && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o
cd build && cp -rf common/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o

binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/common binding.cpp -o binding.o -c $(LDFLAGS)

## https://github.com/ggerganov/llama.cpp/pull/1902
prepare:
Expand All @@ -221,5 +221,5 @@ clean:
rm -rf build

test: libbinding.a
test -f ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/open-llama-7B-v2-open-instruct-GGML/resolve/main/open-llama-7b-v2-open-instruct.ggmlv3.q2_K.bin -O ggllm-test-model.bin
test -f ggllm-test-model.bin || wget -q https://huggingface.co/klosax/openllama-3b-v2-gguf/resolve/main/openllama-3b-v2-q4_0.gguf -O ggllm-test-model.bin
C_INCLUDE_PATH=${INCLUDE_PATH} CGO_LDFLAGS=${CGO_LDFLAGS} LIBRARY_PATH=${LIBRARY_PATH} TEST_MODEL=ggllm-test-model.bin go test -v ./...
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ Check out [this](https://about.sourcegraph.com/blog/go/gophercon-2018-adventures

If you are looking for an high-level OpenAI compatible API, check out [here](https://github.com/go-skynet/llama-cli).

## Attention!

Since https://github.com/go-skynet/go-llama.cpp/pull/180 is merged, now go-llama.cpp is not anymore compatible with ggllm file, but it works with the new `gguf` file format. See also the upstream PR: https://github.com/ggerganov/llama.cpp/pull/2398.

If you need to use the `gglm` format, use the https://github.com/go-skynet/go-llama.cpp/releases/tag/pre-gguf tag.

## Usage

Note: This repository uses git submodules to keep track of [LLama.cpp](https://github.com/ggerganov/llama.cpp).
Expand Down
52 changes: 13 additions & 39 deletions binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,10 @@ int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings) {

int n_past = 0;

// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);

// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);

if (embd_inp.size() > 0) {
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
Expand Down Expand Up @@ -83,9 +79,6 @@ int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tok

for (int i = 0; i < tokenSize; i++) {
auto token_str = llama_token_to_str(ctx, tokens[i]);
if (token_str == nullptr) {
continue;
}
std::vector<std::string> my_vector;
std::string str_token(token_str); // create a new std::string from the char*
params_p->prompt += str_token;
Expand Down Expand Up @@ -185,9 +178,6 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {

std::vector<llama_token> embd_inp;
if ( !params.prompt.empty() || session_tokens.empty() ) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
embd_inp = session_tokens;
Expand Down Expand Up @@ -251,13 +241,10 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
}
}

// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);

grammar_parser::parse_state parsed_grammar;
llama_grammar * grammar = NULL;
if (!params.grammar.empty()) {
Expand All @@ -271,7 +258,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
fprintf(stderr, "\n");

{
auto it = params.logit_bias.find(llama_token_eos());
auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) {
fprintf(stderr,
"%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
Expand Down Expand Up @@ -301,7 +288,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {

// do one empty run to warm up the model
{
const std::vector<llama_token> tmp = { llama_token_bos(), };
const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(ctx);
}
Expand Down Expand Up @@ -475,7 +462,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

// Apply penalties
float nl_logit = logits[llama_token_nl()];
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
Expand All @@ -484,7 +471,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
logits[llama_token_nl()] = nl_logit;
logits[llama_token_nl(ctx)] = nl_logit;
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &candidates_p, grammar);
Expand Down Expand Up @@ -530,7 +517,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
// call the token callback, no need to check if one is actually registered, that will
// be handled on the Go side.
auto token_str = llama_token_to_str(ctx, id);
if (!tokenCallback(state_pr, (char*)token_str)) {
if (!tokenCallback(state_pr, (char*)token_str.c_str())) {
break;
}
} else {
Expand All @@ -547,7 +534,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
}

for (auto id : embd) {
res += llama_token_to_str(ctx, id);
res += llama_token_to_str(ctx, id).c_str();
}

// if not currently processing queued inputs;
Expand Down Expand Up @@ -576,7 +563,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
}

// end of text token
if (!embd.empty() && embd.back() == llama_token_eos()) {
if (!embd.empty() && embd.back() == llama_token_eos(ctx)) {
break;
}
}
Expand Down Expand Up @@ -734,7 +721,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
params->path_prompt_cache = session_file;

if (ignore_eos) {
params->logit_bias[llama_token_eos()] = -INFINITY;
params->ignore_eos = true;
}
if(antiprompt_count > 0) {
params->antiprompt = create_vector(antiprompt, antiprompt_count);
Expand All @@ -759,8 +746,8 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
return params;
}

void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) {
return load_binding_model(fname, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, low_vram, n_gpu_layers, n_batch, maingpu, tensorsplit, numa, rope_freq_base, rope_freq_scale, rms_norm_eps, n_gqa);
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) {
return load_binding_model(fname, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, low_vram, n_gpu_layers, n_batch, maingpu, tensorsplit, numa, rope_freq_base, rope_freq_scale);
}

/*
Expand All @@ -778,7 +765,7 @@ struct llama_binding_state {
llama_model * model;
};

void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa);
void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale);

common.cpp:

Expand All @@ -792,7 +779,7 @@ gpt_params* create_gpt_params(const std::string& fname) {
return lparams;
}

void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) {
void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) {
// load the model
gpt_params * lparams = create_gpt_params(fname);
llama_model * model;
Expand All @@ -807,19 +794,6 @@ void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f
lparams->n_gpu_layers = n_gpu_layers;
lparams->use_mmap = mmap;

// Keep sane defaults
if (n_gqa != 0) {
lparams->n_gqa = n_gqa;
} else {
lparams->n_gqa = 1;
}

if (rms_norm_eps != 0.0f) {
lparams->rms_norm_eps = rms_norm_eps;
} else {
lparams->rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
}

lparams->low_vram = low_vram;
if (rope_freq_base != 0.0f) {
lparams->rope_freq_base = rope_freq_base;
Expand Down
4 changes: 1 addition & 3 deletions binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ void* load_model(const char *fname,
const char *tensorsplit,
bool numa,
float rope_freq_base,
float rope_freq_scale,
float rms_norm_eps,
int n_gqa);
float rope_freq_scale);

int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);

Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
3 changes: 1 addition & 2 deletions llama.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package llama

// #cgo CXXFLAGS: -I${SRCDIR}/llama.cpp/examples -I${SRCDIR}/llama.cpp
// #cgo CXXFLAGS: -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp
// #cgo LDFLAGS: -L${SRCDIR}/ -lbinding -lm -lstdc++
// #cgo darwin LDFLAGS: -framework Accelerate
// #cgo darwin CXXFLAGS: -std=c++11
Expand Down Expand Up @@ -30,7 +30,6 @@ func New(model string, opts ...ModelOption) (*LLama, error) {
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM),
C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA),
C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale),
C.float(mo.RMSNormEPS), C.int(mo.GQA),
)

if result == nil {
Expand Down
15 changes: 0 additions & 15 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ type ModelOptions struct {
TensorSplit string
FreqRopeBase float32
FreqRopeScale float32
RMSNormEPS float32
GQA int
}

type PredictOptions struct {
Expand Down Expand Up @@ -63,7 +61,6 @@ var DefaultModelOptions ModelOptions = ModelOptions{
Seed: 0,
F16Memory: false,
MLock: false,
GQA: 1,
Embeddings: false,
MMap: true,
LowVRAM: false,
Expand Down Expand Up @@ -100,18 +97,6 @@ func SetContext(c int) ModelOption {
}
}

func WithGQA(gqa int) ModelOption {
return func(p *ModelOptions) {
p.GQA = gqa
}
}

func WithRMSNormEPS(rms float32) ModelOption {
return func(p *ModelOptions) {
p.RMSNormEPS = rms
}
}

func WithRopeFreqBase(f float32) ModelOption {
return func(p *ModelOptions) {
p.FreqRopeBase = f
Expand Down
53 changes: 22 additions & 31 deletions patches/1902-cuda.patch
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
diff --git a/examples/common.cpp b/examples/common.cpp
index bd39d92..17ff47e 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -701,18 +701,93 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
diff --git a/common/common.cpp b/common/common.cpp
index d7e1a57..d4db9eb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -678,19 +678,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
return std::make_tuple(nullptr, nullptr);
}

Expand All @@ -19,9 +19,13 @@ index bd39d92..17ff47e 100644
- }
- }
-
return std::make_tuple(model, lctx);
if (params.ignore_eos) {
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
}
@@ -765,3 +752,77 @@ std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token
return std::string(result.data(), result.size());
}
+
+
+gpt_params* create_gpt_params(const std::string& fname) {
+ gpt_params* lparams = new gpt_params;
Expand All @@ -33,7 +37,7 @@ index bd39d92..17ff47e 100644
+ return lparams;
+}
+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa) {
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale) {
+ // load the model
+ gpt_params * lparams = create_gpt_params(fname);
+ llama_model * model;
Expand All @@ -48,19 +52,6 @@ index bd39d92..17ff47e 100644
+ lparams->n_gpu_layers = n_gpu_layers;
+ lparams->use_mmap = mmap;
+
+ // Keep sane defaults
+ if (n_gqa != 0) {
+ lparams->n_gqa = n_gqa;
+ } else {
+ lparams->n_gqa = 1;
+ }
+
+ if (rms_norm_eps != 0.0f) {
+ lparams->rms_norm_eps = rms_norm_eps;
+ } else {
+ lparams->rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+ }
+
+ lparams->low_vram = low_vram;
+ if (rope_freq_base != 0.0f) {
+ lparams->rope_freq_base = rope_freq_base;
Expand Down Expand Up @@ -110,19 +101,19 @@ index bd39d92..17ff47e 100644
+ return state;
+}
\ No newline at end of file
diff --git a/examples/common.h b/examples/common.h
index 375bc0a..7e7f356 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -112,3 +112,10 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s

std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
diff --git a/common/common.h b/common/common.h
index c50a6ed..40c691f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -128,3 +128,11 @@ std::string llama_token_to_str(
std::string llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token);
+
+
+struct llama_binding_state {
+ llama_context * ctx;
+ llama_model * model;
+};
+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, float rms_norm_eps, int n_gqa);
\ No newline at end of file
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale);