Skip to content

Commit 24b85a9

Browse files
committed
First import
1 parent 4b01db1 commit 24b85a9

10 files changed

+680
-0
lines changed

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "llama.cpp"]
2+
path = llama.cpp
3+
url = https://github.com/ggerganov/llama.cpp

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2023 go-skynet authors
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Makefile

+153
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
ifndef UNAME_S
2+
UNAME_S := $(shell uname -s)
3+
endif
4+
5+
ifndef UNAME_P
6+
UNAME_P := $(shell uname -p)
7+
endif
8+
9+
ifndef UNAME_M
10+
UNAME_M := $(shell uname -m)
11+
endif
12+
13+
CCV := $(shell $(CC) --version | head -n 1)
14+
CXXV := $(shell $(CXX) --version | head -n 1)
15+
16+
# Mac OS + Arm can report x86_64
17+
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
18+
ifeq ($(UNAME_S),Darwin)
19+
ifneq ($(UNAME_P),arm)
20+
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
21+
ifeq ($(SYSCTL_M),1)
22+
# UNAME_P := arm
23+
# UNAME_M := arm64
24+
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
25+
endif
26+
endif
27+
endif
28+
29+
#
30+
# Compile flags
31+
#
32+
33+
# keep standard at C11 and C++11
34+
CFLAGS = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC
35+
CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
36+
LDFLAGS =
37+
38+
# warnings
39+
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
40+
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
41+
42+
# OS specific
43+
# TODO: support Windows
44+
ifeq ($(UNAME_S),Linux)
45+
CFLAGS += -pthread
46+
CXXFLAGS += -pthread
47+
endif
48+
ifeq ($(UNAME_S),Darwin)
49+
CFLAGS += -pthread
50+
CXXFLAGS += -pthread
51+
endif
52+
ifeq ($(UNAME_S),FreeBSD)
53+
CFLAGS += -pthread
54+
CXXFLAGS += -pthread
55+
endif
56+
ifeq ($(UNAME_S),NetBSD)
57+
CFLAGS += -pthread
58+
CXXFLAGS += -pthread
59+
endif
60+
ifeq ($(UNAME_S),OpenBSD)
61+
CFLAGS += -pthread
62+
CXXFLAGS += -pthread
63+
endif
64+
ifeq ($(UNAME_S),Haiku)
65+
CFLAGS += -pthread
66+
CXXFLAGS += -pthread
67+
endif
68+
69+
# Architecture specific
70+
# TODO: probably these flags need to be tweaked on some architectures
71+
# feel free to update the Makefile for your architecture and send a pull request or issue
72+
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
73+
# Use all CPU extensions that are available:
74+
CFLAGS += -march=native -mtune=native
75+
endif
76+
ifneq ($(filter ppc64%,$(UNAME_M)),)
77+
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
78+
ifneq (,$(findstring POWER9,$(POWER9_M)))
79+
CFLAGS += -mcpu=power9
80+
CXXFLAGS += -mcpu=power9
81+
endif
82+
# Require c++23's std::byteswap for big-endian support.
83+
ifeq ($(UNAME_M),ppc64)
84+
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
85+
endif
86+
endif
87+
ifndef LLAMA_NO_ACCELERATE
88+
# Mac M1 - include Accelerate framework.
89+
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
90+
ifeq ($(UNAME_S),Darwin)
91+
CFLAGS += -DGGML_USE_ACCELERATE
92+
LDFLAGS += -framework Accelerate
93+
endif
94+
endif
95+
ifdef LLAMA_OPENBLAS
96+
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
97+
LDFLAGS += -lopenblas
98+
endif
99+
ifdef LLAMA_GPROF
100+
CFLAGS += -pg
101+
CXXFLAGS += -pg
102+
endif
103+
ifneq ($(filter aarch64%,$(UNAME_M)),)
104+
CFLAGS += -mcpu=native
105+
CXXFLAGS += -mcpu=native
106+
endif
107+
ifneq ($(filter armv6%,$(UNAME_M)),)
108+
# Raspberry Pi 1, 2, 3
109+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
110+
endif
111+
ifneq ($(filter armv7%,$(UNAME_M)),)
112+
# Raspberry Pi 4
113+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
114+
endif
115+
ifneq ($(filter armv8%,$(UNAME_M)),)
116+
# Raspberry Pi 4
117+
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
118+
endif
119+
120+
#
121+
# Print build information
122+
#
123+
124+
$(info I llama.cpp build info: )
125+
$(info I UNAME_S: $(UNAME_S))
126+
$(info I UNAME_P: $(UNAME_P))
127+
$(info I UNAME_M: $(UNAME_M))
128+
$(info I CFLAGS: $(CFLAGS))
129+
$(info I CXXFLAGS: $(CXXFLAGS))
130+
$(info I LDFLAGS: $(LDFLAGS))
131+
$(info I CC: $(CCV))
132+
$(info I CXX: $(CXXV))
133+
$(info )
134+
135+
llama.cpp/ggml.o:
136+
$(MAKE) -C llama.cpp ggml.o
137+
138+
llama.cpp/llama.o:
139+
$(MAKE) -C llama.cpp llama.o
140+
141+
llama.cpp/common.o:
142+
$(MAKE) -C llama.cpp common.o
143+
144+
binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
145+
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)
146+
147+
libbinding.a: binding.o
148+
ar src libbinding.a llama.cpp/ggml.o llama.cpp/common.o llama.cpp/llama.o binding.o
149+
150+
clean:
151+
rm -rf *.o
152+
rm -rf *.a
153+
$(MAKE) -C llama.cpp clean

binding.cpp

+195
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#include "common.h"
2+
#include "llama.h"
3+
#include "binding.h"
4+
5+
#include <cassert>
6+
#include <cinttypes>
7+
#include <cmath>
8+
#include <cstdio>
9+
#include <cstring>
10+
#include <fstream>
11+
#include <iostream>
12+
#include <string>
13+
#include <vector>
14+
15+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
16+
#include <signal.h>
17+
#include <unistd.h>
18+
#elif defined (_WIN32)
19+
#include <signal.h>
20+
#endif
21+
22+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
23+
void sigint_handler(int signo) {
24+
if (signo == SIGINT) {
25+
_exit(130);
26+
}
27+
}
28+
#endif
29+
30+
int llama_predict(void* params_ptr, void* state_pr, char* result) {
31+
gpt_params* params_p = (gpt_params*) params_ptr;
32+
llama_context* ctx = (llama_context*) state_pr;
33+
34+
gpt_params params = *params_p;
35+
36+
if (params.seed <= 0) {
37+
params.seed = time(NULL);
38+
}
39+
40+
std::mt19937 rng(params.seed);
41+
42+
// Add a space in front of the first character to match OG llama tokenizer behavior
43+
params.prompt.insert(0, 1, ' ');
44+
45+
// tokenize the prompt
46+
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
47+
48+
const int n_ctx = llama_n_ctx(ctx);
49+
50+
// number of tokens to keep when resetting context
51+
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
52+
params.n_keep = (int)embd_inp.size();
53+
}
54+
55+
// determine newline token
56+
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
57+
58+
// TODO: replace with ring-buffer
59+
std::vector<llama_token> last_n_tokens(n_ctx);
60+
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
61+
62+
int n_past = 0;
63+
int n_remain = params.n_predict;
64+
int n_consumed = 0;
65+
66+
std::vector<llama_token> embd;
67+
std::string res = "";
68+
69+
while (n_remain != 0) {
70+
// predict
71+
if (embd.size() > 0) {
72+
// infinite text generation via context swapping
73+
// if we run out of context:
74+
// - take the n_keep first tokens from the original prompt (via n_past)
75+
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
76+
if (n_past + (int) embd.size() > n_ctx) {
77+
const int n_left = n_past - params.n_keep;
78+
79+
n_past = params.n_keep;
80+
81+
// insert n_left/2 tokens at the start of embd from last_n_tokens
82+
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
83+
}
84+
85+
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
86+
fprintf(stderr, "%s : failed to eval\n", __func__);
87+
return 1;
88+
}
89+
}
90+
91+
n_past += embd.size();
92+
embd.clear();
93+
94+
if ((int) embd_inp.size() <= n_consumed) {
95+
// out of user input, sample next token
96+
const int32_t top_k = params.top_k;
97+
const float top_p = params.top_p;
98+
const float temp = params.temp;
99+
const float repeat_penalty = params.repeat_penalty;
100+
101+
llama_token id = 0;
102+
103+
{
104+
auto logits = llama_get_logits(ctx);
105+
106+
if (params.ignore_eos) {
107+
logits[llama_token_eos()] = 0;
108+
}
109+
110+
id = llama_sample_top_p_top_k(ctx,
111+
last_n_tokens.data() + n_ctx - params.repeat_last_n,
112+
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
113+
114+
last_n_tokens.erase(last_n_tokens.begin());
115+
last_n_tokens.push_back(id);
116+
}
117+
118+
// add it to the context
119+
embd.push_back(id);
120+
121+
// decrement remaining sampling budget
122+
--n_remain;
123+
} else {
124+
// some user input remains from prompt or interaction, forward it to processing
125+
while ((int) embd_inp.size() > n_consumed) {
126+
embd.push_back(embd_inp[n_consumed]);
127+
last_n_tokens.erase(last_n_tokens.begin());
128+
last_n_tokens.push_back(embd_inp[n_consumed]);
129+
++n_consumed;
130+
if ((int) embd.size() >= params.n_batch) {
131+
break;
132+
}
133+
}
134+
}
135+
136+
for (auto id : embd) {
137+
res += llama_token_to_str(ctx, id);
138+
}
139+
140+
// end of text token
141+
if (embd.back() == llama_token_eos()) {
142+
break;
143+
}
144+
}
145+
146+
#if defined (_WIN32)
147+
signal(SIGINT, SIG_DFL);
148+
#endif
149+
strcpy(result, res.c_str());
150+
return 0;
151+
}
152+
153+
void llama_free_model(void *state_ptr) {
154+
llama_context* ctx = (llama_context*) state_ptr;
155+
llama_free(ctx);
156+
}
157+
158+
void llama_free_params(void* params_ptr) {
159+
gpt_params* params = (gpt_params*) params_ptr;
160+
delete params;
161+
}
162+
163+
164+
void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k,
165+
float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16) {
166+
gpt_params* params = new gpt_params;
167+
params->seed = seed;
168+
params->n_threads = threads;
169+
params->n_predict = tokens;
170+
params->repeat_last_n = repeat_last_n;
171+
172+
params->top_k = top_k;
173+
params->top_p = top_p;
174+
params->memory_f16 = memory_f16;
175+
params->temp = temp;
176+
params->repeat_penalty = repeat_penalty;
177+
178+
params->prompt = prompt;
179+
params->ignore_eos = ignore_eos;
180+
181+
return params;
182+
}
183+
184+
void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock) {
185+
// load the model
186+
auto lparams = llama_context_default_params();
187+
188+
lparams.n_ctx = n_ctx;
189+
lparams.n_parts = n_parts;
190+
lparams.seed = n_seed;
191+
lparams.f16_kv = memory_f16;
192+
lparams.use_mlock = mlock;
193+
194+
return llama_init_from_file(fname, lparams);
195+
}

binding.h

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#ifdef __cplusplus
2+
extern "C" {
3+
#endif
4+
5+
#include <stdbool.h>
6+
7+
void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock);
8+
9+
void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
10+
int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16);
11+
12+
void llama_free_params(void* params_ptr);
13+
14+
void llama_free_model(void* state);
15+
16+
int llama_predict(void* params_ptr, void* state_pr, char* result);
17+
18+
#ifdef __cplusplus
19+
}
20+
#endif

0 commit comments

Comments
 (0)