Skip to content

Commit 8369908

Browse files
committed
Merge branch 'master' of github.com:ggerganov/llama.cpp into grammar-example
* 'master' of github.com:ggerganov/llama.cpp: kv cache slot search improvements (ggml-org#3493) prompts : fix editorconfig checks after ggml-org#3416 parallel : add option to load external prompt file (ggml-org#3416) server : reuse llama_sample_token common util (ggml-org#3494) llama : correct hparams comparison (ggml-org#3446) ci : fix xcodebuild destinations (ggml-org#3491) convert : update Falcon script for new HF config (ggml-org#3448) build : use std::make_tuple() for compatibility with older GCC versions (ggml-org#3488) common : process escape sequences in reverse prompts (ggml-org#3461) CLBlast: Fix handling of on-device tensor data server : fix incorrect num_tokens_predicted (ggml-org#3480) swift : disable ACCELERATE_NEW_LAPACK (ggml-org#3481) ci : add swift build via xcodebuild (ggml-org#3482)
2 parents 94daebe + 9ca79d5 commit 8369908

File tree

12 files changed

+415
-216
lines changed

12 files changed

+415
-216
lines changed

.github/workflows/build.yml

+25-2
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ on:
1010
push:
1111
branches:
1212
- master
13-
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
13+
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
1414
pull_request:
1515
types: [opened, synchronize, reopened]
16-
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
16+
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
1717

1818
env:
1919
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -253,6 +253,29 @@ jobs:
253253
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
254254
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
255255
256+
macOS-latest-swift:
257+
runs-on: macos-latest
258+
259+
strategy:
260+
matrix:
261+
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
262+
263+
steps:
264+
- name: Clone
265+
id: checkout
266+
uses: actions/checkout@v1
267+
268+
- name: Dependencies
269+
id: depends
270+
continue-on-error: true
271+
run: |
272+
brew update
273+
274+
- name: xcodebuild for swift package
275+
id: xcodebuild
276+
run: |
277+
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
278+
256279
windows-latest-cmake:
257280
runs-on: windows-latest
258281

Package.swift

+6-3
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@ let package = Package(
4444
cSettings: [
4545
.unsafeFlags(["-Wno-shorten-64-to-32"]),
4646
.define("GGML_USE_K_QUANTS"),
47-
.define("GGML_USE_ACCELERATE"),
48-
.define("ACCELERATE_NEW_LAPACK"),
49-
.define("ACCELERATE_LAPACK_ILP64")
47+
.define("GGML_USE_ACCELERATE")
48+
// NOTE: NEW_LAPACK will required iOS version 16.4+
49+
// We should consider add this in the future when we drop support for iOS 14
50+
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
51+
// .define("ACCELERATE_NEW_LAPACK"),
52+
// .define("ACCELERATE_LAPACK_ILP64")
5053
] + additionalSettings,
5154
linkerSettings: [
5255
.linkedFramework("Accelerate")

common/common.cpp

+12-6
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
167167
invalid_param = true;
168168
break;
169169
}
170+
// store the external file name in params
171+
params.prompt_file = argv[i];
170172
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
171173
if (params.prompt.back() == '\n') {
172174
params.prompt.pop_back();
@@ -361,7 +363,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
361363
invalid_param = true;
362364
break;
363365
}
364-
params.lora_adapter.push_back({argv[i], 1.0f});
366+
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
365367
params.use_mmap = false;
366368
} else if (arg == "--lora-scaled") {
367369
if (++i >= argc) {
@@ -373,7 +375,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
373375
invalid_param = true;
374376
break;
375377
}
376-
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
378+
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
377379
params.use_mmap = false;
378380
} else if (arg == "--lora-base") {
379381
if (++i >= argc) {
@@ -616,6 +618,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
616618
process_escapes(params.prompt);
617619
process_escapes(params.input_prefix);
618620
process_escapes(params.input_suffix);
621+
for (auto & antiprompt : params.antiprompt) {
622+
process_escapes(antiprompt);
623+
}
619624
}
620625

621626
return true;
@@ -1020,10 +1025,11 @@ llama_token llama_sample_token(
10201025
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
10211026
} else {
10221027
// Temperature sampling
1023-
llama_sample_top_k (ctx, &cur_p, top_k, 1);
1024-
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
1025-
llama_sample_typical (ctx, &cur_p, typical_p, 1);
1026-
llama_sample_top_p (ctx, &cur_p, top_p, 1);
1028+
size_t min_keep = std::max(1, params.n_probs);
1029+
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
1030+
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
1031+
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
1032+
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
10271033
llama_sample_temp(ctx, &cur_p, temp);
10281034

10291035
{

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ struct gpt_params {
7979
std::string model_draft = ""; // draft model for speculative decoding
8080
std::string model_alias = "unknown"; // model alias
8181
std::string prompt = "";
82+
std::string prompt_file = ""; // store the external prompt file name
8283
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
8384
std::string input_prefix = ""; // string to prefix user inputs with
8485
std::string input_suffix = ""; // string to suffix user inputs with

convert-falcon-hf-to-gguf.py

+79-64
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from __future__ import annotations
55

66
import argparse
7+
import contextlib
78
import json
89
import os
910
import struct
@@ -20,10 +21,10 @@
2021
import gguf
2122

2223

23-
def count_model_parts(dir_model: Path) -> int:
24+
def count_model_parts(dir_model: Path, prefix: str) -> int:
2425
num_parts = 0
2526
for filename in os.listdir(dir_model):
26-
if filename.startswith("pytorch_model-"):
27+
if filename.startswith(prefix):
2728
num_parts += 1
2829

2930
if num_parts > 0:
@@ -77,30 +78,36 @@ def parse_args() -> argparse.Namespace:
7778
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
7879
hparams = json.load(f)
7980

80-
if hparams["architectures"][0] != "RWForCausalLM":
81+
if hparams["architectures"][0] != "FalconForCausalLM":
8182
print("Model architecture not supported: " + hparams["architectures"][0])
8283

8384
sys.exit(1)
8485

8586
# get number of model parts
86-
num_parts = count_model_parts(dir_model)
87+
num_parts = count_model_parts(dir_model, "model-00")
88+
if num_parts:
89+
is_safetensors = True
90+
from safetensors import safe_open
91+
else:
92+
is_safetensors = False
93+
num_parts = count_model_parts(dir_model, "pytorch_model-")
8794

8895
ARCH=gguf.MODEL_ARCH.FALCON
8996
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
9097

9198
print("gguf: get model metadata")
9299

93-
block_count = hparams["n_layer"]
100+
block_count = hparams["num_hidden_layers"]
94101

95102
gguf_writer.add_name("Falcon")
96103
gguf_writer.add_context_length(2048) # not in config.json
97104
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
98105
gguf_writer.add_embedding_length(hparams["hidden_size"])
99106
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
100107
gguf_writer.add_block_count(block_count)
101-
gguf_writer.add_head_count(hparams["n_head"])
102-
if "n_head_kv" in hparams:
103-
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
108+
gguf_writer.add_head_count(hparams["num_attention_heads"])
109+
if "num_kv_heads" in hparams:
110+
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
104111
else:
105112
gguf_writer.add_head_count_kv(1)
106113
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
@@ -146,8 +153,8 @@ def parse_args() -> argparse.Namespace:
146153
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
147154

148155
# params for qkv transform
149-
n_head = hparams["n_head"]
150-
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
156+
n_head = hparams["num_attention_heads"]
157+
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
151158

152159
head_dim = hparams["hidden_size"] // n_head
153160

@@ -156,6 +163,10 @@ def parse_args() -> argparse.Namespace:
156163

157164
if num_parts == 0:
158165
part_names = iter(("pytorch_model.bin",))
166+
elif is_safetensors:
167+
part_names = (
168+
f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
169+
)
159170
else:
160171
part_names = (
161172
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
@@ -165,60 +176,64 @@ def parse_args() -> argparse.Namespace:
165176
if args.vocab_only:
166177
break
167178
print("gguf: loading model part '" + part_name + "'")
168-
model_part = torch.load(dir_model / part_name, map_location="cpu")
169-
170-
for name in model_part.keys():
171-
data = model_part[name]
172-
173-
old_dtype = data.dtype
174-
175-
# convert any unsupported data types to float32
176-
if data.dtype != torch.float16 and data.dtype != torch.float32:
177-
data = data.to(torch.float32)
178-
179-
# QKV tensor transform
180-
# The original query_key_value tensor contains n_head_kv "kv groups",
181-
# each consisting of n_head/n_head_kv query weights followed by one key
182-
# and one value weight (shared by all query heads in the kv group).
183-
# This layout makes it a big pain to work with in GGML.
184-
# So we rearrange them here,, so that we have n_head query weights
185-
# followed by n_head_kv key weights followed by n_head_kv value weights,
186-
# in contiguous fashion.
187-
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
188-
189-
if "query_key_value" in name:
190-
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
191-
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
192-
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
193-
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
194-
data = torch.cat((q,k,v)).reshape_as(data)
195-
196-
data = data.squeeze().numpy()
197-
198-
# map tensor names
199-
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
200-
if new_name is None:
201-
print("Can not map tensor '" + name + "'")
202-
sys.exit()
203-
204-
n_dims = len(data.shape)
205-
data_dtype = data.dtype
206-
207-
# if f32 desired, convert any float16 to float32
208-
if ftype == 0 and data_dtype == np.float16:
209-
data = data.astype(np.float32)
210-
211-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
212-
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
213-
data = data.astype(np.float32)
214-
215-
# if f16 desired, convert any float32 2-dim weight tensors to float16
216-
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
217-
data = data.astype(np.float16)
218-
219-
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
220-
221-
gguf_writer.add_tensor(new_name, data)
179+
if is_safetensors:
180+
ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
181+
else:
182+
ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
183+
184+
with ctx as model_part:
185+
for name in model_part.keys():
186+
data = model_part.get_tensor(name) if is_safetensors else model_part[name]
187+
188+
old_dtype = data.dtype
189+
190+
# convert any unsupported data types to float32
191+
if data.dtype != torch.float16 and data.dtype != torch.float32:
192+
data = data.to(torch.float32)
193+
194+
# QKV tensor transform
195+
# The original query_key_value tensor contains n_head_kv "kv groups",
196+
# each consisting of n_head/n_head_kv query weights followed by one key
197+
# and one value weight (shared by all query heads in the kv group).
198+
# This layout makes it a big pain to work with in GGML.
199+
# So we rearrange them here,, so that we have n_head query weights
200+
# followed by n_head_kv key weights followed by n_head_kv value weights,
201+
# in contiguous fashion.
202+
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
203+
204+
if "query_key_value" in name:
205+
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
206+
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
207+
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
208+
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
209+
data = torch.cat((q,k,v)).reshape_as(data)
210+
211+
data = data.squeeze().numpy()
212+
213+
# map tensor names
214+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
215+
if new_name is None:
216+
print("Can not map tensor '" + name + "'")
217+
sys.exit()
218+
219+
n_dims = len(data.shape)
220+
data_dtype = data.dtype
221+
222+
# if f32 desired, convert any float16 to float32
223+
if ftype == 0 and data_dtype == np.float16:
224+
data = data.astype(np.float32)
225+
226+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
227+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
228+
data = data.astype(np.float32)
229+
230+
# if f16 desired, convert any float32 2-dim weight tensors to float16
231+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
232+
data = data.astype(np.float16)
233+
234+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
235+
236+
gguf_writer.add_tensor(new_name, data)
222237

223238

224239
print("gguf: write header")

examples/jeopardy/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
44

5-
The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
5+
The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
66

77

88
Step 1: Open jeopardy.sh and modify the following:

0 commit comments

Comments
 (0)