Skip to content

Commit 344c106

Browse files
committed
feat: Update llama.cpp
1 parent 710e19a commit 344c106

File tree

2 files changed

+72
-4
lines changed

2 files changed

+72
-4
lines changed

llama_cpp/llama_cpp.py

+71-3
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@
227227
# LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
228228
# LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
229229
# LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
230+
# LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
230231
# };
231232
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
232233
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -257,6 +258,7 @@
257258
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26
258259
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
259260
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
261+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
260262

261263

262264
# // note: these values should be synchronized with ggml_rope
@@ -1357,6 +1359,12 @@ def llama_model_n_head(model: llama_model_p, /) -> int:
13571359
...
13581360

13591361

1362+
# LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
1363+
@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
1364+
def llama_model_n_head_kv(model: llama_model_p, /) -> int:
1365+
...
1366+
1367+
13601368
# // Get the model's RoPE frequency scaling factor
13611369
# LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
13621370
@ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@@ -3375,8 +3383,8 @@ class llama_sampler_i(ctypes.Structure):
33753383

33763384

33773385
# struct llama_sampler {
3378-
# struct llama_sampler_i * iface;
3379-
# llama_sampler_context_t ctx;
3386+
# const struct llama_sampler_i * iface;
3387+
# llama_sampler_context_t ctx;
33803388
# };
33813389
class llama_sampler(ctypes.Structure):
33823390
_fields_ = [
@@ -3410,6 +3418,18 @@ class llama_sampler(ctypes.Structure):
34103418

34113419

34123420
# // mirror of llama_sampler_i:
3421+
# LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
3422+
@ctypes_function(
3423+
"llama_sampler_init",
3424+
[ctypes.POINTER(llama_sampler_i), llama_sampler_context_t],
3425+
llama_sampler_p_ctypes,
3426+
)
3427+
def llama_sampler_init(
3428+
iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, /
3429+
) -> llama_sampler_p:
3430+
...
3431+
3432+
34133433
# LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
34143434
@ctypes_function(
34153435
"llama_sampler_name",
@@ -3627,6 +3647,17 @@ def llama_sampler_init_xtc(
36273647
...
36283648

36293649

3650+
# /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
3651+
# LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
3652+
@ctypes_function(
3653+
"llama_sampler_init_top_n_sigma",
3654+
[ctypes.c_float],
3655+
llama_sampler_p_ctypes,
3656+
)
3657+
def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p:
3658+
...
3659+
3660+
36303661
# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
36313662
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
36323663
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3685,6 +3716,43 @@ def llama_sampler_init_grammar(
36853716
...
36863717

36873718

3719+
# /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
3720+
# /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
3721+
# /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
3722+
# LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
3723+
# const struct llama_vocab * vocab,
3724+
# const char * grammar_str,
3725+
# const char * grammar_root,
3726+
# const char ** trigger_patterns,
3727+
# size_t num_trigger_patterns,
3728+
# const llama_token * trigger_tokens,
3729+
# size_t num_trigger_tokens);
3730+
@ctypes_function(
3731+
"llama_sampler_init_grammar_lazy_patterns",
3732+
[
3733+
llama_vocab_p_ctypes,
3734+
ctypes.c_char_p,
3735+
ctypes.c_char_p,
3736+
ctypes.POINTER(ctypes.c_char_p),
3737+
ctypes.c_size_t,
3738+
ctypes.POINTER(llama_token),
3739+
ctypes.c_size_t,
3740+
],
3741+
llama_sampler_p_ctypes,
3742+
)
3743+
def llama_sampler_init_grammar_lazy_patterns(
3744+
vocab: llama_vocab_p,
3745+
grammar_str: bytes,
3746+
grammar_root: bytes,
3747+
trigger_patterns: CtypesArray[bytes],
3748+
num_trigger_patterns: int,
3749+
trigger_tokens: CtypesArray[llama_token],
3750+
num_trigger_tokens: int,
3751+
/,
3752+
) -> llama_sampler_p:
3753+
...
3754+
3755+
36883756
# /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
36893757
# LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
36903758
# int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -3737,7 +3805,7 @@ def llama_sampler_init_dry(
37373805
dry_base: float,
37383806
dry_allowed_length: int,
37393807
dry_penalty_last_n: int,
3740-
seq_breakers: CtypesArray[bytes],
3808+
seq_breakers,
37413809
num_breakers: int,
37423810
/,
37433811
) -> llama_sampler_p:

vendor/llama.cpp

0 commit comments

Comments
 (0)