Skip to content

Commit fcf9298

Browse files
Merge pull request #31 from menloresearch/update-dev-from-master-2025-03-27-00-08
Sync master with upstream release b4966
2 parents c50af8e + bd40678 commit fcf9298

File tree

17 files changed

+740
-662
lines changed

17 files changed

+740
-662
lines changed

ci/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ docker run --privileged -it \
6060
Inside the container, execute the following commands:
6161

6262
```bash
63-
apt update -y && apt install -y cmake git python3.10-venv wget
63+
apt update -y && apt install -y bc cmake git python3.10-venv time unzip wget
6464
git config --global --add safe.directory /ws
6565
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
6666
```

common/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ if (LLAMA_LLGUIDANCE)
114114

115115
ExternalProject_Add(llguidance_ext
116116
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
117-
# v0.6.12:
118-
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
117+
# v0.7.10:
118+
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
119119
PREFIX ${CMAKE_BINARY_DIR}/llguidance
120120
SOURCE_DIR ${LLGUIDANCE_SRC}
121121
BUILD_IN_SOURCE TRUE

common/llguidance.cpp

Lines changed: 30 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,24 @@ struct llama_sampler_llg {
1111
std::string grammar_kind;
1212
std::string grammar_data;
1313
LlgTokenizer * tokenizer;
14-
LlgConstraint * grammar;
15-
LlgMaskResult llg_res;
16-
bool has_llg_res;
14+
LlgMatcher * grammar;
1715
};
1816

19-
static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
20-
const char * grammar_data) {
17+
static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
18+
const char * grammar_data) {
2119
LlgConstraintInit cinit;
2220
llg_constraint_init_set_defaults(&cinit, tokenizer);
2321
const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
2422
if (log_level && *log_level) {
2523
cinit.log_stderr_level = atoi(log_level);
2624
}
27-
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
28-
if (llg_get_error(c)) {
29-
LOG_ERR("llg error: %s\n", llg_get_error(c));
30-
llg_free_constraint(c);
25+
auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
26+
if (llg_matcher_get_error(c)) {
27+
LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
28+
llg_free_matcher(c);
3129
return nullptr;
3230
}
31+
3332
return c;
3433
}
3534

@@ -40,54 +39,39 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
4039
static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
4140
auto * ctx = (llama_sampler_llg *) smpl->ctx;
4241
if (ctx->grammar) {
43-
LlgCommitResult res;
44-
llg_commit_token(ctx->grammar, token, &res);
45-
ctx->has_llg_res = false;
42+
llg_matcher_consume_token(ctx->grammar, token);
4643
}
4744
}
4845

4946
static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
5047
auto * ctx = (llama_sampler_llg *) smpl->ctx;
5148
if (ctx->grammar) {
52-
if (!ctx->has_llg_res) {
53-
if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
54-
ctx->has_llg_res = true;
49+
const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
50+
if (mask == nullptr) {
51+
if (llg_matcher_compute_mask(ctx->grammar) == 0) {
52+
mask = llg_matcher_get_mask(ctx->grammar);
5553
} else {
56-
LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
57-
llg_free_constraint(ctx->grammar);
54+
LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
55+
llg_free_matcher(ctx->grammar);
5856
ctx->grammar = nullptr;
57+
return;
5958
}
6059
}
61-
if (ctx->has_llg_res) {
62-
if (ctx->llg_res.is_stop) {
63-
for (size_t i = 0; i < cur_p->size; ++i) {
64-
if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
65-
cur_p->data[i].logit = -INFINITY;
66-
}
67-
}
68-
} else {
69-
const uint32_t * mask = ctx->llg_res.sample_mask;
70-
for (size_t i = 0; i < cur_p->size; ++i) {
71-
auto token = cur_p->data[i].id;
72-
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
73-
cur_p->data[i].logit = -INFINITY;
74-
}
75-
}
60+
61+
for (size_t i = 0; i < cur_p->size; ++i) {
62+
auto token = cur_p->data[i].id;
63+
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
64+
cur_p->data[i].logit = -INFINITY;
7665
}
7766
}
7867
}
7968
}
8069

8170
static void llama_sampler_llg_reset(llama_sampler * smpl) {
8271
auto * ctx = (llama_sampler_llg *) smpl->ctx;
83-
if (!ctx->grammar) {
84-
return;
72+
if (ctx->grammar) {
73+
llg_matcher_reset(ctx->grammar);
8574
}
86-
87-
auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
88-
llg_free_constraint(ctx->grammar);
89-
ctx->grammar = grammar_new;
90-
ctx->has_llg_res = false;
9175
}
9276

9377
static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -102,7 +86,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
10286
if (ctx->grammar) {
10387
result_ctx->grammar_kind = ctx->grammar_kind;
10488
result_ctx->grammar_data = ctx->grammar_data;
105-
result_ctx->grammar = llg_clone_constraint(ctx->grammar);
89+
result_ctx->grammar = llg_clone_matcher(ctx->grammar);
10690
result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
10791
}
10892
}
@@ -114,7 +98,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
11498
const auto * ctx = (llama_sampler_llg *) smpl->ctx;
11599

116100
if (ctx->grammar) {
117-
llg_free_constraint(ctx->grammar);
101+
llg_free_matcher(ctx->grammar);
118102
llg_free_tokenizer(ctx->tokenizer);
119103
}
120104

@@ -239,25 +223,24 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
239223
/* .grammar_data = */ grammar_data,
240224
/* .tokenizer = */ tokenizer,
241225
/* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
242-
/* .llg_res = */ {},
243-
/* .has_llg_res = */ false,
244226
};
227+
if (ctx->grammar) {
228+
GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
229+
llg_matcher_get_mask_byte_size(ctx->grammar));
230+
}
245231
} else {
246232
*ctx = {
247233
/* .vocab = */ vocab,
248234
/* .grammar_kind = */ {},
249235
/* .grammar_data = */ {},
250236
/* .tokenizer = */ nullptr,
251237
/* .grammar = */ nullptr,
252-
/* .llg_res = */ {},
253-
/* .has_llg_res = */ false,
254238
};
255239
}
256240

257241
return llama_sampler_init(
258242
/* .iface = */ &llama_sampler_llg_i,
259-
/* .ctx = */ ctx
260-
);
243+
/* .ctx = */ ctx);
261244
}
262245

263246
#else

convert_hf_to_gguf.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1752,7 +1752,7 @@ class Mistral3Model(LlamaModel):
17521752

17531753
# we need to merge the text_config into the root level of hparams
17541754
def __init__(self, *args, **kwargs):
1755-
hparams = Model.load_hparams(kwargs["dir_model"])
1755+
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
17561756
if "text_config" in hparams:
17571757
hparams = {**hparams, **hparams["text_config"]}
17581758
kwargs["hparams"] = hparams
@@ -3385,7 +3385,7 @@ class Gemma3Model(Model):
33853385

33863386
# we need to merge the text_config into the root level of hparams
33873387
def __init__(self, *args, **kwargs):
3388-
hparams = Model.load_hparams(kwargs["dir_model"])
3388+
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
33893389
if "text_config" in hparams:
33903390
hparams = {**hparams, **hparams["text_config"]}
33913391
kwargs["hparams"] = hparams
@@ -3803,8 +3803,6 @@ def set_gguf_parameters(self):
38033803
_tok_embd = None
38043804

38053805
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3806-
del bid # unused
3807-
38083806
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
38093807
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
38103808

@@ -3814,6 +3812,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
38143812
logger.debug("A_log --> A ==> " + new_name)
38153813
data_torch = -torch.exp(data_torch)
38163814

3815+
# [4 1 8192 1] -> [4 8192 1 1]
3816+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
3817+
data_torch = data_torch.squeeze()
3818+
38173819
# assuming token_embd.weight is seen before output.weight
38183820
if self._tok_embd is not None and new_name == output_name:
38193821
if torch.equal(self._tok_embd, data_torch):
@@ -5358,7 +5360,7 @@ def main() -> None:
53585360
logger.error(f"Model {model_architecture} is not supported")
53595361
sys.exit(1)
53605362

5361-
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
5363+
model_instance = model_class(dir_model, output_type, fname_out,
53625364
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
53635365
eager=args.no_lazy,
53645366
metadata_override=args.metadata, model_name=args.model_name,

docs/build.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ The following compilation options are also available to tweak performance:
191191

192192
| Option | Legal values | Default | Description |
193193
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
194-
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
194+
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
195195
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
196196
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
197197
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
@@ -218,6 +218,7 @@ By default, all supported compute capabilities are enabled. To customize this be
218218

219219
```bash
220220
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
221+
cmake --build build --config Release
221222
```
222223

223224
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.

examples/llava/clip.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2989,7 +2989,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
29892989
assert(itype < GGML_TYPE_COUNT);
29902990
ggml_type type = static_cast<ggml_type>(itype);
29912991

2992-
auto * ctx_clip = clip_model_load(fname_inp, 2);
2992+
auto * ctx_clip = clip_init(fname_inp, clip_context_params{
2993+
/* use_gpu */ false,
2994+
/* verbosity */ 2,
2995+
});
29932996

29942997
const auto & ctx_src = ctx_clip->ctx_gguf;
29952998
const auto & ctx_data = ctx_clip->ctx_data;

0 commit comments

Comments
 (0)