Skip to content

Commit 267c139

Browse files
authored
common : refactor downloading system, handle mmproj with -hf option (#12694)
* (wip) refactor downloading system [no ci] * fix all examples * fix mmproj with -hf * gemma3: update readme * only handle mmproj in llava example * fix multi-shard download * windows: fix problem with std::min and std::max * fix 2
1 parent f423981 commit 267c139

File tree

19 files changed

+673
-635
lines changed

19 files changed

+673
-635
lines changed

common/arg.cpp

+604-71
Large diffs are not rendered by default.

common/common.cpp

+4-491
Large diffs are not rendered by default.

common/common.h

+12-32
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,13 @@ struct common_params_sampling {
184184
std::string print() const;
185185
};
186186

187+
struct common_params_model {
188+
std::string path = ""; // model local path // NOLINT
189+
std::string url = ""; // model url to download // NOLINT
190+
std::string hf_repo = ""; // HF repo // NOLINT
191+
std::string hf_file = ""; // HF file // NOLINT
192+
};
193+
187194
struct common_params_speculative {
188195
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
189196

@@ -197,19 +204,11 @@ struct common_params_speculative {
197204
struct cpu_params cpuparams;
198205
struct cpu_params cpuparams_batch;
199206

200-
std::string hf_repo = ""; // HF repo // NOLINT
201-
std::string hf_file = ""; // HF file // NOLINT
202-
203-
std::string model = ""; // draft model for speculative decoding // NOLINT
204-
std::string model_url = ""; // model url to download // NOLINT
207+
struct common_params_model model;
205208
};
206209

207210
struct common_params_vocoder {
208-
std::string hf_repo = ""; // HF repo // NOLINT
209-
std::string hf_file = ""; // HF file // NOLINT
210-
211-
std::string model = ""; // model path // NOLINT
212-
std::string model_url = ""; // model url to download // NOLINT
211+
struct common_params_model model;
213212

214213
std::string speaker_file = ""; // speaker file path // NOLINT
215214

@@ -267,12 +266,10 @@ struct common_params {
267266
struct common_params_speculative speculative;
268267
struct common_params_vocoder vocoder;
269268

270-
std::string model = ""; // model path // NOLINT
269+
struct common_params_model model;
270+
271271
std::string model_alias = ""; // model alias // NOLINT
272-
std::string model_url = ""; // model url to download // NOLINT
273272
std::string hf_token = ""; // HF token // NOLINT
274-
std::string hf_repo = ""; // HF repo // NOLINT
275-
std::string hf_file = ""; // HF file // NOLINT
276273
std::string prompt = ""; // NOLINT
277274
std::string system_prompt = ""; // NOLINT
278275
std::string prompt_file = ""; // store the external prompt file name // NOLINT
@@ -347,7 +344,7 @@ struct common_params {
347344
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
348345

349346
// multimodal models (see examples/llava)
350-
std::string mmproj = ""; // path to multimodal projector // NOLINT
347+
struct common_params_model mmproj;
351348
std::vector<std::string> image; // path to image file(s)
352349

353350
// embedding
@@ -546,23 +543,6 @@ struct llama_model_params common_model_params_to_llama ( common_params
546543
struct llama_context_params common_context_params_to_llama(const common_params & params);
547544
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
548545

549-
struct llama_model * common_load_model_from_url(
550-
const std::string & model_url,
551-
const std::string & local_path,
552-
const std::string & hf_token,
553-
const struct llama_model_params & params);
554-
555-
struct llama_model * common_load_model_from_hf(
556-
const std::string & repo,
557-
const std::string & remote_path,
558-
const std::string & local_path,
559-
const std::string & hf_token,
560-
const struct llama_model_params & params);
561-
562-
std::pair<std::string, std::string> common_get_hf_file(
563-
const std::string & hf_repo_with_tag,
564-
const std::string & hf_token);
565-
566546
// clear LoRA adapters from context, then apply new list of adapters
567547
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
568548

examples/batched-bench/batched-bench.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
3838

3939
llama_model_params model_params = common_model_params_to_llama(params);
4040

41-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
41+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
4242

4343
if (model == NULL) {
4444
fprintf(stderr , "%s: error: unable to load model\n" , __func__);

examples/batched/batched.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
4141

4242
llama_model_params model_params = common_model_params_to_llama(params);
4343

44-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
44+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
4545

4646
if (model == NULL) {
4747
LOG_ERR("%s: error: unable to load model\n" , __func__);

examples/export-lora/export-lora.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
421421

422422
g_verbose = (params.verbosity > 1);
423423
try {
424-
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
424+
lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
425425
ctx.run_merge();
426426
} catch (const std::exception & err) {
427427
fprintf(stderr, "%s\n", err.what());

examples/gritlm/gritlm.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
168168

169169
llama_backend_init();
170170

171-
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
171+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
172172

173173
// create generation context
174174
llama_context * ctx = llama_init_from_model(model, cparams);

examples/llava/README-gemma3.md

+20
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,26 @@
44
>
55
> This is very experimental, only used for demo purpose.
66
7+
## Quick started
8+
9+
You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
10+
11+
```bash
12+
# build
13+
cmake -B build
14+
cmake --build build --target llama-gemma3-cli
15+
16+
# alternatively, install from brew (MacOS)
17+
brew install llama.cpp
18+
19+
# run it
20+
llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
21+
llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
22+
llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
23+
24+
# note: 1B model does not support vision
25+
```
26+
727
## How to get mmproj.gguf?
828

929
```bash

examples/llava/gemma3-cli.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ struct gemma3_context {
7878
}
7979

8080
void init_clip_model(common_params & params) {
81-
const char * clip_path = params.mmproj.c_str();
81+
const char * clip_path = params.mmproj.path.c_str();
8282
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
8383
}
8484

@@ -232,13 +232,13 @@ int main(int argc, char ** argv) {
232232

233233
common_init();
234234

235-
if (params.mmproj.empty()) {
235+
if (params.mmproj.path.empty()) {
236236
show_additional_info(argc, argv);
237237
return 1;
238238
}
239239

240240
gemma3_context ctx(params);
241-
printf("%s: %s\n", __func__, params.model.c_str());
241+
printf("%s: %s\n", __func__, params.model.path.c_str());
242242

243243
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
244244

examples/llava/llava-cli.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {
225225

226226
llama_model_params model_params = common_model_params_to_llama(*params);
227227

228-
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
228+
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
229229
if (model == NULL) {
230230
LOG_ERR("%s: unable to load model\n" , __func__);
231231
return NULL;
@@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) {
234234
}
235235

236236
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
237-
const char * clip_path = params->mmproj.c_str();
237+
const char * clip_path = params->mmproj.path.c_str();
238238

239239
auto prompt = params->prompt;
240240
if (prompt.empty()) {
@@ -283,7 +283,7 @@ int main(int argc, char ** argv) {
283283

284284
common_init();
285285

286-
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
286+
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
287287
print_usage(argc, argv);
288288
return 1;
289289
}

examples/llava/minicpmv-cli.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
3131

3232
llama_model_params model_params = common_model_params_to_llama(*params);
3333

34-
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
34+
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
3535
if (model == NULL) {
3636
LOG_ERR("%s: unable to load model\n" , __func__);
3737
return NULL;
@@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
8080
}
8181

8282
static struct clip_ctx * clip_init_context(common_params * params) {
83-
const char * clip_path = params->mmproj.c_str();
83+
const char * clip_path = params->mmproj.path.c_str();
8484

8585
auto prompt = params->prompt;
8686
if (prompt.empty()) {
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
290290

291291
common_init();
292292

293-
if (params.mmproj.empty() || (params.image.empty())) {
293+
if (params.mmproj.path.empty() || (params.image.empty())) {
294294
show_additional_info(argc, argv);
295295
return 1;
296296
}

examples/llava/qwen2vl-cli.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {
314314

315315
llama_model_params model_params = common_model_params_to_llama(*params);
316316

317-
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
317+
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
318318
if (model == NULL) {
319319
LOG_ERR("%s: unable to load model\n" , __func__);
320320
return NULL;
@@ -323,7 +323,7 @@ static struct llama_model * llava_init(common_params * params) {
323323
}
324324

325325
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
326-
const char * clip_path = params->mmproj.c_str();
326+
const char * clip_path = params->mmproj.path.c_str();
327327

328328
auto prompt = params->prompt;
329329
if (prompt.empty()) {
@@ -524,7 +524,7 @@ int main(int argc, char ** argv) {
524524

525525
common_init();
526526

527-
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
527+
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
528528
print_usage(argc, argv);
529529
return 1;
530530
}

examples/parallel/parallel.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ int main(int argc, char ** argv) {
405405
params.prompt_file = "used built-in defaults";
406406
}
407407
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
408-
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
408+
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());
409409

410410
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
411411
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);

examples/passkey/passkey.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
6464

6565
llama_model_params model_params = common_model_params_to_llama(params);
6666

67-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
67+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
6868

6969
if (model == NULL) {
7070
LOG_ERR("%s: unable to load model\n" , __func__);

examples/server/server.cpp

+8-11
Original file line numberDiff line numberDiff line change
@@ -1876,7 +1876,7 @@ struct server_context {
18761876
}
18771877

18781878
bool load_model(const common_params & params) {
1879-
SRV_INF("loading model '%s'\n", params.model.c_str());
1879+
SRV_INF("loading model '%s'\n", params.model.path.c_str());
18801880

18811881
params_base = params;
18821882

@@ -1886,7 +1886,7 @@ struct server_context {
18861886
ctx = llama_init.context.get();
18871887

18881888
if (model == nullptr) {
1889-
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
1889+
SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
18901890
return false;
18911891
}
18921892

@@ -1897,16 +1897,13 @@ struct server_context {
18971897
add_bos_token = llama_vocab_get_add_bos(vocab);
18981898
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
18991899

1900-
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
1901-
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
1900+
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
1901+
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
19021902

19031903
auto params_dft = params_base;
19041904

19051905
params_dft.devices = params_base.speculative.devices;
1906-
params_dft.hf_file = params_base.speculative.hf_file;
1907-
params_dft.hf_repo = params_base.speculative.hf_repo;
19081906
params_dft.model = params_base.speculative.model;
1909-
params_dft.model_url = params_base.speculative.model_url;
19101907
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
19111908
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
19121909
params_dft.n_parallel = 1;
@@ -1920,12 +1917,12 @@ struct server_context {
19201917
model_dft = llama_init_dft.model.get();
19211918

19221919
if (model_dft == nullptr) {
1923-
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
1920+
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
19241921
return false;
19251922
}
19261923

19271924
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
1928-
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
1925+
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
19291926

19301927
return false;
19311928
}
@@ -3865,7 +3862,7 @@ int main(int argc, char ** argv) {
38653862
json data = {
38663863
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
38673864
{ "total_slots", ctx_server.params_base.n_parallel },
3868-
{ "model_path", ctx_server.params_base.model },
3865+
{ "model_path", ctx_server.params_base.model.path },
38693866
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
38703867
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
38713868
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -4131,7 +4128,7 @@ int main(int argc, char ** argv) {
41314128
{"object", "list"},
41324129
{"data", {
41334130
{
4134-
{"id", params.model_alias.empty() ? params.model : params.model_alias},
4131+
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
41354132
{"object", "model"},
41364133
{"created", std::time(0)},
41374134
{"owned_by", "llamacpp"},

examples/speculative-simple/speculative-simple.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {
2424

2525
common_init();
2626

27-
if (params.speculative.model.empty()) {
27+
if (params.speculative.model.path.empty()) {
2828
LOG_ERR("%s: --model-draft is required\n", __func__);
2929
return 1;
3030
}

examples/speculative/speculative.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
4646

4747
common_init();
4848

49-
if (params.speculative.model.empty()) {
49+
if (params.speculative.model.path.empty()) {
5050
LOG_ERR("%s: --model-draft is required\n", __func__);
5151
return 1;
5252
}

examples/tts/tts.cpp

+1-6
Original file line numberDiff line numberDiff line change
@@ -577,12 +577,7 @@ int main(int argc, char ** argv) {
577577

578578
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
579579

580-
// TODO: refactor in a common struct
581-
params.model = params.vocoder.model;
582-
params.model_url = params.vocoder.model_url;
583-
params.hf_repo = params.vocoder.hf_repo;
584-
params.hf_file = params.vocoder.hf_file;
585-
580+
params.model = params.vocoder.model;
586581
params.embedding = true;
587582

588583
common_init_result llama_init_cts = common_init_from_params(params);

0 commit comments

Comments
 (0)