Skip to content

common : refactor downloading system, handle mmproj with -hf option #12694

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
675 changes: 604 additions & 71 deletions common/arg.cpp

Large diffs are not rendered by default.

495 changes: 4 additions & 491 deletions common/common.cpp

Large diffs are not rendered by default.

44 changes: 12 additions & 32 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,13 @@ struct common_params_sampling {
std::string print() const;
};

struct common_params_model {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
};

struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

Expand All @@ -197,19 +204,11 @@ struct common_params_speculative {
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;

std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT

std::string model = ""; // draft model for speculative decoding // NOLINT
std::string model_url = ""; // model url to download // NOLINT
struct common_params_model model;
};

struct common_params_vocoder {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT

std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
struct common_params_model model;

std::string speaker_file = ""; // speaker file path // NOLINT

Expand Down Expand Up @@ -267,12 +266,10 @@ struct common_params {
struct common_params_speculative speculative;
struct common_params_vocoder vocoder;

std::string model = ""; // model path // NOLINT
struct common_params_model model;

std::string model_alias = ""; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name // NOLINT
Expand Down Expand Up @@ -347,7 +344,7 @@ struct common_params {
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;

// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT
struct common_params_model mmproj;
std::vector<std::string> image; // path to image file(s)

// embedding
Expand Down Expand Up @@ -546,23 +543,6 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);

struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);

std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);

// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

Expand Down
2 changes: 1 addition & 1 deletion examples/batched-bench/batched-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ int main(int argc, char ** argv) {

llama_model_params model_params = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
Expand Down
2 changes: 1 addition & 1 deletion examples/batched/batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ int main(int argc, char ** argv) {

llama_model_params model_params = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__);
Expand Down
2 changes: 1 addition & 1 deletion examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ int main(int argc, char ** argv) {

g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
Expand Down
2 changes: 1 addition & 1 deletion examples/gritlm/gritlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {

llama_backend_init();

llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);

// create generation context
llama_context * ctx = llama_init_from_model(model, cparams);
Expand Down
20 changes: 20 additions & 0 deletions examples/llava/README-gemma3.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,26 @@
>
> This is very experimental, only used for demo purpose.
## Quick started

You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account

```bash
# build
cmake -B build
cmake --build build --target llama-gemma3-cli

# alternatively, install from brew (MacOS)
brew install llama.cpp

# run it
llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF

# note: 1B model does not support vision
```

## How to get mmproj.gguf?

```bash
Expand Down
6 changes: 3 additions & 3 deletions examples/llava/gemma3-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ struct gemma3_context {
}

void init_clip_model(common_params & params) {
const char * clip_path = params.mmproj.c_str();
const char * clip_path = params.mmproj.path.c_str();
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
}

Expand Down Expand Up @@ -232,13 +232,13 @@ int main(int argc, char ** argv) {

common_init();

if (params.mmproj.empty()) {
if (params.mmproj.path.empty()) {
show_additional_info(argc, argv);
return 1;
}

gemma3_context ctx(params);
printf("%s: %s\n", __func__, params.model.c_str());
printf("%s: %s\n", __func__, params.model.path.c_str());

bool is_single_turn = !params.prompt.empty() && !params.image.empty();

Expand Down
6 changes: 3 additions & 3 deletions examples/llava/llava-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {

llama_model_params model_params = common_model_params_to_llama(*params);

llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
Expand All @@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) {
}

static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
const char * clip_path = params->mmproj.path.c_str();

auto prompt = params->prompt;
if (prompt.empty()) {
Expand Down Expand Up @@ -283,7 +283,7 @@ int main(int argc, char ** argv) {

common_init();

if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
Expand Down
6 changes: 3 additions & 3 deletions examples/llava/minicpmv-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {

llama_model_params model_params = common_model_params_to_llama(*params);

llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
Expand Down Expand Up @@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
}

static struct clip_ctx * clip_init_context(common_params * params) {
const char * clip_path = params->mmproj.c_str();
const char * clip_path = params->mmproj.path.c_str();

auto prompt = params->prompt;
if (prompt.empty()) {
Expand Down Expand Up @@ -290,7 +290,7 @@ int main(int argc, char ** argv) {

common_init();

if (params.mmproj.empty() || (params.image.empty())) {
if (params.mmproj.path.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
return 1;
}
Expand Down
6 changes: 3 additions & 3 deletions examples/llava/qwen2vl-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {

llama_model_params model_params = common_model_params_to_llama(*params);

llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
Expand All @@ -323,7 +323,7 @@ static struct llama_model * llava_init(common_params * params) {
}

static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
const char * clip_path = params->mmproj.path.c_str();

auto prompt = params->prompt;
if (prompt.empty()) {
Expand Down Expand Up @@ -524,7 +524,7 @@ int main(int argc, char ** argv) {

common_init();

if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
Expand Down
2 changes: 1 addition & 1 deletion examples/parallel/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ int main(int argc, char ** argv) {
params.prompt_file = "used built-in defaults";
}
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());

LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
Expand Down
2 changes: 1 addition & 1 deletion examples/passkey/passkey.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ int main(int argc, char ** argv) {

llama_model_params model_params = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
Expand Down
19 changes: 8 additions & 11 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1876,7 +1876,7 @@ struct server_context {
}

bool load_model(const common_params & params) {
SRV_INF("loading model '%s'\n", params.model.c_str());
SRV_INF("loading model '%s'\n", params.model.path.c_str());

params_base = params;

Expand All @@ -1886,7 +1886,7 @@ struct server_context {
ctx = llama_init.context.get();

if (model == nullptr) {
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
return false;
}

Expand All @@ -1897,16 +1897,13 @@ struct server_context {
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;

if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());

auto params_dft = params_base;

params_dft.devices = params_base.speculative.devices;
params_dft.hf_file = params_base.speculative.hf_file;
params_dft.hf_repo = params_base.speculative.hf_repo;
params_dft.model = params_base.speculative.model;
params_dft.model_url = params_base.speculative.model_url;
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
Expand All @@ -1920,12 +1917,12 @@ struct server_context {
model_dft = llama_init_dft.model.get();

if (model_dft == nullptr) {
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
return false;
}

if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());

return false;
}
Expand Down Expand Up @@ -3865,7 +3862,7 @@ int main(int argc, char ** argv) {
json data = {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model },
{ "model_path", ctx_server.params_base.model.path },
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
Expand Down Expand Up @@ -4131,7 +4128,7 @@ int main(int argc, char ** argv) {
{"object", "list"},
{"data", {
{
{"id", params.model_alias.empty() ? params.model : params.model_alias},
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
Expand Down
2 changes: 1 addition & 1 deletion examples/speculative-simple/speculative-simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ int main(int argc, char ** argv) {

common_init();

if (params.speculative.model.empty()) {
if (params.speculative.model.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}
Expand Down
2 changes: 1 addition & 1 deletion examples/speculative/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ int main(int argc, char ** argv) {

common_init();

if (params.speculative.model.empty()) {
if (params.speculative.model.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}
Expand Down
7 changes: 1 addition & 6 deletions examples/tts/tts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -577,12 +577,7 @@ int main(int argc, char ** argv) {

const llama_vocab * vocab = llama_model_get_vocab(model_ttc);

// TODO: refactor in a common struct
params.model = params.vocoder.model;
params.model_url = params.vocoder.model_url;
params.hf_repo = params.vocoder.hf_repo;
params.hf_file = params.vocoder.hf_file;

params.model = params.vocoder.model;
params.embedding = true;

common_init_result llama_init_cts = common_init_from_params(params);
Expand Down
Loading
Loading