Skip to content

Commit 8077850

Browse files
authored
[NPU GGUF] Add simple example (#12853)
1 parent 348dc80 commit 8077850

File tree

4 files changed

+310
-5
lines changed

4 files changed

+310
-5
lines changed

docs/mddocs/Quickstart/npu_quickstart.md

+10-5
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,8 @@ Refer to the following table for verified models:
193193
| LLaMA 3.2 | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | Meteor Lake, Lunar Lake, Arrow Lake |
194194
| DeepSeek-R1 | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B), [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | Meteor Lake, Lunar Lake, Arrow Lake |
195195

196-
### Setup for running llama.cpp
196+
### Run GGUF model using CLI tool
197+
#### Setup for running llama.cpp
197198

198199
First, you should create a directory to use `llama.cpp`, for instance, use following command to create a `llama-cpp-npu` directory and enter it.
199200

@@ -208,23 +209,27 @@ Then, please run the following command with **administrator privilege in Minifor
208209
init-llama-cpp.bat
209210
```
210211

211-
### Model Download
212+
#### Model Download
212213

213214
Before running, you should download or copy community GGUF model to your current directory. For instance, `DeepSeek-R1-Distill-Qwen-7B-Q6_K.gguf` of [DeepSeek-R1-Distill-Qwen-7B-GGUF](https://huggingface.co/lmstudio-community/DeepSeek-R1-Distill-Qwen-7B-GGUF/tree/main).
214215

215-
### Run the quantized model
216+
#### Run the quantized model
216217

217218
Please refer to [Runtime Configurations](#runtime-configurations) before running the following command in Miniforge Prompt.
218219

219220
```cmd
220221
llama-cli-npu.exe -m DeepSeek-R1-Distill-Qwen-7B-Q6_K.gguf -n 32 --prompt "What is AI?"
221222
```
222223

224+
And you could use `llama-cli-npu.exe -h` for more details about meaning of each parameter.
225+
226+
### Run GGUF model using llama.cpp C++ API
227+
228+
IPEX-LLM also supports `llama.cpp` C++ API for running GGUF models on Intel NPU. Refer to [Simple Example](../../../python/llm/example/NPU/llama.cpp/) for usage in details.
229+
223230
> **Note**:
224231
>
225232
> - **Warmup on first run**: When running specific GGUF models on NPU for the first time, you might notice delays up to several minutes before the first token is generated. This delay occurs because the blob compilation.
226-
> - For more details about meaning of each parameter, you can use `llama-cli-npu.exe -h`.
227-
228233
229234
## Accuracy Tuning
230235

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
cmake_minimum_required(VERSION 3.10)
2+
3+
project(LLM_NPU_EXAMPLE VERSION 1.0.0 LANGUAGES CXX)
4+
5+
set (CMAKE_CXX_STANDARD 17)
6+
SET (CMAKE_CXX_STANDARD_REQUIRED True)
7+
8+
if(DEFINED ENV{CONDA_ENV_DIR})
9+
set(ENV_DIR $ENV{CONDA_ENV_DIR})
10+
set(LIBRARY_DIR ${ENV_DIR}/bigdl-core-npu)
11+
include_directories(${LIBRARY_DIR}/include/npu)
12+
include_directories(${LIBRARY_DIR}/include/llamacpp)
13+
set(DLL_DIR ${ENV_DIR}/intel_npu_acceleration_library/lib/Release)
14+
else()
15+
set(LIBRARY_DIR ${CMAKE_CURRENT_SOURCE_DIR})
16+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
17+
endif()
18+
19+
add_library(npu_llm STATIC IMPORTED)
20+
set_target_properties(npu_llm PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/npu_llm.lib)
21+
22+
add_library(llama STATIC IMPORTED)
23+
set_target_properties(llama PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/llama.lib)
24+
25+
add_library(common STATIC IMPORTED)
26+
set_target_properties(common PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/common.lib)
27+
28+
add_library(ggml STATIC IMPORTED)
29+
set_target_properties(ggml PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/ggml.lib)
30+
31+
set(TARGET simple)
32+
add_executable(${TARGET} simple.cpp)
33+
install(TARGETS ${TARGET} RUNTIME)
34+
target_link_libraries(${TARGET} PRIVATE npu_llm common llama ggml ${CMAKE_THREAD_LIBS_INIT})
35+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
36+
37+
add_custom_command(TARGET simple POST_BUILD
38+
COMMAND ${CMAKE_COMMAND} -E copy_if_different
39+
${LIBRARY_DIR}/npu_llm.dll
40+
${LIBRARY_DIR}/llama.dll
41+
${LIBRARY_DIR}/ggml.dll
42+
${CMAKE_BINARY_DIR}/Release/
43+
COMMENT "Copying npu_llm.dll llama.dll ggml.dll to build/Release\n"
44+
)
45+
46+
add_custom_command(TARGET simple POST_BUILD
47+
COMMAND ${CMAKE_COMMAND} -E copy_directory
48+
${DLL_DIR}/
49+
${CMAKE_BINARY_DIR}/Release/
50+
COMMENT "Copying dependency to build/Release\n"
51+
)
+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# (Experimental) Example of running GGUF model using llama.cpp C++ API on NPU
2+
In this directory, you will find a simple C++ example on how to run GGUF models on Intel NPUs using `llama.cpp` C++ API. See the table blow for verified models.
3+
4+
## Verified Models
5+
6+
| Model | Model link |
7+
|:--|:--|
8+
| LLaMA 3.2 | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) |
9+
| DeepSeek-R1 | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B), [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) |
10+
11+
Please refer to [Quickstart](../../../../../docs/mddocs/Quickstart/npu_quickstart.md#experimental-llamacpp-support) for details about verified platforms.
12+
13+
## 0. Prerequisites
14+
For `ipex-llm` NPU support, please refer to [Quickstart](../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations.
15+
16+
## 1. Install & Runtime Configurations
17+
### 1.1 Installation on Windows
18+
We suggest using conda to manage environment:
19+
```cmd
20+
conda create -n llm python=3.11
21+
conda activate llm
22+
23+
:: for building the example
24+
pip install cmake
25+
26+
:: install ipex-llm with 'npu' option
27+
pip install --pre --upgrade ipex-llm[npu]
28+
```
29+
30+
Please refer to [Quickstart](../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for more details about `ipex-llm` installation on Intel NPU.
31+
32+
### 1.2 Runtime Configurations
33+
Please refer to [Quickstart](../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device.
34+
35+
## 2. Build C++ Example `simple`
36+
37+
- You can run below cmake script in cmd to build `simple` by yourself, don't forget to replace below <CONDA_ENV_DIR> with your own path.
38+
39+
```cmd
40+
:: under current directory
41+
:: please replace below conda env dir with your own path
42+
set CONDA_ENV_DIR=C:\Users\arda\miniforge3\envs\llm\Lib\site-packages
43+
mkdir build
44+
cd build
45+
cmake ..
46+
cmake --build . --config Release -j
47+
cd Release
48+
```
49+
50+
- You can also directly use our released `simple.exe` which has the same usage as this example `simple.cpp`
51+
52+
## 3. Run `simple`
53+
54+
With built `simple`, you can run the GGUF model
55+
56+
```cmd
57+
# Run simple text completion
58+
simple.exe -m <gguf_model_path> -n 64 -p "Once upon a time,"
59+
```
60+
61+
> **Note**:
62+
>
63+
> **Warmup on first run**: When running specific GGUF models on NPU for the first time, you might notice delays up to several minutes before the first token is generated. This delay occurs because the blob compilation.
+186
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
//
2+
// Copyright 2016 The BigDL Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
//
16+
// This file is copied from https://github.com/ggerganov/llama.cpp/blob/3f1ae2e32cde00c39b96be6d01c2997c29bae555/examples/simple/simple.cpp
17+
18+
#include "arg.h"
19+
#include "common.h"
20+
#include "log.h"
21+
#include "llama.h"
22+
23+
#include <vector>
24+
25+
static void print_usage(int, char ** argv) {
26+
LOG("\nexample usage:\n");
27+
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
28+
LOG("\n");
29+
}
30+
31+
int main(int argc, char ** argv) {
32+
gpt_params params;
33+
34+
params.prompt = "Hello my name is";
35+
params.n_predict = 32;
36+
37+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
38+
return 1;
39+
}
40+
41+
gpt_init();
42+
43+
// total length of the sequence including the prompt
44+
const int n_predict = params.n_predict;
45+
46+
// init LLM
47+
48+
llama_backend_init();
49+
llama_numa_init(params.numa);
50+
51+
// initialize the model
52+
53+
llama_model_params model_params = llama_model_params_from_gpt_params(params);
54+
55+
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
56+
57+
if (model == NULL) {
58+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
59+
return 1;
60+
}
61+
62+
// initialize the context
63+
64+
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
65+
66+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
67+
68+
if (ctx == NULL) {
69+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
70+
return 1;
71+
}
72+
73+
auto sparams = llama_sampler_chain_default_params();
74+
75+
sparams.no_perf = false;
76+
77+
llama_sampler * smpl = llama_sampler_chain_init(sparams);
78+
79+
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
80+
81+
// tokenize the prompt
82+
83+
std::vector<llama_token> tokens_list;
84+
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
85+
86+
const int n_ctx = llama_n_ctx(ctx);
87+
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
88+
89+
LOG("\n");
90+
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
91+
92+
// make sure the KV cache is big enough to hold all the prompt and generated tokens
93+
if (n_kv_req > n_ctx) {
94+
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
95+
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
96+
return 1;
97+
}
98+
99+
// print the prompt token-by-token
100+
101+
LOG("\n");
102+
103+
for (auto id : tokens_list) {
104+
LOG("%s", llama_token_to_piece(ctx, id).c_str());
105+
}
106+
107+
// create a llama_batch with size 512
108+
// we use this object to submit token data for decoding
109+
110+
llama_batch batch = llama_batch_init(512, 0, 1);
111+
112+
// evaluate the initial prompt
113+
for (size_t i = 0; i < tokens_list.size(); i++) {
114+
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
115+
}
116+
117+
// llama_decode will output logits only for the last token of the prompt
118+
batch.logits[batch.n_tokens - 1] = true;
119+
120+
if (llama_decode(ctx, batch) != 0) {
121+
LOG("%s: llama_decode() failed\n", __func__);
122+
return 1;
123+
}
124+
125+
// main loop
126+
127+
int n_cur = batch.n_tokens;
128+
int n_decode = 0;
129+
130+
const auto t_main_start = ggml_time_us();
131+
132+
while (n_cur <= n_predict) {
133+
// sample the next token
134+
{
135+
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
136+
137+
// is it an end of generation?
138+
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
139+
LOG("\n");
140+
141+
break;
142+
}
143+
144+
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
145+
fflush(stdout);
146+
147+
// prepare the next batch
148+
llama_batch_clear(batch);
149+
150+
// push this new token for next evaluation
151+
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
152+
153+
n_decode += 1;
154+
}
155+
156+
n_cur += 1;
157+
158+
// evaluate the current batch with the transformer model
159+
if (llama_decode(ctx, batch)) {
160+
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
161+
return 1;
162+
}
163+
}
164+
165+
LOG("\n");
166+
167+
const auto t_main_end = ggml_time_us();
168+
169+
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
170+
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
171+
172+
LOG("\n");
173+
llama_perf_sampler_print(smpl);
174+
llama_perf_context_print(ctx);
175+
176+
LOG("\n");
177+
178+
llama_batch_free(batch);
179+
llama_sampler_free(smpl);
180+
llama_free(ctx);
181+
llama_free_model(model);
182+
183+
llama_backend_free();
184+
185+
return 0;
186+
}

0 commit comments

Comments
 (0)