Skip to content

Commit f8c2bb2

Browse files
authored
[NPU] optimize qwen2 prefill performance for C++ (#12451)
1 parent 8331875 commit f8c2bb2

File tree

2 files changed

+10
-1
lines changed
  • python/llm
    • example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples
    • src/ipex_llm/transformers/npu_pipeline_model

2 files changed

+10
-1
lines changed

python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py

+5
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
from packaging import version
2525
import os
2626
import shutil
27+
import time
28+
2729

2830
logger = logging.get_logger(__name__)
2931

@@ -55,6 +57,7 @@
5557
model_path = args.repo_id_or_model_path
5658
save_dir = args.save_directory
5759

60+
t0 = time.perf_counter()
5861
model = AutoModelForCausalLM.from_pretrained(model_path,
5962
optimize_model=True,
6063
pipeline=True,
@@ -69,6 +72,7 @@
6972
trust_remote_code=True,
7073
convert_model=True,
7174
save_directory=save_dir)
75+
t1 = time.perf_counter()
7276

7377
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
7478

@@ -81,5 +85,6 @@
8185
tokenizer.save_pretrained(save_dir)
8286

8387
print("-" * 80)
88+
print(f"Convert model cost {t1 - t0}s.")
8489
print(f"finish save model to {save_dir}")
8590
print("success shut down")

python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
135135
if mode == "decode":
136136
input_len = 1
137137
decoder_name = f"decoder_layer_{layer_idx}"
138+
npu_dpu_groups = None
138139
else:
139140
input_len = kv_len
140141
decoder_name = "decoder_layer_prefill"
142+
npu_dpu_groups = 6
143+
141144
single_decoder = LowBitQwenMultiDecoderlayer(
142145
[1, input_len, num_heads * head_dim],
143146
input_layernorm_weights=None,
@@ -162,7 +165,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
162165
)
163166
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
164167
decoder_name,
165-
temp_dir, True, False)
168+
temp_dir, True, False,
169+
npu_dpu_groups=npu_dpu_groups)
166170

167171
# 0, 1, 2 are input_embed/attention_mask/position_id
168172
if mode == "decode":

0 commit comments

Comments
 (0)