Skip to content

Commit 24b46b2

Browse files
authored
[NPU] further fix of qwen2 int8 pipeline & C++ (#12449)
* fix * fix style
1 parent 303b104 commit 24b46b2

File tree

2 files changed

+8
-2
lines changed

2 files changed

+8
-2
lines changed

python/llm/src/ipex_llm/transformers/npu_model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def optimize_npu_model(cls, *args, **kwargs):
231231
from intel_npu_acceleration_library.compiler import create_npu_kernels
232232

233233
model = kwargs.pop("model")
234-
qtype = kwargs.pop("qtype", "sym_int4")
234+
qtype = kwargs.pop("qtype", "sym_int4_rtn")
235235
mixed_precision = kwargs.pop("mixed_precision", False)
236236
quantization_group_size = kwargs.pop("quantization_group_size", 0)
237237
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
@@ -280,6 +280,7 @@ def optimize_npu_model(cls, *args, **kwargs):
280280
max_prompt_len=max_prompt_len,
281281
transpose_value_cache=transpose_value_cache,
282282
group_size=quantization_group_size,
283+
qtype=qtype,
283284
convert_model=convert_model,
284285
save_directory=save_directory)
285286
model.save_low_bit = types.MethodType(save_low_bit, model)

python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,18 @@ def convert_llm(model: torch.nn.Module,
193193
max_prompt_len: int,
194194
transpose_value_cache: bool,
195195
group_size: int,
196+
qtype: str,
196197
convert_model: bool=False,
197198
save_directory: str=None):
198199
# whether to set layernorm weight as const
199200
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
200201
if group_size == 0:
201202
n_splits_linear = 1
202-
n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1
203+
if qtype == "sym_int8_rtn":
204+
# do not split mlp down_proj for Qwen2-7B & sym_int8
205+
n_splits_down_proj = 1
206+
else:
207+
n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1
203208
else:
204209
n_splits_linear = model.config.hidden_size // group_size
205210
n_splits_down_proj = model.config.intermediate_size // group_size

0 commit comments

Comments
 (0)