Skip to content

Commit 508c364

Browse files
authored
Add precision option in PP inference examples (#11440)
1 parent e9e8f9b commit 508c364

13 files changed

+31
-27
lines changed

python/llm/example/GPU/Pipeline-Parallel-Inference/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
5252

5353
For optimal performance, it is recommended to set several environment variables. We provide example usages as following:
5454

55+
> Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`.
56+
5557
</details>
5658

5759
<details>

python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -34,24 +34,26 @@
3434
help='Prompt to infer')
3535
parser.add_argument('--n-predict', type=int, default=32,
3636
help='Max tokens to predict')
37+
parser.add_argument('--low-bit', type=str, default='sym_int4', help='The quantization type the model will convert to.')
3738
parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use')
3839

3940
args = parser.parse_args()
4041
model_path = args.repo_id_or_model_path
42+
low_bit = args.low_bit
4143

4244
# Load model in 4 bit,
4345
# which convert the relevant layers in the model into INT4 format
4446
try:
4547
model = AutoModelForCausalLM.from_pretrained(model_path,
46-
load_in_4bit=True,
48+
load_in_low_bit=low_bit,
4749
optimize_model=True,
4850
trust_remote_code=True,
4951
use_cache=True,
5052
torch_dtype=torch.float16,
5153
pipeline_parallel_stages=args.gpu_num)
5254
except:
5355
model = AutoModel.from_pretrained(model_path,
54-
load_in_4bit=True,
56+
load_in_low_bit=low_bit,
5557
optimize_model=True,
5658
trust_remote_code=True,
5759
use_cache=True,

python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ NUM_GPUS=2 # number of used GPU
2929

3030
# To run Baichuan2-7B-Chat
3131
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
32-
generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS
32+
generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3333

3434
# # To run Baichuan2-13B-Chat
3535
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
36-
# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS
36+
# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ export TORCH_LLM_ALLREDUCE=0
2828
NUM_GPUS=2 # number of used GPU
2929
# To run chatglm3-6b
3030
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
31-
generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS
31+
generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
3030

3131
# To run CodeLlama-7b-Instruct-hf
3232
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
33-
generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS
33+
generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3434

3535
# To run CodeLlama-13b-Instruct-hf
3636
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
37-
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS
37+
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3838

3939
# To run CodeLlama-34b-Instruct-hf
4040
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
41-
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS
41+
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
3030

3131
# To run Llama-2-7b-chat-hf
3232
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
33-
generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS
33+
generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3434

3535
# # To run Llama-2-13b-chat-hf
3636
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
37-
# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
37+
# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3838

3939
# # To run Meta-Llama-3-8B-Instruct
4040
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
41-
# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS
41+
# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
3030

3131
# To run Mistral-7B-v0.1
3232
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
33-
generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS
33+
generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3434

3535
# To run Mixtral-8x7B-Instruct-v0.1
3636
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
37-
# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS
37+
# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
3030

3131
# To run Phi-3-medium-4k-instruct
3232
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
33-
generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS
33+
generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3434

3535
# # To run Phi-3-mini-4k-instruct
3636
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
37-
# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS
37+
# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh

+5-5
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,20 @@ NUM_GPUS=2 # number of used GPU
2929

3030
# To run Qwen1.5-7B-Chat
3131
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
32-
generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS
32+
generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3333

3434
# # To run Qwen1.5-14B-Chat
3535
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
36-
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS
36+
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3737

3838
# # To run Qwen1.5-32B-Chat
3939
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
40-
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS
40+
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
4141

4242
# # To run Qwen1.5-MoE-A2.7B-Chat
4343
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
44-
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS
44+
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
4545

4646
# # To run CodeQwen1.5-7B-Chat
4747
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
48-
# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS
48+
# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@ NUM_GPUS=2 # number of used GPU
2929

3030
# To run Qwen2-7B-Instruct
3131
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
32-
generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS
32+
generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ NUM_GPUS=2 # number of used GPU
3030

3131
# To run SOLAR-10.7B-Instruct-v1.0
3232
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
33-
generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS
33+
generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
3030

3131
# To run vicuna-7b-v1.3
3232
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
33-
generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS
33+
generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3434

3535
# To run vicuna-13b-v1.3
3636
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
37-
# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS
37+
# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3838

3939
# To run vicuna-33b-v1.3
4040
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
41-
# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS
41+
# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
3030

3131
# To run Yi-6B-Chat
3232
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
33-
generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS
33+
generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
3434

3535
# To run Yi-34B-Chat
3636
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
37-
# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS
37+
# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

0 commit comments

Comments
 (0)