Add precision option in PP inference examples (#11440)

plusbang · web-flow · commit 508c364a79b0 · 2024-06-27T09:24:27.000+08:00
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
@@ -52,6 +52,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
 
 For optimal performance, it is recommended to set several environment variables. We provide example usages as following:
 
+> Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`.
+
 </details>
 
 <details>
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
@@ -34,24 +34,26 @@
                         help='Prompt to infer')
     parser.add_argument('--n-predict', type=int, default=32,
                         help='Max tokens to predict')
+    parser.add_argument('--low-bit', type=str, default='sym_int4', help='The quantization type the model will convert to.')
     parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use')
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
+    low_bit = args.low_bit
 
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
     try:
         model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                     load_in_4bit=True,
+                                                     load_in_low_bit=low_bit,
                                                      optimize_model=True,
                                                      trust_remote_code=True,
                                                      use_cache=True,
                                                      torch_dtype=torch.float16,
                                                      pipeline_parallel_stages=args.gpu_num)
     except:
         model = AutoModel.from_pretrained(model_path,
-                                          load_in_4bit=True,
+                                          load_in_low_bit=low_bit,
                                           optimize_model=True,
                                           trust_remote_code=True,
                                           use_cache=True,
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh
@@ -29,8 +29,8 @@ NUM_GPUS=2 # number of used GPU
 
 # To run Baichuan2-7B-Chat
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # # To run Baichuan2-13B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh
@@ -28,4 +28,4 @@ export TORCH_LLM_ALLREDUCE=0
 NUM_GPUS=2 # number of used GPU
 # To run chatglm3-6b
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh
@@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
 
 # To run CodeLlama-7b-Instruct-hf
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # To run CodeLlama-13b-Instruct-hf
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # To run CodeLlama-34b-Instruct-hf
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh
@@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
 
 # To run Llama-2-7b-chat-hf
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # # To run Llama-2-13b-chat-hf
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # # To run Meta-Llama-3-8B-Instruct
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh
@@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
 
 # To run Mistral-7B-v0.1
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # To run Mixtral-8x7B-Instruct-v0.1
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh
@@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
 
 # To run Phi-3-medium-4k-instruct
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # # To run Phi-3-mini-4k-instruct
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh
@@ -29,20 +29,20 @@ NUM_GPUS=2 # number of used GPU
 
 # To run Qwen1.5-7B-Chat
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # # To run Qwen1.5-14B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # # To run Qwen1.5-32B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # # To run Qwen1.5-MoE-A2.7B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # # To run CodeQwen1.5-7B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh
@@ -29,4 +29,4 @@ NUM_GPUS=2 # number of used GPU
 
 # To run Qwen2-7B-Instruct
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh
@@ -30,4 +30,4 @@ NUM_GPUS=2 # number of used GPU
 
 # To run SOLAR-10.7B-Instruct-v1.0
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh
@@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
 
 # To run vicuna-7b-v1.3
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # To run vicuna-13b-v1.3
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # To run vicuna-33b-v1.3
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh
@@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
 
 # To run Yi-6B-Chat
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 
 # To run Yi-34B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'