@@ -240,17 +240,19 @@ Modify the `--load-in-low-bit` value to `fp6`, `fp8`, `fp8_e4m3` or `fp16`
240
240
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
241
241
--served-model-name $served_model_name \
242
242
--port 8000 \
243
- --model $model \
243
+ --model $model \
244
244
--trust-remote-code \
245
- --gpu-memory-utilization 0.75 \
246
- --device xpu \
245
+ --block-size 8 \
246
+ --gpu-memory-utilization 0.9 \
247
+ --device xpu \
247
248
--dtype float16 \
248
249
--enforce-eager \
249
250
--load-in-low-bit sym_int4 \
250
- --max-model-len 4096 \
251
- --max-num-batched-tokens 10240 \
252
- --max-num-seqs 12 \
253
- --tensor-parallel-size 1
251
+ --max-model-len 2048 \
252
+ --max-num-batched-tokens 4000 \
253
+ --tensor-parallel-size 1 \
254
+ --disable-async-output-proc \
255
+ --distributed-executor-backend ray
254
256
` ` `
255
257
256
258
then run following command to start vllm service
@@ -678,8 +680,8 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
678
680
--load-in-low-bit fp8 \
679
681
--max-model-len 4096 \
680
682
--max-num-batched-tokens 10240 \
681
- --max-num-seqs 12 \
682
683
--tensor-parallel-size 1 \
684
+ --distributed-executor-backend ray \
683
685
--enable-lora \
684
686
--lora-modules sql-lora=$SQL_LOARA
685
687
` ` `
@@ -742,16 +744,37 @@ vLLM Serving can be deployed as a server that implements the OpenAI API protocol
742
744
743
745
```bash
744
746
#!/bin/bash
745
- model="/llm/models/Meta-Llama-3.1-8B-Instruct"
746
- served_model_name="llama-3.1-8b"
747
- ...
747
+ model="/llm/models/Qwen1.5-14B-Chat"
748
+ served_model_name="Qwen1.5-14B-Chat"
749
+
750
+ #export SYCL_CACHE_PERSISTENT=1
751
+ export CCL_WORKER_COUNT=4
752
+ export FI_PROVIDER=shm
753
+ export CCL_ATL_TRANSPORT=ofi
754
+ export CCL_ZE_IPC_EXCHANGE=sockets
755
+ export CCL_ATL_SHM=1
756
+
757
+ export USE_XETLA=OFF
758
+ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
759
+ export TORCH_LLM_ALLREDUCE=0
760
+
761
+ source /opt/intel/1ccl-wks/setvars.sh
762
+
748
763
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
749
764
--served-model-name $served_model_name \
750
765
--port 8000 \
751
766
--model $model \
752
- ...
767
+ --trust-remote-code \
768
+ --gpu-memory-utilization 0.9 \
769
+ --device xpu \
770
+ --dtype float16 \
771
+ --enforce-eager \
772
+ --load-in-low-bit fp8 \
773
+ --max-model-len 2048 \
774
+ --max-num-batched-tokens 4000 \
753
775
--api-key <your-api-key> \
754
- --tensor-parallel-size 2
776
+ --tensor-parallel-size 4 \
777
+ --distributed-executor-backend ray
755
778
```
756
779
757
780
2. Send http request with `api-key` header to verify the model has deployed successfully.
@@ -761,7 +784,7 @@ curl http://localhost:8000/v1/completions \
761
784
-H "Content-Type: application/json" \
762
785
-H "Authorization: Bearer <your-api-key>" \
763
786
-d ' {
764
- " model" : " llama-3.1-8b " ,
787
+ " model" : " Qwen1.5-14B-Chat " ,
765
788
" prompt" : " San Francisco is a" ,
766
789
" max_tokens" : 128
767
790
}'
@@ -785,17 +808,16 @@ docker run -itd \
785
808
--restart always $DOCKER_IMAGE
786
809
```
787
810
788
- Then you should start the docker on host that make sure you can visit vLLM backend serving.
811
+ Then you should start the docker on host that make sure you can visit vLLM backend serving.
789
812
790
813
4. After installation, you can access Open WebUI at <http://localhost:3000>. Enjoy! 😄
791
814
792
815
#### Serving with FastChat
793
816
794
817
We can set up model serving using `IPEX-LLM` as backend using FastChat, the following steps gives an example of how to deploy a demo using FastChat.
795
818
819
+ 1. **Start the Docker Container**
796
820
797
- 1. **Start the Docker Container**
798
-
799
821
Run the following command to launch a Docker container with device access:
800
822
801
823
```bash
@@ -817,8 +839,9 @@ We can set up model serving using `IPEX-LLM` as backend using FastChat, the foll
817
839
```
818
840
819
841
2. **Start the FastChat Service**
820
-
842
+
821
843
Enter the container and start the FastChat service:
844
+
822
845
```bash
823
846
#/bin/bash
824
847
0 commit comments