@@ -240,17 +240,19 @@ Modify the `--load-in-low-bit` value to `fp6`, `fp8`, `fp8_e4m3` or `fp16`
240240python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
241241 --served-model-name $served_model_name \
242242 --port 8000 \
243- --model $model \
243+ --model $model \
244244 --trust-remote-code \
245- --gpu-memory-utilization 0.75 \
246- --device xpu \
245+ --block-size 8 \
246+ --gpu-memory-utilization 0.9 \
247+ --device xpu \
247248 --dtype float16 \
248249 --enforce-eager \
249250 --load-in-low-bit sym_int4 \
250- --max-model-len 4096 \
251- --max-num-batched-tokens 10240 \
252- --max-num-seqs 12 \
253- --tensor-parallel-size 1
251+ --max-model-len 2048 \
252+ --max-num-batched-tokens 4000 \
253+ --tensor-parallel-size 1 \
254+ --disable-async-output-proc \
255+ --distributed-executor-backend ray
254256` ` `
255257
256258then run following command to start vllm service
@@ -678,8 +680,8 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
678680 --load-in-low-bit fp8 \
679681 --max-model-len 4096 \
680682 --max-num-batched-tokens 10240 \
681- --max-num-seqs 12 \
682683 --tensor-parallel-size 1 \
684+ --distributed-executor-backend ray \
683685 --enable-lora \
684686 --lora-modules sql-lora=$SQL_LOARA
685687` ` `
@@ -742,16 +744,37 @@ vLLM Serving can be deployed as a server that implements the OpenAI API protocol
742744
743745```bash
744746#!/bin/bash
745- model="/llm/models/Meta-Llama-3.1-8B-Instruct"
746- served_model_name="llama-3.1-8b"
747- ...
747+ model="/llm/models/Qwen1.5-14B-Chat"
748+ served_model_name="Qwen1.5-14B-Chat"
749+
750+ #export SYCL_CACHE_PERSISTENT=1
751+ export CCL_WORKER_COUNT=4
752+ export FI_PROVIDER=shm
753+ export CCL_ATL_TRANSPORT=ofi
754+ export CCL_ZE_IPC_EXCHANGE=sockets
755+ export CCL_ATL_SHM=1
756+
757+ export USE_XETLA=OFF
758+ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
759+ export TORCH_LLM_ALLREDUCE=0
760+
761+ source /opt/intel/1ccl-wks/setvars.sh
762+
748763python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
749764 --served-model-name $served_model_name \
750765 --port 8000 \
751766 --model $model \
752- ...
767+ --trust-remote-code \
768+ --gpu-memory-utilization 0.9 \
769+ --device xpu \
770+ --dtype float16 \
771+ --enforce-eager \
772+ --load-in-low-bit fp8 \
773+ --max-model-len 2048 \
774+ --max-num-batched-tokens 4000 \
753775 --api-key <your-api-key> \
754- --tensor-parallel-size 2
776+ --tensor-parallel-size 4 \
777+ --distributed-executor-backend ray
755778```
756779
7577802. Send http request with `api-key` header to verify the model has deployed successfully.
@@ -761,7 +784,7 @@ curl http://localhost:8000/v1/completions \
761784 -H "Content-Type: application/json" \
762785 -H "Authorization: Bearer <your-api-key>" \
763786 -d ' {
764- " model" : " llama-3.1-8b " ,
787+ " model" : " Qwen1.5-14B-Chat " ,
765788 " prompt" : " San Francisco is a" ,
766789 " max_tokens" : 128
767790 }'
@@ -785,17 +808,16 @@ docker run -itd \
785808 --restart always $DOCKER_IMAGE
786809```
787810
788- Then you should start the docker on host that make sure you can visit vLLM backend serving.
811+ Then you should start the docker on host that make sure you can visit vLLM backend serving.
789812
7908134. After installation, you can access Open WebUI at <http://localhost:3000>. Enjoy! 😄
791814
792815#### Serving with FastChat
793816
794817We can set up model serving using `IPEX-LLM` as backend using FastChat, the following steps gives an example of how to deploy a demo using FastChat.
795818
819+ 1. **Start the Docker Container**
796820
797- 1. **Start the Docker Container**
798-
799821 Run the following command to launch a Docker container with device access:
800822
801823 ```bash
@@ -817,8 +839,9 @@ We can set up model serving using `IPEX-LLM` as backend using FastChat, the foll
817839 ```
818840
8198412. **Start the FastChat Service**
820-
842+
821843 Enter the container and start the FastChat service:
844+
822845 ```bash
823846 #/bin/bash
824847
0 commit comments