Fix (#12390)

gc-fu · web-flow · commit 8331875f348f · 2024-11-27T10:41:58.000+08:00
diff --git a/docker/llm/serving/xpu/docker/vllm_offline_inference.py b/docker/llm/serving/xpu/docker/vllm_offline_inference.py
@@ -54,6 +54,8 @@
           disable_async_output_proc=True,
           distributed_executor_backend="ray",
           max_model_len=2000,
+          trust_remote_code=True,
+          block_size=8,
           max_num_batched_tokens=2000)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.