-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentrypoint.sh
52 lines (41 loc) · 1.35 KB
/
entrypoint.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Set TGI like environment variables
NUM_SHARD=${NUM_SHARD:-$(nvidia-smi --list-gpus | wc -l)}
MODEL_PATH=${MODEL_PATH:-"/repository"}
MAX_MODEL_LEN=${MAX_MODEL_LEN:-1}
DISABLE_LOG_REQUESTS=${DISABLE_LOG_REQUESTS:-"false"}
DISABLE_LOG_STATS=${DISABLE_LOG_STATS:-"false"}
DTYPE=${DTYPE:-"auto"}
ENFORCE_EAGER=${ENFORCE_EAGER:-"false"}
QUANTIZATION=${QUANTIZATION:-""}
SEED=${SEED:-0}
TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-$NUM_SHARD}
TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE:-"false"}
# Entrypoint for the OpenAI API server
CMD="python3 -m vllm.entrypoints.openai.api_server --host '0.0.0.0' --port 80 --model '$MODEL_PATH' --tensor-parallel-size '$TENSOR_PARALLEL_SIZE'"
# Append --max-model-len if its value is not -1
if [ "$MAX_MODEL_LEN" -ne -1 ]; then
CMD="$CMD --max-model-len $MAX_MODEL_LEN"
fi
if [ "$ENFORCE_EAGER" != "false" ]; then
CMD="$CMD --enforce-eager"
fi
if [ "$DISABLE_LOG_STATS" != "false" ]; then
CMD="$CMD --disable-log-stats"
fi
if [ "$DISABLE_LOG_REQUESTS" != "false" ]; then
CMD="$CMD --disable-log-requests"
fi
if [ "$DTYPE" != "auto" ]; then
CMD="$CMD --dtype $DTYPE"
fi
if [ "$TRUST_REMOTE_CODE" != "false" ]; then
CMD="$CMD --trust-remote-code"
fi
if [ "$SEED" -ne 0 ]; then
CMD="$CMD --seed $SEED"
fi
if [ -n "$QUANTIZATION" ]; then
CMD="$CMD --quantization $QUANTIZATION"
fi
# Execute the command
eval $CMD