Skip to content

Commit cd2f63f

Browse files
authored
[CI/CD] add neuron docker and ci test scripts (vllm-project#3571)
1 parent 87fa80c commit cd2f63f

File tree

6 files changed

+103
-4
lines changed

6 files changed

+103
-4
lines changed

.buildkite/run-neuron-test.sh

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# This script build the Neuron docker image and run the API server inside the container.
2+
# It serves a sanity check for compilation and basic model usage.
3+
set -e
4+
5+
# Try building the docker image
6+
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
7+
docker build -t neuron -f Dockerfile.neuron .
8+
9+
# Setup cleanup
10+
remove_docker_container() { docker rm -f neuron || true; }
11+
trap remove_docker_container EXIT
12+
remove_docker_container
13+
14+
# Run the image
15+
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
16+
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
17+
18+
# Wait for the server to start
19+
wait_for_server_to_start() {
20+
timeout=300
21+
counter=0
22+
23+
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
24+
sleep 1
25+
counter=$((counter + 1))
26+
if [ $counter -ge $timeout ]; then
27+
echo "Timeout after $timeout seconds"
28+
break
29+
fi
30+
done
31+
}
32+
wait_for_server_to_start
33+
34+
# Test a simple prompt
35+
curl -X POST -H "Content-Type: application/json" \
36+
localhost:8000/generate \
37+
-d '{"prompt": "San Francisco is a"}'

.buildkite/test-template.j2

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ steps:
2121
queue: amd
2222
command: bash .buildkite/run-amd-test.sh
2323

24+
- label: "Neuron Test"
25+
agents:
26+
queue: neuron
27+
command: bash .buildkite/run-neuron-test.sh
28+
2429
- label: "CPU Test"
2530
command: bash .buildkite/run-cpu-test.sh
2631

Dockerfile.neuron

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# default base image
2+
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
3+
4+
FROM $BASE_IMAGE
5+
6+
RUN echo "Base image is $BASE_IMAGE"
7+
8+
# Install some basic utilities
9+
RUN apt-get update && apt-get install python3 python3-pip -y
10+
11+
### Mount Point ###
12+
# When launching the container, mount the code directory to /app
13+
ARG APP_MOUNT=/app
14+
VOLUME [ ${APP_MOUNT} ]
15+
WORKDIR ${APP_MOUNT}
16+
17+
RUN python3 -m pip install --upgrade pip
18+
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
19+
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
20+
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
21+
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
22+
23+
COPY ./vllm /app/vllm/vllm
24+
COPY ./setup.py /app/vllm/setup.py
25+
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
26+
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
27+
28+
RUN cd /app/vllm \
29+
&& python3 -m pip install -U -r requirements-neuron.txt
30+
31+
ENV VLLM_BUILD_WITH_NEURON 1
32+
RUN cd /app/vllm \
33+
&& pip install -e . \
34+
&& cd ..
35+
36+
CMD ["/bin/bash"]

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,8 @@ def _is_neuron() -> bool:
204204
subprocess.run(["neuron-ls"], capture_output=True, check=True)
205205
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
206206
torch_neuronx_installed = False
207-
return torch_neuronx_installed
207+
return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
208+
False)
208209

209210

210211
def _is_cpu() -> bool:

vllm/engine/async_llm_engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,8 @@ def from_engine_args(
335335
engine_config = engine_args.create_engine_config()
336336

337337
if engine_config.device_config.device_type == "neuron":
338-
raise NotImplementedError("Neuron is not supported for "
339-
"async engine yet.")
338+
from vllm.executor.neuron_executor import NeuronExecutorAsync
339+
executor_class = NeuronExecutorAsync
340340
elif engine_config.parallel_config.worker_use_ray:
341341
initialize_ray_cluster(engine_config.parallel_config)
342342
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync

vllm/executor/neuron_executor.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from typing import Dict, List, Set, Tuple
22

3-
from vllm.executor.executor_base import ExecutorBase
3+
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
44
from vllm.logger import init_logger
55
from vllm.lora.request import LoRARequest
66
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
7+
from vllm.utils import make_async
78

89
logger = init_logger(__name__)
910

@@ -73,3 +74,22 @@ def check_health(self) -> None:
7374
# NeuronExecutor will always be healthy as long as
7475
# it's running.
7576
return
77+
78+
79+
class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
80+
81+
async def execute_model_async(
82+
self,
83+
seq_group_metadata_list: List[SequenceGroupMetadata],
84+
blocks_to_swap_in: Dict[int, int],
85+
blocks_to_swap_out: Dict[int, int],
86+
blocks_to_copy: Dict[int, List[int]],
87+
) -> SamplerOutput:
88+
output = await make_async(self.driver_worker.execute_model)(
89+
seq_group_metadata_list=seq_group_metadata_list, )
90+
return output
91+
92+
async def check_health_async(self) -> None:
93+
# NeuronExecutor will always be healthy as long as
94+
# it's running.
95+
return

0 commit comments

Comments
 (0)