llm/vicuna/serve.yaml

resources:
  accelerators: A100:1
  disk_size: 1024
  # Note: The disk_tier option is not offered in skypilot<=0.2.5, we need
  # to install SkyPilot from source.
  disk_tier: high

setup: |
  conda activate chatbot
  if [ $? -ne 0 ]; then
    conda create -n chatbot python=3.9 -y
    conda activate chatbot
  fi

  # Install dependencies
  pip install torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
  pip install git+https://github.com/lm-sys/FastChat.git
  pip install git+https://github.com/huggingface/transformers.git@41a2f3529c6b56866c317031375ffd3e7b8bea01

  echo "Downloading model..."
  python3 -m fastchat.model.apply_delta \
    --base huggyllama/llama-${MODEL_SIZE}b \
    --target ~/vicuna-${MODEL_SIZE}b \
    --delta lmsys/vicuna-${MODEL_SIZE}b-delta-v1.1

run: |
  conda activate chatbot
  
  echo 'Starting controller...'
  python3 -m fastchat.serve.controller > ~/controller.log 2>&1 &
  sleep 10
  echo 'Starting model worker...'
  python3 -m fastchat.serve.model_worker --model-path ~/vicuna-${MODEL_SIZE}b > ~/model_worker.log 2>&1 &
  sleep 10
  echo 'Starting gradio server...'
  python3 -m fastchat.serve.gradio_web_server --share | tee ~/gradio.log

envs:
  MODEL_SIZE: 7