.github/workflows/more-tests.yml

name: Run parallel prefill

on:
  pull_request:
  push:
    branches:
      - main
  workflow_dispatch:

jobs:
  test-cuda:
    permissions:
      id-token: write
      contents: read
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
      runner: linux.g5.4xlarge.nvidia.gpu
      gpu-arch-type: cuda
      gpu-arch-version: "12.4"
      timeout: 60
      script: |
        set -xeou pipefail
        echo "::group::Print machine info"
        uname -a
        echo "::endgroup::"

        echo "::group::Download checkpoints"
        # Install requirements
        ./install/install_requirements.sh cuda
        pip3 list
        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
        echo "::endgroup::"

        echo "::group::Download checkpoints"
        mkdir -p checkpoints/stories15M
        pushd checkpoints/stories15M
        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
        wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
        popd
        echo "::endgroup::"

        echo "::group::Run inference"
        export MODEL_DIR=checkpoints/stories15M/
        export MODEL_PATH=${MODEL_DIR}/stories15M.pt
        export MODEL_NAME=stories15M


        for DTYPE in bfloat16 float16 float32; do
          ###################################################################
          # group with different temperatures
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0.9
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 1.0
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 100
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 200
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 500
          ###################################################################
          # group with different temperatures and prefill, and compile
          # and prefill compile
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0 --compile --compile-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0.9 --compile --compile-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 1.0 --compile --compile-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 100 --compile --compile-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 200 --compile --compile-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 500 --compile --compile-prefill
          ###################################################################
          # group with different temperatures and sequential prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0 --sequential-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0.9 --sequential-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 1.0 --sequential-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 100 --sequential-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 200 --sequential-prefill
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 500 --sequential-prefill
          ###################################################################
          # group with different temperatures and prefill, and compile
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0 --sequential-prefill --compile
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0.9 --sequential-prefill --compile
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 1.0 --sequential-prefill --compile
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 100 --sequential-prefill --compile
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 200 --sequential-prefill --compile
          python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 500 --sequential-prefill --compile

        done

        echo "tests complete"
        echo "******************************************"
        echo "::endgroup::"


  test-sdpa-backends-export:
    permissions:
      id-token: write
      contents: read
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
      runner: linux.g5.4xlarge.nvidia.gpu
      gpu-arch-type: cuda
      gpu-arch-version: "12.4"
      timeout: 60
      script: |
        set -xeou pipefail
        echo "::group::Print machine info"
        uname -a
        echo "::endgroup::"

        echo "::group::Download checkpoints"
        # Install requirements
        ./install/install_requirements.sh cuda
        pip3 list
        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
        echo "::endgroup::"

        echo "::group::Download checkpoints"
        mkdir -p checkpoints/stories15M
        pushd checkpoints/stories15M
        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
        wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
        popd
        echo "::endgroup::"

        echo "::group::Run inference"
        export MODEL_DIR=checkpoints/stories15M/
        export MODEL_PATH=${MODEL_DIR}/stories15M.pt
        export MODEL_NAME=stories15M

        ./torchchat/utils/scripts/build_native.sh aoti
        
        for DEVICE in cpu cuda; do
          # depending on how the parameter passing works, may only be able to do bfloat16 for aoti_run, similar to runner-cuda-dtype.yml
          # (although the runner environment should not have an opinion what we us in the artifact, and we might suitably abstract that)
          for DTYPE in bfloat16 float16 float32; do
            for SDPA in 'math' 'flash_attention' 'efficient_attention' 'cudnn_attention'; do
              echo "***************************************************************"
              echo "*** $DEVICE $DTYPE $SDPA"
              ###################################################################
              # Export DSO and run with Python
              python torchchat.py export --output-dso dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} 
              python torchchat.py generate --dso-path dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 --prompt "Once upon a time"
              ###################################################################
              # Export AOTI and run with aoti_run
              python torchchat.py export --output-aoti /tmp/model.pt2 --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} 
              ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "Once upon a time"
              ###################################################################
            done
          done
        done

        echo "tests complete"
        echo "******************************************"
        echo "::endgroup::"