Skip to content

Release Llama 3.1 8B FP8 Benchmarking Tests from IREE TOM #352

Release Llama 3.1 8B FP8 Benchmarking Tests from IREE TOM

Release Llama 3.1 8B FP8 Benchmarking Tests from IREE TOM #352

# Copyright 2024 Advanced Micro Devices, Inc.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
name: Release Llama 3.1 8B FP8 Benchmarking Tests from IREE TOM
on:
workflow_dispatch:
schedule:
# Running every 6 hour on all days
- cron: "0 */6 * * *"
permissions:
contents: write
concurrency:
# A PR number if a pull request and otherwise the commit hash. This cancels
# queued and in-progress runs for the same PR (presubmit) or commit
# (postsubmit). The workflow name is prepended to avoid conflicts between
# different workflows.
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true
jobs:
test_llama_large:
if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
timeout-minutes: 240
name: "Release: Llama 8B FP8 Benchmarking Tests from IREE TOM"
strategy:
matrix:
version: [ 3.11 ]
fail-fast: false
runs-on: linux-mi325-1gpu-ossci-nod-ai
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
OFFLINE_SERVING: DISABLED
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Get Current Date And Check if it is serving day
id: date
run: |
echo "date=$(date -u +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
if [ "$(date -u +%u)" = "2" ]; then
echo "IS_SERVING_DAY=true" >> $GITHUB_ENV
else
echo "IS_SERVING_DAY=false" >> $GITHUB_ENV
fi
- name: "Setting up Python"
id: setup_python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: ${{matrix.version}}
- name: Create Python venv
run: |
python -m venv ${VENV_DIR}
source ${VENV_DIR}/bin/activate
- name: Install pip deps and build IREE
run: |
bash scripts/setenv.sh --tom
mkdir -p output_artifacts
pip freeze | grep -E 'iree|amdshark' > $(pwd)/output_artifacts/version.txt
cd iree
echo -n "IREE " >> $(pwd)/../output_artifacts/version.txt
git log -1 --pretty=%H >> $(pwd)/../output_artifacts/version.txt
cd -
- name: Run export and compile
id: export_compile
run: |
export PATH=$PWD/iree-build/tools/:$PATH
export PYTHONPATH=$PWD/iree-build/compiler/bindings/python:$PWD/iree-build/runtime/bindings/python
bash scripts/export_and_compile.sh \
--irpa /amdshark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \
--dtype fp8 --bs-prefill 4 --bs-decode 4 2>&1 | tee "$(pwd)/output_artifacts/export_and_compilation.log"
- name: Validate VMFB Responses
id: validate_vmfb
if: ${{ steps.export_compile.outcome == 'success' }}
continue-on-error: true
run: |
echo "Validate Responses"
export PATH=$PWD/iree-build/tools/:$PATH
export PYTHONPATH=$PWD/iree-build/compiler/bindings/python:$PWD/iree-build/runtime/bindings/python
bash scripts/validate_numerics.sh \
--irpa /amdshark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \
--vmfb $(pwd)/output_artifacts/output.vmfb \
--config $(pwd)/output_artifacts/config_attn.json \
--tokenizer /amdshark-dev/8b/instruct/tokenizer.json \
--tokenizer_config /amdshark-dev/8b/instruct/tokenizer_config.json \
--steps 64 \
--kv-cache-dtype float8_e4m3fnuz | tee output_artifacts/run_llm_vmfb.log
- name: Run IREE Benchmark Module
if: ${{ steps.export_compile.outcome == 'success' }}
run: |
export PATH=$PWD/iree-build/tools/:$PATH
export PYTHONPATH=$PWD/iree-build/compiler/bindings/python:$PWD/iree-build/runtime/bindings/python
bash scripts/run_iree_benchmark.sh --bs-prefill 4 --bs-decode 4 \
--parameters /amdshark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \
--model llama-8B-FP8 2>&1 | tee "$(pwd)/output_artifacts/iree_benchmark.log"
python scripts/utils.py \
--combine-json $(pwd)/output_artifacts/benchmark_module \
--output-json $(pwd)/output_artifacts/consolidated_benchmark.json \
--append-isl
- name: Run online serving
if: ${{ steps.export_compile.outcome == 'success' }}
run: |
export PATH=$PWD/iree-build/tools/:$PATH
export PYTHONPATH=$PWD/iree-build/compiler/bindings/python:$PWD/iree-build/runtime/bindings/python
cd shortfin
export ROCR_VISIBLE_DEVICES=0
bash ../scripts/run_serving.sh \
--irpa /amdshark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \
--tokenizer_json /amdshark-dev/8b/instruct/tokenizer.json \
--vmfb ../output_artifacts/output.vmfb \
--model_config ../output_artifacts/config_attn.json \
--port 8900 | tee ../output_artifacts/serving.log
cd ..
- name: Run Offline serving :chat
if: ${{ env.OFFLINE_SERVING == 'ENABLED' }}
run: |
if [ "$(date +%u)" -eq 1 ]; then
cd shortfin
export ROCR_VISIBLE_DEVICES=0
bash ../scripts/run_offline_serving.sh \
--irpa /amdshark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \
--tokenizer_json /amdshark-dev/8b/instruct/tokenizer.json \
--vmfb ../output_artifacts/output.vmfb \
--model_config ../output_artifacts/config_attn.json \
--mode chat
cd ..
python scripts/utils.py \
--combine-json $(pwd)/output_artifacts/chat \
--output-json $(pwd)/output_artifacts/consolidated_chat_serving.json
fi
- name: Run Offline serving :reasoning
if: ${{ env.OFFLINE_SERVING == 'ENABLED' }}
run: |
if [ "$(date +%u)" -eq 3 ]; then
cd shortfin
export ROCR_VISIBLE_DEVICES=0
bash ../scripts/run_offline_serving.sh \
--irpa /amdshark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \
--tokenizer_json /amdshark-dev/8b/instruct/tokenizer.json \
--vmfb ../output_artifacts/output.vmfb \
--model_config ../output_artifacts/config_attn.json \
--mode reasoning
cd ..
python scripts/utils.py \
--combine-json $(pwd)/output_artifacts/reasoning \
--output-json $(pwd)/output_artifacts/consolidated_reasoning_serving.json
fi
- name: Run Offline serving :summary
if: ${{ env.OFFLINE_SERVING == 'ENABLED' }}
run: |
if [ "$(date +%u)" -eq 5 ]; then
cd shortfin
export ROCR_VISIBLE_DEVICES=0
bash ../scripts/run_offline_serving.sh \
--irpa /amdshark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \
--tokenizer_json /amdshark-dev/8b/instruct/tokenizer.json \
--vmfb ../output_artifacts/output.vmfb \
--model_config ../output_artifacts/config_attn.json \
--mode summary
cd ..
python scripts/utils.py \
--combine-json $(pwd)/output_artifacts/summary \
--output-json $(pwd)/output_artifacts/consolidated_summary_serving.json
fi
- name: Upload log files
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: llama-logs
path: |
output_artifacts/consolidated*.json
output_artifacts/*.log
output_artifacts/version.txt
- name: Cleanup output artifacts
if: always()
run: |
rm -rf output_artifacts
test ! -d output_artifacts && echo "Output artifacts are removed"
# New job to push logs to amd-shark-ai-reports repository
push_logs:
name: "Push log llama 8B FP8"
needs: [ test_llama_large ]
if: always()
runs-on: ubuntu-24.04
steps:
- name: Download log artifacts
uses: actions/download-artifact@v4
with:
name: llama-logs
path: logs
- name: Checkout Target Repo
if: always()
uses: actions/checkout@v4
with:
repository: nod-ai/amd-shark-ai-reports
token: ${{ secrets.AMD_SHARK_AI_GITHUB_TOKEN }}
path: amd-shark-ai-reports
- name: Push artifacts
if: always()
run: |
git config --global user.name "GitHub Actions Bot"
git config --global user.email ""
date=$(date -u +'%Y-%m-%d-%H'h)
mkdir -p "amd-shark-ai-reports/$date/llama_3.1-8b-fp8"
cp -v logs/*.json "amd-shark-ai-reports/$date/llama_3.1-8b-fp8" || true
cp -v logs/*.log "amd-shark-ai-reports/$date/llama_3.1-8b-fp8" || true
cp -v logs/version.txt "amd-shark-ai-reports/$date/llama_3.1-8b-fp8"
cd amd-shark-ai-reports
git pull
git add $date
git commit -m "Add CI Llama 8B FP8 logs on $(date -u +'%Y-%m-%d-%H'h)"
git push origin main
rm -rf ../logs