Skip to content

SimpleFSDP 8 GPU Integration Tests #836

SimpleFSDP 8 GPU Integration Tests

SimpleFSDP 8 GPU Integration Tests #836

name: SimpleFSDP 8 GPU Integration Tests
on:
push:
branches: [ main ]
paths:
- 'torchtitan/experiments/simple_fsdp/**'
- '.github/workflows/integration_test_8gpu_simple_fsdp.yaml'
pull_request:
paths:
- 'torchtitan/experiments/simple_fsdp/**'
- '.github/workflows/integration_test_8gpu_simple_fsdp.yaml'
schedule:
# Runs every 12 hours
- cron: '0 */12 * * *'
concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
defaults:
run:
shell: bash -l -eo pipefail {0}
jobs:
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.g5.48xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.6"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: torchtitan-ubuntu-20.04-clang12
repository: pytorch/torchtitan
upload-artifact: outputs
script: |
set -eux
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
# Log CUDA driver version for debugging.
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
echo "CUDA driver version: ${DRIVER_VERSION}"
pip config --user set global.progress_bar off
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
mkdir artifacts-to-be-uploaded
python -m torchtitan.experiments.simple_fsdp.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
# Run the numerics unit tests of SimpleFSDP
torchrun --nproc-per-node=8 -m pytest torchtitan/experiments/simple_fsdp/tests/test_numerics.py -v