Skip to content

feat: add planning result logging (#4022) #6410

feat: add planning result logging (#4022)

feat: add planning result logging (#4022) #6410

Workflow file for this run

# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: CUDA Unit Test CI
on:
push:
branches:
# only run tests on main branch & nightly; release should be triggered manually
- nightly
- main
paths-ignore:
- "docs/*"
- "third_party/*"
- .gitignore
- "*.md"
- ".github/workflows/[bcdprv]*.yml"
- ".github/workflows/unittest_ci_cpu.yml"
- '.github/scripts/*.sh'
- '.github/scripts/*.py'
pull_request:
paths-ignore:
- "docs/*"
- "third_party/*"
- .gitignore
- "*.md"
- ".github/workflows/[bcdprv]*.yml"
- ".github/workflows/unittest_ci_cpu.yml"
- '.github/scripts/*.sh'
- '.github/scripts/*.py'
workflow_dispatch:
inputs:
channel:
description: "Channel to use for torch and fbgemm"
required: true
type: choice
options:
- release
- nightly
- test
jobs:
unittest_ci_gpu:
strategy:
fail-fast: false
matrix:
cuda-tag: ["cu126", "cu128", "cu129", "cu130"]
os:
- linux.g5.12xlarge.nvidia.gpu
python:
- version: "3.10"
tag: "py310"
- version: "3.11"
tag: "py311"
- version: "3.12"
tag: "py312"
- version: "3.13"
tag: "py313"
- version: "3.14"
tag: "py314"
- version: "3.14"
tag: "py314"
free_threaded: true
is_pr:
- ${{ github.event_name == 'pull_request' }}
is_main_push: # for main branch
- ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
exclude:
- is_pr: true
cuda-tag: "cu126"
- is_pr: true
cuda-tag: "cu128"
- is_pr: true
cuda-tag: "cu129"
- is_pr: true
cuda-tag: "cu130"
python:
version: "3.10"
- is_pr: true
cuda-tag: "cu130"
python:
version: "3.11"
- is_pr: true
cuda-tag: "cu130"
python:
version: "3.12"
- is_pr: true
cuda-tag: "cu130"
python:
version: "3.13"
- is_main_push: true
cuda-tag: "cu126"
python:
version: "3.11"
- is_main_push: true
cuda-tag: "cu126"
python:
version: "3.12"
- is_main_push: true
cuda-tag: "cu126"
python:
version: "3.13"
- is_main_push: true
cuda-tag: "cu128"
python:
version: "3.10"
- is_main_push: true
cuda-tag: "cu128"
python:
version: "3.12"
- is_main_push: true
cuda-tag: "cu128"
python:
version: "3.14"
- is_main_push: true
cuda-tag: "cu129"
python:
version: "3.11"
- is_main_push: true
cuda-tag: "cu129"
python:
version: "3.13"
- is_main_push: true
cuda-tag: "cu129"
python:
version: "3.14"
free_threaded: true
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
runner: ${{ matrix.os }}
timeout: 60
script: |
ldd --version
if [[ "${{ matrix.python.free_threaded }}" == "true" ]]; then
conda create -y --name build_binary python-freethreading=${{ matrix.python.version }}
else
conda create -y --name build_binary python=${{ matrix.python.version }}
fi
conda run -n build_binary \
python -c "import sys; print(f'python GIL enabled: {sys._is_gil_enabled()}') if sys.version >= '3.13' else print('python GIL enabled: true')"
conda info
python --version
conda run -n build_binary python --version
if [[ "${{ inputs.channel }}" = "release" ]]; then
index_url=https://download.pytorch.org/whl/${{ matrix.cuda-tag }}
elif [ -z "${{ inputs.channel }}" ]; then
index_url=https://download.pytorch.org/whl/nightly/${{ matrix.cuda-tag }}
else
index_url=https://download.pytorch.org/whl/${{ inputs.channel }}/${{ matrix.cuda-tag }}
fi
echo "index_url: $index_url"
if [[ "${{ matrix.python.version }}" = "3.14" ]]; then
# temporary workaround for torch package issue in python 3.14
conda run -n build_binary pip install packaging
fi
conda run -n build_binary \
pip install torch --index-url $index_url
conda run -n build_binary \
python -c "import torch; print(torch.__version__)"
echo "torch succeeded"
conda run -n build_binary \
python -c "import torch.distributed"
conda run -n build_binary \
pip install fbgemm-gpu --index-url $index_url
conda run -n build_binary \
python -c "import fbgemm_gpu; print(fbgemm_gpu.__version__)"
echo "fbgemm_gpu succeeded"
conda run -n build_binary \
pip install -r requirements.txt
conda run -n build_binary \
python setup.py bdist_wheel \
--python-tag=${{ matrix.python.tag }}
conda run -n build_binary \
python -c "import torchrec"
echo "torch.distributed succeeded"
conda run -n build_binary \
python -c "import numpy"
echo "numpy succeeded"
conda run -n build_binary \
pip install pytest
conda run -n build_binary \
python -m pytest torchrec -v -s \
-W ignore::pytest.PytestCollectionWarning \
--continue-on-collection-errors \
--ignore=torchrec/distributed/tests/test_comm.py \
--ignore=torchrec/distributed/tests/test_infer_shardings.py \
--ignore=torchrec/distributed/tests/test_keyed_jagged_tensor_pool.py \
--ignore=torchrec/distributed/tests/test_pt2_multiprocess.py \
--ignore=torchrec/distributed/tests/test_pt2.py \
--ignore=torchrec/distributed/tests/test_quant_model_parallel.py \
--ignore=torchrec/distributed/tests/test_quant_pruning.py \
--ignore=torchrec/distributed/tests/test_quant_sequence_model_parallel.py \
--ignore=torchrec/distributed/tests/test_cache_prefetch.py \
--ignore=torchrec/distributed/tests/test_fp_embeddingbag_single_rank.py \
--ignore=torchrec/distributed/tests/test_infer_utils.py \
--ignore=torchrec/distributed/tests/test_fx_jit.py \
--ignore=torchrec/distributed/tests/test_model_parallel_hierarchical.py \
--ignore-glob=**/test_utils/ \
--ignore-glob='torchrec/metrics/' \
--ignore-glob='*test_train_pipeline*' \
--ignore-glob='torchrec/distributed/tests/test_model_parallel_gloo*' \
--ignore-glob='torchrec/inference/inference_legacy/tests*' \
--ignore-glob='*test_model_parallel_nccl*' \
-k "not _disabled_in_oss_compatibility"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true