Skip to content

CUDA Nightly

CUDA Nightly #69

Workflow file for this run

name: CUDA Nightly
on:
schedule:
- cron: "0 6 * * *"
workflow_dispatch:
jobs:
cuda-tests-bench:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install PyTorch (prefer CUDA if available)
run: |
python -m pip install --upgrade pip wheel
# Try CUDA 12.1 wheels; falls back to CPU-only if CUDA not present
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121 || pip install torch torchvision
- name: Install project dependencies
run: |
pip install -e .[test] || true
if [ -f requirements/requirements-medical.txt ]; then pip install -r requirements/requirements-medical.txt; fi
if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi
- name: Show CUDA availability
run: |
python - <<'PY'
import torch
print("torch:", torch.__version__)
print("cuda_available:", torch.cuda.is_available())
print("cuda_device_count:", torch.cuda.device_count())
PY
- name: Run test suite
run: pytest -q
- name: Run quick benchmarks sweep
run: |
mkdir -p benchmark_results
python benchmarks/run_benchmarks.py \
--models biobert,clinicalbert \
--datasets benchmarks/datasets/mimic_notes_sample.jsonl,benchmarks/datasets/pubmed_sample.jsonl \
--batch-sizes 1 4 \
--seq-lengths 128 \
--iterations 10 \
--device cuda \
--output benchmark_results/ci_quick.json || true
- name: Depthwise CUDA microbench (non-blocking)
run: |
# Run Triton depthwise vs eager microbenchmarks on CUDA (if available). Non-fatal.
python benchmarks/benchmark_imaging.py \
--device cuda --conv-type 2d --in-ch 32 --batch 8 \
--depthwise-bench --depthwise-bench-iters 30 \
--depthwise-bench-sizes 8x128x128,32x256x256,64x512x512 || true
- name: Softmax×V CUDA microbench (non-blocking)
run: |
# Run gated Triton softmax×V vs eager benchmark on CUDA (if available). Non-fatal.
mkdir -p benchmark_results
MEDVLLM_ENABLE_TRITON_SOFTMAXV=1 \
python benchmarks/benchmark_attention.py \
--device cuda --dtype bf16 \
--seq 512 --heads 8 --dim 64 --iters 30 \
--attn-softmaxv-bench --enable-triton-softmaxv \
--save benchmark_results/attn_softmaxv_ci.json || true
- name: Separable 3D CUDA microbench (non-blocking)
run: |
# Run depthwise separable 3D Triton vs eager microbenchmark on CUDA (if available). Non-fatal.
mkdir -p benchmark_results
MEDVLLM_ENABLE_TRITON_SEP3D=1 \
python benchmarks/benchmark_separable_conv3d.py || true
- name: Run training performance benchmark
run: |
python benchmarks/benchmark_training.py \
--epochs 1 \
--batch-size 4 \
--seq-length 64 \
--dataset-size 32 \
--hidden-dim 64 \
--vocab-size 1024 \
--num-classes 8 \
--device cuda \
--use-real-adapter \
--adapter biobert \
--dataset-file benchmarks/datasets/mimic_notes_sample.jsonl \
--output benchmark_results/train_ci_quick.json || true
- name: Upload benchmark artifacts
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: benchmark_results/*.json
- name: Generate Markdown benchmark report
run: |
mkdir -p reports
python benchmarks/generate_report.py --results-dir benchmark_results --output reports/benchmark_summary_ci.md || true
- name: Upload benchmark report artifact
uses: actions/upload-artifact@v4
with:
name: benchmark-report
path: reports/*.md