Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/runs-on.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ runners:
cpu: 16
family: ["c6g", "c7g"]
image: linux-arm64
linux-arm64-gpu:
family: ["g5g.xlarge"]
image: linux-arm64
spot: "false"
windows-gpu:
family: ["g4dn.2xlarge"]
image: windows-amd64
Expand Down
43 changes: 38 additions & 5 deletions .github/workflows/cuda13.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,29 @@ jobs:
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda13 \
build/testxgboost python-package/dist/*.whl

build-cuda13-arm64:
name: Build CUDA 13 (ARM64)
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-arm64-cpu
- tag=cuda13-build-cuda13-arm64
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
- uses: actions/checkout@v4
with:
submodules: "true"
- name: Log into Docker registry (AWS ECR)
run: bash ops/pipeline/login-docker-registry.sh
- run: |
bash ops/pipeline/build-cuda13.sh
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda13-arm64 \
python-package/dist/*.whl
test-cpp-cuda13:
name: Google Test (C++) with CUDA 13
needs: [build-cuda13]
Expand All @@ -62,12 +85,22 @@ jobs:
- run: |
bash ops/pipeline/test-cpp-cuda13.sh
test-python-cuda13:
name: Run Python tests with CUDA 13
needs: [build-cuda13]
name: Run Python tests with CUDA 13 (${{ matrix.description }})
needs: [build-cuda13, build-cuda13-arm64]
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-gpu
- tag=cuda13-test-python-cuda13
- runner=${{ matrix.runner }}
- tag=cuda13-test-python-cuda13-${{ matrix.description }}
strategy:
fail-fast: false
matrix:
include:
- description: amd64
runner: linux-amd64-gpu
artifact_from: build-cuda13
- description: arm64
runner: linux-arm64-gpu
artifact_from: build-cuda13-arm64
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
Expand All @@ -80,7 +113,7 @@ jobs:
run: |
python3 ops/pipeline/manage-artifacts.py download \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda13 \
--prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \
--dest-dir wheelhouse \
*.whl
- name: Run Python tests
Expand Down
29 changes: 28 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,28 @@ jobs:
bash ops/pipeline/build-cuda.sh \
xgb-ci.gpu_build_rockylinux8_dev_ver enable-rmm

build-cuda-arm64:
name: Build CUDA + manylinux_2_28_aarch64 wheel
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-arm64-cpu
- tag=main-build-cuda-arm64
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
- uses: actions/checkout@v4
with:
submodules: "true"
- name: Log into Docker registry (AWS ECR)
run: bash ops/pipeline/login-docker-registry.sh
- run: bash ops/pipeline/build-cuda-arm64.sh
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda-arm64 \
python-package/dist/*.whl

build-python-wheels-arm64:
name: Build manylinux_2_28_aarch64 wheel
runs-on:
Expand Down Expand Up @@ -211,7 +233,7 @@ jobs:

test-python-wheel:
name: Run Python tests (${{ matrix.description }})
needs: [build-cuda, build-python-wheels-arm64]
needs: [build-cuda, build-cuda-arm64, build-python-wheels-arm64]
runs-on:
- runs-on
- runner=${{ matrix.runner }}
Expand Down Expand Up @@ -242,6 +264,11 @@ jobs:
suite: cpu-arm64
runner: linux-arm64-cpu
artifact_from: build-python-wheels-arm64
- description: gpu-arm64
image_repo: xgb-ci.gpu_aarch64
suite: gpu-arm64
runner: linux-arm64-gpu
artifact_from: build-cuda-arm64
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
Expand Down
21 changes: 21 additions & 0 deletions doc/contrib/ci.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,15 @@ Examples: useful tasks for local development
--image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8:main \
-- ops/pipeline/build-cuda-impl.sh

* Build XGBoost with GPU support on Linux ARM64

.. code-block:: bash

export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
python3 ops/docker_run.py \
--image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8_aarch64:main \
-- ops/pipeline/build-cuda-impl.sh

* Run Python tests

.. code-block:: bash
Expand All @@ -217,6 +226,16 @@ Examples: useful tasks for local development
--use-gpus \
-- ops/pipeline/test-python-wheel-impl.sh gpu

* Run Python tests with GPU algorithm on Linux ARM64

.. code-block:: bash

export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
python3 ops/docker_run.py \
--image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_aarch64:main \
--use-gpus \
-- ops/pipeline/test-python-wheel-impl.sh gpu-arm64

* Run Python tests with GPU algorithm, with multiple GPUs

.. code-block:: bash
Expand Down Expand Up @@ -287,6 +306,8 @@ To opt into self-hosted runners (enabled by RunsOn), we use the following specia
- tag=[unique tag that uniquely identifies the job in the GH Action workflow]

where the runner is defined in ``.github/runs-on.yml``.
For CUDA-enabled ARM64 builds and tests we rely on the ``linux-arm64-gpu`` runner,
which provisions a Graviton + NVIDIA GPU instance.

===================================================================
The Lay of the Land: how CI pipelines are organized in the codebase
Expand Down
7 changes: 6 additions & 1 deletion doc/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Capabilities of binary wheels for each platform:
+=====================+=========+======================+
| Linux x86_64 | |tick| | |tick| |
+---------------------+---------+----------------------+
| Linux aarch64 | |cross| | |cross| |
| Linux aarch64 | |tick| | |cross| |
+---------------------+---------+----------------------+
| MacOS x86_64 | |cross| | |cross| |
+---------------------+---------+----------------------+
Expand All @@ -76,6 +76,11 @@ Capabilities of binary wheels for each platform:
| Windows | |tick| | |cross| |
+---------------------+---------+----------------------+

Linux aarch64 wheels now ship with CUDA support, so ``pip install xgboost`` on
modern Jetson or Graviton machines provides the same GPU functionality as the
Linux x86_64 wheel. Multi-node and multi-GPU training remain experimental on
ARM64 at this time.

Minimal installation (CPU-only)
*******************************
The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning.
Expand Down
75 changes: 75 additions & 0 deletions ops/pipeline/build-cuda-arm64.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash
## Build XGBoost with CUDA for Linux ARM64

set -euo pipefail

if [[ -z "${GITHUB_SHA:-}" ]]
then
echo "Make sure to set environment variable GITHUB_SHA"
exit 1
fi

IMAGE_REPO="xgb-ci.gpu_build_rockylinux8_aarch64"
export USE_FEDERATED=1
export USE_RMM=0

source ops/pipeline/classify-git-branch.sh
source ops/pipeline/get-docker-registry-details.sh
source ops/pipeline/get-image-tag.sh

WHEEL_TAG=manylinux_2_28_aarch64
BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"

echo "--- Build with CUDA (ARM64)"

if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
then
export BUILD_ONLY_SM75=1
else
export BUILD_ONLY_SM75=0
fi

set -x

python3 ops/docker_run.py \
--image-uri ${BUILD_IMAGE_URI} \
--run-args='-e BUILD_ONLY_SM75 -e USE_RMM -e USE_FEDERATED' \
-- ops/pipeline/build-cuda-impl.sh

echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
python3 ops/docker_run.py \
--image-uri ${MANYLINUX_IMAGE_URI} \
-- auditwheel repair --only-plat \
--plat ${WHEEL_TAG} python-package/dist/*.whl
python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
wheelhouse/*.whl
mv -v wheelhouse/*.whl python-package/dist/
if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then
echo "error: libgomp.so was not vendored in the wheel"
exit -1
fi

# Check size of wheel
pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl

echo "--- Generate meta info"
python3 ops/script/format_wheel_meta.py \
--wheel-path python-package/dist/*.whl \
--commit-hash ${GITHUB_SHA} \
--platform-tag ${WHEEL_TAG} \
--meta-path python-package/dist/

echo "--- Upload Python wheel"
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket xgboost-nightly-builds \
--prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
python-package/dist/*.whl
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket xgboost-nightly-builds \
--prefix ${BRANCH_NAME} --make-public \
python-package/dist/meta.json
fi

18 changes: 16 additions & 2 deletions ops/pipeline/build-cuda13.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,29 @@ then
exit 1
fi

IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
export USE_RMM=0
export USE_FEDERATED=0

ARCH=$(uname -m)
case "${ARCH}" in
x86_64)
IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
WHEEL_TAG=manylinux_2_28_x86_64
;;
aarch64)
IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64"
WHEEL_TAG=manylinux_2_28_aarch64
;;
*)
echo "Unsupported architecture: ${ARCH}"
exit 1
;;
esac

source ops/pipeline/classify-git-branch.sh
source ops/pipeline/get-docker-registry-details.sh
source ops/pipeline/get-image-tag.sh

WHEEL_TAG=manylinux_2_28_x86_64
BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"

Expand Down
15 changes: 14 additions & 1 deletion ops/pipeline/test-python-wheel-cuda13.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,20 @@ set -euo pipefail
source ops/pipeline/get-docker-registry-details.sh
source ops/pipeline/get-image-tag.sh

IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
ARCH=$(uname -m)
case "${ARCH}" in
x86_64)
IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
;;
aarch64)
IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64"
;;
*)
echo "Unsupported architecture: ${ARCH}"
exit 1
;;
esac

IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"

set -x
Expand Down
7 changes: 6 additions & 1 deletion ops/pipeline/test-python-wheel-impl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ suite="$1"

# Cannot set -u before Conda env activation
case "$suite" in
gpu|mgpu)
gpu|mgpu|gpu-arm64)
source activate gpu_test
;;
cpu)
Expand Down Expand Up @@ -42,6 +42,11 @@ case "$suite" in
python -c 'from cupy.cuda import jitify; jitify._init_module()'
pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu
;;
gpu-arm64)
echo "-- Run Python tests, using a single GPU (ARM64)"
python -c 'from cupy.cuda import jitify; jitify._init_module()'
pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu
;;
mgpu)
echo "-- Run Python tests, using multiple GPUs"
python -c 'from cupy.cuda import jitify; jitify._init_module()'
Expand Down
4 changes: 2 additions & 2 deletions ops/pipeline/test-python-wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ set -euo pipefail

if [[ "$#" -lt 2 ]]
then
echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [image_repo]"
echo "Usage: $0 {gpu|mgpu|gpu-arm64|cpu|cpu-arm64} [image_repo]"
exit 1
fi

suite="$1"
image_repo="$2"

if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]]
if [[ "$suite" == "gpu" || "$suite" == "mgpu" || "$suite" == "gpu-arm64" ]]
then
gpu_option="--use-gpus"
else
Expand Down
1 change: 1 addition & 0 deletions ops/script/release_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
]
cu13_platforms = [
"manylinux_2_28_x86_64",
"manylinux_2_28_aarch64",
]
minimal_platforms = [
"win_amd64",
Expand Down