dmlc · dantegd · Nov 20, 2025
diff --git a/.github/runs-on.yml b/.github/runs-on.yml
@@ -34,6 +34,10 @@ runners:
     cpu: 16
     family: ["c6g", "c7g"]
     image: linux-arm64
+  linux-arm64-gpu:
+    family: ["g5g.xlarge"]
+    image: linux-arm64
+    spot: "false"
   windows-gpu:
     family: ["g4dn.2xlarge"]
     image: windows-amd64

diff --git a/.github/workflows/cuda13.yml b/.github/workflows/cuda13.yml
@@ -36,6 +36,29 @@ jobs:
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
             --prefix cache/${{ github.run_id }}/build-cuda13 \
             build/testxgboost python-package/dist/*.whl
+
+  build-cuda13-arm64:
+    name: Build CUDA 13 (ARM64)
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-arm64-cpu
+      - tag=cuda13-build-cuda13-arm64
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - run: |
+          bash ops/pipeline/build-cuda13.sh
+      - name: Stash files
+        run: |
+          python3 ops/pipeline/manage-artifacts.py upload \
+            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
+            --prefix cache/${{ github.run_id }}/build-cuda13-arm64 \
+            python-package/dist/*.whl
   test-cpp-cuda13:
     name: Google Test (C++) with CUDA 13
     needs: [build-cuda13]
@@ -62,12 +85,22 @@ jobs:
       - run: |
           bash ops/pipeline/test-cpp-cuda13.sh
   test-python-cuda13:
-    name: Run Python tests with CUDA 13
-    needs: [build-cuda13]
+    name: Run Python tests with CUDA 13 (${{ matrix.description }})
+    needs: [build-cuda13, build-cuda13-arm64]
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-gpu
-      - tag=cuda13-test-python-cuda13
+      - runner=${{ matrix.runner }}
+      - tag=cuda13-test-python-cuda13-${{ matrix.description }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - description: amd64
+            runner: linux-amd64-gpu
+            artifact_from: build-cuda13
+          - description: arm64
+            runner: linux-arm64-gpu
+            artifact_from: build-cuda13-arm64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -80,7 +113,7 @@ jobs:
         run: |
           python3 ops/pipeline/manage-artifacts.py download \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \
             --dest-dir wheelhouse \
             *.whl
       - name: Run Python tests

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -94,6 +94,28 @@ jobs:
           bash ops/pipeline/build-cuda.sh \
             xgb-ci.gpu_build_rockylinux8_dev_ver enable-rmm
 
+  build-cuda-arm64:
+    name: Build CUDA + manylinux_2_28_aarch64 wheel
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-arm64-cpu
+      - tag=main-build-cuda-arm64
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - run: bash ops/pipeline/build-cuda-arm64.sh
+      - name: Stash files
+        run: |
+          python3 ops/pipeline/manage-artifacts.py upload \
+            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
+            --prefix cache/${{ github.run_id }}/build-cuda-arm64 \
+            python-package/dist/*.whl
+
   build-python-wheels-arm64:
     name: Build manylinux_2_28_aarch64 wheel
     runs-on:
@@ -211,7 +233,7 @@ jobs:
 
   test-python-wheel:
     name: Run Python tests (${{ matrix.description }})
-    needs: [build-cuda, build-python-wheels-arm64]
+    needs: [build-cuda, build-cuda-arm64, build-python-wheels-arm64]
     runs-on:
       - runs-on
       - runner=${{ matrix.runner }}
@@ -242,6 +264,11 @@ jobs:
             suite: cpu-arm64
             runner: linux-arm64-cpu
             artifact_from: build-python-wheels-arm64
+          - description: gpu-arm64
+            image_repo: xgb-ci.gpu_aarch64
+            suite: gpu-arm64
+            runner: linux-arm64-gpu
+            artifact_from: build-cuda-arm64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker

diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
@@ -198,6 +198,15 @@ Examples: useful tasks for local development
       --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8:main \
       -- ops/pipeline/build-cuda-impl.sh
 
+* Build XGBoost with GPU support on Linux ARM64
+
+  .. code-block:: bash
+
+    export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
+    python3 ops/docker_run.py \
+      --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8_aarch64:main \
+      -- ops/pipeline/build-cuda-impl.sh
+
 * Run Python tests
 
   .. code-block:: bash
@@ -217,6 +226,16 @@ Examples: useful tasks for local development
       --use-gpus \
       -- ops/pipeline/test-python-wheel-impl.sh gpu
 
+* Run Python tests with GPU algorithm on Linux ARM64
+
+  .. code-block:: bash
+
+    export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
+    python3 ops/docker_run.py \
+      --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_aarch64:main \
+      --use-gpus \
+      -- ops/pipeline/test-python-wheel-impl.sh gpu-arm64
+
 * Run Python tests with GPU algorithm, with multiple GPUs
 
   .. code-block:: bash
@@ -287,6 +306,8 @@ To opt into self-hosted runners (enabled by RunsOn), we use the following specia
     - tag=[unique tag that uniquely identifies the job in the GH Action workflow]
 
 where the runner is defined in ``.github/runs-on.yml``.
+For CUDA-enabled ARM64 builds and tests we rely on the ``linux-arm64-gpu`` runner,
+which provisions a Graviton + NVIDIA GPU instance.
 
 ===================================================================
 The Lay of the Land: how CI pipelines are organized in the codebase

diff --git a/doc/install.rst b/doc/install.rst
@@ -67,7 +67,7 @@ Capabilities of binary wheels for each platform:
 +=====================+=========+======================+
 | Linux x86_64        | |tick|  |  |tick|              |
 +---------------------+---------+----------------------+
-| Linux aarch64       | |cross| |  |cross|             |
+| Linux aarch64       | |tick|  |  |cross|             |
 +---------------------+---------+----------------------+
 | MacOS x86_64        | |cross| |  |cross|             |
 +---------------------+---------+----------------------+
@@ -76,6 +76,11 @@ Capabilities of binary wheels for each platform:
 | Windows             | |tick|  |  |cross|             |
 +---------------------+---------+----------------------+
 
+Linux aarch64 wheels now ship with CUDA support, so ``pip install xgboost`` on
+modern Jetson or Graviton machines provides the same GPU functionality as the
+Linux x86_64 wheel. Multi-node and multi-GPU training remain experimental on
+ARM64 at this time.
+
 Minimal installation (CPU-only)
 *******************************
 The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning.

diff --git a/ops/pipeline/build-cuda-arm64.sh b/ops/pipeline/build-cuda-arm64.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+## Build XGBoost with CUDA for Linux ARM64
+
+set -euo pipefail
+
+if [[ -z "${GITHUB_SHA:-}" ]]
+then
+  echo "Make sure to set environment variable GITHUB_SHA"
+  exit 1
+fi
+
+IMAGE_REPO="xgb-ci.gpu_build_rockylinux8_aarch64"
+export USE_FEDERATED=1
+export USE_RMM=0
+
+source ops/pipeline/classify-git-branch.sh
+source ops/pipeline/get-docker-registry-details.sh
+source ops/pipeline/get-image-tag.sh
+
+WHEEL_TAG=manylinux_2_28_aarch64
+BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
+MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
+
+echo "--- Build with CUDA (ARM64)"
+
+if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
+then
+  export BUILD_ONLY_SM75=1
+else
+  export BUILD_ONLY_SM75=0
+fi
+
+set -x
+
+python3 ops/docker_run.py \
+  --image-uri ${BUILD_IMAGE_URI} \
+  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM -e USE_FEDERATED' \
+  -- ops/pipeline/build-cuda-impl.sh
+
+echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
+python3 ops/docker_run.py \
+  --image-uri ${MANYLINUX_IMAGE_URI} \
+  -- auditwheel repair --only-plat \
+  --plat ${WHEEL_TAG} python-package/dist/*.whl
+python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
+  wheelhouse/*.whl
+mv -v wheelhouse/*.whl python-package/dist/
+if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then
+  echo "error: libgomp.so was not vendored in the wheel"
+  exit -1
+fi
+
+# Check size of wheel
+pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
+
+echo "--- Generate meta info"
+python3 ops/script/format_wheel_meta.py \
+  --wheel-path python-package/dist/*.whl  \
+  --commit-hash ${GITHUB_SHA}  \
+  --platform-tag ${WHEEL_TAG}  \
+  --meta-path python-package/dist/
+
+echo "--- Upload Python wheel"
+if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
+then
+  python3 ops/pipeline/manage-artifacts.py upload \
+    --s3-bucket xgboost-nightly-builds \
+    --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
+    python-package/dist/*.whl
+  python3 ops/pipeline/manage-artifacts.py upload \
+    --s3-bucket xgboost-nightly-builds \
+    --prefix ${BRANCH_NAME} --make-public \
+    python-package/dist/meta.json
+fi
+
diff --git a/ops/pipeline/build-cuda13.sh b/ops/pipeline/build-cuda13.sh
@@ -9,15 +9,29 @@ then
   exit 1
 fi
 
-IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
 export USE_RMM=0
 export USE_FEDERATED=0
 
+ARCH=$(uname -m)
+case "${ARCH}" in
+  x86_64)
+    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
+    WHEEL_TAG=manylinux_2_28_x86_64
+    ;;
+  aarch64)
+    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64"
+    WHEEL_TAG=manylinux_2_28_aarch64
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}"
+    exit 1
+    ;;
+esac
+
 source ops/pipeline/classify-git-branch.sh
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-WHEEL_TAG=manylinux_2_28_x86_64
 BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
 MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
 

diff --git a/ops/pipeline/test-python-wheel-cuda13.sh b/ops/pipeline/test-python-wheel-cuda13.sh
@@ -6,7 +6,20 @@ set -euo pipefail
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
+ARCH=$(uname -m)
+case "${ARCH}" in
+  x86_64)
+    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
+    ;;
+  aarch64)
+    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64"
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}"
+    exit 1
+    ;;
+esac
+
 IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
 
 set -x

diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh
@@ -13,7 +13,7 @@ suite="$1"
 
 # Cannot set -u before Conda env activation
 case "$suite" in
-  gpu|mgpu)
+  gpu|mgpu|gpu-arm64)
     source activate gpu_test
     ;;
   cpu)
@@ -42,6 +42,11 @@ case "$suite" in
     python -c 'from cupy.cuda import jitify; jitify._init_module()'
     pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu
     ;;
+  gpu-arm64)
+    echo "-- Run Python tests, using a single GPU (ARM64)"
+    python -c 'from cupy.cuda import jitify; jitify._init_module()'
+    pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu
+    ;;
   mgpu)
     echo "-- Run Python tests, using multiple GPUs"
     python -c 'from cupy.cuda import jitify; jitify._init_module()'

diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh
@@ -5,14 +5,14 @@ set -euo pipefail
 
 if [[ "$#" -lt 2 ]]
 then
-  echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [image_repo]"
+  echo "Usage: $0 {gpu|mgpu|gpu-arm64|cpu|cpu-arm64} [image_repo]"
   exit 1
 fi
 
 suite="$1"
 image_repo="$2"
 
-if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]]
+if [[ "$suite" == "gpu" || "$suite" == "mgpu" || "$suite" == "gpu-arm64" ]]
 then
   gpu_option="--use-gpus"
 else

diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py
@@ -154,6 +154,7 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
     ]
     cu13_platforms = [
         "manylinux_2_28_x86_64",
+        "manylinux_2_28_aarch64",
     ]
     minimal_platforms = [
         "win_amd64",