diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 1a8098071ba3..000000000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,35 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates - -version: 2 -updates: - - package-ecosystem: "maven" - directory: "/jvm-packages" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-example" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "github-actions" - directory: / - schedule: - interval: "monthly" diff --git a/.github/runs-on.yml b/.github/runs-on.yml index d951a08e8273..e21895ee8c3b 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -34,4 +34,3 @@ runners: cpu: 32 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: windows-amd64 - diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index d3208a1294d1..26e8fa34c119 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -15,20 +15,15 @@ jobs: timeout-minutes: 20 name: A job to run test in FreeBSD steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Test in FreeBSD - id: test - uses: vmactions/freebsd-vm@v1 - with: - usesh: true - prepare: | - pkg install -y cmake git ninja googletest - - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON - ninja -v - ./testxgboost + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Test in FreeBSD + id: test + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + prepare: | + pkg install -y cmake git ninja googletest bash + run: | + bash ops/pipeline/test-freebsd.sh diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index aec7e9d31087..8b7c71a82bf8 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -19,25 +19,25 @@ jobs: ports: - 5000:5000 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.7.1 - with: - driver-opts: network=host - - name: Build and push container - uses: docker/build-push-action@v6 - with: - context: . - file: tests/ci_build/Dockerfile.i386 - push: true - tags: localhost:5000/xgboost/build-32bit:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Build XGBoost - run: | - docker run --rm -v $PWD:/workspace -w /workspace \ - -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ - localhost:5000/xgboost/build-32bit:latest \ - tests/ci_build/build_via_cmake.sh + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host + - name: Build and push container + uses: docker/build-push-action@v6 + with: + context: . + file: ops/docker/dockerfile/Dockerfile.i386 + push: true + tags: localhost:5000/xgboost/build-32bit:latest + cache-from: type=gha + cache-to: type=gha,mode=max + - name: Build XGBoost + run: | + docker run --rm -v $PWD:/workspace -w /workspace \ + -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ + localhost:5000/xgboost/build-32bit:latest \ + bash ops/script/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 945f362685a4..53e695721887 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -1,100 +1,287 @@ -name: XGBoost-JVM-Tests +name: XGBoost CI (JVM packages) on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - test-with-jvm: - name: Test JVM on OS ${{ matrix.os }} - timeout-minutes: 30 - runs-on: ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=jvm-tests-build-containers-${{ matrix.container_id }} strategy: - fail-fast: false matrix: - os: [windows-latest, ubuntu-latest, macos-13] - + container_id: + - xgb-ci.manylinux2014_x86_64 + - xgb-ci.jvm + - xgb-ci.jvm_gpu_build + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh ${{ matrix.container_id }} - - uses: actions/setup-java@8df1039502a15bceb9433410b1a100fbe190c53b # v4.5.0 - with: - distribution: 'temurin' - java-version: '8' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: jvm_tests - environment-file: tests/ci_build/conda_env/jvm_tests.yml - use-mamba: true + build-jvm-manylinux2014: + name: >- + Build libxgboost4j.so targeting glibc 2.17 + (arch ${{ matrix.arch }}, runner ${{ matrix.runner }}) + needs: build-containers + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=jvm-tests-build-jvm-manylinux2014-${{ matrix.arch }} + strategy: + fail-fast: false + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + - name: Upload libxgboost4j.so + run: | + libname=lib/libxgboost4j_linux_${{ matrix.arch }}_${{ github.sha }}.so + mv -v lib/libxgboost4j.so ${libname} + bash ops/pipeline/publish-artifact.sh ${libname} \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ - - name: Cache Maven packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + build-jvm-gpu: + name: Build libxgboost4j.so with CUDA + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=jvm-tests-build-jvm-gpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - run: bash ops/pipeline/build-jvm-gpu.sh + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash build-jvm-gpu lib/libxgboost4j.so - - name: Test XGBoost4J (Core) - run: | - cd jvm-packages - mvn test -B -pl :xgboost4j_2.12 + build-jvm-mac: + name: "Build libxgboost4j.dylib for ${{ matrix.description }}" + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - description: "MacOS (Apple Silicon)" + script: ops/pipeline/build-jvm-macos-apple-silicon.sh + libname: libxgboost4j_m1_${{ github.sha }}.dylib + runner: macos-14 + - description: "MacOS (Intel)" + script: ops/pipeline/build-jvm-macos-intel.sh + libname: libxgboost4j_intel_${{ github.sha }}.dylib + runner: macos-13 + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: bash ${{ matrix.script }} + - name: Upload libxgboost4j.dylib + run: | + mv -v lib/libxgboost4j.dylib ${{ matrix.libname }} + bash ops/pipeline/publish-artifact.sh ${{ matrix.libname }} \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - name: Test XGBoost4J (Core, Spark, Examples) - run: | - rm -rfv build/ - cd jvm-packages - mvn -B test - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows + build-jvm-docs: + name: Build docs for JVM packages + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=jvm-tests-build-jvm-docs + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - name: Unstash files + run: | + bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so + - run: bash ops/pipeline/build-jvm-doc.sh + - name: Upload JVM doc + run: | + bash ops/pipeline/publish-artifact.sh \ + jvm-packages/${{ env.BRANCH_NAME }}.tar.bz2 \ + s3://xgboost-docs/ - - name: Extract branch name - shell: bash - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - (matrix.os == 'windows-latest' || matrix.os == 'macos-13') + build-test-jvm-packages: + name: Build and test JVM packages (Linux, Scala ${{ matrix.scala_version }}) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=jvm-tests-build-test-jvm-packages-scala${{ matrix.scala_version }} + strategy: + fail-fast: false + matrix: + scala_version: ["2.12", "2.13"] + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.jvm + - name: Build and test JVM packages (Scala ${{ matrix.scala_version }}) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: ${{ matrix.scala_version }} + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash \ + build-test-jvm-packages lib/libxgboost4j.so + if: matrix.scala_version == '2.13' - - name: Publish artifact xgboost4j.dll to S3 - run: | - cd lib/ - Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'windows-latest' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + build-test-jvm-packages-other-os: + name: Build and test JVM packages (${{ matrix.os }}) + timeout-minutes: 30 + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [windows-latest, macos-13] + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '8' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: minimal + environment-file: ops/conda_env/minimal.yml + - name: Cache Maven packages + uses: actions/cache@v4 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + - name: Test XGBoost4J (Core) + run: | + cd jvm-packages + mvn test -B -pl :xgboost4j_2.12 + - name: Publish artifact xgboost4j.dll to S3 + run: | + cd lib/ + Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll + python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` + --acl public-read --region us-west-2 + if: | + (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && + matrix.os == 'windows-latest' + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - name: Publish artifact libxgboost4j.dylib to S3 - shell: bash -l {0} - run: | - cd lib/ - mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib - ls - python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'macos-13' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + test-jvm-packages-gpu: + name: Test JVM packages with CUDA (Scala ${{ matrix.scala_version }}) + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-mgpu + - tag=jvm-tests-test-jvm-packages-gpu-scala${{ matrix.scala_version }} + strategy: + fail-fast: false + matrix: + scala_version: ["2.12", "2.13"] + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - name: Unstash files + run: | + bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so + - run: bash ops/pipeline/test-jvm-gpu.sh + env: + SCALA_VERSION: ${{ matrix.scala_version }} - - name: Build and Test XGBoost4J with scala 2.13 - run: | - rm -rfv build/ - cd jvm-packages - mvn -B clean install test -Pdefault,scala-2.13 - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows + deploy-jvm-packages: + name: Deploy JVM packages to S3 (${{ matrix.variant.name }}) + needs: [build-jvm-gpu, build-test-jvm-packages, test-jvm-packages-gpu] + runs-on: + - runs-on + - runner=linux-amd64-cpu + - run-id=${{ github.run_id }} + - tag=jvm-tests-deploy-jvm-packages-${{ matrix.variant.name }}-scala${{ matrix.scala_version }} + strategy: + fail-fast: false + matrix: + variant: + - name: cpu + container_id: xgb-ci.jvm + artifact_from: build-test-jvm-packages + - name: gpu + container_id: xgb-ci.jvm_gpu_build + artifact_from: build-jvm-gpu + scala_version: ['2.12', '2.13'] + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh ${{ matrix.variant.container_id }} + - name: Unstash files + run: | + bash ops/pipeline/stash-artifacts.sh \ + unstash ${{ matrix.variant.artifact_from }} \ + lib/libxgboost4j.so + ls -lh lib/libxgboost4j.so + - name: Deploy JVM packages to S3 + run: | + bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant.name }} \ + ${{ matrix.variant.container_id }} ${{ matrix.scala_version }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000000..2c400b073988 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,119 @@ +name: XGBoost CI (Lint) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + build-containers: + name: Build CI containers + env: + CONTAINER_ID: xgb-ci.clang_tidy + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=lint-build-containers + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ env.CONTAINER_ID }} + run: bash ops/docker_build.sh ${{ env.CONTAINER_ID }} + + clang-tidy: + name: Run clang-tidy + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=lint-clang-tidy + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.clang_tidy + - run: bash ops/pipeline/run-clang-tidy.sh + + python-mypy-lint: + runs-on: ubuntu-latest + name: Type and format checks for the Python package + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: python_lint + environment-file: ops/conda_env/python_lint.yml + - name: Run mypy + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0 + - name: Run formatter + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0 + - name: Run pylint + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1 + + cpp-lint: + runs-on: ubuntu-latest + name: Code linting for C++ + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + architecture: 'x64' + - name: Install Python packages + run: | + python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint + - name: Run lint + run: | + python3 ops/script/lint_cpp.py + bash ops/script/lint_cmake.sh + + lintr: + runs-on: ubuntu-latest + name: Run R linters on Ubuntu + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: r-lib/actions/setup-r@v2 + with: + r-version: "release" + - name: Cache R packages + uses: actions/cache@v4 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") + - name: Run lintr + run: | + MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ + Rscript ops/script/lint_r.R $(pwd) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b75456f04b4a..2f579d3e9611 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,193 +1,322 @@ -# This is a basic workflow to help you get started with Actions +name: XGBoost CI -name: XGBoost-CI - -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the master branch on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -# A workflow run is made up of one or more jobs that can run sequentially or in parallel +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - gtest-cpu: - name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-build-containers-${{ matrix.container_id }} strategy: - fail-fast: false matrix: - os: [macos-13] + container_id: + - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu_build_rockylinux8_dev_ver + - xgb-ci.gpu_build_r_rockylinux8 + - xgb-ci.gpu + - xgb-ci.gpu_dev_ver + - xgb-ci.cpu + - xgb-ci.manylinux_2_28_x86_64 + - xgb-ci.manylinux2014_x86_64 + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + - container_id: xgb-ci.aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh ${{ matrix.container_id }} + + build-cpu: + name: Build CPU + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.cpu + - run: bash ops/pipeline/build-cpu.sh + - name: Stash CLI executable + run: bash ops/pipeline/stash-artifacts.sh stash build-cpu ./xgboost + + build-cpu-arm64: + name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-arm64-cpu + - tag=build-cpu-arm64 + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.aarch64 + - run: bash ops/pipeline/build-cpu-arm64.sh + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash build-cpu-arm64 \ + ./xgboost python-package/dist/*.whl + - name: Upload Python wheel + run: | + bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + + build-cuda: + name: Build CUDA + manylinux_2_28_x86_64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cuda + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/pipeline/build-cuda.sh + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash build-cuda \ + build/testxgboost ./xgboost python-package/dist/*.whl + - name: Upload Python wheel + run: | + for file in python-package/dist/*.whl python-package/dist/meta.json + do + bash ops/pipeline/publish-artifact.sh "${file}" \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + done + + build-cuda-with-rmm: + name: Build CUDA with RMM + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cuda-with-rmm + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - run: | + bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8 + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh \ + stash build-cuda-with-rmm build/testxgboost + - name: Upload Python wheel + run: | + bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ + s3://xgboost-nightly-builds/experimental_build_with_rmm/ + + build-cuda-with-rmm-dev: + name: Build CUDA with RMM (dev) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cuda-with-rmm-dev steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8_dev_ver + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - run: | + bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8_dev_ver - gtest-cpu-nonomp: - name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} + build-manylinux2014: + name: Build manylinux2014_${{ matrix.arch }} wheel + needs: build-containers + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-build-manylinux2014-${{ matrix.arch }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + - name: Upload Python wheel + run: | + for wheel in python-package/dist/*.whl + do + bash ops/pipeline/publish-artifact.sh "${wheel}" \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + done + + build-gpu-rpkg: + name: Build GPU-enabled R package + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-gpu-rpkg steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_r_rockylinux8 + - run: bash ops/pipeline/build-gpu-rpkg.sh + - name: Upload R tarball + run: | + bash ops/pipeline/publish-artifact.sh xgboost_r_gpu_linux_*.tar.gz \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ - gtest-cpu-sycl: - name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} + + test-cpp-gpu: + name: >- + Run Google Tests with GPUs + (Suite ${{ matrix.suite }}, Runner ${{ matrix.runner }}) + needs: [build-cuda, build-cuda-with-rmm] + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-test-cpp-gpu-${{ matrix.suite }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] - python-version: ["3.10"] + include: + - suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - suite: gpu-rmm + runner: linux-amd64-gpu + artifact_from: build-cuda-with-rmm + - suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL - run: | - cd build - ./testxgboost --gtest_filter=-Sycl* + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu + - name: Unstash gtest + run: | + bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + build/testxgboost + chmod +x build/testxgboost + - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} - c-api-demo: - name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash -l {0} + test-python-wheel: + name: Run Python tests (${{ matrix.description }}) + needs: [build-cuda, build-cpu-arm64] + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-test-python-wheel-${{ matrix.description }} strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: cpp_test - environment-file: tests/ci_build/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd - - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo - ./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo - - cpp-lint: - runs-on: ubuntu-latest - name: Code linting for C++ + include: + - description: single-gpu + container: xgb-ci.gpu + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: single-gpu-nightly-deps + container: xgb-ci.gpu_dev_ver + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: multiple-gpu + container: xgb-ci.gpu + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: multiple-gpu-nightly-deps + container: xgb-ci.gpu_dev_ver + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: cpu-amd64 + container: xgb-ci.cpu + suite: cpu + runner: linux-amd64-cpu + artifact_from: build-cuda + - description: cpu-arm64 + container: xgb-ci.aarch64 + suite: cpu-arm64 + runner: linux-arm64-cpu + artifact_from: build-cpu-arm64 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 tests/ci_build/lint_cpp.py - sh ./tests/ci_build/lint_cmake.sh + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh ${{ matrix.container }} + - name: Unstash Python wheel + run: | + bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + python-package/dist/*.whl ./xgboost + chmod +x ./xgboost + - name: Run Python tests, ${{ matrix.description }} + run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml new file mode 100644 index 000000000000..67c1bf57d3a2 --- /dev/null +++ b/.github/workflows/misc.yml @@ -0,0 +1,49 @@ +name: XGBoost CI (misc) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + gtest-cpu-nonomp: + name: Test Google C++ unittest (CPU Non-OMP) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Install system packages + run: | + sudo apt-get install -y --no-install-recommends ninja-build + - name: Build and install XGBoost + run: bash ops/script/build_via_cmake.sh -DUSE_OPENMP=OFF + - name: Run gtest binary + run: | + cd build + ctest --extra-verbose + + c-api-demo: + name: Test installing XGBoost lib + building the C API demo + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: cpp_test + environment-file: ops/conda_env/cpp_test.yml + - name: Build and run C API demo with shared + run: bash ops/pipeline/test-c-api-demo.sh diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 8f0ab1c68262..dc8de819e2bb 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -1,4 +1,4 @@ -name: XGBoost-Python-Tests +name: XGBoost CI (Python tests) on: [push, pull_request] @@ -14,335 +14,51 @@ concurrency: cancel-in-progress: true jobs: - python-mypy-lint: - runs-on: ubuntu-latest - name: Type and format checks for the Python package - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: python_lint - environment-file: tests/ci_build/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Run mypy - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - run: | - python tests/ci_build/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1 - - python-sdist-test-on-Linux: - # Mismatched glibcxx version between system and conda forge. - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: sdist_test - environment-file: tests/ci_build/conda_env/sdist_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False - cd .. - python -c 'import xgboost' - python-sdist-test: - # Use system toolchain instead of conda toolchain for macos and windows. - # MacOS has linker error if clang++ from conda-forge is used runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} + name: Test installing Python XGBoost from the source distribution (${{ matrix.os }}) strategy: + fail-fast: false matrix: - os: [macos-13, windows-latest] - python-version: ["3.10"] + os: [macos-13, windows-latest, ubuntu-latest] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install osx system dependencies - if: matrix.os == 'macos-13' - run: | - brew install ninja libomp - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: test - - name: Install build - run: | - conda install -c conda-forge python-build - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz - cd .. - python -c 'import xgboost' + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: sdist_test + environment-file: ops/conda_env/sdist_test.yml + - name: Install extra package for MacOS + run: | + mamba install -c conda-forge llvm-openmp + if: matrix.os == 'macos-13' + - name: Build and install XGBoost + run: bash ops/pipeline/test-python-sdist.sh python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: macos-13} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: macos_cpu_test - environment-file: tests/ci_build/conda_env/macos_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on macos - run: | - brew install ninja - - mkdir build - cd build - # Set prefix, to use OpenMP library from Conda env - # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 - # to learn why we don't use libomp from Homebrew. - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - python-tests-on-win: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} + name: Test XGBoost Python package on macos-13 + runs-on: macos-13 timeout-minutes: 60 - strategy: - matrix: - config: - - {os: windows-latest, python-version: '3.10'} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - auto-update-conda: true - python-version: ${{ matrix.config.python-version }} - activate-environment: win64_env - environment-file: tests/ci_build/conda_env/win64_cpu_test.yml - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Windows - run: | - mkdir build_msvc - cd build_msvc - cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON - cmake --build . --config Release --parallel $(nproc) - - - name: Install Python package - run: | - cd python-package - python --version - pip wheel -v . --wheel-dir dist/ - pip install ./dist/*.whl - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - python-tests-on-ubuntu: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_cpu_test - environment-file: tests/ci_build/conda_env/linux_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - - name: Test PySpark Interface - shell: bash -l {0} - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark - - python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ - + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: macos_cpu_test + environment-file: ops/conda_env/macos_cpu_test.yml + - run: bash ops/pipeline/test-python-macos.sh python-system-installation-on-ubuntu: - name: Test XGBoost Python package System Installation on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - + name: Test XGBoost Python package System Installation on Ubuntu + runs-on: ubuntu-latest steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4 with: submodules: 'true' - - name: Set up Python 3.10 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@v5 with: python-version: "3.10" - - - name: Install ninja - run: | - sudo apt-get update && sudo apt-get install -y ninja-build - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja - ninja - - - name: Copy lib to system lib - run: | - cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" - - - name: Install XGBoost in Virtual Environment - run: | - cd python-package - pip install virtualenv - virtualenv venv - source venv/bin/activate && \ - pip install -v . --config-settings use_system_libxgboost=True && \ - python -c 'import xgboost' + - run: bash ops/pipeline/test-python-with-sysprefix.sh diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml deleted file mode 100644 index 1bbdedc3f9c6..000000000000 --- a/.github/workflows/python_wheels.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: XGBoost-Python-Wheels - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - python-wheels: - name: Build wheel for ${{ matrix.platform_id }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - os: macos-13 - platform_id: macosx_x86_64 - - os: macos-14 - platform_id: macosx_arm64 - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54 - - name: Install libomp - run: brew install libomp - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - python-version: "3.10" - use-mamba: true - - name: Build wheels - run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Extract branch name - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - - name: Upload Python wheel - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - run: | - python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml new file mode 100644 index 000000000000..ab13dfa395cd --- /dev/null +++ b/.github/workflows/python_wheels_macos.yml @@ -0,0 +1,53 @@ +name: Build Python wheels targeting MacOS + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +defaults: + run: + shell: bash -l {0} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + python-wheels-macos: + name: Build wheel for ${{ matrix.platform_id }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - os: macos-13 + platform_id: macosx_x86_64 + - os: macos-14 + platform_id: macosx_arm64 + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Set up homebrew + uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 + - name: Install libomp + run: brew install libomp + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: minimal + environment-file: ops/conda_env/minimal.yml + - name: Build wheels + run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} + - name: Upload Python wheel + if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + run: | + python -m pip install awscli + python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml index 4b506927e06c..da01f39f650b 100644 --- a/.github/workflows/r_nold.yml +++ b/.github/workflows/r_nold.yml @@ -22,23 +22,20 @@ jobs: container: image: rhub/debian-gcc-devel-nold steps: - - name: Install git and system packages - shell: bash - run: | - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Run R tests - shell: bash - run: | - cd R-package && \ - /tmp/R-devel/bin/R CMD INSTALL . && \ - /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" + - name: Install git and system packages + shell: bash + run: | + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Install dependencies + shell: bash -l {0} + run: | + /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + - name: Run R tests + shell: bash + run: | + cd R-package && \ + /tmp/R-devel/bin/R CMD INSTALL . && \ + /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index c56d1f8ef943..fc0245f5752e 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -13,138 +13,91 @@ concurrency: cancel-in-progress: true jobs: - lintr: - runs-on: ${{ matrix.config.os }} - name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - matrix: - config: - - {os: ubuntu-latest, r: 'release'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) - test-Rpkg: - runs-on: ${{ matrix.config.os }} - name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} + runs-on: ${{ matrix.os }} + name: Test R on OS ${{ matrix.os }}, R ${{ matrix.r }}, Compiler ${{ matrix.compiler }}, Build ${{ matrix.build }} strategy: fail-fast: false matrix: - config: - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} - - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'} + include: + - os: windows-latest + r: release + compiler: mingw + build: autotools + - os: ubuntu-latest + r: release + compiler: none + build: cmake env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - steps: - - name: Install system dependencies - run: | - sudo apt update - sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.config.os == 'ubuntu-latest' - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - - uses: r-lib/actions/setup-tinytex@v2 - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler != 'none' - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler == 'none' + - name: Install system dependencies + run: | + sudo apt update + sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev + if: matrix.os == 'ubuntu-latest' + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.r }} + - name: Cache R packages + uses: actions/cache@v4 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + architecture: 'x64' + - uses: r-lib/actions/setup-tinytex@v2 + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") + - name: Test R + run: | + python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler != 'none' + - name: Test R + run: | + python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler == 'none' test-R-on-Debian: name: Test R package on Debian runs-on: ubuntu-latest container: image: rhub/debian-gcc-release - steps: - - name: Install system dependencies - run: | - # Must run before checkout to have the latest git installed. - # No need to add pandoc, the container has it figured out. - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - name: Trust git cloning project sources - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Test R - shell: bash -l {0} - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - - - uses: dorny/paths-filter@v3 - id: changes - with: - filters: | - r_package: - - 'R-package/**' - - - name: Run document check - if: steps.changes.outputs.r_package == 'true' - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc + - name: Install system dependencies + run: | + # Must run before checkout to have the latest git installed. + # No need to add pandoc, the container has it figured out. + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + - name: Trust git cloning project sources + run: | + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Install dependencies + shell: bash -l {0} + run: | + Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + - name: Test R + shell: bash -l {0} + run: | + python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check + - uses: dorny/paths-filter@v3 + id: changes + with: + filters: | + r_package: + - 'R-package/**' + - name: Run document check + if: steps.changes.outputs.r_package == 'true' + run: | + python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 85a9abb57e1b..f3837391b4fe 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -22,7 +22,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@v4 with: persist-credentials: false diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml new file mode 100644 index 000000000000..22456b1b68e5 --- /dev/null +++ b/.github/workflows/sycl_tests.yml @@ -0,0 +1,48 @@ +name: XGBoost CI (oneAPI) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +defaults: + run: + shell: bash -l {0} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + gtest-cpu-sycl: + name: Test Google C++ unittest (CPU SYCL) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + - name: Run gtest + run: bash ops/pipeline/build-test-sycl.sh gtest + + python-sycl-tests-on-ubuntu: + name: Test XGBoost Python package with SYCL + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + - name: Test Python package + run: bash ops/pipeline/build-test-sycl.sh pytest diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 5e229db4c050..4a3e4747c3ff 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -25,20 +25,20 @@ jobs: name: Check latest RAPIDS runs-on: ubuntu-latest steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Check latest RAPIDS and update conftest.sh - run: | - bash tests/buildkite/update-rapids.sh - - name: Create Pull Request - uses: peter-evans/create-pull-request@v7 - if: github.ref == 'refs/heads/master' - with: - add-paths: | - tests/buildkite - branch: create-pull-request/update-rapids - base: master - title: "[CI] Update RAPIDS to latest stable" - commit-message: "[CI] Update RAPIDS to latest stable" + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Check latest RAPIDS and update conftest.sh + run: | + bash ops/script/update_rapids.sh + - name: Create Pull Request + uses: peter-evans/create-pull-request@v7 + if: github.ref == 'refs/heads/master' + with: + add-paths: | + ops/docker + branch: create-pull-request/update-rapids + base: master + title: "[CI] Update RAPIDS to latest stable" + commit-message: "[CI] Update RAPIDS to latest stable" diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 000000000000..f97daf761abf --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,53 @@ +name: XGBoost CI (Windows) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: powershell + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + build-win64-gpu: + name: Build XGBoost for Windows with CUDA + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-cpu + - tag=windows-build-win64-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: powershell ops/pipeline/build-win64-gpu.ps1 + - name: Stash files + run: | + powershell ops/pipeline/stash-artifacts.ps1 stash build-win64-gpu ` + build/testxgboost.exe xgboost.exe ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) + + test-win64-gpu: + name: Test XGBoost on Windows + needs: build-win64-gpu + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-gpu + - tag=windows-test-win64-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Unstash files + run: | + powershell ops/pipeline/stash-artifacts.ps1 unstash build-win64-gpu ` + build/testxgboost.exe xgboost.exe python-package/dist/*.whl + - run: powershell ops/pipeline/test-win64-gpu.ps1 diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py index 0b4594e2d2c0..c5a72724f707 100644 --- a/dev/prepare_jvm_release.py +++ b/dev/prepare_jvm_release.py @@ -203,7 +203,7 @@ def main(): ) print( "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n" - " python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" + " python ops/script/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" " GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true" ) print( diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py index a0f8f796130e..bfcc813a0ef1 100644 --- a/dev/release-artifacts.py +++ b/dev/release-artifacts.py @@ -123,7 +123,7 @@ def make_python_sdist( with DirectoryExcursion(ROOT): with open("python-package/pyproject.toml", "r") as f: orig_pyproj_lines = f.read() - with open("tests/buildkite/remove_nccl_dep.patch", "r") as f: + with open("ops/patch/remove_nccl_dep.patch", "r") as f: patch_lines = f.read() subprocess.run( ["patch", "-p0"], input=patch_lines, check=True, text=True, encoding="utf-8" diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index af9e6556290c..d6effa0b09d4 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -14,11 +14,9 @@ project. ************** GitHub Actions ************** -The configuration files are located under the directory -`.github/workflows `_. - -Most of the tests listed in the configuration files run automatically for every incoming pull -requests and every update to branches. A few tests however require manual activation: +We make the extensive use of `GitHub Actions `_ to host our +CI pipelines. Most of the tests listed in the configuration files run automatically for every +incoming pull requests and every update to branches. A few tests however require manual activation: * R tests with ``noLD`` option: Run R tests using a custom-built R with compilation flag ``--disable-long-double``. See `this page `_ for more @@ -26,18 +24,29 @@ requests and every update to branches. A few tests however require manual activa To invoke this test suite for a particular pull request, simply add a review comment ``/gha run r-nold-test``. (Ordinary comment won't work. It needs to be a review comment.) -GitHub Actions is also used to build Python wheels targeting MacOS Intel and Apple Silicon. See -`.github/workflows/python_wheels.yml -`_. The -``python_wheels`` pipeline sets up environment variables prefixed ``CIBW_*`` to indicate the target -OS and processor. The pipeline then invokes the script ``build_python_wheels.sh``, which in turns -calls ``cibuildwheel`` to build the wheel. The ``cibuildwheel`` is a library that sets up a -suitable Python environment for each OS and processor target. Since we don't have Apple Silicon -machine in GitHub Actions, cross-compilation is needed; ``cibuildwheel`` takes care of the complex -task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call -``pip wheel``. Since XGBoost has a native library component, we created a customized build -backend that hooks into ``pip``. The customized backend contains the glue code to compile the native -library on the fly.) +******************************* +Self-Hosted Runners with RunsOn +******************************* + +`RunsOn `_ is a SaaS (Software as a Service) app that lets us to easily create +self-hosted runners to use with GitHub Actions pipelines. RunsOn uses +`Amazon Web Services (AWS) `_ under the hood to provision runners with +access to various amount of CPUs, memory, and NVIDIA GPUs. Thanks to this app, we are able to test +GPU-accelerated and distributed algorithms of XGBoost while using the familar interface of +GitHub Actions. + +In GitHub Actions, jobs run on Microsoft-hosted runners by default. +To opt into self-hosted runners (enabled by RunsOn), we use the following special syntax: + +.. code-block:: yaml + + runs-on: + - runs-on + - runner=runner-name + - run-id=${{ github.run_id }} + - tag=[unique tag that uniquely identifies the job in the GH Action workflow] + +where the runner is defined in ``.github/runs-on.yml``. ********************************************************* Reproduce CI testing environments using Docker containers @@ -49,116 +58,298 @@ You can reproduce the same testing environment as the CI pipelines by running Do Prerequisites ============= 1. Install Docker: https://docs.docker.com/engine/install/ubuntu/ -2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian +2. Install NVIDIA Docker runtime: + https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html. The runtime lets you access NVIDIA GPUs inside a Docker container. +.. _build_run_docker_locally: + ============================================== Building and Running Docker containers locally ============================================== -For your convenience, we provide the wrapper script ``tests/ci_build/ci_build.sh``. You can use it as follows: +For your convenience, we provide three wrapper scripts: + +* ``ops/docker_build.py``: Build a Docker container +* ``ops/docker_build.sh``: Wrapper for ``ops/docker_build.py`` with a more concise interface +* ``ops/docker_run.py``: Run a command inside a Docker container + +**To build a Docker container**, invoke ``docker_build.sh`` as follows: + +.. code-block:: bash + + export BRANCH_NAME="master" # Relevant for CI, for local testing, use "master" + bash ops/docker_build.sh CONTAINER_ID + +where ``CONTAINER_ID`` identifies for the container. The wrapper script will look up the YAML file +``ops/docker/ci_container.yml``. For example, when ``CONTAINER_ID`` is set to ``xgb-ci.gpu``, +the script will use the corresponding entry from ``ci_container.yml``: + +.. code-block:: yaml + + xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: "24.10" + +The ``container_def`` entry indicates where the Dockerfile is located. The container +definition will be fetched from ``ops/docker/dockerfile/Dockerfile.CONTAINER_DEF`` where +``CONTAINER_DEF`` is the value of ``container_def`` entry. In this example, the Dockerfile +is ``ops/docker/dockerfile/Dockerfile.gpu``. + +The ``build_args`` entry lists all the build arguments for the Docker build. In this example, +the build arguments are: + +.. code-block:: + + --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 + +The build arguments provide inputs to the ``ARG`` instructions in the Dockerfile. + +.. note:: Inspect the logs from the CI pipeline to find what's going on under the hood + + When invoked, ``ops/docker_build.sh`` logs the precise commands that it runs under the hood. + Using the example above: + + .. code-block:: bash + + # docker_build.sh calls docker_build.py... + python3 ops/docker_build.py --container-def gpu --container-id xgb-ci.gpu \ + --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 + + ... + + # .. and docker_build.py in turn calls "docker build"... + docker build --build-arg CUDA_VERSION_ARG=12.4.1 \ + --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 \ + --load --progress=plain \ + --ulimit nofile=1024000:1024000 \ + -t xgb-ci.gpu \ + -f ops/docker/dockerfile/Dockerfile.gpu \ + ops/ + + The logs come in handy when debugging the container builds. In addition, you can change + the build arguments to make changes to the container. + +**To run commands within a Docker container**, invoke ``docker_run.py`` as follows: + +.. code-block:: bash + + python3 ops/docker_run.py --container-id "ID of the container" [--use-gpus] \ + -- "command to run inside the container" + +where ``--use-gpus`` should be specified to expose NVIDIA GPUs to the Docker container. + +For example: .. code-block:: bash - tests/ci_build/ci_build.sh --use-gpus --build-arg \ - ... + # Run without GPU + python3 ops/docker_run.py --container-id xgb-ci.cpu \ + -- bash ops/script/build_via_cmake.sh + + # Run with NVIDIA GPU + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- bash ops/pipeline/test-python-wheel-impl.sh gpu + +The ``docker_run.py`` script will convert these commands to the following invocations +of ``docker run``: -where: +.. code-block:: bash + + docker run --rm --pid=host \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + xgb-ci.cpu \ + bash ops/script/build_via_cmake.sh -* ```` is the identifier for the container. The wrapper script will use the - container definition (Dockerfile) located at ``tests/ci_build/Dockerfile.``. - For example, setting the container type to ``gpu`` will cause the script to load the Dockerfile - ``tests/ci_build/Dockerfile.gpu``. -* Specify ``--use-gpus`` to run any GPU code. This flag will grant the container access to all NVIDIA GPUs in the base machine. Omit the flag if the access to GPUs is not necessary. -* ```` is a build argument to be passed to Docker. Must be of form ``VAR=VALUE``. - Example: ``--build-arg CUDA_VERSION_ARG=11.0``. You can pass multiple ``--build-arg``. -* ```` is the command to run inside the Docker container. This can be more than one argument. - Example: ``tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON``. + docker run --rm --pid=host --gpus all \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + xgb-ci.gpu \ + bash ops/pipeline/test-python-wheel-impl.sh gpu -Optionally, you can set the environment variable ``CI_DOCKER_EXTRA_PARAMS_INIT`` to pass extra -arguments to Docker. For example: +Optionally, you can specify ``--run-args`` to pass extra arguments to ``docker run``: .. code-block:: bash # Allocate extra space in /dev/shm to enable NCCL - export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - # Run multi-GPU test suite - tests/ci_build/ci_build.sh gpu --use-gpus --build-arg CUDA_VERSION_ARG=11.0 \ - tests/ci_build/test_python.sh mgpu + # Also run the container with elevated privileges + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--shm-size=4g --privileged' \ + -- bash ops/pipeline/test-python-wheel-impl.sh gpu + +which translates to + +.. code-block:: bash + + docker run --rm --pid=host --gpus all \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + --shm-size=4g --privileged \ + xgb-ci.gpu \ + bash ops/pipeline/test-python-wheel-impl.sh gpu + +******************************************************************* +The Lay of the Land: how CI pipelines are organized in the codebase +******************************************************************* +The XGBoost project stores the configuration for its CI pipelines as part of the codebase. +The git repository therefore stores not only the change history for its source code but also +the change history for the CI pipelines. + +================= +File Organization +================= + +The CI pipelines are organized into the following directories and files: + +* ``.github/workflows/``: Definition of CI pipelines, using the GitHub Actions syntax +* ``.github/runs-on.yml``: Configuration for the RunsOn service. Specifies the spec for + the self-hosted CI runners. +* ``ops/conda_env/``: Definitions for Conda environments +* ``ops/packer/``: Packer scripts to build VM images for Amazon EC2 +* ``ops/patch/``: Patch files +* ``ops/pipeline/``: Shell scripts defining CI/CD pipelines. Most of these scripts can be run + locally (to assist with development and debugging); a few must run in the CI. +* ``ops/script/``: Various utility scripts useful for testing +* ``ops/docker/dockerfile/``: Dockerfiles to define containers +* ``ops/docker/ci_container.yml``: Defines the mapping between Dockerfiles and containers. + Also specifies the build arguments to be used with each container. See + :ref:`build_run_docker_locally` to learn how this YAML file is used in the context of + a container build. +* ``ops/docker_build.*``: Wrapper scripts to build and test CI containers. See + :ref:`build_run_docker_locally` for the detailed description. + +To inspect a given CI pipeline, inspect files in the following order: + +.. plot:: + :nofigs: + + from graphviz import Source + source = r""" + digraph ci_graph { + graph [fontname = "monospace"]; + node [fontname = "monospace"]; + edge [fontname = "monospace"]; + 0 [label=<.github/workflows/*.yml>, shape=box]; + 1 [label=, shape=box]; + 2 [label=, shape=box]; + 3 [label=, shape=box]; + 0 -> 1 [xlabel="Calls"]; + 1 -> 2 [xlabel="Calls,\nvia docker_run.py"]; + 2 -> 3 [xlabel="Calls"]; + 1 -> 3 [xlabel="Calls"]; + } + """ + Source(source, format='png').render('../_static/ci_graph', view=False) + Source(source, format='svg').render('../_static/ci_graph', view=False) + +.. figure:: ../_static/ci_graph.svg + :align: center + :figwidth: 80 % + +=================================== +Primitives used in the CI pipelines +=================================== + +------------------------ +Build and run containers +------------------------ + +See :ref:`build_run_docker_locally` to learn about the utility scripts for building and +using containers. + +**What's the relationship between the VM image (for Amazon EC2) and the container image?** +In ``ops/packer/`` directory, we define Packer scripts to build VM images for Amazon EC2. +The VM image contains the minimal set of drivers and system software that are needed to +run the containers. + +We update container images much more often than VM images. Whereas VM images are +updated sparingly (once in a few months), container images are updated each time a branch +or a pull request is updated. This way, developers can make changes to containers and +see the results of the changes immediately in the CI run. + +------------------------------------------ +Stash artifacts, to move them between jobs +------------------------------------------ + +This primitive is useful when one pipeline job needs to consume the output +from another job. +We use `Amazon S3 `_ to store the stashed files. + +**To stash a file**: + +.. code-block:: bash + + REMOTE_PREFIX="remote directory to place the artifact(s)" + bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" path/to/file + +The ``REMOTE_PREFIX`` argument, which is the second command-line argument +for ``stash-artifacts.sh``, specifies the remote directory in which the artifact(s) +should be placed. More precisely, the artifact(s) will be placed in +``s3://{RUNS_ON_S3_BUCKET_CACHE}/cache/{GITHUB_REPOSITORY}/stash/{GITHUB_RUN_ID}/{REMOTE_PREFIX}/`` +where ``RUNS_ON_S3_BUCKET_CACHE``, ``GITHUB_REPOSITORY``, and ``GITHUB_RUN_ID`` are set by +the CI. (RunsOn provisions an S3 bucket to stage cache, and its name is stored in the environment +variable ``RUNS_ON_S3_BUCKET_CACHE``.) + +You can upload multiple files, possibly with wildcard globbing: -To pass multiple extra arguments: +.. code-block:: bash + + REMOTE_PREFIX="build-cuda" + bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" \ + build/testxgboost python-package/dist/*.whl + +**To unstash a file**: + +.. code-block:: bash + + REMOTE_PREFIX="remote directory to place the artifact(s)" + bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" path/to/file + +You can also use the wildcard globbing. The script will download the matching artifacts +from the remote directory. .. code-block:: bash - export CI_DOCKER_EXTRA_PARAMS_INIT='-e VAR1=VAL1 -e VAR2=VAL2 -e VAR3=VAL3' - -******************************************** -Update pipeline definitions for BuildKite CI -******************************************** - -`BuildKite `_ is a SaaS (Software as a Service) platform that orchestrates -cloud machines to host CI pipelines. The BuildKite platform allows us to define CI pipelines as a -declarative YAML file. - -The pipeline definitions are found in ``tests/buildkite/``: - -* ``tests/buildkite/pipeline-win64.yml``: This pipeline builds and tests XGBoost for the Windows platform. -* ``tests/buildkite/pipeline-mgpu.yml``: This pipeline builds and tests XGBoost with access to multiple - NVIDIA GPUs. -* ``tests/buildkite/pipeline.yml``: This pipeline builds and tests XGBoost with access to a single - NVIDIA GPU. Most tests are located here. - -**************************************** -Managing Elastic CI Stack with BuildKite -**************************************** - -BuildKite allows us to define cloud resources in -a declarative fashion. Every configuration step is now documented explicitly as code. - -**Prerequisite**: You should have some knowledge of `CloudFormation `_. -CloudFormation lets us define a stack of cloud resources (EC2 machines, Lambda functions, S3 etc) using -a single YAML file. - -**Prerequisite**: Gain access to the XGBoost project's AWS account (``admin@xgboost-ci.net``), and then -set up a credential pair in order to provision resources on AWS. See -`Creating an IAM user in your AWS account `_. - -* Option 1. Give full admin privileges to your IAM user. This is the simplest option. -* Option 2. Give limited set of permissions to your IAM user, to reduce the possibility of messing up other resources. - For this, use the script ``tests/buildkite/infrastructure/service-user/create_service_user.py``. - -===================== -Worker Image Pipeline -===================== -Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and -manually install the necessary packages. This process is not only laborious but also error-prone. You may -forget to install a package or change a system configuration. - -No more. Now we have an automated pipeline for building images for worker machines. - -* Run ``tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py`` in order to provision - CloudFormation stacks named ``buildkite-linux-amd64-gpu-worker`` and ``buildkite-windows-gpu-worker``. They are - pipelines that create AMIs (Amazon Machine Images) for Linux and Windows workers, respectively. -* Navigate to the CloudFormation web console to verify that the image builder pipelines have been provisioned. It may - take some time. -* Once they pipelines have been fully provisioned, run the script - ``tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py`` to execute the pipelines. New AMIs will be - uploaded to the EC2 service. You can locate them in the EC2 console. -* Make sure to modify ``tests/buildkite/infrastructure/aws-stack-creator/metadata.py`` to use the correct AMI IDs. - (For ``linux-amd64-cpu`` and ``linux-arm64-cpu``, use the AMIs provided by BuildKite. Consult the ``AWSRegion2AMI`` - section of https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml.) - -====================== -EC2 Autoscaling Groups -====================== -In EC2, you can create auto-scaling groups, where you can dynamically adjust the number of worker instances according to -workload. When a pull request is submitted, the following steps take place: - -1. GitHub sends a signal to the registered webhook, which connects to the BuildKite server. -2. BuildKite sends a signal to a `Lambda `_ function named ``Autoscaling``. -3. The Lambda function sends a signal to the auto-scaling group. The group scales up and adds additional worker instances. -4. New worker instances run the test jobs. Test results are reported back to BuildKite. -5. When the test jobs complete, BuildKite sends a signal to ``Autoscaling``, which in turn requests the autoscaling group - to scale down. Idle worker instances are shut down. - -To set up the auto-scaling group, run the script ``tests/buildkite/infrastructure/aws-stack-creator/create_stack.py``. -Check the CloudFormation web console to verify successful provision of auto-scaling groups. + REMOTE_PREFIX="build-cuda" + # Download all files whose path matches the wildcard pattern python-package/dist/*.whl + bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" \ + python-package/dist/*.whl + +----------------------------------------- +Custom actions in ``dmlc/xgboost-devops`` +----------------------------------------- + +XGBoost implements a few custom +`composite actions `_ +to reduce duplicated code within workflow YAML files. The custom actions are hosted in a separate repository, +`dmlc/xgboost-devops `_, to make it easy to test changes to the custom actions in +a pull request or a fork. + +In a workflow file, we'd refer to ``dmlc/xgboost-devops/{custom-action}@main``. For example: + +.. code-block:: yaml + + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: cpp_test + environment-file: ops/conda_env/cpp_test.yml + +Each custom action consists of two components: + +* Main script (``dmlc/xgboost-devops/{custom-action}/action.yml``): dispatches to a specific version + of the implementation script (see the next item). The main script clones ``xgboost-devops`` from + a specified fork at a particular ref, allowing us to easily test changes to the custom action. +* Implementation script (``dmlc/xgboost-devops/impls/{custom-action}/action.yml``): Implements the + custom script. + +This design was inspired by Mike Sarahan's work in +`rapidsai/shared-actions `_. diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst index bf18ad08cf53..60b3c4a13bd2 100644 --- a/doc/contrib/coding_guide.rst +++ b/doc/contrib/coding_guide.rst @@ -107,7 +107,7 @@ C++ interface of the R package, please make corresponding changes in ``src/init. Generating the Package and Running Tests ======================================== -The source layout of XGBoost is a bit unusual to normal R packages as XGBoost is primarily written in C++ with multiple language bindings in mind. As a result, some special cares need to be taken to generate a standard R tarball. Most of the tests are being run on CI, and as a result, the best way to see how things work is by looking at the CI configuration files (GitHub action, at the time of writing). There are helper scripts in ``tests/ci_build`` and ``R-package/tests/helper_scripts`` for running various checks including linter and making the standard tarball. +The source layout of XGBoost is a bit unusual to normal R packages as XGBoost is primarily written in C++ with multiple language bindings in mind. As a result, some special cares need to be taken to generate a standard R tarball. Most of the tests are being run on CI, and as a result, the best way to see how things work is by looking at the CI configuration files (GitHub action, at the time of writing). There are helper scripts in ``ops/script`` and ``R-package/tests/helper_scripts`` for running various checks including linter and making the standard tarball. ********************************* Running Formatting Checks Locally @@ -127,7 +127,7 @@ To run checks for Python locally, install the checkers mentioned previously and .. code-block:: bash cd /path/to/xgboost/ - python ./tests/ci_build/lint_python.py --fix + python ./ops/script/lint_python.py --fix To run checks for R: @@ -135,21 +135,21 @@ To run checks for R: cd /path/to/xgboost/ R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) + Rscript ops/script/lint_r.R $(pwd) To run checks for cpplint locally: .. code-block:: bash cd /path/to/xgboost/ - python ./tests/ci_build/lint_cpp.py + python ./ops/script/lint_cpp.py See next section for clang-tidy. For CMake scripts: .. code-block:: bash - bash ./tests/ci_build/lint_cmake.sh + bash ./ops/script/lint_cmake.sh Lastly, the linter for jvm-packages is integrated into the maven build process. @@ -163,21 +163,21 @@ To run this check locally, run the following command from the top level source t .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py + python3 ops/script/run_clang_tidy.py Also, the script accepts two optional integer arguments, namely ``--cpp`` and ``--cuda``. By default they are both set to 1, meaning that both C++ and CUDA code will be checked. If the CUDA toolkit is not installed on your machine, you'll encounter an error. To exclude CUDA source from linting, use: .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py --cuda=0 + python3 ops/script/run_clang_tidy.py --cuda=0 Similarly, if you want to exclude C++ source from linting: .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py --cpp=0 + python3 ops/script/run_clang_tidy.py --cpp=0 ********************************** Guide for handling user input data diff --git a/doc/contrib/donate.rst b/doc/contrib/donate.rst index b6171c412c74..ba7c75a942f9 100644 --- a/doc/contrib/donate.rst +++ b/doc/contrib/donate.rst @@ -13,9 +13,9 @@ DMLC/XGBoost has grown from a research project incubated in academia to one of t A robust and efficient **continuous integration (CI)** infrastructure is one of the most critical solutions to address the above challenge. A CI service will monitor an open-source repository and run a suite of integration tests for every incoming contribution. This way, the CI ensures that every proposed change in the codebase is compatible with existing functionalities. Furthermore, XGBoost can enable more thorough tests with a powerful CI infrastructure to cover cases which are closer to the production environment. -There are several CI services available free to open source projects, such as Travis CI and AppVeyor. The XGBoost project already utilizes GitHub Actions. However, the XGBoost project has needs that these free services do not adequately address. In particular, the limited usage quota of resources such as CPU and memory leaves XGBoost developers unable to bring "too-intensive" tests. In addition, they do not offer test machines with GPUs for testing XGBoost-GPU code base which has been attracting more and more interest across many organizations. Consequently, the XGBoost project uses a cloud-hosted test farm. We use `BuildKite `_ to organize CI pipelines. +There are several CI services available free to open source projects, such as Travis CI and AppVeyor. The XGBoost project already utilizes GitHub Actions. However, the XGBoost project has needs that these free services do not adequately address. In particular, the limited usage quota of resources such as CPU and memory leaves XGBoost developers unable to bring "too-intensive" tests. In addition, they do not offer test machines with GPUs for testing XGBoost-GPU code base which has been attracting more and more interest across many organizations. Consequently, the XGBoost project uses a cloud-hosted test farm. We host `Amazon Web Services (AWS) `_ to host the test machines, along with `GitHub Actions `_ and `RunsOn `_ (SaaS app) to organize the CI pipelines. -The cloud-hosted test farm has recurring operating expenses. It utilizes a leading cloud provider (AWS) to accommodate variable workload. BuildKite launches worker machines on AWS on demand, to run the test suite on incoming contributions. To save cost, the worker machines are terminated when they are no longer needed. +The cloud-hosted test farm has recurring operating expenses. RunsOn launches worker machines on AWS on demand to run the test suite on incoming contributions. To save cost, the worker machines are terminated when they are no longer needed. To help defray the hosting cost, the XGBoost project seeks donations from third parties. @@ -29,9 +29,9 @@ The Project Management Committee (PMC) of the XGBoost project appointed `Open So All expenses incurred for hosting CI will be submitted to the fiscal host with receipts. Only the expenses in the following categories will be approved for reimbursement: -* Cloud expenses for the cloud test farm (https://buildkite.com/xgboost) +* Cloud expenses for the cloud test farm * Cost of domain https://xgboost-ci.net -* Monthly cost of using BuildKite +* Annual subscription for RunsOn * Hosting cost of the User Forum (https://discuss.xgboost.ai) Administration of cloud CI infrastructure diff --git a/doc/contrib/release.rst b/doc/contrib/release.rst index c0370b14ed42..4548b1ffa9a2 100644 --- a/doc/contrib/release.rst +++ b/doc/contrib/release.rst @@ -17,7 +17,7 @@ Making a Release ----------------- 1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue. -2. Create a release branch if this is a major release. Bump release version. There's a helper script ``tests/ci_build/change_version.py``. +2. Create a release branch if this is a major release. Bump release version. There's a helper script ``ops/script/change_version.py``. 3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``. 4. Create a tag on release branch, either on GitHub or locally. 5. Make a release on GitHub tag page, which might be done with previous step if the tag is created on GitHub. diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst index aa58cd337020..857d7a067307 100644 --- a/doc/contrib/unit_tests.rst +++ b/doc/contrib/unit_tests.rst @@ -63,7 +63,7 @@ Run .. code-block:: bash - python ./tests/ci_build/test_r_package.py --task=check + python ./ops/script/test_r_package.py --task=check at the root of the project directory. The command builds and checks the XGBoost r-package. Alternatively, if you want to just run the tests, you can use the following diff --git a/doc/jvm/api.rst b/doc/jvm/api.rst index b9e7821aa6fa..3d56cb2c9aa4 100644 --- a/doc/jvm/api.rst +++ b/doc/jvm/api.rst @@ -5,4 +5,5 @@ API Docs for the JVM packages * `XGBoost4J Java API <../jvm_docs/javadocs/index.html>`_ * `XGBoost4J Scala API <../jvm_docs/scaladocs/xgboost4j/index.html>`_ * `XGBoost4J-Spark Scala API <../jvm_docs/scaladocs/xgboost4j-spark/index.html>`_ +* `XGBoost4J-Spark-GPU Scala API <../jvm_docs/scaladocs/xgboost4j-spark-gpu/index.html>`_ * `XGBoost4J-Flink Scala API <../jvm_docs/scaladocs/xgboost4j-flink/index.html>`_ diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py index 6be7b451ce14..fbd9b4ce5672 100755 --- a/jvm-packages/create_jni.py +++ b/jvm-packages/create_jni.py @@ -32,7 +32,7 @@ def cd(path): path = normpath(path) cwd = os.getcwd() os.chdir(path) - print("cd " + path) + print("cd " + path, flush=True) try: yield path finally: @@ -41,7 +41,7 @@ def cd(path): def maybe_makedirs(path): path = normpath(path) - print("mkdir -p " + path) + print("mkdir -p " + path, flush=True) try: os.makedirs(path) except OSError as e: @@ -50,14 +50,14 @@ def maybe_makedirs(path): def run(command, **kwargs): - print(command) + print(command, flush=True) subprocess.run(command, shell=True, check=True, env=os.environ, **kwargs) def cp(source, target): source = normpath(source) target = normpath(target) - print("cp {0} {1}".format(source, target)) + print("cp {0} {1}".format(source, target), flush=True) shutil.copy(source, target) @@ -78,7 +78,7 @@ def native_build(args): subprocess.check_output("/usr/libexec/java_home").strip().decode() ) - print("building Java wrapper") + print("building Java wrapper", flush=True) with cd(".."): build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build" maybe_makedirs(build_dir) @@ -123,7 +123,7 @@ def native_build(args): run("cmake .. " + " ".join(args + [generator])) break except subprocess.CalledProcessError as e: - print(f"Failed to build with generator: {generator}", e) + print(f"Failed to build with generator: {generator}", e, flush=True) with cd(os.path.pardir): shutil.rmtree(build_dir) maybe_makedirs(build_dir) @@ -132,7 +132,7 @@ def native_build(args): run("cmake --build . --config Release" + maybe_parallel_build) - print("copying native library") + print("copying native library", flush=True) library_name, os_folder = { "Windows": ("xgboost4j.dll", "windows"), "Darwin": ("libxgboost4j.dylib", "macos"), @@ -153,7 +153,7 @@ def native_build(args): maybe_makedirs(output_folder) cp("../lib/" + library_name, output_folder) - print("copying train/test files") + print("copying train/test files", flush=True) # for xgboost4j maybe_makedirs("xgboost4j/src/test/resources") diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index be46dc261285..b8a7d3337f35 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -116,6 +116,22 @@ + + docs + + ON + true + true + true + + + xgboost4j + xgboost4j-spark + xgboost4j-spark-gpu + xgboost4j-flink + + + release diff --git a/tests/ci_build/conda_env/aarch64_test.yml b/ops/conda_env/aarch64_test.yml similarity index 100% rename from tests/ci_build/conda_env/aarch64_test.yml rename to ops/conda_env/aarch64_test.yml diff --git a/tests/ci_build/conda_env/cpp_test.yml b/ops/conda_env/cpp_test.yml similarity index 100% rename from tests/ci_build/conda_env/cpp_test.yml rename to ops/conda_env/cpp_test.yml diff --git a/tests/ci_build/conda_env/linux_cpu_test.yml b/ops/conda_env/linux_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/linux_cpu_test.yml rename to ops/conda_env/linux_cpu_test.yml diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml similarity index 97% rename from tests/ci_build/conda_env/linux_sycl_test.yml rename to ops/conda_env/linux_sycl_test.yml index bec01c4f95fc..1761787662ee 100644 --- a/tests/ci_build/conda_env/linux_sycl_test.yml +++ b/ops/conda_env/linux_sycl_test.yml @@ -18,6 +18,7 @@ dependencies: - pytest-timeout - pytest-cov - dask=2024.11 +- ninja - dpcpp_linux-64 - onedpl-devel - intel-openmp diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/ops/conda_env/macos_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/macos_cpu_test.yml rename to ops/conda_env/macos_cpu_test.yml diff --git a/tests/ci_build/conda_env/jvm_tests.yml b/ops/conda_env/minimal.yml similarity index 79% rename from tests/ci_build/conda_env/jvm_tests.yml rename to ops/conda_env/minimal.yml index 56e11dff27bb..efe972bd44d9 100644 --- a/tests/ci_build/conda_env/jvm_tests.yml +++ b/ops/conda_env/minimal.yml @@ -1,4 +1,4 @@ -name: jvm_tests +name: minimal channels: - conda-forge dependencies: diff --git a/tests/ci_build/conda_env/python_lint.yml b/ops/conda_env/python_lint.yml similarity index 100% rename from tests/ci_build/conda_env/python_lint.yml rename to ops/conda_env/python_lint.yml diff --git a/tests/ci_build/conda_env/sdist_test.yml b/ops/conda_env/sdist_test.yml similarity index 85% rename from tests/ci_build/conda_env/sdist_test.yml rename to ops/conda_env/sdist_test.yml index 3597b42c6132..c21cd2b701e1 100644 --- a/tests/ci_build/conda_env/sdist_test.yml +++ b/ops/conda_env/sdist_test.yml @@ -9,5 +9,3 @@ dependencies: - cmake - ninja - python-build -- c-compiler -- cxx-compiler diff --git a/tests/ci_build/conda_env/win64_test.yml b/ops/conda_env/win64_test.yml similarity index 100% rename from tests/ci_build/conda_env/win64_test.yml rename to ops/conda_env/win64_test.yml diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml new file mode 100644 index 000000000000..348bf90f8a1f --- /dev/null +++ b/ops/docker/ci_container.yml @@ -0,0 +1,72 @@ +## List of CI containers with definitions and build arguments + +# Each container will be built using the definition from +# ops/docker/dockerfile/Dockerfile.CONTAINER_DEF + +rapids_versions: + stable: &rapids_version "24.10" + dev: &dev_rapids_version "24.12" + +xgb-ci.gpu_build_rockylinux8: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *rapids_version + +xgb-ci.gpu_build_rockylinux8_dev_ver: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *dev_rapids_version + +xgb-ci.gpu_build_r_rockylinux8: + container_def: gpu_build_r_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.4.1" + R_VERSION_ARG: "4.3.2" + +xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *rapids_version + +xgb-ci.gpu_dev_ver: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *dev_rapids_version + RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" + +xgb-ci.clang_tidy: + container_def: clang_tidy + build_args: + CUDA_VERSION_ARG: "12.4.1" + +xgb-ci.cpu: + container_def: cpu + +xgb-ci.aarch64: + container_def: aarch64 + +xgb-ci.manylinux_2_28_x86_64: + container_def: manylinux_2_28_x86_64 + +xgb-ci.manylinux2014_x86_64: + container_def: manylinux2014_x86_64 + +xgb-ci.manylinux2014_aarch64: + container_def: manylinux2014_aarch64 + +xgb-ci.jvm: + container_def: jvm + +xgb-ci.jvm_gpu_build: + container_def: jvm_gpu_build + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" diff --git a/ops/docker/docker_cache_ecr.yml b/ops/docker/docker_cache_ecr.yml new file mode 100644 index 000000000000..e20f35fc8020 --- /dev/null +++ b/ops/docker/docker_cache_ecr.yml @@ -0,0 +1,4 @@ +## Constants for AWS ECR (Elastic Container Registry), used for the Docker cache + +DOCKER_CACHE_ECR_ID: "492475357299" +DOCKER_CACHE_ECR_REGION: "us-west-2" diff --git a/tests/ci_build/Dockerfile.aarch64 b/ops/docker/dockerfile/Dockerfile.aarch64 similarity index 97% rename from tests/ci_build/Dockerfile.aarch64 rename to ops/docker/dockerfile/Dockerfile.aarch64 index 8d6cfaca39fa..9dff2a05230b 100644 --- a/tests/ci_build/Dockerfile.aarch64 +++ b/ops/docker/dockerfile/Dockerfile.aarch64 @@ -32,7 +32,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.clang_tidy b/ops/docker/dockerfile/Dockerfile.clang_tidy similarity index 96% rename from tests/ci_build/Dockerfile.clang_tidy rename to ops/docker/dockerfile/Dockerfile.clang_tidy index 2e7751a20185..de7d9bd3f254 100644 --- a/tests/ci_build/Dockerfile.clang_tidy +++ b/ops/docker/dockerfile/Dockerfile.clang_tidy @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu22.04 ARG CUDA_VERSION_ARG @@ -44,7 +44,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu similarity index 92% rename from tests/ci_build/Dockerfile.cpu rename to ops/docker/dockerfile/Dockerfile.cpu index 22db93572207..a426ce5da30c 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/ops/docker/dockerfile/Dockerfile.cpu @@ -41,8 +41,7 @@ RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ COPY conda_env/linux_cpu_test.yml /scripts/ RUN mamba create -n linux_cpu_test && \ mamba env update -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n linux_cpu_test pip install buildkite-test-collector + mamba clean --all --yes # Install lightweight sudo (not bound to TTY) RUN set -ex; \ @@ -52,7 +51,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu similarity index 76% rename from tests/ci_build/Dockerfile.gpu rename to ops/docker/dockerfile/Dockerfile.gpu index 501726e9ffba..96a532fc2ff1 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -1,8 +1,10 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 ARG CUDA_VERSION_ARG ARG RAPIDS_VERSION_ARG + # Should be first 4 digits (e.g. 24.06) ARG NCCL_VERSION_ARG +ARG RAPIDSAI_CONDA_CHANNEL_ARG="rapidsai" # Environment ENV DEBIAN_FRONTEND=noninteractive @@ -24,16 +26,16 @@ ENV PATH=/opt/miniforge/bin:$PATH RUN \ export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \ - python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cuda-version=$CUDA_SHORT_VER \ + mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ + python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ "nccl>=${NCCL_SHORT_VER}" \ - dask \ - dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ + "dask<=2024.10.0" \ + "distributed<=2024.10.0" \ + "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ python-kubernetes urllib3 graphviz hypothesis loky \ "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n gpu_test pip install buildkite-test-collector + mamba clean --all --yes ENV GOSU_VERSION=1.10 ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ @@ -46,7 +48,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 similarity index 97% rename from tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 rename to ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 index 159e5d776c16..2d18b1eeb315 100644 --- a/tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG R_VERSION_ARG @@ -52,7 +52,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 similarity index 94% rename from tests/ci_build/Dockerfile.gpu_build_rockylinux8 rename to ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 index 8869fb468e12..b686bfbb2b0d 100644 --- a/tests/ci_build/Dockerfile.gpu_build_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG @@ -53,7 +53,7 @@ RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ # Install RMM # Patch out -Werror # Patch CCCL 2.5.0 to apply https://github.com/NVIDIA/cccl/pull/1957 -RUN git clone -b v${RAPIDS_VERSION_ARG}.00 https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ +RUN git clone -b branch-${RAPIDS_VERSION_ARG} https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ pushd rmm && \ find . -name CMakeLists.txt -print0 | xargs -0 sed -i 's/-Werror//g' && \ mkdir build && \ @@ -76,7 +76,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.i386 b/ops/docker/dockerfile/Dockerfile.i386 similarity index 100% rename from tests/ci_build/Dockerfile.i386 rename to ops/docker/dockerfile/Dockerfile.i386 diff --git a/tests/ci_build/Dockerfile.jvm b/ops/docker/dockerfile/Dockerfile.jvm similarity index 97% rename from tests/ci_build/Dockerfile.jvm rename to ops/docker/dockerfile/Dockerfile.jvm index c4584747f5db..9fd62e52de93 100644 --- a/tests/ci_build/Dockerfile.jvm +++ b/ops/docker/dockerfile/Dockerfile.jvm @@ -37,7 +37,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build similarity index 97% rename from tests/ci_build/Dockerfile.jvm_gpu_build rename to ops/docker/dockerfile/Dockerfile.jvm_gpu_build index edb5918b8bbc..4983493a6878 100644 --- a/tests/ci_build/Dockerfile.jvm_gpu_build +++ b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG @@ -48,7 +48,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.manylinux2014_aarch64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 similarity index 82% rename from tests/ci_build/Dockerfile.manylinux2014_aarch64 rename to ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 index 9627e15c64a0..7800033f552d 100644 --- a/tests/ci_build/Dockerfile.manylinux2014_aarch64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 @@ -1,5 +1,7 @@ FROM quay.io/pypa/manylinux2014_aarch64 +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + # Install lightweight sudo (not bound to TTY) ENV GOSU_VERSION=1.10 RUN set -ex; \ @@ -9,7 +11,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.manylinux2014_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 similarity index 82% rename from tests/ci_build/Dockerfile.manylinux2014_x86_64 rename to ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 index 11beb116ee43..8214b598d8d4 100644 --- a/tests/ci_build/Dockerfile.manylinux2014_x86_64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 @@ -1,5 +1,7 @@ FROM quay.io/pypa/manylinux2014_x86_64 +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + # Install lightweight sudo (not bound to TTY) ENV GOSU_VERSION=1.10 RUN set -ex; \ @@ -9,7 +11,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 similarity index 92% rename from tests/ci_build/Dockerfile.manylinux_2_28_x86_64 rename to ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 index 5e264e2f16e6..f5dac54b9b8f 100644 --- a/tests/ci_build/Dockerfile.manylinux_2_28_x86_64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 @@ -9,7 +9,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/entrypoint.sh b/ops/docker/entrypoint.sh similarity index 70% rename from tests/ci_build/entrypoint.sh rename to ops/docker/entrypoint.sh index a0c5f56bb52d..40135c197c73 100755 --- a/tests/ci_build/entrypoint.sh +++ b/ops/docker/entrypoint.sh @@ -1,12 +1,10 @@ #!/usr/bin/env bash -# This script is a wrapper creating the same user inside container as the one -# running the ci_build.sh outside the container. It also set the home directory -# for the user inside container to match the same absolute path as the workspace -# outside of container. Do not run this manually. It does not make sense. It is -# intended to be called by ci_build.sh only. +# This wrapper script propagates the user information from the host +# to the container. This way, any files generated by processes running +# in the container will be accessible in the host. -set -e +set -euo pipefail COMMAND=("$@") @@ -19,7 +17,11 @@ else rm /this_is_writable_file_system fi -if [[ -n $CI_BUILD_UID ]] && [[ -n $CI_BUILD_GID ]]; then +## Assumption: the host passes correct user information via environment variables +## CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP + +if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] +then groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ "${CI_BUILD_USER}" || true diff --git a/ops/docker/extract_build_args.jq b/ops/docker/extract_build_args.jq new file mode 100644 index 000000000000..b35240edb626 --- /dev/null +++ b/ops/docker/extract_build_args.jq @@ -0,0 +1,12 @@ +## Example input: +## xgb-ci.gpu_build_r_rockylinux8 +## Example output: +## --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg R_VERSION_ARG=4.3.2 +def compute_build_args($input; $container_id): + $input | + .[$container_id] | + select(.build_args != null) | + .build_args | + to_entries | + map("--build-arg " + .key + "=" + .value) | + join(" "); diff --git a/ops/docker/extract_build_args.sh b/ops/docker/extract_build_args.sh new file mode 100755 index 000000000000..42a83047742c --- /dev/null +++ b/ops/docker/extract_build_args.sh @@ -0,0 +1,26 @@ +#!/bin/bash +## Extract container definition and build args from ops/docker/ci_container.yml, +## given the container ID. +## +## Example input: +## xgb-ci.clang_tidy +## Example output: +## CONTAINER_DEF='clang_tidy' BUILD_ARGS='--build-arg CUDA_VERSION_ARG=12.4.1' + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 [container_id]" + exit 1 +fi + +CONTAINER_ID="$1" +CONTAINER_DEF=$( + yq -o json ops/docker/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' +) +BUILD_ARGS=$( + yq -o json ops/docker/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" \ + 'include "ops/docker/extract_build_args"; + compute_build_args(.; $container_id)' +) +echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/docker_build.py b/ops/docker_build.py new file mode 100644 index 000000000000..1fed975ce223 --- /dev/null +++ b/ops/docker_build.py @@ -0,0 +1,137 @@ +""" +Wrapper script to build a Docker container with layer caching +""" + +import argparse +import itertools +import pathlib +import subprocess +import sys +from typing import Optional + +from docker_run import OPS_DIR, fancy_print_cli_args + + +def parse_build_args(raw_build_args: list[str]) -> dict[str, str]: + parsed_build_args = dict() + for arg in raw_build_args: + try: + key, value = arg.split("=", maxsplit=1) + except ValueError as e: + raise ValueError( + f"Build argument must be of form KEY=VALUE. Got: {arg}" + ) from e + parsed_build_args[key] = value + return parsed_build_args + + +def docker_build( + container_id: str, + *, + build_args: dict[str, str], + dockerfile_path: pathlib.Path, + docker_context_path: pathlib.Path, + cache_from: Optional[str], + cache_to: Optional[str], +) -> None: + ## Set up command-line arguments to be passed to `docker build` + # Build args + docker_build_cli_args = list( + itertools.chain.from_iterable( + [["--build-arg", f"{k}={v}"] for k, v in build_args.items()] + ) + ) + # When building an image using a non-default driver, we need to specify + # `--load` to load it to the image store. + # See https://docs.docker.com/build/builders/drivers/ + docker_build_cli_args.append("--load") + # Layer caching + if cache_from: + docker_build_cli_args.extend(["--cache-from", cache_from]) + if cache_to: + docker_build_cli_args.extend(["--cache-to", cache_to]) + # Remaining CLI args + docker_build_cli_args.extend( + [ + "--progress=plain", + "--ulimit", + "nofile=1024000:1024000", + "-t", + container_id, + "-f", + str(dockerfile_path), + str(docker_context_path), + ] + ) + cli_args = ["docker", "build"] + docker_build_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + # Dockerfile to be used in docker build + dockerfile_path = ( + OPS_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" + ) + docker_context_path = OPS_DIR + + build_args = parse_build_args(args.build_arg) + + docker_build( + args.container_id, + build_args=build_args, + dockerfile_path=dockerfile_path, + docker_context_path=docker_context_path, + cache_from=args.cache_from, + cache_to=args.cache_to, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build a Docker container") + parser.add_argument( + "--container-def", + type=str, + required=True, + help=( + "String uniquely identifying the container definition. The container " + "definition will be fetched from " + "docker/dockerfile/Dockerfile.CONTAINER_DEF." + ), + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID to assign to the newly built container", + ) + parser.add_argument( + "--build-arg", + type=str, + default=[], + action="append", + help=( + "Build-time variable(s) to be passed to `docker build`. Each variable " + "should be specified as a key-value pair in the form KEY=VALUE. " + "The variables should match the ARG instructions in the Dockerfile. " + "When passing multiple variables, specify --build-arg multiple times. " + "Example: --build-arg CUDA_VERSION_ARG=12.5 --build-arg RAPIDS_VERSION_ARG=24.10'" + ), + ) + parser.add_argument( + "--cache-from", + type=str, + help="Use an external cache source for the Docker build", + ) + parser.add_argument( + "--cache-to", + type=str, + help="Export layers from the container to an external cache destination", + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/docker_build.sh b/ops/docker_build.sh new file mode 100755 index 000000000000..7d83daec9574 --- /dev/null +++ b/ops/docker_build.sh @@ -0,0 +1,149 @@ +#!/bin/bash +## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). +## This script provides a convenient wrapper for ops/docker_build.py. +## Build-time variables (--build-arg) and container defintion are fetched from +## ops/docker/ci_container.yml. +## +## Note. This script takes in some inputs via environment variables. + +USAGE_DOC=$( +cat <<-EOF +Usage: ops/docker_build.sh [container_id] + +In addition, the following environment variables should be set. + - BRANCH_NAME: Name of the current git branch or pull request (Required) + - USE_DOCKER_CACHE: If set to 1, enable caching +EOF +) + +ECR_LIFECYCLE_RULE=$( +cat <<-EOF +{ + "rules": [ + { + "rulePriority": 1, + "selection": { + "tagStatus": "any", + "countType": "sinceImagePushed", + "countUnit": "days", + "countNumber": 30 + }, + "action": { + "type": "expire" + } + } + ] +} +EOF +) + +set -euo pipefail + +for arg in "BRANCH_NAME" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n\n${USAGE_DOC}" + exit 1 + fi +done + +if [[ "$#" -lt 1 ]] +then + echo "${USAGE_DOC}" + exit 2 +fi +CONTAINER_ID="$1" + +# Fetch CONTAINER_DEF and BUILD_ARGS +source <(ops/docker/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 + +if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false +then + USE_DOCKER_CACHE=0 +fi + +if [[ ${USE_DOCKER_CACHE} -eq 0 ]] +then + echo "USE_DOCKER_CACHE not set; caching disabled" +else + DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/docker/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/docker/docker_cache_ecr.yml) + DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" + echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" + # Login for Docker registry + echo "aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} |" \ + "docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO}" + aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} \ + | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO} +fi + +# Pull pre-built container from the cache +# First try locating one for the particular branch or pull request +CACHE_FROM_CMD="" +IS_CACHED=0 +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_TAG="${BRANCH_NAME//\//-}" # Slashes are not allowed in Docker tag + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + echo "Found a cached container for the branch ${BRANCH_NAME}: ${DOCKER_URL}" + IS_CACHED=1 + else + # If there's no pre-built container from the cache, + # use the pre-built container from the master branch. + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:master" + echo "Could not find a cached container for the branch ${BRANCH_NAME}." \ + "Using a cached container from the master branch: ${DOCKER_URL}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + IS_CACHED=1 + else + echo "Could not find a cached container for the master branch either." + IS_CACHED=0 + fi + fi + if [[ $IS_CACHED -eq 1 ]] + then + CACHE_FROM_CMD="--cache-from type=registry,ref=${DOCKER_URL}" + fi +fi + +# Run Docker build +set -x +python3 ops/docker_build.py \ + --container-def ${CONTAINER_DEF} \ + --container-id ${CONTAINER_ID} \ + ${BUILD_ARGS} \ + --cache-to type=inline \ + ${CACHE_FROM_CMD} +set +x + +# Now cache the new container +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker tag ${CONTAINER_ID} ${DOCKER_URL}" + docker tag "${CONTAINER_ID}" "${DOCKER_URL}" + + # Attempt to create Docker repository; it will fail if the repository already exists + echo "aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION}" + if aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION} + then + # Repository was created. Now set expiration policy + echo "aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID}" \ + "--region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin" + echo "${ECR_LIFECYCLE_RULE}" | aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID} \ + --region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin + fi + + echo "docker push --quiet ${DOCKER_URL}" + if ! time docker push --quiet "${DOCKER_URL}" + then + echo "ERROR: could not update Docker cache ${DOCKER_URL}" + exit 1 + fi +fi diff --git a/ops/docker_run.py b/ops/docker_run.py new file mode 100644 index 000000000000..7e61c5a14f39 --- /dev/null +++ b/ops/docker_run.py @@ -0,0 +1,168 @@ +""" +Wrapper script to run a command inside a Docker container +""" + +import argparse +import grp +import itertools +import os +import pathlib +import pwd +import subprocess +import sys +import textwrap + +OPS_DIR = pathlib.Path(__file__).expanduser().resolve().parent +PROJECT_ROOT_DIR = OPS_DIR.parent +LINEWIDTH = 88 +TEXT_WRAPPER = textwrap.TextWrapper( + width=LINEWIDTH, + initial_indent="", + subsequent_indent=" ", + break_long_words=False, + break_on_hyphens=False, +) + + +def parse_run_args(raw_run_args: str) -> list[str]: + return [x for x in raw_run_args.split() if x] + + +def get_user_ids() -> dict[str, str]: + uid = os.getuid() + gid = os.getgid() + return { + "CI_BUILD_UID": str(uid), + "CI_BUILD_USER": pwd.getpwuid(uid).pw_name, + "CI_BUILD_GID": str(gid), + "CI_BUILD_GROUP": grp.getgrgid(gid).gr_name, + } + + +def fancy_print_cli_args(cli_args: list[str]) -> None: + print( + "=" * LINEWIDTH + + "\n" + + " \\\n".join(TEXT_WRAPPER.wrap(" ".join(cli_args))) + + "\n" + + "=" * LINEWIDTH + + "\n", + flush=True, + ) + + +def docker_run( + container_id: str, + command_args: list[str], + *, + use_gpus: bool, + workdir: pathlib.Path, + user_ids: dict[str, str], + extra_args: list[str], +) -> None: + # Command-line arguments to be passed to `docker run` + docker_run_cli_args = ["--rm", "--pid=host"] + + if use_gpus: + docker_run_cli_args.extend(["--gpus", "all"]) + + docker_run_cli_args.extend(["-v", f"{workdir}:/workspace", "-w", "/workspace"]) + docker_run_cli_args.extend( + itertools.chain.from_iterable([["-e", f"{k}={v}"] for k, v in user_ids.items()]) + ) + docker_run_cli_args.extend(extra_args) + docker_run_cli_args.append(container_id) + docker_run_cli_args.extend(command_args) + + cli_args = ["docker", "run"] + docker_run_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + run_args = parse_run_args(args.run_args) + user_ids = get_user_ids() + + if args.use_gpus: + print("Using NVIDIA GPUs for `docker run`") + if args.interactive: + print("Using interactive mode for `docker run`") + run_args.append("-it") + + docker_run( + args.container_id, + args.command_args, + use_gpus=args.use_gpus, + workdir=args.workdir, + user_ids=user_ids, + extra_args=run_args, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + usage=( + f"{sys.argv[0]} --container-id CONTAINER_ID [--use-gpus] [--interactive] " + "[--workdir WORKDIR] [--run-args RUN_ARGS] -- COMMAND_ARG " + "[COMMAND_ARG ...]" + ), + description="Run tasks inside a Docker container", + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID of the container to run.", + ) + parser.add_argument( + "--use-gpus", + action="store_true", + help=( + "Grant the container access to NVIDIA GPUs; requires the NVIDIA " + "Container Toolkit." + ), + ) + parser.add_argument( + "--interactive", + action="store_true", + help=( + "Run the container in the interactive mode; requires an interactive shell " + "(TTY). With this flag, you can use Ctrl-C to interrupt an long-running " + "command." + ), + ) + parser.add_argument( + "--workdir", + type=lambda p: pathlib.Path(p).expanduser().resolve(), + default=PROJECT_ROOT_DIR, + help="Path to working directory; if unset, use the project's root", + ) + parser.add_argument( + "--run-args", + type=str, + default="", + help=( + "Argument(s) to be passed to `docker run`. When passing multiple " + "arguments, use single quotes to wrap them. Example: " + "--run-args '--cap-add SYS_PTRACE --shm-size=4g'" + ), + ) + parser.add_argument( + "command_args", + metavar="COMMAND_ARG", + type=str, + nargs="+", + help=( + "Argument(s) for the command to execute. NOTE. Make sure to specify " + "double-dash (--) to clearly distinguish between the command and the " + "preceding parameters. Example: --run-args '--cap-add SYS_PTRACE " + "--shm-size=4g' -- ./myprog" + ), + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh new file mode 100644 index 000000000000..57be6e14b507 --- /dev/null +++ b/ops/packer/linux/bootstrap.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -euo pipefail + +## Install Docker +# Add Docker's official GPG key: +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc +# Add the repository to Apt sources: +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +# Allow users to use Docker without sudo +sudo usermod -aG docker ubuntu + +# Start Docker daemon +sudo systemctl is-active --quiet docker.service || sudo systemctl start docker.service +sudo systemctl is-enabled --quiet docker.service || sudo systemctl enable docker.service +sleep 10 # Docker daemon takes time to come up after installing +sudo docker info + +## Install NVIDIA Container Toolkit +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker + +sleep 10 +sudo docker run --rm --gpus all ubuntu nvidia-smi +sudo systemctl stop docker + +## Install AWS CLI v2 +wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip +unzip -q awscliv2.zip +sudo ./aws/install +rm -rf ./aws/ ./awscliv2.zip + +## Install jq and yq +sudo apt update && sudo apt install jq +mkdir yq/ +pushd yq/ +wget -nv https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64.tar.gz -O - | \ + tar xz && sudo mv ./yq_linux_amd64 /usr/bin/yq +popd +rm -rf yq/ diff --git a/ops/packer/linux/install_drivers.sh b/ops/packer/linux/install_drivers.sh new file mode 100644 index 000000000000..07309be836a8 --- /dev/null +++ b/ops/packer/linux/install_drivers.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -euo pipefail + +## Install basic tools +echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections +sudo apt-get update +sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip + +## Install CUDA Toolkit 12.6 (Driver will be installed later) +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install cuda-toolkit-12-6 cuda-drivers-565 +rm cuda-keyring_1.1-1_all.deb diff --git a/ops/packer/linux/linux.pkr.hcl b/ops/packer/linux/linux.pkr.hcl new file mode 100644 index 000000000000..c6990894764a --- /dev/null +++ b/ops/packer/linux/linux.pkr.hcl @@ -0,0 +1,79 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + } +} + +locals { + ami_name_prefix = "xgboost-ci" + image_name = "RunsOn worker with Ubuntu 24.04 + CUDA driver" + region = "us-west-2" + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + volume_size = 40 +} + +data "amazon-ami" "aws-ubuntu-x64" { + filters = { + name = "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] +} + +source "amazon-ebs" "runs-on-linux" { + source_ami = "${data.amazon-ami.aws-ubuntu-x64.id}" + ami_name = "${local.ami_name_prefix}-runs-on-linux-${local.timestamp}" + ami_description = "${local.image_name}" + ami_regions = ["${local.region}"] + ami_virtualization_type = "hvm" + associate_public_ip_address = true + communicator = "ssh" + instance_type = "g4dn.xlarge" + region = "${local.region}" + ssh_timeout = "10m" + ssh_username = "ubuntu" + ssh_file_transfer_method = "sftp" + user_data_file = "setup_ssh.sh" + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = "${local.volume_size}" + volume_type = "gp3" + delete_on_termination = true + } + aws_polling { # Wait up to 1 hour until the AMI is ready + delay_seconds = 15 + max_attempts = 240 + } + snapshot_tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } + tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.runs-on-linux"] + + provisioner "shell" { + script = "install_drivers.sh" + pause_after = "30s" + } + + provisioner "shell" { + expect_disconnect = true + inline = ["echo 'Reboot VM'", "sudo reboot"] + } + + provisioner "shell" { + pause_before = "1m0s" + script = "bootstrap.sh" + } +} diff --git a/ops/packer/linux/setup_ssh.sh b/ops/packer/linux/setup_ssh.sh new file mode 100644 index 000000000000..501b4da455f5 --- /dev/null +++ b/ops/packer/linux/setup_ssh.sh @@ -0,0 +1,2 @@ +#!/bin/bash +systemctl start ssh diff --git a/ops/packer/windows/bootstrap.ps1 b/ops/packer/windows/bootstrap.ps1 new file mode 100644 index 000000000000..c67f3b73fb9a --- /dev/null +++ b/ops/packer/windows/bootstrap.ps1 @@ -0,0 +1,73 @@ +## Install packages from Chocolatey + +# jq & yq +Write-Output "Installing jq and yq..." +choco install jq --version=1.7.1 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install yq --version=4.40.2 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# AWS CLI +Write-Output "Installing AWS CLI..." +choco install awscli --version=2.18.11 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Git +Write-Host '>>> Installing Git...' +choco install git --version=2.47.0 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# CMake +Write-Host '>>> Installing CMake 3.30.5...' +choco install cmake --version 3.30.5 --installargs "ADD_CMAKE_TO_PATH=System" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Notepad++ +Write-Host '>>> Installing Notepad++...' +choco install notepadplusplus +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Miniforge3 +Write-Host '>>> Installing Miniforge3...' +choco install miniforge3 --params="'/InstallationType:AllUsers /RegisterPython:1 /D:C:\tools\miniforge3'" +C:\tools\miniforge3\Scripts\conda.exe init --user --system +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +. "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +conda config --set auto_activate_base false + +# Java 11 +Write-Host '>>> Installing Java 11...' +choco install openjdk11 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Maven +Write-Host '>>> Installing Maven...' +choco install maven +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# GraphViz +Write-Host '>>> Installing GraphViz...' +choco install graphviz +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Visual Studio 2022 Community +Write-Host '>>> Installing Visual Studio 2022 Community...' +choco install visualstudio2022community ` + --params "--wait --passive --norestart" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install visualstudio2022-workload-nativedesktop --params ` + "--wait --passive --norestart --includeOptional" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# CUDA 12.5 +Write-Host '>>> Installing CUDA 12.5...' +choco install cuda --version=12.5.1.555 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# R 4.3 +Write-Host '>>> Installing R...' +choco install r.project --version=4.3.2 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install rtools --version=4.3.5550 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/packer/windows/install_choco.ps1 b/ops/packer/windows/install_choco.ps1 new file mode 100644 index 000000000000..131e8129feaa --- /dev/null +++ b/ops/packer/windows/install_choco.ps1 @@ -0,0 +1,14 @@ +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/InstallChoco.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +$ErrorActionPreference = "Stop" + +# Install Chocolatey +# See https://chocolatey.org/install#individual +Set-ExecutionPolicy Bypass -Scope Process -Force +[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072 +Invoke-Expression ((New-Object System.Net.WebClient).DownloadString("https://community.chocolatey.org/install.ps1")) + +# Globally Auto confirm every action +# See: https://docs.chocolatey.org/en-us/faqs#why-do-i-have-to-confirm-packages-now-is-there-a-way-to-remove-this +choco feature enable -n allowGlobalConfirmation diff --git a/ops/packer/windows/setup_ssh.ps1 b/ops/packer/windows/setup_ssh.ps1 new file mode 100644 index 000000000000..a7bdee898002 --- /dev/null +++ b/ops/packer/windows/setup_ssh.ps1 @@ -0,0 +1,58 @@ + +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/SetupSsh.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +# Don't display progress bars +# See: https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_preference_variables?view=powershell-7.3#progresspreference +$ProgressPreference = "SilentlyContinue" +$ErrorActionPreference = "Stop" + +# Install OpenSSH using Add-WindowsCapability +# See: https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_install_firstuse?tabs=powershell#install-openssh-for-windows + +Write-Host "Installing and starting ssh-agent" +Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0 +Set-Service -Name ssh-agent -StartupType Automatic +Start-Service ssh-agent + +Write-Host "Installing and starting sshd" +Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0 +Set-Service -Name sshd -StartupType Automatic +Start-Service sshd + +# Confirm the Firewall rule is configured. It should be created automatically by setup. Run the following to verify +if (!(Get-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -ErrorAction SilentlyContinue | Select-Object Name, Enabled)) { + Write-Output "Firewall Rule 'OpenSSH-Server-In-TCP' does not exist, creating it..." + New-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -DisplayName "OpenSSH Server (sshd)" -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22 +} else { + Write-Output "Firewall rule 'OpenSSH-Server-In-TCP' has been created and exists." +} + +# Set default shell to Powershell +New-ItemProperty -Path "HKLM:\SOFTWARE\OpenSSH" -Name DefaultShell -Value "C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe" -PropertyType String -Force + +$keyDownloadScript = Join-Path $env:ProgramData "ssh\download-key.ps1" + +@' +# Download private key to $env:ProgramData\ssh\administrators_authorized_keys +$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys" + +$keyUrl = "http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key" +Invoke-WebRequest $keyUrl -OutFile $openSSHAuthorizedKeys + +# Ensure ACL for administrators_authorized_keys is correct +# See https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_server_configuration#authorizedkeysfile +icacls.exe $openSSHAuthorizedKeys /inheritance:r /grant "Administrators:F" /grant "SYSTEM:F" +'@ | Out-File $keyDownloadScript + +# Create Task +$taskName = "DownloadKey" +$principal = New-ScheduledTaskPrincipal -UserID "NT AUTHORITY\SYSTEM" -LogonType ServiceAccount -RunLevel Highest +$action = New-ScheduledTaskAction -Execute "Powershell.exe" -Argument "-NoProfile -File ""$keyDownloadScript""" +$trigger = New-ScheduledTaskTrigger -AtStartup +Register-ScheduledTask -Action $action -Trigger $trigger -Principal $principal -TaskName $taskName -Description $taskName + +# Fetch key via $keyDownloadScript +& Powershell.exe -ExecutionPolicy Bypass -File $keyDownloadScript + + diff --git a/ops/packer/windows/sysprep.ps1 b/ops/packer/windows/sysprep.ps1 new file mode 100644 index 000000000000..a0470309f9da --- /dev/null +++ b/ops/packer/windows/sysprep.ps1 @@ -0,0 +1,14 @@ +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/PrepareImage.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +$ErrorActionPreference = "Stop" + +Write-Output "Cleaning up keys" +$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys" +Remove-Item -Recurse -Force -Path $openSSHAuthorizedKeys + +# Make sure task is enabled +Enable-ScheduledTask "DownloadKey" + +Write-Output "Running Sysprep" +& "$Env:Programfiles\Amazon\EC2Launch\ec2launch.exe" sysprep diff --git a/ops/packer/windows/windows.pkr.hcl b/ops/packer/windows/windows.pkr.hcl new file mode 100644 index 000000000000..4c14b7b75806 --- /dev/null +++ b/ops/packer/windows/windows.pkr.hcl @@ -0,0 +1,90 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + windows-update = { + version = "0.15.0" + source = "github.com/rgl/windows-update" + } + } +} + +locals { + ami_name_prefix = "xgboost-ci" + image_name = "RunsOn worker with Windows Server 2022 + ssh + CUDA driver" + region = "us-west-2" + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + volume_size = 120 +} + +data "amazon-ami" "aws-windows-x64" { + filters = { + name = "Windows_Server-2022-English-Full-Base-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] +} + +source "amazon-ebs" "runs-on-windows" { + source_ami = "${data.amazon-ami.aws-windows-x64.id}" + ami_name = "${local.ami_name_prefix}-runs-on-windows-${local.timestamp}" + ami_description = "${local.image_name}" + ami_regions = ["${local.region}"] + ami_virtualization_type = "hvm" + associate_public_ip_address = true + communicator = "ssh" + instance_type = "g4dn.xlarge" + region = "${local.region}" + ssh_timeout = "10m" + ssh_username = "Administrator" + ssh_file_transfer_method = "sftp" + user_data_file = "setup_ssh.ps1" + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = "${local.volume_size}" + volume_type = "gp3" + delete_on_termination = true + } + aws_polling { # Wait up to 2.5 hours until the AMI is ready + delay_seconds = 15 + max_attempts = 600 + } + fast_launch { + enable_fast_launch = true + target_resource_count = 10 + } + snapshot_tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } + tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.runs-on-windows"] + + provisioner "windows-update" {} + + provisioner "powershell" { + script = "install_choco.ps1" + } + + provisioner "windows-restart" { + max_retries = 3 + } + + provisioner "powershell" { + script = "bootstrap.ps1" + } + + provisioner "powershell" { # Sysprep should run the last + script = "sysprep.ps1" + } +} diff --git a/tests/buildkite/cpu_only_pypkg.patch b/ops/patch/cpu_only_pypkg.patch similarity index 100% rename from tests/buildkite/cpu_only_pypkg.patch rename to ops/patch/cpu_only_pypkg.patch diff --git a/tests/buildkite/manylinux2014_warning.patch b/ops/patch/manylinux2014_warning.patch similarity index 100% rename from tests/buildkite/manylinux2014_warning.patch rename to ops/patch/manylinux2014_warning.patch diff --git a/tests/buildkite/remove_nccl_dep.patch b/ops/patch/remove_nccl_dep.patch similarity index 100% rename from tests/buildkite/remove_nccl_dep.patch rename to ops/patch/remove_nccl_dep.patch diff --git a/ops/pipeline/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh new file mode 100755 index 000000000000..ff948ca0c77a --- /dev/null +++ b/ops/pipeline/build-cpu-arm64.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +WHEEL_TAG=manylinux_2_28_aarch64 + +echo "--- Build CPU code targeting ARM64" + +echo "--- Build libxgboost from the source" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- ops/script/build_via_cmake.sh \ + --conda-env=aarch64_test \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOL=ON + +echo "--- Run Google Test" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c "cd build && ctest --extra-verbose" + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/script/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ + +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" diff --git a/ops/pipeline/build-cpu.sh b/ops/pipeline/build-cpu.sh new file mode 100755 index 000000000000..dc0572f0ca4d --- /dev/null +++ b/ops/pipeline/build-cpu.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -euox pipefail + +echo "--- Build CPU code" + +# This step is not necessary, but here we include it, to ensure that +# DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use +# the configured header build/dmlc/build_config.h instead of +# include/dmlc/build_config_default.h. +rm -fv dmlc-core/include/dmlc/build_config_default.h + +# Sanitizer tests +echo "--- Run Google Test with sanitizer enabled" +# Work around https://github.com/google/sanitizers/issues/1614 +sudo sysctl vm.mmap_rnd_bits=28 +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- ops/script/build_via_cmake.sh \ + -DUSE_SANITIZER=ON \ + -DENABLED_SANITIZERS="address;leak;undefined" \ + -DCMAKE_BUILD_TYPE=Debug \ + -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + --run-args '-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer + -e ASAN_OPTIONS=symbolize=1 + -e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log + --cap-add SYS_PTRACE' \ + -- bash -c \ + "cd build && ./testxgboost --gtest_filter=-*DeathTest*" + +echo "--- Run Google Test" +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- ops/script/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH=/opt/grpc \ + -DPLUGIN_FEDERATED=ON +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- bash -c "cd build && ctest --extra-verbose" diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh new file mode 100755 index 000000000000..479c9a1b1a28 --- /dev/null +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -0,0 +1,74 @@ +#!/bin/bash +## Build XGBoost with CUDA + RMM support + +set -euo pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 [container_id]" + exit 1 +fi +container_id="$1" + +source ops/pipeline/classify-git-branch.sh + +set -x + +WHEEL_TAG=manylinux_2_28_x86_64 + +echo "--- Build with CUDA with RMM" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +echo "--- Build libxgboost from the source" +python3 ops/docker_run.py \ + --container-id "${container_id}" \ + -- ops/script/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DPLUGIN_RMM=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id "${container_id}" \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/script/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.${WHEEL_TAG} \ + -- auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.${WHEEL_TAG} \ + -- bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh new file mode 100755 index 000000000000..49475c01c69e --- /dev/null +++ b/ops/pipeline/build-cuda.sh @@ -0,0 +1,85 @@ +#!/bin/bash +## Build XGBoost with CUDA + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +WHEEL_TAG=manylinux_2_28_x86_64 + +source ops/pipeline/classify-git-branch.sh + +echo "--- Build with CUDA" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +echo "--- Build libxgboost from the source" +set -x +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- ops/script/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/script/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" + +# Generate the meta info which includes xgboost version and the commit info +python3 ops/docker_run.py \ +--container-id xgb-ci.gpu_build_rockylinux8 \ +-- python ops/script/format_wheel_meta.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} \ + --meta-path python-package/dist/ + +echo "--- Upload Python wheel" +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress + aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress +fi diff --git a/tests/ci_build/build_r_pkg_with_cuda.sh b/ops/pipeline/build-gpu-rpkg-impl.sh similarity index 73% rename from tests/ci_build/build_r_pkg_with_cuda.sh rename to ops/pipeline/build-gpu-rpkg-impl.sh index 78a2afc1cdf7..2815b8f448f1 100755 --- a/tests/ci_build/build_r_pkg_with_cuda.sh +++ b/ops/pipeline/build-gpu-rpkg-impl.sh @@ -1,8 +1,12 @@ #!/bin/bash -set -e -set -x -if [ "$#" -ne 1 ] +## Build XGBoost R package with GPU support and package it in a tarball. +## Users will be able to install it without having CTK installed +## (only a compatible NVIDIA driver is needed). + +set -euo pipefail + +if [[ "$#" -ne 1 ]] then echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]" exit 1 @@ -10,7 +14,7 @@ fi commit_hash="$1" -python tests/ci_build/test_r_package.py --task=pack +python3 ops/script/test_r_package.py --task=pack mv xgboost/ xgboost_rpack/ mkdir build diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh new file mode 100755 index 000000000000..d1384ef766a6 --- /dev/null +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +echo "--- Build XGBoost R package with CUDA" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_r_rockylinux8 \ + -- ops/pipeline/build-gpu-rpkg-impl.sh \ + ${GITHUB_SHA} diff --git a/ops/pipeline/build-jvm-doc-impl.sh b/ops/pipeline/build-jvm-doc-impl.sh new file mode 100755 index 000000000000..4e95f284e25c --- /dev/null +++ b/ops/pipeline/build-jvm-doc-impl.sh @@ -0,0 +1,43 @@ +#!/bin/bash +## Build docs for the JVM packages and package it in a tarball +## Note: Note: this script assumes that the user has already built libxgboost4j.so +## and place it in the lib/ directory. + +if [[ $# -ne 1 ]] +then + echo "Usage: $0 [branch name]" + exit 1 +fi + +set -euo pipefail + +branch_name=$1 + +# Copy in libxgboost4j.so +mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + +cd jvm-packages/ +# Install JVM packages in local Maven repository +mvn --no-transfer-progress install -Pdocs +# Build Scaladocs +mvn --no-transfer-progress scala:doc -Pdocs +# Build Javadocs +mvn --no-transfer-progress javadoc:javadoc -Pdocs + +# Package JVM docs in a tarball +mkdir -p tmp/scaladocs +cp -rv xgboost4j/target/reports/apidocs/ ./tmp/javadocs/ +cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/ +cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/ +cp -rv xgboost4j-spark-gpu/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark-gpu/ +cp -rv xgboost4j-flink/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-flink/ + +cd tmp +tar cvjf ${branch_name}.tar.bz2 javadocs/ scaladocs/ +mv ${branch_name}.tar.bz2 .. +cd .. +rm -rfv tmp/ + +set +x +set +e diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh new file mode 100755 index 000000000000..00fdac7a1353 --- /dev/null +++ b/ops/pipeline/build-jvm-doc.sh @@ -0,0 +1,24 @@ +#!/bin/bash +## Build docs for the JVM packages and package it in a tarball +## Note: this script assumes that the user has already built libxgboost4j.so +## and place it in the lib/ directory. + +set -euox pipefail + +echo "--- Build JVM packages doc" + +if [[ -z ${BRANCH_NAME:-} ]] +then + echo "Make sure to define environment variable BRANCH_NAME." + exit 1 +fi + +if [[ ! -f lib/libxgboost4j.so ]] +then + echo "Must place libxgboost4j.so in lib/ first" + exit 2 +fi + +python3 ops/docker_run.py \ + --container-id xgb-ci.jvm_gpu_build \ + -- ops/pipeline/build-jvm-doc-impl.sh ${BRANCH_NAME} diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh new file mode 100755 index 000000000000..7656a3d2f188 --- /dev/null +++ b/ops/pipeline/build-jvm-gpu.sh @@ -0,0 +1,33 @@ +#!/bin/bash +## Build libxgboost4j.so with CUDA + +set -euo pipefail + +source ops/pipeline/classify-git-branch.sh + +echo "--- Build libxgboost4j.so with CUDA" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +COMMAND=$( +cat <<-EOF +cd build-gpu/ && \ +cmake .. -DCMAKE_PREFIX_PATH=/workspace/cccl -GNinja -DUSE_CUDA=ON -DUSE_NCCL=ON \ + -DJVM_BINDINGS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ${arch_flag} && \ + ninja +EOF +) + +set -x +mkdir -p build-gpu/ +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet --depth 1 +python3 ops/docker_run.py \ + --container-id xgb-ci.jvm_gpu_build \ + -- bash -c "${COMMAND}" diff --git a/tests/buildkite/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-apple-silicon.sh old mode 100644 new mode 100755 similarity index 50% rename from tests/buildkite/build-jvm-macos-m1.sh rename to ops/pipeline/build-jvm-macos-apple-silicon.sh index 1d2e5e8703bc..cfba35d0f96a --- a/tests/buildkite/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-apple-silicon.sh @@ -1,8 +1,7 @@ #!/bin/bash +## Build libxgboost4j.dylib targeting MacOS (Apple Silicon) -set -euo pipefail - -source tests/buildkite/conftest.sh +set -euox pipefail # Display system info echo "--- Display system information" @@ -12,6 +11,8 @@ sysctl -n machdep.cpu.brand_string uname -m set +x +brew install ninja libomp + # Build XGBoost4J binary echo "--- Build libxgboost4j.dylib" set -x @@ -24,18 +25,3 @@ popd rm -rf build otool -L lib/libxgboost.dylib set +x - -echo "--- Upload libxgboost4j.dylib" -set -x -pushd lib -libname=libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib -mv -v libxgboost4j.dylib ${libname} -buildkite-agent artifact upload ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd -set +x diff --git a/ops/pipeline/build-jvm-macos-intel.sh b/ops/pipeline/build-jvm-macos-intel.sh new file mode 100755 index 000000000000..5e73b03b7f6e --- /dev/null +++ b/ops/pipeline/build-jvm-macos-intel.sh @@ -0,0 +1,26 @@ +#!/bin/bash +## Build libxgboost4j.dylib targeting MacOS (Intel) + +set -euox pipefail + +# Display system info +echo "--- Display system information" +set -x +system_profiler SPSoftwareDataType +sysctl -n machdep.cpu.brand_string +uname -m +set +x + +brew install ninja libomp + +# Build XGBoost4J binary +echo "--- Build libxgboost4j.dylib" +set -x +mkdir build +pushd build +export JAVA_HOME=$(/usr/libexec/java_home) +cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=ON -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 +ninja -v +popd +rm -rf build +otool -L lib/libxgboost.dylib diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh new file mode 100755 index 000000000000..e69dd3682b90 --- /dev/null +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -0,0 +1,25 @@ +#!/bin/bash +## Build libxgboost4j.so targeting glibc 2.17 systems + +set -euox pipefail + +if [[ $# -ne 1 ]] +then + echo "Usage: $0 {x86_64,aarch64}" + exit 1 +fi + +arch=$1 + +image="xgb-ci.manylinux2014_${arch}" + +# Build XGBoost4J binary +echo "--- Build libxgboost4j.so (targeting glibc 2.17)" +set -x +mkdir build +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ + "cd build && cmake .. -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && make -j$(nproc)" +ldd lib/libxgboost4j.so +objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh new file mode 100755 index 000000000000..a8f5af8bc3cd --- /dev/null +++ b/ops/pipeline/build-manylinux2014.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {x86_64,aarch64}" + exit 1 +fi + +arch="$1" + +WHEEL_TAG="manylinux2014_${arch}" +image="xgb-ci.${WHEEL_TAG}" + +python_bin="/opt/python/cp310-cp310/bin/python" + +echo "--- Build binary wheel for ${WHEEL_TAG}" +# Patch to add warning about manylinux2014 variant +patch -p0 < ops/patch/remove_nccl_dep.patch +patch -p0 < ops/patch/manylinux2014_warning.patch +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ + "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" +git checkout python-package/pyproject.toml python-package/xgboost/core.py + # discard the patch + +python3 ops/docker_run.py \ + --container-id ${image} \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +rm -rf python-package/dist/ +mkdir python-package/dist/ +mv -v wheelhouse/*.whl python-package/dist/ + +echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" +# Patch to rename pkg to xgboost-cpu +patch -p0 < ops/patch/remove_nccl_dep.patch +patch -p0 < ops/patch/cpu_only_pypkg.patch +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ + "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" +git checkout python-package/pyproject.toml # discard the patch + +python3 ops/docker_run.py \ + --container-id ${image} \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/xgboost_cpu-*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +rm -v python-package/dist/xgboost_cpu-*.whl +mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ diff --git a/tests/ci_build/build_python_wheels.sh b/ops/pipeline/build-python-wheels-macos.sh old mode 100644 new mode 100755 similarity index 94% rename from tests/ci_build/build_python_wheels.sh rename to ops/pipeline/build-python-wheels-macos.sh index d9927905cf83..697514c0c3ad --- a/tests/ci_build/build_python_wheels.sh +++ b/ops/pipeline/build-python-wheels-macos.sh @@ -1,7 +1,6 @@ #!/bin/bash -set -e -set -x +set -euox pipefail if [[ $# -ne 2 ]]; then echo "Usage: $0 [platform_id] [commit ID]" @@ -31,7 +30,6 @@ if [[ "$platform_id" == macosx_* ]]; then # Set up environment variables to configure cibuildwheel export CIBW_BUILD=cp${cpython_ver}-${platform_id} export CIBW_ARCHS=${cibw_archs} - export CIBW_ENVIRONMENT=${setup_env_var} export CIBW_TEST_SKIP='*-macosx_arm64' export CIBW_BUILD_VERBOSITY=3 else @@ -44,7 +42,7 @@ export CIBW_REPAIR_WHEEL_COMMAND_MACOS="delocate-wheel --require-archs {delocate python -m pip install cibuildwheel python -m cibuildwheel python-package --output-dir wheelhouse -python tests/ci_build/rename_whl.py \ +python ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${commit_id} \ --platform-tag ${wheel_tag} diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh new file mode 100755 index 000000000000..ed95ba3368ab --- /dev/null +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -0,0 +1,79 @@ +#!/bin/bash +## Build and test JVM packages. +## Companion script for build-test-jvm-packages.sh. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) + - USE_CUDA: Set to 1 to enable CUDA + - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided + libxgboost4j.so. (Usually Maven will invoke create_jni.py to + build it from scratch.) When using this option, make sure to + place libxgboost4j.so in lib/ directory. +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +# Set Scala version +if [[ "${SCALA_VERSION}" == "2.12" || "${SCALA_VERSION}" == "2.13" ]] +then + python ops/script/change_scala_version.py --scala-version ${SCALA_VERSION} --purge-artifacts +else + echo "Error: SCALA_VERSION must be either 2.12 or 2.13" + exit 2 +fi + +# If SKIP_NATIVE_BUILD is set, copy in libxgboost4j.so from lib/ +# Also copy in other files needed for testing. (Usually create_jni.py would perform this +# step, but we need to do it manually here.) +if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] +then + bash ops/script/inject_jvm_lib.sh +fi + +cd jvm-packages/ + +# Ensure that XGBoost4J-Spark is compatible with multiple versions of Spark +if [[ "${USE_CUDA:-}" != "1" && "${SCALA_VERSION}" == "2.12" ]] +then + for spark_version in 3.1.3 3.2.4 3.3.4 3.4.3 + do + mvn --no-transfer-progress clean package -Dspark.version=${spark_version} \ + -pl xgboost4j,xgboost4j-spark + done +fi + +set +x +mvn_options="" +if [[ "${USE_CUDA:-}" == "1" ]] +then + mvn_options="${mvn_options} -Pgpu" +fi +if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] +then + mvn_options="${mvn_options} -Dskip.native.build=true" +fi +set -x + +mvn --no-transfer-progress clean install ${mvn_options} + +# Integration tests +if [[ "${USE_CUDA:-}" != "1" ]] +then + mvn --no-transfer-progress test -pl xgboost4j-example +fi diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh new file mode 100755 index 000000000000..d04cc3510de5 --- /dev/null +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -0,0 +1,28 @@ +#!/bin/bash +## Build and test JVM packages. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +python3 ops/docker_run.py --container-id xgb-ci.jvm \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION}" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/build-test-sycl.sh b/ops/pipeline/build-test-sycl.sh new file mode 100755 index 000000000000..f3b651b18cf9 --- /dev/null +++ b/ops/pipeline/build-test-sycl.sh @@ -0,0 +1,33 @@ +#!/bin/bash +## Build and test oneAPI + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gtest,pytest}" + exit 1 +fi + +suite="$1" + +mkdir build +pushd build +cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja +ninja +popd + +case "$suite" in + gtest) + ./build/testxgboost + ;; + pytest) + cd python-package + python --version + pip install -v . + cd .. + pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ + ;; +esac diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 new file mode 100644 index 000000000000..76cc955059b8 --- /dev/null +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -0,0 +1,46 @@ +$ErrorActionPreference = "Stop" + +. ops/pipeline/enforce-ci.ps1 + +Write-Host "--- Build libxgboost on Windows with CUDA" + +nvcc --version +if ( $is_release_branch -eq 0 ) { + $arch_flag = "-DGPU_COMPUTE_VER=75" +} else { + $arch_flag = "" +} + +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +mkdir build +cd build +cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON ` + -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ` + -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" ${arch_flag} +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +cmake --build . --config Release -- /m /nodeReuse:false ` + "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Build binary wheel" +cd ../python-package +conda activate +pip install --user -v "pip>=23" +pip --version +pip wheel --no-deps -v . --wheel-dir dist/ +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +python ../ops/script/rename_whl.py ` + --wheel-path (Get-ChildItem dist/*.whl | Select-Object -Expand FullName) ` + --commit-hash $Env:GITHUB_SHA ` + --platform-tag win_amd64 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Upload Python wheel" +cd .. +if ( $is_release_branch -eq 1 ) { + aws s3 cp (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) ` + s3://xgboost-nightly-builds/$Env:BRANCH_NAME/ --acl public-read --no-progress + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +} diff --git a/ops/pipeline/classify-git-branch.sh b/ops/pipeline/classify-git-branch.sh new file mode 100755 index 000000000000..3d9a2348f23e --- /dev/null +++ b/ops/pipeline/classify-git-branch.sh @@ -0,0 +1,25 @@ +#!/bin/bash +## Detect whether the current git branch is a pull request or a release branch + +set -euo pipefail + +if [[ -n ${GITHUB_BASE_REF:-} ]] +then + is_pull_request=1 +else + is_pull_request=0 +fi + +if [[ ${BRANCH_NAME:-} == "master" || ${BRANCH_NAME:-} == "release_"* || ${BRANCH_NAME:-} == "federated-secure" ]] +then + is_release_branch=1 + enforce_daily_budget=0 +else + is_release_branch=0 + enforce_daily_budget=1 +fi + +if [[ -n ${DISABLE_RELEASE:-} ]] +then + is_release_branch=0 +fi diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh new file mode 100755 index 000000000000..e9c09112a4bd --- /dev/null +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -0,0 +1,39 @@ +#!/bin/bash +## Deploy JVM packages to S3 bucket +## Companion script for ops/pipeline/deploy-jvm-packages.sh + +set -euox pipefail + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {cpu,gpu} [scala_version]" + exit 1 +fi + +variant="$1" +scala_version="$2" +maven_options="-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true" + +case "$variant" in + cpu) + # CPU variant + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + bash ops/script/inject_jvm_lib.sh + pushd jvm-packages + mvn --no-transfer-progress deploy -Pdefault,release-to-s3 ${maven_options} + popd + ;; + gpu) + # GPU variant + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + bash ops/script/inject_jvm_lib.sh + pushd jvm-packages + mvn --no-transfer-progress install -Pgpu ${maven_options} + mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu ${maven_options} + popd + ;; + *) + echo "Unrecognized argument: $variant" + exit 2 + ;; +esac diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh new file mode 100755 index 000000000000..e821f334b9d2 --- /dev/null +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## Deploy JVM packages to S3 bucket + +set -euox pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ "$#" -lt 3 ]] +then + echo "Usage: $0 {cpu,gpu} [container_id] [scala_version]" + exit 1 +fi + +variant="$1" +container_id="$2" +scala_version="$3" + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" + python3 ops/docker_run.py --container-id "${container_id}" \ + -- ops/pipeline/deploy-jvm-packages-impl.sh "${variant}" "${scala_version}" +fi diff --git a/ops/pipeline/enforce-ci.ps1 b/ops/pipeline/enforce-ci.ps1 new file mode 100644 index 000000000000..0528472be6cb --- /dev/null +++ b/ops/pipeline/enforce-ci.ps1 @@ -0,0 +1,28 @@ +## Ensure that a script is running inside the CI. +## Usage: . ops/pipeline/enforce-ci.ps1 + +if ( -Not $Env:GITHUB_ACTION ) { + $script_name = (Split-Path -Path $PSCommandPath -Leaf) + Write-Host "$script_name is not meant to run locally; it should run inside GitHub Actions." + Write-Host "Please inspect the content of $script_name and locate the desired command manually." + exit 1 +} + +if ( -Not $Env:BRANCH_NAME ) { + Write-Host "Make sure to define environment variable BRANCH_NAME." + exit 2 +} + +if ( $Env:GITHUB_BASE_REF ) { + $is_pull_request = 1 +} else { + $is_pull_request = 0 +} + +if ( ($Env:BRANCH_NAME -eq "master") -or ($Env:BRANCH_NAME -match "release_.+") ) { + $is_release_branch = 1 + $enforce_daily_budget = 0 +} else { + $is_release_branch = 0 + $enforce_daily_budget = 1 +} diff --git a/ops/pipeline/enforce-ci.sh b/ops/pipeline/enforce-ci.sh new file mode 100755 index 000000000000..1e853a5ea266 --- /dev/null +++ b/ops/pipeline/enforce-ci.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +## Ensure that a script is running inside the CI. +## Usage: source ops/pipeline/enforce-ci.sh + +set -euo pipefail + +if [[ -z ${GITHUB_ACTION:-} ]] +then + echo "$0 is not meant to run locally; it should run inside GitHub Actions." + echo "Please inspect the content of $0 and locate the desired command manually." + exit 1 +fi + +if [[ -z ${BRANCH_NAME:-} ]] +then + echo "Make sure to define environment variable BRANCH_NAME." + exit 2 +fi + +source ops/pipeline/classify-git-branch.sh diff --git a/ops/pipeline/publish-artifact.sh b/ops/pipeline/publish-artifact.sh new file mode 100755 index 000000000000..adcb3c521d2a --- /dev/null +++ b/ops/pipeline/publish-artifact.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +## Publish artifacts in an S3 bucket +## Meant to be used inside GitHub Actions + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ $# -ne 2 ]] +then + echo "Usage: $0 [artifact] [s3_url]" + exit 1 +fi + +artifact="$1" +s3_url="$2" + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + echo "aws s3 cp ${artifact} ${s3_url} --acl public-read --no-progress" + aws s3 cp "${artifact}" "${s3_url}" --acl public-read --no-progress +fi diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh new file mode 100755 index 000000000000..676f302009ce --- /dev/null +++ b/ops/pipeline/run-clang-tidy.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -euox pipefail + +echo "--- Run clang-tidy" + +python3 ops/docker_run.py \ + --container-id xgb-ci.clang_tidy \ + -- python3 ops/script/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/pipeline/stash-artifacts.ps1 b/ops/pipeline/stash-artifacts.ps1 new file mode 100644 index 000000000000..9b9989bf376d --- /dev/null +++ b/ops/pipeline/stash-artifacts.ps1 @@ -0,0 +1,49 @@ +[CmdletBinding()] +Param( + [Parameter( + Mandatory=$true, + Position=0 + )][string]$command, + [Parameter( + Mandatory=$true, + Position=1 + )][string]$remote_prefix, + [Parameter( + Mandatory=$true, + Position=2, + ValueFromRemainingArguments=$true + )][string[]]$artifacts +) + +## Convenience wrapper for ops/pipeline/stash-artifacts.py +## Meant to be used inside GitHub Actions + +$ErrorActionPreference = "Stop" + +. ops/pipeline/enforce-ci.ps1 + +foreach ($env in "GITHUB_REPOSITORY", "GITHUB_RUN_ID", "RUNS_ON_S3_BUCKET_CACHE") { + $val = [Environment]::GetEnvironmentVariable($env) + if ($val -eq $null) { + Write-Host "Error: $env must be set." + exit 1 + } +} + +$artifact_stash_prefix = "cache/${Env:GITHUB_REPOSITORY}/stash/${Env:GITHUB_RUN_ID}" + +conda activate + +Write-Host @" +python ops/pipeline/stash-artifacts.py ` + --command "${command}" ` + --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` + --prefix "${artifact_stash_prefix}/${remote_prefix}" ` + -- $artifacts +"@ +python ops/pipeline/stash-artifacts.py ` + --command "${command}" ` + --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` + --prefix "${artifact_stash_prefix}/${remote_prefix}" ` + -- $artifacts +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/pipeline/stash-artifacts.py b/ops/pipeline/stash-artifacts.py new file mode 100644 index 000000000000..151e187513da --- /dev/null +++ b/ops/pipeline/stash-artifacts.py @@ -0,0 +1,144 @@ +""" +Stash an artifact in an S3 bucket for later use + +Note. This script takes in all inputs via environment variables + except the path to the artifact(s). +""" + +import argparse +import os +import subprocess +from pathlib import Path +from urllib.parse import SplitResult, urlsplit, urlunsplit + + +def resolve(x: Path) -> Path: + return x.expanduser().resolve() + + +def path_equals(a: Path, b: Path) -> bool: + return resolve(a) == resolve(b) + + +def compute_s3_url(s3_bucket: str, prefix: str, artifact: Path) -> str: + filename = artifact.name + relative_path = resolve(artifact).relative_to(Path.cwd()) + if resolve(artifact.parent) == resolve(Path.cwd()): + full_prefix = prefix + else: + full_prefix = f"{prefix}/{str(relative_path.parent)}" + return f"s3://{s3_bucket}/{full_prefix}/{filename}" + + +def aws_s3_upload(src: Path, dest: str) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download(src: str, dest: Path) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest)] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: + parsed_src = urlsplit(src) + src_dir = urlunsplit( + SplitResult( + scheme="s3", + netloc=parsed_src.netloc, + path=os.path.dirname(parsed_src.path), + query="", + fragment="", + ) + ) + dest_dir = dest.parent + src_glob = os.path.basename(parsed_src.path) + cli_args = [ + "aws", + "s3", + "cp", + "--recursive", + "--no-progress", + "--exclude", + "'*'", + "--include", + src_glob, + src_dir, + str(dest_dir), + ] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def upload(args: argparse.Namespace) -> None: + print(f"Stashing artifacts to prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + aws_s3_upload(artifact_path, s3_url) + + +def download(args: argparse.Namespace) -> None: + print(f"Unstashing artifacts from prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + print(f"mkdir -p {str(artifact_path.parent)}") + artifact_path.parent.mkdir(parents=True, exist_ok=True) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + if "*" in artifact: + aws_s3_download_with_wildcard(s3_url, artifact_path) + else: + aws_s3_download(s3_url, artifact_path) + + +if __name__ == "__main__": + # Ensure that the current working directory is the project root + if not (Path.cwd() / "ops").is_dir() or not path_equals( + Path(__file__).parent.parent, Path.cwd() / "ops" + ): + x = Path(__file__).name + raise RuntimeError(f"Script {x} must be run at the project's root directory") + + parser = argparse.ArgumentParser() + parser.add_argument( + "--command", + type=str, + choices=["stash", "unstash"], + required=True, + help="Whether to stash or unstash the artifact", + ) + parser.add_argument( + "--s3-bucket", + type=str, + required=True, + help="Name of the S3 bucket to store the artifact", + ) + parser.add_argument( + "--prefix", + type=str, + required=True, + help=( + "Where the artifact would be stored. The artifact will be stored in " + "s3://[s3-bucket]/[prefix]." + ), + ) + parser.add_argument("artifacts", type=str, nargs="+", metavar="artifact") + parsed_args = parser.parse_args() + if parsed_args.command == "stash": + upload(parsed_args) + elif parsed_args.command == "unstash": + download(parsed_args) diff --git a/ops/pipeline/stash-artifacts.sh b/ops/pipeline/stash-artifacts.sh new file mode 100755 index 000000000000..98c9695c4227 --- /dev/null +++ b/ops/pipeline/stash-artifacts.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +## Convenience wrapper for ops/pipeline/stash-artifacts.py +## Meant to be used inside GitHub Actions + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ "$#" -lt 3 ]] +then + echo "Usage: $0 {stash,unstash} [remote_prefix] [artifact] [artifact ...]" + exit 1 +fi + +command="$1" +remote_prefix="$2" +shift 2 + +for arg in "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" +do + if [[ -z "${!arg:-}" ]] + then + echo "Error: $arg must be set." + exit 2 + fi +done + +artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" + +set -x +python3 ops/pipeline/stash-artifacts.py \ + --command "${command}" \ + --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ + --prefix "${artifact_stash_prefix}/${remote_prefix}" \ + -- "$@" diff --git a/ops/pipeline/test-c-api-demo.sh b/ops/pipeline/test-c-api-demo.sh new file mode 100755 index 000000000000..9a44c8c46fd9 --- /dev/null +++ b/ops/pipeline/test-c-api-demo.sh @@ -0,0 +1,39 @@ +#!/bin/bash +## Test C API demos + +set -euox pipefail + +# Build and install XGBoost static library (libxgboost.a) +mkdir build +pushd build +cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja +ninja -v install +popd + +# Build and run C API demo with static library +pushd demo/c-api/ +mkdir build-c-api-demo +pushd build-c-api-demo +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX +ninja -v +ctest +popd +rm -rf ./build-c-api-demo +popd + +# Build and install XGBoost shared library (libxgboost.so) +pushd build +cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja \ + -DPLUGIN_FEDERATED=ON +ninja -v install +popd + +# Build and run C API demo with shared library +mkdir demo/c-api/build-c-api-demo +pushd demo/c-api/build-c-api-demo +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX +ninja -v +ctest +popd +./ops/script/verify_link.sh ./demo/c-api/build-c-api-demo/basic/api-demo +./ops/script/verify_link.sh ./demo/c-api/build-c-api-demo/external-memory/external-memory-demo diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh new file mode 100755 index 000000000000..9a0cd4743c18 --- /dev/null +++ b/ops/pipeline/test-cpp-gpu.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gpu,gpu-rmm,mgpu}" + exit 1 +fi +arg=$1 + +case "${arg}" in + gpu) + echo "--- Run Google Tests, using a single GPU" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- build/testxgboost + ;; + + gpu-rmm) + echo "--- Run Google Tests, using a single GPU, RMM enabled" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- build/testxgboost --use-rmm-pool + ;; + + mgpu) + echo "--- Run Google Tests, using multiple GPUs" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--shm-size=4g' \ + -- build/testxgboost --gtest_filter=*MGPU* + ;; + + *) + echo "Unrecognized arg: ${arg}" + exit 2 + ;; +esac diff --git a/ops/pipeline/test-freebsd.sh b/ops/pipeline/test-freebsd.sh new file mode 100755 index 000000000000..f9ed61e9e2b8 --- /dev/null +++ b/ops/pipeline/test-freebsd.sh @@ -0,0 +1,10 @@ +#!/bin/bash +## Run tests on FreeBSD + +set -euox pipefail + +mkdir build +cd build +cmake .. -GNinja -DGOOGLE_TEST=ON +ninja -v +./testxgboost diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh new file mode 100755 index 000000000000..380db97c787c --- /dev/null +++ b/ops/pipeline/test-jvm-gpu.sh @@ -0,0 +1,32 @@ +#!/bin/bash +## Test JVM packages with CUDA. Note: this script assumes that +## the user has already built libxgboost4j.so with CUDA support +## and place it in the lib/ directory. + +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ + -- nvidia-smi +python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1 --shm-size=4g --privileged" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python-macos.sh b/ops/pipeline/test-python-macos.sh new file mode 100755 index 000000000000..63b5690d1312 --- /dev/null +++ b/ops/pipeline/test-python-macos.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## Test XGBoost Python wheel on MacOS + +set -euox pipefail + +brew install ninja + +mkdir build +pushd build +# Set prefix, to use OpenMP library from Conda env +# See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 +# to learn why we don't use libomp from Homebrew. +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON +ninja +popd + +cd python-package +python --version +pip install -v . + +cd .. +pytest -s -v -rxXs --durations=0 ./tests/python +pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask diff --git a/ops/pipeline/test-python-sdist.sh b/ops/pipeline/test-python-sdist.sh new file mode 100755 index 000000000000..d6b71597380e --- /dev/null +++ b/ops/pipeline/test-python-sdist.sh @@ -0,0 +1,11 @@ +#!/bin/bash +## Test installing Python XGBoost from source distribution + +set -euox pipefail + +cd python-package +python --version +python -m build --sdist +pip install -v ./dist/xgboost-*.tar.gz +cd .. +python -c 'import xgboost' diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh new file mode 100755 index 000000000000..75bfa5fbaffb --- /dev/null +++ b/ops/pipeline/test-python-wheel-impl.sh @@ -0,0 +1,74 @@ +#!/bin/bash +## Companion script for ops/pipeline/test-python-wheel.sh + +set -eo pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64}" + exit 1 +fi + +suite="$1" + +# Cannot set -u before Conda env activation +case "$suite" in + gpu|mgpu) + source activate gpu_test + ;; + cpu) + source activate linux_cpu_test + ;; + cpu-arm64) + source activate aarch64_test + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac + +set -xu + +export PYSPARK_DRIVER_PYTHON=$(which python) +export PYSPARK_PYTHON=$(which python) +export SPARK_TESTING=1 + +pip install -v ./python-package/dist/*.whl + +case "$suite" in + gpu) + echo "-- Run Python tests, using a single GPU" + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + ;; + mgpu) + echo "-- Run Python tests, using multiple GPUs" + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_federated + ;; + cpu) + echo "-- Run Python tests (CPU)" + export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 + pytest -v -s -rxXs --fulltrace --durations=0 tests/python + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated + ;; + cpu-arm64) + echo "-- Run Python tests (CPU, ARM64)" + pytest -v -s -rxXs --fulltrace --durations=0 \ + tests/python/test_basic.py tests/python/test_basic_models.py \ + tests/python/test_model_compatibility.py + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh new file mode 100755 index 000000000000..b4dd59b7cb0e --- /dev/null +++ b/ops/pipeline/test-python-wheel.sh @@ -0,0 +1,25 @@ +#!/bin/bash +## Test XGBoost Python wheel on the Linux platform + +set -euo pipefail + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [container_id]" + exit 1 +fi + +suite="$1" +container_id="$2" + +if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]] +then + gpu_option="--use-gpus" +else + gpu_option="" +fi + +set -x +python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ + --run-args='--shm-size=4g --privileged' \ + -- bash ops/pipeline/test-python-wheel-impl.sh "${suite}" diff --git a/ops/pipeline/test-python-with-sysprefix.sh b/ops/pipeline/test-python-with-sysprefix.sh new file mode 100755 index 000000000000..9ee918b112f4 --- /dev/null +++ b/ops/pipeline/test-python-with-sysprefix.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## Test if Python XGBoost can be configured to use libxgboost.so from the system prefix + +set -euox pipefail + +sudo apt-get update && sudo apt-get install -y ninja-build + +mkdir build +pushd build +cmake .. -GNinja +ninja +popd + +# Copy libxgboost.so to system prefix +cp -v lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" + +# Now configure Python XGBoost to use libxgboost.so from the system prefix +cd python-package +pip install virtualenv +virtualenv venv +source venv/bin/activate && \ + pip install -v . --config-settings use_system_libxgboost=True && \ + python -c 'import xgboost' diff --git a/ops/pipeline/test-win64-gpu.ps1 b/ops/pipeline/test-win64-gpu.ps1 new file mode 100644 index 000000000000..4af3bee2cffc --- /dev/null +++ b/ops/pipeline/test-win64-gpu.ps1 @@ -0,0 +1,26 @@ +$ErrorActionPreference = "Stop" + +Write-Host "--- Test XGBoost on Windows with CUDA" + +nvcc --version + +Write-Host "--- Run Google Tests" +build/testxgboost.exe +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Set up Python env" +conda activate +$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) +mamba env create -n ${env_name} --file=ops/conda_env/win64_test.yml +conda activate ${env_name} +python -m pip install ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Run Python tests" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +Write-Host "--- Run Python tests with GPU" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` + tests/python-gpu +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh new file mode 100755 index 000000000000..00a571584ea4 --- /dev/null +++ b/ops/script/build_via_cmake.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -euo pipefail + +if [[ "$#" -lt 1 ]] +then + conda_env="" +else + conda_env="$1" +fi + +if [[ "${conda_env}" == --conda-env=* ]] +then + conda_env=$(echo "${conda_env}" | sed 's/^--conda-env=//g' -) + echo "Activating Conda environment ${conda_env}" + shift 1 + cmake_args="$@" + + # Workaround for file permission error + if [[ -n ${CI_BUILD_UID:-} ]] + then + gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs + fi + + # Don't activate Conda env if it's already activated + if [[ -z ${CONDA_PREFIX:-} ]] + then + source activate ${conda_env} + fi + cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" +else + cmake_args="$@" + cmake_prefix_flag='' +fi + +rm -rf build +mkdir build +cd build +# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until +# https://github.com/dmlc/xgboost/issues/10400 is fixed +set -x +cmake .. ${cmake_args} \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -GNinja \ + ${cmake_prefix_flag} \ + -DHIDE_CXX_SYMBOLS=ON \ + -DBUILD_DEPRECATED_CLI=ON +ninja clean +time ninja -v +cd .. +set +x diff --git a/dev/change_scala_version.py b/ops/script/change_scala_version.py similarity index 93% rename from dev/change_scala_version.py rename to ops/script/change_scala_version.py index c8a9b54ccf91..ed475a1f9582 100644 --- a/dev/change_scala_version.py +++ b/ops/script/change_scala_version.py @@ -4,7 +4,7 @@ import shutil -def main(args): +def main(args: argparse.Namespace) -> None: if args.scala_version == "2.12": scala_ver = "2.12" scala_patchver = "2.12.18" @@ -20,6 +20,9 @@ def main(args): if target.is_dir(): print(f"Removing {target}...") shutil.rmtree(target) + for target in pathlib.Path("jvm-packages/").glob("**/*.so"): + print(f"Removing {target}...") + target.unlink() # Update pom.xml for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"): diff --git a/tests/ci_build/change_version.py b/ops/script/change_version.py similarity index 100% rename from tests/ci_build/change_version.py rename to ops/script/change_version.py diff --git a/tests/ci_build/format_wheel_meta.py b/ops/script/format_wheel_meta.py similarity index 92% rename from tests/ci_build/format_wheel_meta.py rename to ops/script/format_wheel_meta.py index 9e7bad907687..a7def879905e 100644 --- a/tests/ci_build/format_wheel_meta.py +++ b/ops/script/format_wheel_meta.py @@ -2,18 +2,19 @@ Script to generate meta.json to store metadata for a nightly build of XGBoost Python package. """ + +import argparse import json import pathlib -from argparse import ArgumentParser -def main(args): +def main(args: argparse.Namespace) -> None: wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() if not wheel_path.exists(): raise ValueError(f"Wheel cannot be found at path {wheel_path}") if not wheel_path.is_file(): raise ValueError(f"Path {wheel_path} is not a valid file") - wheel_dir, wheel_name = wheel_path.parent, wheel_path.name + wheel_name = wheel_path.name meta_path = pathlib.Path(args.meta_path) if not meta_path.exists(): @@ -36,7 +37,7 @@ def main(args): if __name__ == "__main__": - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Format meta.json encoding the latest nightly version of the Python wheel" ) parser.add_argument( diff --git a/ops/script/inject_jvm_lib.sh b/ops/script/inject_jvm_lib.sh new file mode 100755 index 000000000000..82584aeaca92 --- /dev/null +++ b/ops/script/inject_jvm_lib.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Inject lib/libxgboost4j.so into JVM packages. +# This script is useful when the user opts to set skip.native.build=true +# option in the JVM package build. When this option is set, the JVM package +# build will not build libxgboost4j.so; instead it will expect to find the +# library in jvm-packages/xgboost4j/src/main/resources/lib/{os}/{arch}/. +# This script will ensure that libxgboost4j.so is copied to the correct +# location. + +set -euox pipefail + +echo "Using externally provided libxgboost4j.so. Locating one from lib/..." +mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +mkdir -p jvm-packages/xgboost4j/src/test/resources +mkdir -p jvm-packages/xgboost4j-spark/src/test/resources +mkdir -p jvm-packages/xgboost4j-spark-gpu/src/test/resources + +# Generate machine.txt.* files from the CLI regression demo +# TODO(hcho3): Remove once CLI is removed +pushd demo/CLI/regression +python3 mapfeat.py +python3 mknfold.py machine.txt 1 +popd + +cp -v demo/data/agaricus.* \ + jvm-packages/xgboost4j/src/test/resources +cp -v demo/CLI/regression/machine.txt.t* demo/data/agaricus.* \ + jvm-packages/xgboost4j-spark/src/test/resources +cp -v demo/data/veterans_lung_cancer.csv \ + jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv \ + jvm-packages/xgboost4j-spark-gpu/src/test/resources diff --git a/tests/ci_build/lint_cmake.sh b/ops/script/lint_cmake.sh old mode 100644 new mode 100755 similarity index 94% rename from tests/ci_build/lint_cmake.sh rename to ops/script/lint_cmake.sh index d67ecd0844ed..55aeb20e8fb2 --- a/tests/ci_build/lint_cmake.sh +++ b/ops/script/lint_cmake.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e +set -euo pipefail cmake_files=$( find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \ diff --git a/tests/ci_build/lint_cpp.py b/ops/script/lint_cpp.py similarity index 86% rename from tests/ci_build/lint_cpp.py rename to ops/script/lint_cpp.py index d4775d6b6b3e..2d00b219ceab 100644 --- a/tests/ci_build/lint_cpp.py +++ b/ops/script/lint_cpp.py @@ -2,6 +2,7 @@ import os import re import sys +from typing import TextIO import cpplint from cpplint import _cpplint_state @@ -9,7 +10,7 @@ CXX_SUFFIX = set(["cc", "c", "cpp", "h", "cu", "hpp"]) -def filepath_enumerate(paths): +def filepath_enumerate(paths: list[str]) -> list[str]: """Enumerate the file paths of all subfiles of the list of paths""" out = [] for path in paths: @@ -22,7 +23,7 @@ def filepath_enumerate(paths): return out -def get_header_guard_dmlc(filename): +def get_header_guard_dmlc(filename: str) -> str: """Get Header Guard Convention for DMLC Projects. For headers in include, directly use the path @@ -54,11 +55,10 @@ def get_header_guard_dmlc(filename): class Lint: - def __init__(self): + def __init__(self) -> None: self.project_name = "xgboost" - self.cpp_header_map = {} - self.cpp_src_map = {} - self.python_map = {} + self.cpp_header_map: dict[str, dict[str, int]] = {} + self.cpp_src_map: dict[str, dict[str, int]] = {} self.pylint_cats = set(["error", "warning", "convention", "refactor"]) # setup cpp lint @@ -78,7 +78,7 @@ def __init__(self): cpplint._SetCountingStyle("toplevel") cpplint._line_length = 100 - def process_cpp(self, path, suffix): + def process_cpp(self, path: str, suffix: str) -> None: """Process a cpp file.""" _cpplint_state.ResetErrorCounts() cpplint.ProcessFile(str(path), _cpplint_state.verbose_level) @@ -91,7 +91,9 @@ def process_cpp(self, path, suffix): self.cpp_src_map[str(path)] = errors @staticmethod - def _print_summary_map(strm, result_map, ftype): + def _print_summary_map( + strm: TextIO, result_map: dict[str, dict[str, int]], ftype: str + ) -> int: """Print summary of certain result map.""" if len(result_map) == 0: return 0 @@ -105,7 +107,7 @@ def _print_summary_map(strm, result_map, ftype): ) return len(result_map) - npass - def print_summary(self, strm): + def print_summary(self, strm: TextIO) -> int: """Print summary of lint.""" nerr = 0 nerr += Lint._print_summary_map(strm, self.cpp_header_map, "cpp-header") @@ -122,7 +124,7 @@ def print_summary(self, strm): cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc -def process(fname, allow_type): +def process(fname: str, allow_type: list[str]) -> None: """Process a file.""" fname = str(fname) arr = fname.rsplit(".", 1) @@ -132,13 +134,19 @@ def process(fname, allow_type): _HELPER.process_cpp(fname, arr[-1]) -def main(): +def main() -> None: parser = argparse.ArgumentParser(description="run cpp lint") parser.add_argument( "path", nargs="*", help="Path to traverse", - default=["src", "include", os.path.join("R-package", "src"), "python-package", "plugin/sycl"], + default=[ + "src", + "include", + os.path.join("R-package", "src"), + "python-package", + "plugin/sycl", + ], ) parser.add_argument( "--exclude_path", @@ -149,7 +157,7 @@ def main(): args = parser.parse_args() excluded_paths = filepath_enumerate(args.exclude_path) - allow_type = [] + allow_type: list[str] = [] allow_type += CXX_SUFFIX for path in args.path: diff --git a/tests/ci_build/lint_python.py b/ops/script/lint_python.py similarity index 95% rename from tests/ci_build/lint_python.py rename to ops/script/lint_python.py index d0ef625fa008..f418fbf1075f 100644 --- a/tests/ci_build/lint_python.py +++ b/ops/script/lint_python.py @@ -16,8 +16,6 @@ class LintersPaths: BLACK = ( # core "python-package/", - # CI - "tests/ci_build/tidy.py", # tests "tests/python/test_config.py", "tests/python/test_callback.py", @@ -72,10 +70,7 @@ class LintersPaths: "demo/guide-python/update_process.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "tests/ci_build/lint_python.py", - "tests/ci_build/test_r_package.py", - "tests/ci_build/test_utils.py", - "tests/ci_build/change_version.py", + "ops/", ) ISORT = ( @@ -85,12 +80,13 @@ class LintersPaths: "tests/test_distributed/", "tests/python/", "tests/python-gpu/", - "tests/ci_build/", # demo "demo/", # misc "dev/", "doc/", + # CI + "ops/", ) MYPY = ( @@ -132,11 +128,7 @@ class LintersPaths: "demo/guide-python/learning_to_rank.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "tests/ci_build/tidy.py", - "tests/ci_build/lint_python.py", - "tests/ci_build/test_r_package.py", - "tests/ci_build/test_utils.py", - "tests/ci_build/change_version.py", + "ops/", ) diff --git a/tests/ci_build/lint_r.R b/ops/script/lint_r.R similarity index 100% rename from tests/ci_build/lint_r.R rename to ops/script/lint_r.R diff --git a/tests/ci_build/rename_whl.py b/ops/script/rename_whl.py similarity index 95% rename from tests/ci_build/rename_whl.py rename to ops/script/rename_whl.py index 500196190b3d..d4467720c738 100644 --- a/tests/ci_build/rename_whl.py +++ b/ops/script/rename_whl.py @@ -1,8 +1,8 @@ +import argparse import pathlib -from argparse import ArgumentParser -def main(args): +def main(args: argparse.Namespace) -> None: wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() if not wheel_path.exists(): raise ValueError(f"Wheel cannot be found at path {wheel_path}") @@ -43,7 +43,7 @@ def main(args): if __name__ == "__main__": - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Format a Python wheel's name using the git commit hash and platform tag" ) parser.add_argument( diff --git a/tests/ci_build/tidy.py b/ops/script/run_clang_tidy.py similarity index 97% rename from tests/ci_build/tidy.py rename to ops/script/run_clang_tidy.py index 13bbedc0b4b5..dca5d1069598 100755 --- a/tests/ci_build/tidy.py +++ b/ops/script/run_clang_tidy.py @@ -19,7 +19,9 @@ def call(args: list[str]) -> tuple[int, int, str, list[str]]: # `workspace` is a name used in the CI container. Normally we should keep the dir # as `xgboost`. matched = re.search( - "(workspace|xgboost)/.*(src|tests|include)/.*warning:", error_msg, re.MULTILINE + "(workspace|xgboost)/.*(ops|src|tests|include)/.*warning:", + error_msg, + re.MULTILINE, ) if matched is None: @@ -265,7 +267,7 @@ def test_tidy(args: argparse.Namespace) -> None: """ root_path = os.path.abspath(os.path.curdir) tidy_file = os.path.join(root_path, ".clang-tidy") - test_file_path = os.path.join(root_path, "tests", "ci_build", "test_tidy.cc") + test_file_path = os.path.join(root_path, "ops", "script", "test_tidy.cc") tidy_config = "--config-file=" + tidy_file if not args.tidy_version: @@ -274,8 +276,8 @@ def test_tidy(args: argparse.Namespace) -> None: tidy = "clang-tidy-" + str(args.tidy_version) cmd = [tidy, tidy_config, test_file_path] (proc_code, tidy_status, error_msg, _) = call(cmd) - assert proc_code == 0 - assert tidy_status == 1 + if proc_code != 0 or tidy_status != 1: + raise RuntimeError(error_msg) print("clang-tidy is working.") diff --git a/tests/ci_build/test_r_package.py b/ops/script/test_r_package.py similarity index 99% rename from tests/ci_build/test_r_package.py rename to ops/script/test_r_package.py index 5ca7fa69b21a..3ce886c1bc41 100644 --- a/tests/ci_build/test_r_package.py +++ b/ops/script/test_r_package.py @@ -42,7 +42,7 @@ def pkgroot(path: str) -> None: else: would_remove = output.stdout.decode("utf-8").strip().split("\n") - if would_remove and not all(f.find("tests/ci_build") != -1 for f in would_remove): + if would_remove and not all(f.find("ops") != -1 for f in would_remove): raise ValueError( "\n".join(would_remove) + "\nPlease cleanup the working git repository." ) diff --git a/tests/ci_build/test_tidy.cc b/ops/script/test_tidy.cc similarity index 100% rename from tests/ci_build/test_tidy.cc rename to ops/script/test_tidy.cc diff --git a/tests/ci_build/test_utils.py b/ops/script/test_utils.py similarity index 100% rename from tests/ci_build/test_utils.py rename to ops/script/test_utils.py diff --git a/tests/buildkite/update-rapids.sh b/ops/script/update_rapids.sh similarity index 50% rename from tests/buildkite/update-rapids.sh rename to ops/script/update_rapids.sh index f6a2675bdfa9..d7958ce70d86 100755 --- a/tests/buildkite/update-rapids.sh +++ b/ops/script/update_rapids.sh @@ -7,7 +7,10 @@ echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION" DEV_RAPIDS_VERSION=$(date +%Y-%m-%d -d "20${LATEST_RAPIDS_VERSION//./-}-01 + 2 month" | cut -c3-7 | tr - .) echo "DEV_RAPIDS_VERSION = $DEV_RAPIDS_VERSION" -PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +OPS_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")/.." ; pwd -P ) +CONTAINER_YAML="$OPS_PATH/docker/ci_container.yml" -sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh -sed -i "s/^DEV_RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/DEV_RAPIDS_VERSION=${DEV_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh +sed -i "s/\&rapids_version \"[[:digit:]]\+\.[[:digit:]]\+\"/\&rapids_version \"${LATEST_RAPIDS_VERSION}\"/" \ + "$CONTAINER_YAML" +sed -i "s/\&dev_rapids_version \"[[:digit:]]\+\.[[:digit:]]\+\"/\&dev_rapids_version \"${DEV_RAPIDS_VERSION}\"/" \ + "$CONTAINER_YAML" diff --git a/tests/ci_build/verify_link.sh b/ops/script/verify_link.sh similarity index 100% rename from tests/ci_build/verify_link.sh rename to ops/script/verify_link.sh diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 01e81b16ee0b..608a535cd8cb 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,11 +7,6 @@ namespace dh { PinnedMemory::PinnedMemory() { - // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. - // See https://github.com/dmlc/xgboost/issues/10933 - this->impl_.emplace(); - return; - #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh deleted file mode 100755 index aa8f572483a3..000000000000 --- a/tests/buildkite/build-containers.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -set -euo pipefail -set -x - -if [ "$#" -lt 1 ] -then - echo "Usage: $0 [container to build]" - exit 1 -fi -container=$1 - -source tests/buildkite/conftest.sh - -echo "--- Build container ${container}" - -BUILD_ARGS="" - -case "${container}" in - cpu) - ;; - - gpu|gpu_build_rockylinux8) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - ;; - - gpu_dev_ver) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$DEV_RAPIDS_VERSION" - ;; - - jvm_gpu_build) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - ;; - - *) - echo "Unrecognized container ID: ${container}" - exit 2 - ;; -esac - -# Run a no-op command. This will simply build the container and push it to the private registry -tests/ci_build/ci_build.sh ${container} ${BUILD_ARGS} bash diff --git a/tests/buildkite/build-cpu-arm64.sh b/tests/buildkite/build-cpu-arm64.sh deleted file mode 100755 index 8b3847ed58b9..000000000000 --- a/tests/buildkite/build-cpu-arm64.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -WHEEL_TAG=manylinux_2_28_aarch64 - -echo "--- Build CPU code targeting ARM64" - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh aarch64" - -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test \ - -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOL=ON -echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" - -echo "--- Build binary wheel" -$command_wrapper bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -$command_wrapper bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -buildkite-agent artifact upload "python-package/dist/*.whl" -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi - -echo "--- Stash XGBoost CLI executable" -buildkite-agent artifact upload ./xgboost diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh deleted file mode 100755 index 11679d644de1..000000000000 --- a/tests/buildkite/build-cpu.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Build CPU code" - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh cpu" - -$command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h - # This step is not necessary, but here we include it, to ensure that - # DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use - # the configured header build/dmlc/build_config.h instead of - # include/dmlc/build_config_default.h. -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \ - -DPLUGIN_FEDERATED=ON -echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" -echo "--- Stash XGBoost CLI executable" -buildkite-agent artifact upload ./xgboost - -# Sanitizer test -echo "--- Run Google Test with sanitizer enabled" -$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON \ - -DENABLED_SANITIZERS="address;leak;undefined" -DCMAKE_BUILD_TYPE=Debug \ - -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ -CI_DOCKER_EXTRA_PARAMS_INIT="-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer "` - `"-e ASAN_OPTIONS=symbolize=1 "` - `"-e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log "` - `"--cap-add SYS_PTRACE" \ - $command_wrapper bash -c "cd build && ctest --exclude-regex AllTestsInDMLCUnitTests "` - `"--extra-verbose" diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh deleted file mode 100755 index 189c67cba449..000000000000 --- a/tests/buildkite/build-cuda-with-rmm.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [ "$#" -lt 1 ] -then - mode=stable - exit 1 -else - mode=$1 -fi - -WHEEL_TAG=manylinux_2_28_x86_64 - -source tests/buildkite/conftest.sh - - -case "${mode}" in - stable) - container_tag='gpu_build_rockylinux8' - rapids_version=$RAPIDS_VERSION - ;; - - dev) - container_tag='gpu_dev_ver' - rapids_version=$DEV_RAPIDS_VERSION - ;; - - *) - echo "Unrecognized mode ID: ${mode}" - exit 2 - ;; -esac - -echo "--- Build with CUDA ${CUDA_VERSION} with RMM" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -command_wrapper="tests/ci_build/ci_build.sh $container_tag --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$rapids_version" - -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DPLUGIN_RMM=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} -echo "--- Build binary wheel" -$command_wrapper bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 auditwheel repair \ - --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \ - --acl public-read --no-progress -fi - -echo "-- Stash C++ test executable (testxgboost)" -buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh deleted file mode 100755 index 03d2cc8a6a24..000000000000 --- a/tests/buildkite/build-cuda.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -WHEEL_TAG=manylinux_2_28_x86_64 - -source tests/buildkite/conftest.sh - -echo "--- Build with CUDA ${CUDA_VERSION}" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -command_wrapper="tests/ci_build/ci_build.sh gpu_build_rockylinux8 --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} -echo "--- Build binary wheel" -$command_wrapper bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 auditwheel repair \ - --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress - - # Generate the meta info which includes xgboost version and the commit info - $command_wrapper python tests/ci_build/format_wheel_meta.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} \ - --meta-path python-package/dist/ - aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi -echo "-- Stash C++ test executable (testxgboost)" -buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-gpu-rpkg.sh b/tests/buildkite/build-gpu-rpkg.sh deleted file mode 100755 index 83bcd9eb9c7b..000000000000 --- a/tests/buildkite/build-gpu-rpkg.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build XGBoost R package with CUDA" - -tests/ci_build/ci_build.sh gpu_build_r_rockylinux8 \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg R_VERSION_ARG=${R_VERSION} \ - tests/ci_build/build_r_pkg_with_cuda.sh \ - ${BUILDKITE_COMMIT} - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Upload R tarball" - aws s3 cp xgboost_r_gpu_linux_*.tar.gz s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi diff --git a/tests/buildkite/build-jvm-doc.sh b/tests/buildkite/build-jvm-doc.sh deleted file mode 100755 index d168eb8cc58d..000000000000 --- a/tests/buildkite/build-jvm-doc.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build JVM packages doc" -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Upload JVM packages doc" - aws s3 cp jvm-packages/${BRANCH_NAME}.tar.bz2 \ - s3://xgboost-docs/${BRANCH_NAME}.tar.bz2 --acl public-read --no-progress -fi diff --git a/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh b/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh deleted file mode 100644 index e7fec780b956..000000000000 --- a/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh jvm_manylinux2014_aarch64" - -# Build XGBoost4J binary -echo "--- Build libxgboost4j.so (targeting glibc 2.17)" -set -x -mkdir build -$command_wrapper bash -c \ - "cd build && cmake .. -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && make -j$(nproc)" -ldd lib/libxgboost4j.so -objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu - -echo "--- Upload libxgboost4j.so" -pushd lib -libname=libxgboost4j_linux_arm64_${BUILDKITE_COMMIT}.so -mv -v libxgboost4j.so ${libname} -buildkite-agent artifact upload ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd diff --git a/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh b/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh deleted file mode 100644 index 46a819a016d3..000000000000 --- a/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh jvm_manylinux2014_x86_64" - -# Build XGBoost4J binary -echo "--- Build libxgboost4j.so (targeting glibc 2.17)" -set -x -mkdir build -$command_wrapper bash -c \ - "cd build && cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && ninja -v" -ldd lib/libxgboost4j.so -objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu - -echo "--- Upload libxgboost4j.so" -pushd lib -libname=libxgboost4j_linux_x86_64_${BUILDKITE_COMMIT}.so -mv -v libxgboost4j.so ${libname} -buildkite-agent artifact upload ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd diff --git a/tests/buildkite/build-jvm-packages-gpu.sh b/tests/buildkite/build-jvm-packages-gpu.sh deleted file mode 100755 index 76ffafbcfdd7..000000000000 --- a/tests/buildkite/build-jvm-packages-gpu.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build and test XGBoost JVM packages with CUDA" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -tests/ci_build/ci_build.sh jvm_gpu_build --use-gpus \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ - tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} -Duse.cuda=ON ${arch_flag} diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh deleted file mode 100755 index da4d1e9d8c8a..000000000000 --- a/tests/buildkite/build-jvm-packages.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build and test XGBoost JVM packages with Scala 2.12" -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} - -echo "--- Build and test XGBoost JVM packages with Scala 2.13" - -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} "" "" "true" diff --git a/tests/buildkite/build-manylinux2014.sh b/tests/buildkite/build-manylinux2014.sh deleted file mode 100755 index 426d32b5c361..000000000000 --- a/tests/buildkite/build-manylinux2014.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [ $# -ne 1 ]; then - echo "Usage: $0 {x86_64,aarch64}" - exit 1 -fi - -arch=$1 - -source tests/buildkite/conftest.sh - -WHEEL_TAG="manylinux2014_${arch}" -command_wrapper="tests/ci_build/ci_build.sh ${WHEEL_TAG}" -python_bin="/opt/python/cp310-cp310/bin/python" - -echo "--- Build binary wheel for ${WHEEL_TAG}" -# Patch to add warning about manylinux2014 variant -patch -p0 < tests/buildkite/remove_nccl_dep.patch -patch -p0 < tests/buildkite/manylinux2014_warning.patch -$command_wrapper bash -c \ - "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" -git checkout python-package/pyproject.toml python-package/xgboost/core.py # discard the patch - -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -rm -rf python-package/dist/ -mkdir python-package/dist/ -mv -v wheelhouse/*.whl python-package/dist/ - -echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" -# Patch to rename pkg to xgboost-cpu -patch -p0 < tests/buildkite/remove_nccl_dep.patch -patch -p0 < tests/buildkite/cpu_only_pypkg.patch -$command_wrapper bash -c \ - "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" -git checkout python-package/pyproject.toml # discard the patch - -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/xgboost_cpu-*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -rm -v python-package/dist/xgboost_cpu-*.whl -mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ - -echo "--- Upload Python wheel" -for wheel in python-package/dist/*.whl -do - buildkite-agent artifact upload "${wheel}" -done -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - for wheel in python-package/dist/*.whl - do - aws s3 cp "${wheel}" s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress - done -fi diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1 deleted file mode 100644 index 9114d3237751..000000000000 --- a/tests/buildkite/build-win64-gpu.ps1 +++ /dev/null @@ -1,55 +0,0 @@ -$ErrorActionPreference = "Stop" - -. tests/buildkite/conftest.ps1 - -Write-Host "--- Build libxgboost on Windows with CUDA" - -nvcc --version -if ( $is_release_branch -eq 0 ) { - $arch_flag = "-DGPU_COMPUTE_VER=75" -} else { - $arch_flag = "" -} -mkdir build -cd build -cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON ` - -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ${arch_flag} -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -cmake --build . --config Release -- /m /nodeReuse:false ` - "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -Write-Host "--- Build binary wheel" -cd ../python-package -conda activate -& pip install --user -v "pip>=23" -& pip --version -& pip wheel --no-deps -v . --wheel-dir dist/ -Get-ChildItem . -Filter dist/*.whl | -Foreach-Object { - & python ../tests/ci_build/rename_whl.py ` - --wheel-path $_.FullName ` - --commit-hash $Env:BUILDKITE_COMMIT ` - --platform-tag win_amd64 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} - -Write-Host "--- Upload Python wheel" -cd .. -Get-ChildItem . -Filter python-package/dist/*.whl | -Foreach-Object { - & buildkite-agent artifact upload python-package/dist/$_ - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} -if ( $is_release_branch -eq 1 ) { - Get-ChildItem . -Filter python-package/dist/*.whl | - Foreach-Object { - & aws s3 cp python-package/dist/$_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ ` - --acl public-read --no-progress - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - } -} - -Write-Host "--- Stash C++ test executables" -& buildkite-agent artifact upload build/testxgboost.exe -& buildkite-agent artifact upload xgboost.exe diff --git a/tests/buildkite/conftest.ps1 b/tests/buildkite/conftest.ps1 deleted file mode 100644 index bd623caf0c03..000000000000 --- a/tests/buildkite/conftest.ps1 +++ /dev/null @@ -1,13 +0,0 @@ -if ( $Env:BUILDKITE_PULL_REQUEST -and ($Env:BUILDKITE_PULL_REQUEST -ne "false") ) { - $is_pull_request = 1 -} else { - $is_pull_request = 0 -} - -if ( ($Env:BUILDKITE_BRANCH -eq "master") -or ($Env:BUILDKITE_BRANCH -match "release_.+") ) { - $is_release_branch = 1 - $enforce_daily_budget = 0 -} else { - $is_release_branch = 0 - $enforce_daily_budget = 1 -} diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh deleted file mode 100755 index 12f4c07ac6c9..000000000000 --- a/tests/buildkite/conftest.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -function get_aws_secret { - if [[ $# -ne 1 ]] - then - echo "Usage: get_aws_secret [Name of secret]" - return 1 - fi - aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString -} - -function set_buildkite_env_vars_in_container { - # Pass all Buildkite-specific env vars to Docker containers. - # This is to be used with tests/ci_build/ci_build.sh - export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "` - `"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "` - `"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "` - `"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL" -} - -set -x - -CUDA_VERSION=12.4.1 -NCCL_VERSION=2.23.4-1 -RAPIDS_VERSION=24.10 -DEV_RAPIDS_VERSION=24.12 -SPARK_VERSION=3.5.1 -JDK_VERSION=8 -R_VERSION=4.3.2 - -if [[ -z ${BUILDKITE:-} ]] -then - echo "$0 is not meant to run locally; it should run inside BuildKite." - echo "Please inspect the content of $0 and locate the desired command manually." - exit 1 -fi - -if [[ -n $BUILDKITE_PULL_REQUEST && $BUILDKITE_PULL_REQUEST != "false" ]] -then - is_pull_request=1 - BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST -else - is_pull_request=0 - BRANCH_NAME=$BUILDKITE_BRANCH -fi -export BRANCH_NAME=${BRANCH_NAME//\//-} - -if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] -then - is_release_branch=1 - enforce_daily_budget=0 -else - is_release_branch=0 - enforce_daily_budget=1 -fi - -if [[ -n ${DISABLE_RELEASE:-} ]] -then - is_release_branch=0 -fi - -set +x diff --git a/tests/buildkite/deploy-jvm-packages.sh b/tests/buildkite/deploy-jvm-packages.sh deleted file mode 100755 index 812a6c5cafec..000000000000 --- a/tests/buildkite/deploy-jvm-packages.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" - tests/ci_build/ci_build.sh jvm_gpu_build \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ - tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION} -fi diff --git a/tests/buildkite/enforce_daily_budget.py b/tests/buildkite/enforce_daily_budget.py deleted file mode 100644 index af1b1ce484b8..000000000000 --- a/tests/buildkite/enforce_daily_budget.py +++ /dev/null @@ -1,14 +0,0 @@ -import json -import argparse - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--response", type=str, required=True) - args = parser.parse_args() - with open(args.response, "r") as f: - payload = f.read() - response = json.loads(payload) - if response["approved"]: - print(f"Testing approved. Reason: {response['reason']}") - else: - raise RuntimeError(f"Testing rejected. Reason: {response['reason']}") diff --git a/tests/buildkite/enforce_daily_budget.sh b/tests/buildkite/enforce_daily_budget.sh deleted file mode 100755 index 8212f07c1b24..000000000000 --- a/tests/buildkite/enforce_daily_budget.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Enforce daily budget" - -source tests/buildkite/conftest.sh - -if [[ $enforce_daily_budget == 0 ]] -then - echo "Automatically approving all test jobs for trunk branches" -else - aws lambda invoke --function-name XGBoostCICostWatcher --invocation-type RequestResponse --region us-west-2 response.json - python3 tests/buildkite/enforce_daily_budget.py --response response.json -fi diff --git a/tests/buildkite/infrastructure/README.md b/tests/buildkite/infrastructure/README.md deleted file mode 100644 index cc3e552e70ff..000000000000 --- a/tests/buildkite/infrastructure/README.md +++ /dev/null @@ -1,106 +0,0 @@ -BuildKite CI Infrastructure -=========================== - -# Worker image builder (`worker-image-pipeline/`) - -Use EC2 Image Builder to build machine images in a deterministic fashion. -The machine images are used to initialize workers in the CI/CD pipelines. - -## Editing bootstrap scripts - -Currently, we create two pipelines for machine images: one for Linux workers and another -for Windows workers. -You can edit the bootstrap scripts to change how the worker machines are initialized. - -* `linux-amd64-gpu-bootstrap.yml`: Bootstrap script for Linux worker machines -* `windows-gpu-bootstrap.yml`: Bootstrap script for Windows worker machines - -## Creating and running Image Builder pipelines - -Run the following commands to create and run pipelines in EC2 Image Builder service: -```bash -python worker-image-pipeline/create_worker_image_pipelines.py --aws-region us-west-2 -python worker-image-pipeline/run_pipelines.py --aws-region us-west-2 -``` -Go to the AWS CloudFormation console and verify the existence of two CloudFormation stacks: -* `buildkite-windows-gpu-worker` -* `buildkite-linux-amd64-gpu-worker` - -Then go to the EC2 Image Builder console to check the status of the image builds. You may -want to inspect the log output should a build fails. -Once the new machine images are done building, see the next section to deploy the new -images to the worker machines. - -# Elastic CI Stack for AWS (`aws-stack-creator/`) - -Use EC2 Autoscaling groups to launch worker machines in EC2. BuildKite periodically sends -messages to the Autoscaling groups to increase or decrease the number of workers according -to the number of outstanding testing jobs. - -## Deploy an updated CI stack with new machine images - -First, edit `aws-stack-creator/metadata.py` to update the `AMI_ID` fields: -```python -AMI_ID = { - # Managed by XGBoost team - "linux-amd64-gpu": { - "us-west-2": "...", - }, - "linux-amd64-mgpu": { - "us-west-2": "...", - }, - "windows-gpu": { - "us-west-2": "...", - }, - "windows-cpu": { - "us-west-2": "...", - }, - # Managed by BuildKite - # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml - "linux-amd64-cpu": { - "us-west-2": "...", - }, - "pipeline-loader": { - "us-west-2": "...", - }, - "linux-arm64-cpu": { - "us-west-2": "...", - }, -} -``` -AMI IDs uniquely identify the machine images in the EC2 service. -Go to the EC2 Image Builder console to find the AMI IDs for the new machine images -(see the previous section), and update the following fields: - -* `AMI_ID["linux-amd64-gpu"]["us-west-2"]`: - Use the latest output from the `buildkite-linux-amd64-gpu-worker` pipeline -* `AMI_ID["linux-amd64-mgpu"]["us-west-2"]`: - Should be identical to `AMI_ID["linux-amd64-gpu"]["us-west-2"]` -* `AMI_ID["windows-gpu"]["us-west-2"]`: - Use the latest output from the `buildkite-windows-gpu-worker` pipeline -* `AMI_ID["windows-cpu"]["us-west-2"]`: - Should be identical to `AMI_ID["windows-gpu"]["us-west-2"]` - -Next, visit https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml -to look up the AMI IDs for the following fields: - -* `AMI_ID["linux-amd64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field - `Mappings/AWSRegion2AMI/us-west-2/linuxamd64` -* `AMI_ID["pipeline-loader"]["us-west-2"]`: - Should be identical to `AMI_ID["linux-amd64-cpu"]["us-west-2"]` -* `AMI_ID["linux-arm64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field - `Mappings/AWSRegion2AMI/us-west-2/linuxarm64` - -Finally, run the following commands to deploy the new machine images: -``` -python aws-stack-creator/create_stack.py --aws-region us-west-2 --agent-token AGENT_TOKEN -``` -Go to the AWS CloudFormation console and verify the existence of the following -CloudFormation stacks: -* `buildkite-pipeline-loader-autoscaling-group` -* `buildkite-linux-amd64-cpu-autoscaling-group` -* `buildkite-linux-amd64-gpu-autoscaling-group` -* `buildkite-linux-amd64-mgpu-autoscaling-group` -* `buildkite-linux-arm64-cpu-autoscaling-group` -* `buildkite-windows-cpu-autoscaling-group` -* `buildkite-windows-gpu-autoscaling-group` diff --git a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml deleted file mode 100644 index 7f15b1fbcd4f..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "Buildkite agent's IAM policy" - -Resources: - BuildkiteAgentManagedPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:*", - "s3-object-lambda:*" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "lambda:InvokeFunction", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "secretsmanager:GetSecretValue", - "Resource": "*" - } - ] - } diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py deleted file mode 100644 index 8f8db348a073..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py +++ /dev/null @@ -1,127 +0,0 @@ -import argparse -import copy -import os -import re -import sys - -import boto3 -import botocore -from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS - -current_dir = os.path.dirname(__file__) -sys.path.append(os.path.join(current_dir, "..")) - -from common_blocks.utils import create_or_update_stack, wait - -TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" - - -def get_availability_zones(*, aws_region): - client = boto3.client("ec2", region_name=aws_region) - r = client.describe_availability_zones( - Filters=[ - {"Name": "region-name", "Values": [aws_region]}, - {"Name": "zone-type", "Values": ["availability-zone"]}, - ] - ) - return sorted([x["ZoneName"] for x in r["AvailabilityZones"]]) - - -def get_default_vpc(*, aws_region): - ec2 = boto3.resource("ec2", region_name=aws_region) - default_vpc_id = None - for x in ec2.vpcs.filter(Filters=[{"Name": "is-default", "Values": ["true"]}]): - return x - - # Create default VPC if not exist - client = boto3.client("ec2", region_name=aws_region) - r = client.create_default_vpc() - default_vpc_id = r["Vpc"]["VpcId"] - - return ec2.Vpc(default_vpc_id) - - -def format_params(args, *, stack_id, agent_iam_policy): - default_vpc = get_default_vpc(aws_region=args.aws_region) - azs = get_availability_zones(aws_region=args.aws_region) - # For each of the first two availability zones (AZs), choose the default subnet - subnets = [ - x.id - for x in default_vpc.subnets.filter( - Filters=[ - {"Name": "default-for-az", "Values": ["true"]}, - {"Name": "availability-zone", "Values": azs[:2]}, - ] - ) - ] - assert len(subnets) == 2 - - params = copy.deepcopy(STACK_PARAMS[stack_id]) - params["ImageId"] = AMI_ID[stack_id][args.aws_region] - params["BuildkiteQueue"] = stack_id - params["CostAllocationTagValue"] = f"buildkite-{stack_id}" - params["BuildkiteAgentToken"] = args.agent_token - params["VpcId"] = default_vpc.id - params["Subnets"] = ",".join(subnets) - params["ManagedPolicyARNs"] = agent_iam_policy - params.update(COMMON_STACK_PARAMS) - return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] - - -def get_full_stack_id(stack_id): - return f"buildkite-{stack_id}-autoscaling-group" - - -def create_agent_iam_policy(args, *, client): - policy_stack_name = "buildkite-agent-iam-policy" - print(f"Creating stack {policy_stack_name} for agent IAM policy...") - with open( - os.path.join(current_dir, "agent-iam-policy-template.yml"), - encoding="utf-8", - ) as f: - policy_template = f.read() - promise = create_or_update_stack( - args, client=client, stack_name=policy_stack_name, template_body=policy_template - ) - wait(promise, client=client) - - cf = boto3.resource("cloudformation", region_name=args.aws_region) - policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy") - return policy.physical_resource_id - - -def main(args): - client = boto3.client("cloudformation", region_name=args.aws_region) - - agent_iam_policy = create_agent_iam_policy(args, client=client) - - promises = [] - - for stack_id in AMI_ID: - stack_id_full = get_full_stack_id(stack_id) - print(f"Creating elastic CI stack {stack_id_full}...") - - params = format_params( - args, stack_id=stack_id, agent_iam_policy=agent_iam_policy - ) - - promise = create_or_update_stack( - args, - client=client, - stack_name=stack_id_full, - template_url=TEMPLATE_URL, - params=params, - ) - promises.append(promise) - print(f"CI stack {stack_id_full} is in progress in the background") - - for promise in promises: - wait(promise, client=client) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - parser.add_argument("--agent-token", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py deleted file mode 100644 index 5012aa738854..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py +++ /dev/null @@ -1,114 +0,0 @@ -AMI_ID = { - # Managed by XGBoost team - "linux-amd64-gpu": { - "us-west-2": "ami-0b4079c15bbbd0faf", - }, - "linux-amd64-mgpu": { - "us-west-2": "ami-0b4079c15bbbd0faf", - }, - "windows-gpu": { - "us-west-2": "ami-0123456bcf4cdfb82", - }, - "windows-cpu": { - "us-west-2": "ami-0123456bcf4cdfb82", - }, - # Managed by BuildKite - # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml - "linux-amd64-cpu": { - "us-west-2": "ami-0083e0ae73c175ec6", - }, - "pipeline-loader": { - "us-west-2": "ami-0083e0ae73c175ec6", - }, - "linux-arm64-cpu": { - "us-west-2": "ami-0dbf1f9da54222f21", - }, -} - -STACK_PARAMS = { - "linux-amd64-gpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "g4dn.xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "8", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-amd64-mgpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "g4dn.12xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "1", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "windows-gpu": { - "InstanceOperatingSystem": "windows", - "InstanceTypes": "g4dn.2xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "windows-cpu": { - "InstanceOperatingSystem": "windows", - "InstanceTypes": "c5a.2xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-amd64-cpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "c5a.4xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "16", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "pipeline-loader": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "t3a.micro", - "AgentsPerInstance": "1", - "MinSize": "2", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-arm64-cpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "c6g.4xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "8", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, -} - -COMMON_STACK_PARAMS = { - "BuildkiteAgentTimestampLines": "false", - "BuildkiteWindowsAdministrator": "true", - "AssociatePublicIpAddress": "true", - "ScaleOutForWaitingJobs": "false", - "EnableCostAllocationTags": "true", - "CostAllocationTagName": "CreatedBy", - "ECRAccessPolicy": "full", - "EnableSecretsPlugin": "false", - "EnableECRPlugin": "false", - "EnableDockerLoginPlugin": "false", - "EnableDockerUserNamespaceRemap": "false", - "BuildkiteAgentExperiments": "normalised-upload-paths,resolve-commit-after-checkout", -} diff --git a/tests/buildkite/infrastructure/common_blocks/utils.py b/tests/buildkite/infrastructure/common_blocks/utils.py deleted file mode 100644 index 27a0835e8dc0..000000000000 --- a/tests/buildkite/infrastructure/common_blocks/utils.py +++ /dev/null @@ -1,97 +0,0 @@ -import re - -import boto3 -import botocore - - -def stack_exists(args, *, stack_name): - client = boto3.client("cloudformation", region_name=args.aws_region) - waiter = client.get_waiter("stack_exists") - try: - waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1}) - return True - except botocore.exceptions.WaiterError as e: - return False - - -def create_or_update_stack( - args, *, client, stack_name, template_url=None, template_body=None, params=None -): - kwargs = { - "StackName": stack_name, - "Capabilities": [ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - } - if template_url: - kwargs["TemplateURL"] = template_url - if template_body: - kwargs["TemplateBody"] = template_body - if params: - kwargs["Parameters"] = params - - if stack_exists(args, stack_name=stack_name): - print(f"Stack {stack_name} already exists. Updating...") - try: - response = client.update_stack(**kwargs) - return {"StackName": stack_name, "Action": "update"} - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "ValidationError" and re.search( - "No updates are to be performed", e.response["Error"]["Message"] - ): - print(f"No update was made to {stack_name}") - return {"StackName": stack_name, "Action": "noop"} - else: - raise e - else: - kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False}) - response = client.create_stack(**kwargs) - return {"StackName": stack_name, "Action": "create"} - - -def replace_stack( - args, *, client, stack_name, template_url=None, template_body=None, params=None -): - """Delete an existing stack and create a new stack with identical name""" - - if not stack_exists(args, stack_name=stack_name): - raise ValueError(f"Stack {stack_name} does not exist") - r = client.delete_stack(StackName=stack_name) - delete_waiter = client.get_waiter("stack_delete_complete") - delete_waiter.wait(StackName=stack_name) - - kwargs = { - "StackName": stack_name, - "Capabilities": [ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - "OnFailure": "ROLLBACK", - "EnableTerminationProtection": False, - } - if template_url: - kwargs["TemplateURL"] = template_url - if template_body: - kwargs["TemplateBody"] = template_body - if params: - kwargs["Parameters"] = params - response = client.create_stack(**kwargs) - return {"StackName": stack_name, "Action": "create"} - - -def wait(promise, *, client): - stack_name = promise["StackName"] - print(f"Waiting for {stack_name}...") - if promise["Action"] == "create": - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_name) - print(f"Finished creating stack {stack_name}") - elif promise["Action"] == "update": - waiter = client.get_waiter("stack_update_complete") - waiter.wait(StackName=stack_name) - print(f"Finished updating stack {stack_name}") - elif promise["Action"] != "noop": - raise ValueError(f"Invalid promise {promise}") diff --git a/tests/buildkite/infrastructure/requirements.txt b/tests/buildkite/infrastructure/requirements.txt deleted file mode 100644 index 3ce271ebbdd6..000000000000 --- a/tests/buildkite/infrastructure/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -boto3 -cfn_tools diff --git a/tests/buildkite/infrastructure/service-user/create_service_user.py b/tests/buildkite/infrastructure/service-user/create_service_user.py deleted file mode 100644 index ba08779bd159..000000000000 --- a/tests/buildkite/infrastructure/service-user/create_service_user.py +++ /dev/null @@ -1,44 +0,0 @@ -import argparse -import os - -import boto3 - -current_dir = os.path.dirname(__file__) - - -def main(args): - with open( - os.path.join(current_dir, "service-user-template.yml"), encoding="utf-8" - ) as f: - service_user_template = f.read() - - stack_id = "buildkite-elastic-ci-stack-service-user" - - print("Create a new IAM user with suitable permissions...") - client = boto3.client("cloudformation", region_name=args.aws_region) - response = client.create_stack( - StackName=stack_id, - TemplateBody=service_user_template, - Capabilities=[ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - ], - Parameters=[{"ParameterKey": "UserName", "ParameterValue": args.user_name}], - ) - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_id) - user = boto3.resource("iam", region_name=args.aws_region).User(args.user_name) - key_pair = user.create_access_key_pair() - print("Finished creating an IAM users with suitable permissions.") - print(f"Access Key ID: {key_pair.access_key_id}") - print(f"Access Secret Access Key: {key_pair.secret_access_key}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - parser.add_argument( - "--user-name", type=str, default="buildkite-elastic-ci-stack-user" - ) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/service-user/service-user-template.yml b/tests/buildkite/infrastructure/service-user/service-user-template.yml deleted file mode 100644 index 2077cfe7b148..000000000000 --- a/tests/buildkite/infrastructure/service-user/service-user-template.yml +++ /dev/null @@ -1,349 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "Buildkite Elastic CI Stack CloudFormation service user" - -Parameters: - UserName: - Type: String - Default: buildkite-elastic-ci-stack-user - Description: Name of user to create - -Outputs: - UserNameOutput: - Value: !Ref CloudFormationServiceUser - UserArnOutput: - Value: !GetAtt CloudFormationServiceUser.Arn - -Resources: - CloudFormationServiceUser: - Type: AWS::IAM::User - Properties: - ManagedPolicyArns: - - !Ref SubstackCrudPolicy - - !Ref CrudPolicy - - !Ref ImageBuilderPolicy - UserName: !Ref UserName - - SubstackCrudPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": "cloudformation:*", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "serverlessrepo:GetApplication", - "serverlessrepo:GetCloudFormationTemplate", - "serverlessrepo:CreateCloudFormationTemplate" - ], - "Resource": "*" - } - ] - } - - CrudPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "ec2:DescribeAccountAttributes", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeInstances", - "ec2:DescribeInternetGateways", - "ec2:DescribeLaunchTemplateVersions", - "ec2:DescribeLaunchTemplates", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribeRouteTables", - "ec2:DescribeSecurityGroups", - "ec2:DescribeSubnets", - "ec2:DescribeVpcs", - "ec2:CreateTags" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateInternetGateway", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:DeleteInternetGateway" - ], - "Resource": "arn:aws:ec2:*:*:internet-gateway/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateLaunchTemplate", - "ec2:CreateLaunchTemplateVersion", - "ec2:DeleteLaunchTemplate" - ], - "Resource": "arn:aws:ec2:*:*:launch-template/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable", - "ec2:CreateRoute", - "ec2:CreateRouteTable", - "ec2:DeleteRoute", - "ec2:DeleteRouteTable" - ], - "Resource": "arn:aws:ec2:*:*:route-table/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:AuthorizeSecurityGroupIngress", - "ec2:RevokeSecurityGroupIngress", - "ec2:CreateSecurityGroup", - "ec2:DeleteSecurityGroup" - ], - "Resource": "arn:aws:ec2:*:*:security-group/*" - }, - { - "Effect": "Allow", - "Action": "ec2:RunInstances", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateSubnet", - "ec2:DeleteSubnet", - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable" - ], - "Resource": "arn:aws:ec2:*:*:subnet/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateVpc", - "ec2:CreateSecurityGroup", - "ec2:ModifyVpcAttribute", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:CreateSubnet", - "ec2:CreateRouteTable", - "ec2:DeleteVpc" - ], - "Resource": "arn:aws:ec2:*:*:vpc/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateDefaultVpc", - "ec2:CreateDefaultSubnet" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:CreateInstanceProfile", - "iam:GetInstanceProfile", - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:DeleteInstanceProfile" - ], - "Resource": "arn:aws:iam::*:instance-profile/*" - }, - { - "Effect": "Allow", - "Action": [ - "kms:DescribeKey", - "kms:CreateGrant", - "kms:Decrypt", - "kms:Encrypt" - ], - "Resource": "arn:aws:kms:*:*:key/*" - }, - { - "Effect": "Allow", - "Action": [ - "lambda:CreateFunction", - "lambda:GetFunction", - "lambda:GetFunctionCodeSigningConfig", - "lambda:AddPermission", - "lambda:RemovePermission", - "lambda:DeleteFunction", - "lambda:InvokeFunction", - "lambda:TagResource" - ], - "Resource": "arn:aws:lambda:*:*:function:*" - }, - { - "Effect": "Allow", - "Action": [ - "logs:CreateLogGroup", - "logs:PutRetentionPolicy", - "logs:DeleteLogGroup" - ], - "Resource": "arn:aws:logs:*:*:log-group:*" - }, - { - "Effect": "Allow", - "Action": [ - "s3:GetObject", - "s3:CreateBucket", - "s3:PutBucketAcl", - "s3:PutBucketLogging", - "s3:PutBucketTagging", - "s3:PutBucketVersioning" - ], - "Resource": "arn:aws:s3:::*" - }, - { - "Effect": "Allow", - "Action": [ - "ssm:GetParameter", - "ssm:PutParameter", - "ssm:DeleteParameter" - ], - "Resource": "arn:aws:ssm:*:*:parameter/*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:ListPolicies", - "iam:ListInstanceProfiles", - "iam:ListRoles", - "iam:ListPolicyVersions", - "iam:ListRolePolicies", - "iam:ListAttachedRolePolicies", - "iam:ListInstanceProfileTags", - "iam:ListRoleTags", - "iam:ListInstanceProfilesForRole", - "iam:GetPolicyVersion", - "iam:GetPolicy", - "iam:GetInstanceProfile", - "iam:GetRole", - "iam:GetRolePolicy", - "iam:TagPolicy", - "iam:UntagPolicy", - "iam:TagInstanceProfile", - "iam:UntagInstanceProfile", - "iam:TagRole", - "iam:UntagRole", - "iam:CreateRole", - "iam:PassRole", - "iam:DeleteRole", - "iam:UpdateRoleDescription", - "iam:UpdateRole", - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:CreateInstanceProfile", - "iam:DeleteInstanceProfile", - "iam:DetachRolePolicy", - "iam:SetDefaultPolicyVersion", - "iam:AttachRolePolicy", - "iam:UpdateAssumeRolePolicy", - "iam:PutRolePermissionsBoundary", - "iam:DeleteRolePermissionsBoundary", - "iam:CreatePolicy", - "iam:DeletePolicyVersion", - "iam:DeletePolicy", - "iam:PutRolePolicy", - "iam:DeleteRolePolicy" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "autoscaling:DescribeLifecycleHookTypes", - "autoscaling:DescribeTerminationPolicyTypes", - "autoscaling:DescribePolicies", - "autoscaling:DescribeWarmPool", - "autoscaling:DescribeScalingActivities", - "autoscaling:DescribeScalingProcessTypes", - "autoscaling:DescribeScheduledActions", - "autoscaling:DescribeAutoScalingGroups", - "autoscaling:DescribeAutoScalingInstances", - "autoscaling:DescribeLifecycleHooks", - "autoscaling:SetDesiredCapacity", - "autoscaling:PutLifecycleHook", - "autoscaling:DeleteLifecycleHook", - "autoscaling:SetInstanceProtection", - "autoscaling:CreateAutoScalingGroup", - "autoscaling:EnableMetricsCollection", - "autoscaling:UpdateAutoScalingGroup", - "autoscaling:DeleteAutoScalingGroup", - "autoscaling:PutScalingPolicy", - "autoscaling:DeletePolicy", - "autoscaling:BatchPutScheduledUpdateGroupAction", - "autoscaling:PutScheduledUpdateGroupAction", - "autoscaling:DeleteScheduledAction", - "autoscaling:PutWarmPool", - "autoscaling:DeleteWarmPool", - "autoscaling:TerminateInstanceInAutoScalingGroup", - "autoscaling:AttachInstances" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "events:DescribeRule", - "events:PutRule", - "events:PutTargets", - "events:RemoveTargets", - "events:DeleteRule" - ], - "Resource": "arn:aws:events:*:*:rule/*" - } - ] - } - - ImageBuilderPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "imagebuilder:CreateComponent", - "imagebuilder:GetComponent", - "imagebuilder:DeleteComponent", - "imagebuilder:CreateImageRecipe", - "imagebuilder:GetImageRecipe", - "imagebuilder:DeleteImageRecipe", - "imagebuilder:CreateImagePipeline", - "imagebuilder:GetImagePipeline", - "imagebuilder:DeleteImagePipeline", - "imagebuilder:CreateInfrastructureConfiguration", - "imagebuilder:GetInfrastructureConfiguration", - "imagebuilder:DeleteInfrastructureConfiguration", - "imagebuilder:CreateDistributionConfiguration", - "imagebuilder:GetDistributionConfiguration", - "imagebuilder:DeleteDistributionConfiguration", - "imagebuilder:TagResource", - "imagebuilder:StartImagePipelineExecution", - "ec2:DescribeImages", - "ec2:DescribeSnapshots", - "ec2:DescribeRegions", - "ec2:DescribeVolumes", - "ec2:DescribeKeyPairs", - "ec2:DescribeInstanceTypeOfferings" - ], - "Resource": "*" - } - ] - } diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py deleted file mode 100644 index 8051b991da51..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse -import copy -import json -import os -import sys -from urllib.request import urlopen - -import boto3 -import cfn_flip -from metadata import IMAGE_PARAMS - -current_dir = os.path.dirname(__file__) -sys.path.append(os.path.join(current_dir, "..")) - -from common_blocks.utils import replace_stack, wait - -BUILDKITE_CF_TEMPLATE_URL = ( - "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" -) - - -def format_params(*, stack_id, aws_region, ami_mapping): - params = copy.deepcopy(IMAGE_PARAMS[stack_id]) - with open( - os.path.join(current_dir, params["BootstrapScript"]), - encoding="utf-8", - ) as f: - bootstrap_script = f.read() - params["BaseImageId"] = ami_mapping[aws_region][params["BaseImageId"]] - params["BootstrapScript"] = bootstrap_script - return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] - - -def get_ami_mapping(): - with urlopen(BUILDKITE_CF_TEMPLATE_URL) as response: - buildkite_cf_template = response.read().decode("utf-8") - cfn_obj = json.loads(cfn_flip.to_json(buildkite_cf_template)) - return cfn_obj["Mappings"]["AWSRegion2AMI"] - - -def get_full_stack_id(stack_id): - return f"buildkite-{stack_id}-worker" - - -def main(args): - with open( - os.path.join(current_dir, "ec2-image-builder-pipeline-template.yml"), - encoding="utf-8", - ) as f: - ec2_image_pipeline_template = f.read() - - ami_mapping = get_ami_mapping() - - client = boto3.client("cloudformation", region_name=args.aws_region) - promises = [] - - for stack_id in IMAGE_PARAMS: - stack_id_full = get_full_stack_id(stack_id) - print(f"Creating EC2 image builder stack {stack_id_full}...") - - params = format_params( - stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping - ) - - promise = replace_stack( - args, - client=client, - stack_name=stack_id_full, - template_body=ec2_image_pipeline_template, - params=params, - ) - promises.append(promise) - print( - f"EC2 image builder stack {stack_id_full} is in progress in the background" - ) - - for promise in promises: - wait(promise, client=client) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml deleted file mode 100644 index 8d3bafa72f08..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml +++ /dev/null @@ -1,108 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "EC2 Image Builder pipelines to build workers" - -Parameters: - BaseImageId: - Type: String - Description: Base AMI to build a new image on top of. - - BootstrapScript: - Type: String - Description: Content of AMI customization script - - InstanceType: - Type: String - Description: Instance type for the Image Builder instances. - - InstanceOperatingSystem: - Type: String - Description: The operating system to run on the instance - AllowedValues: - - Linux - - Windows - Default: "Linux" - - VolumeSize: - Type: Number - Description: Size of EBS volume, in GiBs - -Conditions: - IsInstanceWindows: - !Equals [ !Ref InstanceOperatingSystem, "Windows" ] - -Resources: - # IAM role for the image builder instance - InstanceRole: - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: "Allow" - Principal: - Service: "ec2.amazonaws.com" - Action: "sts:AssumeRole" - ManagedPolicyArns: - - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore - - arn:aws:iam::aws:policy/EC2InstanceProfileForImageBuilder - - arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess - - InstanceProfile: - Type: AWS::IAM::InstanceProfile - Properties: - Roles: - - !Ref InstanceRole - - # Component that runs the bootstrap script - BootstrapComponent: - Type: AWS::ImageBuilder::Component - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "bootstrap-component", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Platform: !Ref InstanceOperatingSystem - Version: "1.0.0" - Description: Execute a bootstrap script. - Data: !Ref BootstrapScript - - Recipe: - Type: AWS::ImageBuilder::ImageRecipe - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Components: - - ComponentArn: !Ref BootstrapComponent - ParentImage: !Ref BaseImageId - BlockDeviceMappings: - - DeviceName: !If [IsInstanceWindows, "/dev/sda1", "/dev/xvda"] - Ebs: - DeleteOnTermination: true - Encrypted: false - VolumeSize: !Ref VolumeSize - VolumeType: gp2 - Version: "1.0.0" - - Infrastructure: - Type: AWS::ImageBuilder::InfrastructureConfiguration - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-infrastructure", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - InstanceProfileName: !Ref InstanceProfile - InstanceTypes: - - !Ref InstanceType - TerminateInstanceOnFailure: true - - # Copy to this region only - Distribution: - Type: AWS::ImageBuilder::DistributionConfiguration - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-distribution-config", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Distributions: - - Region: !Ref AWS::Region - AmiDistributionConfiguration: {} - - # Composition of the above elements - Pipeline: - Type: AWS::ImageBuilder::ImagePipeline - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - DistributionConfigurationArn: !Ref Distribution - ImageRecipeArn: !Ref Recipe - InfrastructureConfigurationArn: !Ref Infrastructure diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml deleted file mode 100644 index 88403911cbc6..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: BuildKiteLinuxAMD64GPUBootstrap -description: Set up worker image for linux-amd64-gpu pipeline -schemaVersion: 1.0 - -phases: - - name: build - steps: - - name: SetupStep - action: ExecuteBash - inputs: - commands: - - | - yum groupinstall -y "Development tools" - yum install -y kernel-devel-$(uname -r) - dnf install -y kernel-modules-extra - aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ . - chmod +x NVIDIA-Linux-x86_64*.run - ./NVIDIA-Linux-x86_64*.run --silent - - curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo - yum install -y nvidia-container-toolkit - yum clean expire-cache - nvidia-ctk runtime configure --runtime=docker - systemctl restart docker diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py deleted file mode 100644 index 37100209fe2e..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py +++ /dev/null @@ -1,18 +0,0 @@ -IMAGE_PARAMS = { - "linux-amd64-gpu": { - "BaseImageId": "linuxamd64", - # AMI ID is looked up from Buildkite's CloudFormation template - "BootstrapScript": "linux-amd64-gpu-bootstrap.yml", - "InstanceType": "g4dn.xlarge", - "InstanceOperatingSystem": "Linux", - "VolumeSize": "40", # in GiBs - }, - "windows-gpu": { - "BaseImageId": "windows", - # AMI ID is looked up from Buildkite's CloudFormation template - "BootstrapScript": "windows-gpu-bootstrap.yml", - "InstanceType": "g4dn.2xlarge", - "InstanceOperatingSystem": "Windows", - "VolumeSize": "120", # in GiBs - }, -} diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py deleted file mode 100644 index 9edb8b1a7c24..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py +++ /dev/null @@ -1,22 +0,0 @@ -import argparse - -import boto3 -from create_worker_image_pipelines import get_full_stack_id -from metadata import IMAGE_PARAMS - - -def main(args): - cf = boto3.resource("cloudformation", region_name=args.aws_region) - builder_client = boto3.client("imagebuilder", region_name=args.aws_region) - for stack_id in IMAGE_PARAMS: - stack_id_full = get_full_stack_id(stack_id) - pipeline_arn = cf.Stack(stack_id_full).Resource("Pipeline").physical_resource_id - print(f"Running pipeline {pipeline_arn} to generate a new AMI...") - r = builder_client.start_image_pipeline_execution(imagePipelineArn=pipeline_arn) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml deleted file mode 100644 index 0348e28c8709..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: BuildKiteWindowsGPUBootstrap -description: Set up worker image for windows-gpu pipeline -schemaVersion: 1.0 - -phases: - - name: build - steps: - - name: SetupStep - action: ExecutePowerShell - inputs: - commands: - - | - $ErrorActionPreference = "Stop" - - choco --version - choco feature enable -n=allowGlobalConfirmation - - # CMake 3.29.2 - Write-Host '>>> Installing CMake 3.29.2...' - choco install cmake --version 3.29.2 --installargs "ADD_CMAKE_TO_PATH=System" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Notepad++ - Write-Host '>>> Installing Notepad++...' - choco install notepadplusplus - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Mambaforge - Write-Host '>>> Installing Mambaforge...' - choco install mambaforge /RegisterPython:1 /D:C:\tools\mambaforge - C:\tools\mambaforge\Scripts\conda.exe init --user --system - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - . "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - conda config --set auto_activate_base false - - # Install Java 11 - Write-Host '>>> Installing Java 11...' - choco install openjdk11 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install Maven - Write-Host '>>> Installing Maven...' - choco install maven - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install GraphViz - Write-Host '>>> Installing GraphViz...' - choco install graphviz - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install Visual Studio 2022 Community - Write-Host '>>> Installing Visual Studio 2022 Community...' - choco install visualstudio2022community ` - --params "--wait --passive --norestart" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install visualstudio2022-workload-nativedesktop --params ` - "--wait --passive --norestart --includeOptional" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install CUDA 12.4 - Write-Host '>>> Installing CUDA 12.4...' - choco install cuda --version=12.4.1.551 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install R - Write-Host '>>> Installing R...' - choco install r.project --version=4.3.2 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install rtools --version=4.3.5550 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/buildkite/pipeline-mac-m1.yml b/tests/buildkite/pipeline-mac-m1.yml deleted file mode 100644 index 57b1b1d12010..000000000000 --- a/tests/buildkite/pipeline-mac-m1.yml +++ /dev/null @@ -1,13 +0,0 @@ -steps: - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - - label: ":macos: Build libxgboost4j.dylib for MacOS M1" - command: "tests/buildkite/build-jvm-macos-m1.sh" - key: mac-m1-jvm - agents: - queue: mac-mini-m1 - - label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11" - command: "tests/buildkite/test-macos-m1-clang11.sh" - key: mac-m1-appleclang11 - agents: - queue: mac-mini-m1 diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml deleted file mode 100644 index cbb573c3682c..000000000000 --- a/tests/buildkite/pipeline-mgpu.yml +++ /dev/null @@ -1,48 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh jvm_gpu_build" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - label: ":console: Build and test JVM packages with CUDA" - command: "tests/buildkite/build-jvm-packages-gpu.sh" - key: build-jvm-packages-gpu - agents: - queue: linux-amd64-mgpu - - wait - #### -------- TEST -------- - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-mgpu.sh" - key: test-cpp-mgpu - agents: - queue: linux-amd64-mgpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml deleted file mode 100644 index 4d84f93a54d4..000000000000 --- a/tests/buildkite/pipeline-nightly.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Nightly CI pipeline, to test against dev versions of dependencies - -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 - USE_DEPS_DEV_VER: "1" - # Use dev versions of RAPIDS and other dependencies -steps: - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh gpu_dev_ver" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - wait - - label: ":console: Build CUDA + RMM Nightly" - command: "tests/buildkite/build-cuda-with-rmm.sh dev" - key: build-cuda-rmm-nightly - agents: - queue: linux-amd64-cpu - - wait - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml deleted file mode 100644 index 83a61981e716..000000000000 --- a/tests/buildkite/pipeline-win64.yml +++ /dev/null @@ -1,24 +0,0 @@ -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- BUILD -------- - - label: ":windows: Build XGBoost for Windows with CUDA" - command: "tests/buildkite/build-win64-gpu.ps1" - key: build-win64-gpu - agents: - queue: windows-cpu - - - wait - - #### -------- TEST -------- - - label: ":windows: Test XGBoost on Windows" - command: "tests/buildkite/test-win64-gpu.ps1" - key: test-win64-gpu - agents: - queue: windows-gpu diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml deleted file mode 100644 index 6c1df33b84dd..000000000000 --- a/tests/buildkite/pipeline.yml +++ /dev/null @@ -1,113 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh cpu" - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Run clang-tidy" - command: "tests/buildkite/run-clang-tidy.sh" - key: run-clang-tidy - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU" - command: "tests/buildkite/build-cpu.sh" - key: build-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU ARM64 + manylinux_2_28_aarch64 wheel" - command: "tests/buildkite/build-cpu-arm64.sh" - key: build-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Build CUDA + manylinux_2_28_x86_64 wheel" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - label: ":console: Build CUDA with RMM" - command: "tests/buildkite/build-cuda-with-rmm.sh stable" - key: build-cuda-with-rmm - agents: - queue: linux-amd64-cpu - - label: ":console: Build R package with CUDA" - command: "tests/buildkite/build-gpu-rpkg.sh" - key: build-gpu-rpkg - agents: - queue: linux-amd64-cpu - - label: ":console: Build JVM packages" - timeout_in_minutes: 30 - command: "tests/buildkite/build-jvm-packages.sh" - key: build-jvm-packages - agents: - queue: linux-amd64-cpu - - label: ":console: Build libxgboost4j.so for Linux ARM64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh" - key: build-jvm-linux-arm64-manylinux2014 - agents: - queue: linux-arm64-cpu - - label: ":console: Build libxgboost4j.so for Linux x86_64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh" - key: build-jvm-linux-x86_64-manylinux2014 - agents: - queue: linux-amd64-cpu - - label: ":console: Build JVM package doc" - command: "tests/buildkite/build-jvm-doc.sh" - key: build-jvm-doc - agents: - queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_x86_64 wheel" - command: "tests/buildkite/build-manylinux2014.sh x86_64" - key: build-manylinux2014-x86_64 - agents: - queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_aarch64 wheel" - command: "tests/buildkite/build-manylinux2014.sh aarch64" - key: build-manylinux2014-aarch64 - agents: - queue: linux-arm64-cpu - - wait - #### -------- TEST -------- - - label: ":console: Test Python package, CPU" - command: "tests/buildkite/test-python-cpu.sh" - key: test-python-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Test Python package, CPU ARM64" - command: "tests/buildkite/test-python-cpu-arm64.sh" - key: test-python-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-gpu.sh" - key: test-cpp-gpu - agents: - queue: linux-amd64-gpu - - wait - #### -------- DEPLOY JVM -------- - - label: ":console: Deploy JVM packages" - command: "tests/buildkite/deploy-jvm-packages.sh" - key: deploy-jvm-packages - agents: - queue: linux-amd64-cpu diff --git a/tests/buildkite/run-clang-tidy.sh b/tests/buildkite/run-clang-tidy.sh deleted file mode 100755 index 95ff010c20f1..000000000000 --- a/tests/buildkite/run-clang-tidy.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Run clang-tidy" - -source tests/buildkite/conftest.sh - -tests/ci_build/ci_build.sh clang_tidy \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - python3 tests/ci_build/tidy.py --cuda-archs 75 diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh deleted file mode 100755 index d7197db2efce..000000000000 --- a/tests/buildkite/test-cpp-gpu.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Run Google Tests with CUDA, using a GPU" -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost - -echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" -rm -rfv build/ -buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost --use-rmm-pool diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh deleted file mode 100755 index 65614b191d04..000000000000 --- a/tests/buildkite/test-cpp-mgpu.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -# Allocate extra space in /dev/shm to enable NCCL -export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - -echo "--- Run Google Tests with CUDA, using multiple GPUs" -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost --gtest_filter=*MGPU* diff --git a/tests/buildkite/test-macos-m1-clang11.sh b/tests/buildkite/test-macos-m1-clang11.sh deleted file mode 100755 index 6824cb7b14b4..000000000000 --- a/tests/buildkite/test-macos-m1-clang11.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -# Display system info -echo "--- Display system information" -set -x -system_profiler SPSoftwareDataType -sysctl -n machdep.cpu.brand_string -uname -m -set +x - -# Ensure that XGBoost can be built with Clang 11 -echo "--- Build and Test XGBoost with MacOS M1, Clang 11" -set -x -LLVM11_PATH=$(brew --prefix llvm\@11) -mkdir build -pushd build -cmake .. -GNinja -DCMAKE_C_COMPILER=${LLVM11_PATH}/bin/clang \ - -DCMAKE_CXX_COMPILER=${LLVM11_PATH}/bin/clang++ -DGOOGLE_TEST=ON \ - -DUSE_DMLC_GTEST=ON -ninja -v -./testxgboost diff --git a/tests/buildkite/test-python-cpu-arm64.sh b/tests/buildkite/test-python-cpu-arm64.sh deleted file mode 100755 index 68a428034073..000000000000 --- a/tests/buildkite/test-python-cpu-arm64.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Test Python CPU ARM64" -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cpu-arm64 -buildkite-agent artifact download "xgboost" . --step build-cpu-arm64 -chmod +x ./xgboost -tests/ci_build/ci_build.sh aarch64 tests/ci_build/test_python.sh cpu-arm64 diff --git a/tests/buildkite/test-python-cpu.sh b/tests/buildkite/test-python-cpu.sh deleted file mode 100755 index 6c53dc2821bc..000000000000 --- a/tests/buildkite/test-python-cpu.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Test CPU code in Python env" - -source tests/buildkite/conftest.sh - -mkdir -pv python-package/dist -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda -buildkite-agent artifact download "xgboost" . --step build-cpu -chmod +x ./xgboost - -export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/cpu) -set_buildkite_env_vars_in_container -tests/ci_build/ci_build.sh cpu tests/ci_build/test_python.sh cpu diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh deleted file mode 100755 index d7bd729a2e01..000000000000 --- a/tests/buildkite/test-python-gpu.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [ "$#" -lt 1 ] -then - suite='' - args='' -else - suite=$1 - shift 1 - args="$@" -fi - -source tests/buildkite/conftest.sh - -echo "--- Fetch build artifacts" -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost - -# Allocate extra space in /dev/shm to enable NCCL -export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - -if [[ -z "${USE_DEPS_DEV_VER-}" ]] -then - container_tag='gpu' - rapids_version=${RAPIDS_VERSION} -else - container_tag='gpu_dev_ver' - rapids_version=${DEV_RAPIDS_VERSION} -fi - -command_wrapper="tests/ci_build/ci_build.sh ${container_tag} --use-gpus --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=${rapids_version} --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION" - -# Run specified test suite -case "$suite" in - gpu) - export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu) - set_buildkite_env_vars_in_container - echo "--- Test XGBoost Python package, single GPU" - $command_wrapper tests/ci_build/test_python.sh $suite - ;; - - mgpu) - export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu) - set_buildkite_env_vars_in_container - echo "--- Test XGBoost Python package, 4 GPUs" - $command_wrapper tests/ci_build/test_python.sh $suite - ;; - - *) - echo "Usage: $0 {gpu|mgpu} [extra args to pass to pytest]" - exit 1 - ;; -esac diff --git a/tests/buildkite/test-win64-gpu.ps1 b/tests/buildkite/test-win64-gpu.ps1 deleted file mode 100644 index 95a51b50228d..000000000000 --- a/tests/buildkite/test-win64-gpu.ps1 +++ /dev/null @@ -1,39 +0,0 @@ -$ErrorActionPreference = "Stop" - -. tests/buildkite/conftest.ps1 - -Write-Host "--- Test XGBoost on Windows with CUDA" - -New-Item python-package/dist -ItemType Directory -ea 0 -New-Item build -ItemType Directory -ea 0 -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -buildkite-agent artifact download "build/testxgboost.exe" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -buildkite-agent artifact download "xgboost.exe" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -nvcc --version - -Write-Host "--- Run Google Tests" -& build/testxgboost.exe -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -Write-Host "--- Set up Python env" -conda activate -$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) -mamba env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml -conda activate ${env_name} -Get-ChildItem . -Filter python-package/dist/*.whl | -Foreach-Object { - & python -m pip install python-package/dist/$_ - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} - -Write-Host "--- Run Python tests" -python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -Write-Host "--- Run Python tests with GPU" -python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` - tests/python-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver deleted file mode 100644 index d23c5e83c2c7..000000000000 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ /dev/null @@ -1,54 +0,0 @@ -# Container to test XGBoost against dev versions of dependencies - -ARG CUDA_VERSION_ARG -FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 -ARG CUDA_VERSION_ARG -ARG RAPIDS_VERSION_ARG - # Should be first 4 digits of the dev version (e.g. 24.06) -ARG NCCL_VERSION_ARG - -# Environment -ENV DEBIAN_FRONTEND=noninteractive -SHELL ["/bin/bash", "-c"] # Use Bash as shell - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge - -ENV PATH=/opt/miniforge/bin:$PATH - -# Create new Conda environment with dev versions of cuDF, Dask, and cuPy -RUN \ - export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ - export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ - python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ - "nccl>=${NCCL_SHORT_VER}" \ - dask \ - "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ - numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ - python-kubernetes urllib3 graphviz hypothesis loky \ - "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n gpu_test pip install buildkite-test-collector - -ENV GOSU_VERSION=1.10 -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 b/tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 deleted file mode 100644 index 52baff43bb6f..000000000000 --- a/tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_aarch64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 b/tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 deleted file mode 100644 index 578b85618776..000000000000 --- a/tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_x86_64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel ninja-build - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/build_jvm_doc.sh b/tests/ci_build/build_jvm_doc.sh deleted file mode 100755 index 01a91dd629b5..000000000000 --- a/tests/ci_build/build_jvm_doc.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -if [ $# -ne 1 ]; then - echo "Usage: $0 [branch name]" - exit 1 -fi - -set -e -set -x - -rm -rf build/ -cd jvm-packages - -branch_name=$1 - -# Install JVM packages in local Maven repository -mvn --no-transfer-progress install -DskipTests -# Build Scaladocs -mvn --no-transfer-progress scala:doc -DskipTests -# Build Javadocs -mvn --no-transfer-progress javadoc:javadoc -DskipTests - -# Package JVM docs in a tarball -mkdir -p tmp/scaladocs -cp -rv xgboost4j/target/reports/apidocs/ ./tmp/javadocs/ -cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/ -cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/ -cp -rv xgboost4j-flink/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-flink/ - -cd tmp -tar cvjf ${branch_name}.tar.bz2 javadocs/ scaladocs/ -mv ${branch_name}.tar.bz2 .. -cd .. -rm -rfv tmp/ - -set +x -set +e diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh deleted file mode 100755 index 99681f5ca43c..000000000000 --- a/tests/ci_build/build_jvm_packages.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -set -e -set -x - -spark_version=$1 -use_cuda=$2 -gpu_arch=$3 -use_scala213=$4 - -gpu_options="" -if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then - gpu_options="$use_cuda -Pgpu" -fi - -rm -rf build/ -cd jvm-packages - -if [ "x$gpu_arch" != "x" ]; then - export GPU_ARCH_FLAG=$gpu_arch -fi - -# Purge artifacts and set correct Scala version -pushd .. -if [ "x$use_scala213" != "x" ]; then - python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts -else - python dev/change_scala_version.py --scala-version 2.12 --purge-artifacts -fi -popd - -# Build and test XGBoost4j-spark against different spark versions only for CPU and scala=2.12 -if [ "x$gpu_options" == "x" ] && [ "x$use_scala213" == "x" ]; then - mvn --no-transfer-progress clean package -Dspark.version=3.1.3 -pl xgboost4j,xgboost4j-spark - mvn --no-transfer-progress clean package -Dspark.version=3.2.4 -pl xgboost4j,xgboost4j-spark - mvn --no-transfer-progress clean package -Dspark.version=3.3.4 -pl xgboost4j,xgboost4j-spark - mvn --no-transfer-progress clean package -Dspark.version=3.4.3 -pl xgboost4j,xgboost4j-spark -fi - -mvn --no-transfer-progress clean install -Dspark.version=${spark_version} $gpu_options - -# Integration tests -if [ "x$use_cuda" == "x" ]; then - mvn --no-transfer-progress test -pl xgboost4j-example -fi - -set +x -set +e diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh deleted file mode 100755 index 3238c41e1bcb..000000000000 --- a/tests/ci_build/build_via_cmake.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash -set -e - -if [[ "$1" == --conda-env=* ]] -then - conda_env=$(echo "$1" | sed 's/^--conda-env=//g' -) - echo "Activating Conda environment ${conda_env}" - shift 1 - cmake_args="$@" - - # Workaround for file permission error - if [[ -n $CI_BUILD_UID ]] - then - gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs - fi - - source activate ${conda_env} - cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" -else - cmake_args="$@" - cmake_prefix_flag='' -fi - -rm -rf build -mkdir build -cd build -# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until -# https://github.com/dmlc/xgboost/issues/10400 is fixed -cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON -DBUILD_DEPRECATED_CLI=ON -ninja clean -time ninja -v -cd .. diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh deleted file mode 100755 index a2f2d6063160..000000000000 --- a/tests/ci_build/ci_build.sh +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env bash -# -# Execute command within a docker container -# -# Usage: ci_build.sh [--use-gpus] -# [--dockerfile ] [-it] -# [--build-arg ] -# -# CONTAINER_TYPE: Type of the docker container used the run the build: e.g., -# (cpu | gpu) -# -# --use-gpus: Whether to grant the container access to NVIDIA GPUs. -# -# DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build. If -# this optional value is not supplied (via the --dockerfile -# flag), will use Dockerfile.CONTAINER_TYPE in default -# -# BUILD_ARG: (Optional) an argument to be passed to docker build -# -# COMMAND: Command to be executed in the docker container -# -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Get the command line arguments. -CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' ) -shift 1 - -# Dockerfile to be used in docker build -DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" -DOCKER_CONTEXT_PATH="${SCRIPT_DIR}" - -GPU_FLAG='' -if [[ "$1" == "--use-gpus" ]]; then - echo "Using NVIDIA GPUs" - GPU_FLAG='--gpus all' - shift 1 -fi - -if [[ "$1" == "--dockerfile" ]]; then - DOCKERFILE_PATH="$2" - DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}") - echo "Using custom Dockerfile path: ${DOCKERFILE_PATH}" - echo "Using custom docker build context path: ${DOCKER_CONTEXT_PATH}" - shift 2 -fi - -if [[ -n "${CI_DOCKER_EXTRA_PARAMS_INIT}" ]] -then - IFS=' ' read -r -a CI_DOCKER_EXTRA_PARAMS <<< "${CI_DOCKER_EXTRA_PARAMS_INIT}" -fi - -if [[ "$1" == "-it" ]]; then - CI_DOCKER_EXTRA_PARAMS+=('-it') - shift 1 -fi - -while [[ "$1" == "--build-arg" ]]; do - CI_DOCKER_BUILD_ARG+=" $1" - CI_DOCKER_BUILD_ARG+=" $2" - shift 2 -done - -if [[ ! -f "${DOCKERFILE_PATH}" ]]; then - echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\"" - exit 1 -fi - -COMMAND=("$@") - -# Validate command line arguments. -if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then - supported_container_types=$( ls -1 ${SCRIPT_DIR}/Dockerfile.* | \ - sed -n 's/.*Dockerfile\.\([^\/]*\)/\1/p' | tr '\n' ' ' ) - echo "Usage: $(basename $0) CONTAINER_TYPE COMMAND" - echo " CONTAINER_TYPE can be one of [${supported_container_types}]" - echo " COMMAND is a command (with arguments) to run inside" - echo " the container." - exit 1 -fi - -# Helper function to traverse directories up until given file is found. -function upsearch () { - test / == "$PWD" && return || \ - test -e "$1" && echo "$PWD" && return || \ - cd .. && upsearch "$1" -} - -# Set up WORKSPACE. Jenkins will set them for you or we pick -# reasonable defaults if you run it outside of Jenkins. -WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../../}" - -# Determine the docker image name -DOCKER_IMG_NAME="xgb-ci.${CONTAINER_TYPE}" - -# Append cuda version if available -CUDA_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'CUDA_VERSION_ARG=[0-9]+\.[0-9]+' | grep -o -E '[0-9]+\.[0-9]+') -# Append jdk version if available -JDK_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'JDK_VERSION=[0-9]+' | grep -o -E '[0-9]+') -# Append cmake version if available -CMAKE_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'CMAKE_VERSION=[0-9]+\.[0-9]+' | grep -o -E '[0-9]+\.[0-9]+') -# Append R version if available -USE_R35=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'USE_R35=[0-9]+' | grep -o -E '[0-9]+$') -if [[ ${USE_R35} == "1" ]]; then - USE_R35="_r35" -elif [[ ${USE_R35} == "0" ]]; then - USE_R35="_no_r35" -fi -DOCKER_IMG_NAME=$DOCKER_IMG_NAME$CUDA_VERSION$JDK_VERSION$CMAKE_VERSION$USE_R35 - -# Under Jenkins matrix build, the build tag may contain characters such as -# commas (,) and equal signs (=), which are not valid inside docker image names. -DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | sed -e 's/=/_/g' -e 's/,/-/g') - -# Convert to all lower-case, as per requirement of Docker image names -DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]') - -# Bash on Ubuntu on Windows -UBUNTU_ON_WINDOWS=$([ -e /proc/version ] && grep -l Microsoft /proc/version || echo "") -# MSYS, Git Bash, etc. -MSYS=$([ -e /proc/version ] && grep -l MINGW /proc/version || echo "") - -if [[ -z "$UBUNTU_ON_WINDOWS" ]] && [[ -z "$MSYS" ]] && [[ ! "$OSTYPE" == "darwin"* ]]; then - USER_IDS="-e CI_BUILD_UID=$( id -u ) -e CI_BUILD_GID=$( id -g ) -e CI_BUILD_USER=$( id -un ) -e CI_BUILD_GROUP=$( id -gn ) -e CI_BUILD_HOME=${WORKSPACE}" -fi - -# Print arguments. -cat <=1.4.1 -- pandas -- matplotlib -- dask -- distributed -- python-graphviz -- pytest -- jsonschema -- hypothesis -- python-graphviz -- pip -- py-ubjson -- loky -- pyarrow diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh deleted file mode 100755 index 2cb108c8bc6f..000000000000 --- a/tests/ci_build/deploy_jvm_packages.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -set -e -set -x - -if [ $# -ne 1 ]; then - echo "Usage: $0 [spark version]" - exit 1 -fi - -spark_version=$1 - -cd jvm-packages -rm -rf $(find . -name target) -rm -rf ../build/ - -## Deploy JVM packages to xgboost-maven-repo - -# Scala 2.12, CPU variant -mvn --no-transfer-progress deploy -Pdefault,release-to-s3 -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true -mvn clean -mvn clean -Pdefault,release-to-s3 - -# Scala 2.12, GPU variant -mvn --no-transfer-progress install -Pgpu -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true -mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true - -# Scala 2.13, CPU variant -pushd .. -python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts -popd -mvn --no-transfer-progress deploy -Pdefault,release-to-s3 -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true -mvn clean -mvn clean -Pdefault,release-to-s3 - -# Scala 2.13, GPU variant -mvn --no-transfer-progress install -Pgpu -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true -mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true - -set +x -set +e diff --git a/tests/ci_build/jenkins_tools.Groovy b/tests/ci_build/jenkins_tools.Groovy deleted file mode 100644 index 1bc2574c6ac0..000000000000 --- a/tests/ci_build/jenkins_tools.Groovy +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/groovy -// -*- mode: groovy -*- - -/* Utility functions for Jenkins */ - -// Command to run command inside a docker container -dockerRun = 'tests/ci_build/ci_build.sh' - - -/** - * Creates cmake and make builds - */ -def buildFactory(buildName, conf, restricted, build_func) { - def os = conf["os"] - def device = conf["withGpu"] ? (conf["multiGpu"] ? "mgpu" : "gpu") : "cpu" - def restricted_flag = restricted ? "restricted" : "unrestricted" - def nodeReq = "${os} && ${device} && ${restricted_flag}" - def dockerTarget = conf["withGpu"] ? "gpu" : "cpu" - [ ("${buildName}") : { build_func("${buildName}", conf, nodeReq, dockerTarget) } - ] -} - -def cmakeOptions(conf) { - return ([ - conf["withGpu"] ? '-DUSE_CUDA=ON' : '-DUSE_CUDA=OFF', - conf["withNccl"] ? '-DUSE_NCCL=ON' : '-DUSE_NCCL=OFF', - conf["withOmp"] ? '-DOPEN_MP:BOOL=ON' : ''] - ).join(" ") -} - -def getBuildName(conf) { - def gpuLabel = conf['withGpu'] ? ( (conf['multiGpu'] ? "_mgpu" : "") + "_cuda" + conf['cudaVersion'] + (conf['withNccl'] ? "_nccl" : "_nonccl")) : "_cpu" - def ompLabel = conf['withOmp'] ? "_omp" : "" - def pyLabel = "_py${conf['pythonVersion']}" - return "${conf['os']}${gpuLabel}${ompLabel}${pyLabel}" -} - -return this diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh deleted file mode 100755 index a1a023046e5b..000000000000 --- a/tests/ci_build/test_python.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -set -e - -if [ "$#" -lt 1 ] -then - suite='' - args='' -else - suite=$1 - shift 1 - args="$@" -fi - -# Install XGBoost Python package -function install_xgboost { - wheel_found=0 - pip install --upgrade pip --user - for file in python-package/dist/*.whl - do - if [ -e "${file}" ] - then - pip install --user "${file}" - wheel_found=1 - break # need just one - fi - done - if [ "$wheel_found" -eq 0 ] - then - pushd . - cd python-package - pip install --user -v . - popd - fi -} - -function setup_pyspark_envs { - export PYSPARK_DRIVER_PYTHON=`which python` - export PYSPARK_PYTHON=`which python` - export SPARK_TESTING=1 -} - -function unset_pyspark_envs { - unset PYSPARK_DRIVER_PYTHON - unset PYSPARK_PYTHON - unset SPARK_TESTING -} - -function uninstall_xgboost { - pip uninstall -y xgboost -} - -# Run specified test suite -case "$suite" in - gpu) - source activate gpu_test - set -x - install_xgboost - setup_pyspark_envs - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - mgpu) - source activate gpu_test - set -x - install_xgboost - setup_pyspark_envs - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_federated - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - cpu) - source activate linux_cpu_test - set -x - install_xgboost - export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 - setup_pyspark_envs - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_federated - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - cpu-arm64) - source activate aarch64_test - set -x - install_xgboost - setup_pyspark_envs - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - *) - echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [extra args to pass to pytest]" - exit 1 - ;; -esac diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index 97ee39b31a1e..6f4c34edfa9f 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -32,9 +32,6 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); - if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { - GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); - } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -114,7 +111,15 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; +#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); +#else // defined(xgboost_IS_WIN) + if (major >= 12 && minor >= 5) { + ASSERT_TRUE(pinned.IsVm()); + } else { + ASSERT_FALSE(pinned.IsVm()); + } +#endif // defined(xgboost_IS_WIN) } TEST(AtomitFetch, Max) {