From da850865ec1c416231fb6539c8df227c33409ddc Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Fri, 20 Sep 2024 16:06:56 +0200 Subject: [PATCH] chore(gpu): add file to run full tests on H100 from workflow only --- .github/workflows/gpu_full_h100_tests.yml | 156 ++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 .github/workflows/gpu_full_h100_tests.yml diff --git a/.github/workflows/gpu_full_h100_tests.yml b/.github/workflows/gpu_full_h100_tests.yml new file mode 100644 index 0000000000..43cfb58533 --- /dev/null +++ b/.github/workflows/gpu_full_h100_tests.yml @@ -0,0 +1,156 @@ +# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack +name: TFHE Cuda Backend - Full tests on H100 + +env: + CARGO_TERM_COLOR: always + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUSTFLAGS: "-C target-cpu=native" + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }} + +on: + workflow_dispatch: + +jobs: + setup-instance: + name: Setup instance (cuda-h100-tests) + runs-on: ubuntu-latest + outputs: + runner-name: ${{ steps.start-instance.outputs.label }} + steps: + - name: Start instance + id: start-instance + uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261 + with: + mode: start + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + backend: hyperstack + profile: single-h100 + + cuda-tests-linux: + name: CUDA H100 tests + needs: [ setup-instance ] + concurrency: + group: ${{ github.workflow }}_${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + runs-on: ${{ needs.setup-instance.outputs.runner-name }} + strategy: + fail-fast: false + # explicit include-based build matrix, of known valid options + matrix: + include: + - os: ubuntu-22.04 + cuda: "12.2" + gcc: 11 + env: + CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} + CMAKE_VERSION: 3.29.6 + steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + + - name: Checkout tfhe-rs + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + with: + persist-credentials: 'false' + token: ${{ secrets.FHE_ACTIONS_TOKEN }} + + - name: Set up home + run: | + echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}" + + - name: Install latest stable + uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a + with: + toolchain: stable + + - name: Export CUDA variables + if: ${{ !cancelled() }} + run: | + echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}" + echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" + echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}" + echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}" + + # Specify the correct host compilers + - name: Export gcc and g++ variables + if: ${{ !cancelled() }} + run: | + { + echo "CC=/usr/bin/gcc-${{ matrix.gcc }}"; + echo "CXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "HOME=/home/ubuntu"; + } >> "${GITHUB_ENV}" + + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + + - name: Run core crypto, integer and internal CUDA backend tests + run: | + make test_gpu + + - name: Run user docs tests + run: | + make test_user_doc_gpu + + - name: Test C API + run: | + make test_c_api_gpu + + - name: Run High Level API Tests + run: | + make test_high_level_api_gpu + + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-tests-linux ] + runs-on: ubuntu-latest + if: ${{ failure() }} + continue-on-error: true + steps: + - name: Send message + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ needs.cuda-tests-linux.result }} + SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})" + + teardown-instance: + name: Teardown instance (cuda-h100-tests) + needs: [ setup-instance, cuda-tests-linux ] + runs-on: ubuntu-latest + steps: + - name: Stop instance + id: stop-instance + uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261 + with: + mode: stop + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + label: ${{ needs.setup-instance.outputs.runner-name }} + + - name: Slack Notification + if: ${{ failure() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"