microsoft
diff --git a/‎.github/workflows/codeql.yml
+2-8 b/‎.github/workflows/codeql.yml
+2-8
diff --git a/‎.github/workflows/integration-test-backup.yml
+3-12 b/‎.github/workflows/integration-test-backup.yml
+3-12
diff --git a/‎.github/workflows/lint.yml
+6-6 b/‎.github/workflows/lint.yml
+6-6
diff --git a/‎.github/workflows/ut-backup.yml
+3-16 b/‎.github/workflows/ut-backup.yml
+3-16
diff --git a/‎CMakeLists.txt
+1-1 b/‎CMakeLists.txt
+1-1
diff --git a/‎README.md
+10-26 b/‎README.md
+10-26
diff --git a/‎docker/base-cuda12.1.dockerfile
+8-3 b/‎docker/base-cuda12.1.dockerfile
+8-3
diff --git a/‎docker/dev-cuda11.8.dockerfile
+28 b/‎docker/dev-cuda11.8.dockerfile
+28
diff --git a/‎docker/dev-cuda12.1.dockerfile
+27 b/‎docker/dev-cuda12.1.dockerfile
+27
diff --git a/‎include/mscclpp/core.hpp
+4-1 b/‎include/mscclpp/core.hpp
+4-1
diff --git a/‎pyproject.toml
+1-1 b/‎pyproject.toml
+1-1
diff --git a/‎python/CMakeLists.txt
+2-3 b/‎python/CMakeLists.txt
+2-3
diff --git a/‎python/mscclpp/__init__.py
+3 b/‎python/mscclpp/__init__.py
+3
@@ -12,7 +12,7 @@ jobs:
     name: Analyze
     runs-on: 'ubuntu-latest'
     container:
-      image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }}
+      image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
 
     permissions:
       actions: read
@@ -27,7 +27,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Check disk space
       run: |
@@ -38,12 +38,6 @@ jobs:
       with:
         languages: ${{ matrix.language }}
 
-    - name: Install cmake
-      run: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
-        sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake
-
     - name: Dubious ownership exception
       run: |
         git config --global --add safe.directory /__w/mscclpp/mscclpp
 
@@ -4,7 +4,7 @@ on: workflow_dispatch
 
 jobs:
   IntegrationTest:
-    runs-on: self-hosted
+    runs-on: [ self-hosted, A100 ]
     defaults:
       run:
         shell: bash
@@ -13,22 +13,17 @@ jobs:
         cuda: [ cuda11.8, cuda12.1 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Install CMake
-        run: |
-          curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-          tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
-
       - name: Build
         run: |
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
 
       - name: Lock GPU clock frequency
@@ -41,7 +36,6 @@ jobs:
       - name: Run mscclpp AllGather test
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -50,13 +44,11 @@ jobs:
       - name: Run mscclpp SendRecv test
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
 
       - name: Run mscclpp AllReduce test
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -68,7 +60,6 @@ jobs:
       - name: Run mscclpp AllToAll test
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
 
 
@@ -11,7 +11,7 @@ jobs:
 
     steps:
     - name: Check out Git repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Install ClangFormat
       run: |
@@ -28,25 +28,25 @@ jobs:
 
     steps:
       - name: Check out Git repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: 3
 
       - name: Install Python dependencies
-        run: python3.8 -m pip install black
+        run: python3 -m pip install black
 
       - name: Run black
-        run: python3.8 -m black --check --config pyproject.toml .
+        run: python3 -m black --check --config pyproject.toml .
 
   spelling:
     runs-on: ubuntu-20.04
 
     steps:
     - name: Check out Git repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Download misspell
       run: |
 
@@ -4,7 +4,7 @@ on: workflow_dispatch
 
 jobs:
   UnitTest:
-    runs-on: self-hosted
+    runs-on: [ self-hosted, A100 ]
     defaults:
       run:
         shell: bash
@@ -14,7 +14,7 @@ jobs:
         cuda: [ cuda11.8, cuda12.1 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
@@ -23,10 +23,8 @@ jobs:
 
       - name: Build
         run: |
-          curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-          tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
         working-directory: ${{ github.workspace }}
 
@@ -36,31 +34,20 @@ jobs:
           for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
             sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
           done
-        working-directory: ${{ github.workspace }}
 
       - name: UnitTests
         run: |
           ./build/test/unit_tests
-        working-directory: ${{ github.workspace }}
 
       - name: MpUnitTests
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
           mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
           mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
-        working-directory: ${{ github.workspace }}
 
       - name: PyTests
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           cd build && make pylib-copy
-          if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
-            python3 -m pip install -r ../python/test/requirements_cu11.txt
-          else
-            python3 -m pip install -r ../python/test/requirements_cu12.txt
-          fi
           mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
-        working-directory: ${{ github.workspace }}
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 set(MSCCLPP_MAJOR "0")
-set(MSCCLPP_MINOR "2")
+set(MSCCLPP_MINOR "3")
 set(MSCCLPP_PATCH "0")
 
 set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
 
@@ -18,9 +18,9 @@ MSCCL++ is a development kit for implementing highly optimized distributed GPU a
 
 * **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime.
 
-## Key Features (v0.2)
+## Key Features (v0.3)
 
-MSCCL++ v0.2 supports the following features.
+MSCCL++ v0.3 supports the following features.
 
 ### In-Kernel Communication Interfaces
 
@@ -124,31 +124,15 @@ Customized proxies can be used for conducting a series of pre-defined data trans
 
 Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases.
 
-## Status & Roadmap
+### New in MSCCL++ v0.3 (Latest Release)
+* Updated interfaces
+* Add Python bindings and interfaces
+* Add Python unit tests
+* Add more configurable parameters
+* Add a new single-node AllReduce kernel
+* Fix bugs
 
-MSCCL++ is under active development and a part of its features will be added in a future release. The following describes key features of each version.
-
-### MSCCL++ v0.4 (TBU)
-* Automatic task scheduler
-* Dynamic performance tuning
-
-### MSCCL++ v0.3 (TBU)
-* Tile-based communication: efficient transport of 2D data patches (tiles)
-* GPU computation interfaces
-
-### MSCCL++ v0.2 (Latest Release)
-* Basic communication functionalities and new interfaces
-    - GPU-side communication interfaces
-    - Host-side helpers: bootstrap, communicator, and proxy
-    - Supports both NVLink and InfiniBand
-    - Supports both in-SM copy and DMA/RDMA
-* Communication performance optimization
-    - Example code outperforms NCCL/MSCCL AllGather/AllReduce/AllToAll
-* Development pipeline
-* Documentation
-
-### MSCCL++ v0.1
-* Proof-of-concept, preliminary interfaces
+See details from https://github.com/microsoft/mscclpp/issues/89.
 
 ## Contributing
 
 
@@ -5,7 +5,10 @@ LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-RUN apt-get update && \
+RUN rm -rf /opt/nvidia
+
+RUN apt-get clean && \
+    apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
@@ -47,8 +50,10 @@ RUN cd /tmp && \
     cd .. && \
     rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
 
-ENV PATH="${PATH}:/usr/local/mpi/bin" \
-    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
+ENV PATH="/usr/local/mpi/bin:${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"
 
 RUN echo PATH="${PATH}" > /etc/environment && \
     echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
+
+ENTRYPOINT []
@@ -0,0 +1,28 @@
+FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+
+LABEL maintainer="MSCCL++"
+LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
+
+ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
+    CMAKE_VERSION="3.26.4"
+
+ADD . ${MSCCLPP_SRC_DIR}
+WORKDIR ${MSCCLPP_SRC_DIR}
+
+# Install cmake 3.26.4
+ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
+    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
+RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
+    rm -rf ${CMAKE_HOME}.tar.gz
+ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
+
+# Install pytest & dependencies
+RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt
+
+# Set PATH
+RUN echo PATH="${PATH}" > /etc/environment
+
+# Cleanup
+WORKDIR /
+RUN rm -rf ${MSCCLPP_SRC_DIR}
@@ -0,0 +1,27 @@
+FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+
+LABEL maintainer="MSCCL++"
+LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
+
+ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
+    CMAKE_VERSION="3.26.4"
+
+ADD . ${MSCCLPP_SRC_DIR}
+WORKDIR ${MSCCLPP_SRC_DIR}
+
+# Install cmake 3.26.4
+ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
+    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
+RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
+ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
+
+# Install pytest & dependencies
+RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt
+
+# Set PATH
+RUN echo PATH="${PATH}" > /etc/environment
+
+# Cleanup
+WORKDIR /
+RUN rm -rf ${MSCCLPP_SRC_DIR}
@@ -5,7 +5,7 @@
 #define MSCCLPP_CORE_HPP_
 
 #define MSCCLPP_MAJOR 0
-#define MSCCLPP_MINOR 2
+#define MSCCLPP_MINOR 3
 #define MSCCLPP_PATCH 0
 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
 
@@ -24,6 +24,9 @@ namespace mscclpp {
 /// Unique ID for a process. This is a MSCCLPP_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
 using UniqueId = std::array<uint8_t, MSCCLPP_UNIQUE_ID_BYTES>;
 
+/// Return a version string.
+std::string version();
+
 /// Base class for bootstraps.
 class Bootstrap {
  public:
 
@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "mscclpp"
-version = "0.2.0"
+version = "0.3.0"
 
 [tool.scikit-build]
 cmake.minimum-version = "3.25.0"
 
@@ -7,11 +7,10 @@ add_subdirectory(test)
 add_custom_target(pylib-copy)
 add_custom_command(TARGET pylib-copy POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.cpython-38-x86_64-linux-gnu.so
+        ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
         ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.cpython-38-x86_64-linux-gnu.so
+        ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
         ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
     COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries"
 )
-
@@ -18,8 +18,11 @@
     TcpBootstrap,
     Transport,
     TransportFlags,
+    version,
 )
 
+__version__ = version()
+
 
 def get_include():
     """Return the directory that contains the MSCCL++ headers."""