Skip to content

Commit 8c0f9e8

Browse files
authored
v0.3.0 (#171)
1 parent 11ac824 commit 8c0f9e8

21 files changed

+133
-162
lines changed

.github/workflows/codeql.yml

+2-8
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
name: Analyze
1313
runs-on: 'ubuntu-latest'
1414
container:
15-
image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }}
15+
image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
1616

1717
permissions:
1818
actions: read
@@ -27,7 +27,7 @@ jobs:
2727

2828
steps:
2929
- name: Checkout repository
30-
uses: actions/checkout@v3
30+
uses: actions/checkout@v4
3131

3232
- name: Check disk space
3333
run: |
@@ -38,12 +38,6 @@ jobs:
3838
with:
3939
languages: ${{ matrix.language }}
4040

41-
- name: Install cmake
42-
run: |
43-
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
44-
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
45-
sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake
46-
4741
- name: Dubious ownership exception
4842
run: |
4943
git config --global --add safe.directory /__w/mscclpp/mscclpp

.github/workflows/integration-test-backup.yml

+3-12
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ on: workflow_dispatch
44

55
jobs:
66
IntegrationTest:
7-
runs-on: self-hosted
7+
runs-on: [ self-hosted, A100 ]
88
defaults:
99
run:
1010
shell: bash
@@ -13,22 +13,17 @@ jobs:
1313
cuda: [ cuda11.8, cuda12.1 ]
1414

1515
container:
16-
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
16+
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
1717
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
1818

1919
steps:
2020
- name: Checkout
2121
uses: actions/checkout@v4
2222

23-
- name: Install CMake
24-
run: |
25-
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
26-
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
27-
2823
- name: Build
2924
run: |
3025
mkdir build && cd build
31-
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
26+
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
3227
make -j
3328
3429
- name: Lock GPU clock frequency
@@ -41,7 +36,6 @@ jobs:
4136
- name: Run mscclpp AllGather test
4237
run: |
4338
set -e
44-
export PATH=/usr/local/mpi/bin:$PATH
4539
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
4640
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
4741
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -50,13 +44,11 @@ jobs:
5044
- name: Run mscclpp SendRecv test
5145
run: |
5246
set -e
53-
export PATH=/usr/local/mpi/bin:$PATH
5447
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
5548
5649
- name: Run mscclpp AllReduce test
5750
run: |
5851
set -e
59-
export PATH=/usr/local/mpi/bin:$PATH
6052
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
6153
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
6254
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -68,7 +60,6 @@ jobs:
6860
- name: Run mscclpp AllToAll test
6961
run: |
7062
set -e
71-
export PATH=/usr/local/mpi/bin:$PATH
7263
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
7364
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
7465

.github/workflows/lint.yml

+6-6
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111

1212
steps:
1313
- name: Check out Git repository
14-
uses: actions/checkout@v3
14+
uses: actions/checkout@v4
1515

1616
- name: Install ClangFormat
1717
run: |
@@ -28,25 +28,25 @@ jobs:
2828

2929
steps:
3030
- name: Check out Git repository
31-
uses: actions/checkout@v3
31+
uses: actions/checkout@v4
3232

3333
- name: Set up Python
3434
uses: actions/setup-python@v4
3535
with:
36-
python-version: 3.8
36+
python-version: 3
3737

3838
- name: Install Python dependencies
39-
run: python3.8 -m pip install black
39+
run: python3 -m pip install black
4040

4141
- name: Run black
42-
run: python3.8 -m black --check --config pyproject.toml .
42+
run: python3 -m black --check --config pyproject.toml .
4343

4444
spelling:
4545
runs-on: ubuntu-20.04
4646

4747
steps:
4848
- name: Check out Git repository
49-
uses: actions/checkout@v3
49+
uses: actions/checkout@v4
5050

5151
- name: Download misspell
5252
run: |

.github/workflows/ut-backup.yml

+3-16
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ on: workflow_dispatch
44

55
jobs:
66
UnitTest:
7-
runs-on: self-hosted
7+
runs-on: [ self-hosted, A100 ]
88
defaults:
99
run:
1010
shell: bash
@@ -14,7 +14,7 @@ jobs:
1414
cuda: [ cuda11.8, cuda12.1 ]
1515

1616
container:
17-
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
17+
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
1818
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
1919

2020
steps:
@@ -23,10 +23,8 @@ jobs:
2323

2424
- name: Build
2525
run: |
26-
curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
27-
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
2826
mkdir build && cd build
29-
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
27+
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
3028
make -j
3129
working-directory: ${{ github.workspace }}
3230

@@ -36,31 +34,20 @@ jobs:
3634
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
3735
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
3836
done
39-
working-directory: ${{ github.workspace }}
4037
4138
- name: UnitTests
4239
run: |
4340
./build/test/unit_tests
44-
working-directory: ${{ github.workspace }}
4541
4642
- name: MpUnitTests
4743
run: |
4844
set -e
49-
export PATH=/usr/local/mpi/bin:$PATH
5045
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
5146
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
5247
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
53-
working-directory: ${{ github.workspace }}
5448
5549
- name: PyTests
5650
run: |
5751
set -e
58-
export PATH=/usr/local/mpi/bin:$PATH
5952
cd build && make pylib-copy
60-
if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
61-
python3 -m pip install -r ../python/test/requirements_cu11.txt
62-
else
63-
python3 -m pip install -r ../python/test/requirements_cu12.txt
64-
fi
6553
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
66-
working-directory: ${{ github.workspace }}

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Licensed under the MIT license.
33

44
set(MSCCLPP_MAJOR "0")
5-
set(MSCCLPP_MINOR "2")
5+
set(MSCCLPP_MINOR "3")
66
set(MSCCLPP_PATCH "0")
77

88
set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})

README.md

+10-26
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ MSCCL++ is a development kit for implementing highly optimized distributed GPU a
1818

1919
* **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime.
2020

21-
## Key Features (v0.2)
21+
## Key Features (v0.3)
2222

23-
MSCCL++ v0.2 supports the following features.
23+
MSCCL++ v0.3 supports the following features.
2424

2525
### In-Kernel Communication Interfaces
2626

@@ -124,31 +124,15 @@ Customized proxies can be used for conducting a series of pre-defined data trans
124124

125125
Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases.
126126

127-
## Status & Roadmap
127+
### New in MSCCL++ v0.3 (Latest Release)
128+
* Updated interfaces
129+
* Add Python bindings and interfaces
130+
* Add Python unit tests
131+
* Add more configurable parameters
132+
* Add a new single-node AllReduce kernel
133+
* Fix bugs
128134

129-
MSCCL++ is under active development and a part of its features will be added in a future release. The following describes key features of each version.
130-
131-
### MSCCL++ v0.4 (TBU)
132-
* Automatic task scheduler
133-
* Dynamic performance tuning
134-
135-
### MSCCL++ v0.3 (TBU)
136-
* Tile-based communication: efficient transport of 2D data patches (tiles)
137-
* GPU computation interfaces
138-
139-
### MSCCL++ v0.2 (Latest Release)
140-
* Basic communication functionalities and new interfaces
141-
- GPU-side communication interfaces
142-
- Host-side helpers: bootstrap, communicator, and proxy
143-
- Supports both NVLink and InfiniBand
144-
- Supports both in-SM copy and DMA/RDMA
145-
* Communication performance optimization
146-
- Example code outperforms NCCL/MSCCL AllGather/AllReduce/AllToAll
147-
* Development pipeline
148-
* Documentation
149-
150-
### MSCCL++ v0.1
151-
* Proof-of-concept, preliminary interfaces
135+
See details from https://github.com/microsoft/mscclpp/issues/89.
152136

153137
## Contributing
154138

docker/base-cuda12.1.dockerfile

+8-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
55

66
ENV DEBIAN_FRONTEND=noninteractive
77

8-
RUN apt-get update && \
8+
RUN rm -rf /opt/nvidia
9+
10+
RUN apt-get clean && \
11+
apt-get update && \
912
apt-get install -y --no-install-recommends \
1013
build-essential \
1114
ca-certificates \
@@ -47,8 +50,10 @@ RUN cd /tmp && \
4750
cd .. && \
4851
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
4952

50-
ENV PATH="${PATH}:/usr/local/mpi/bin" \
51-
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
53+
ENV PATH="/usr/local/mpi/bin:${PATH}" \
54+
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"
5255

5356
RUN echo PATH="${PATH}" > /etc/environment && \
5457
echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
58+
59+
ENTRYPOINT []

docker/dev-cuda11.8.dockerfile

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
2+
3+
LABEL maintainer="MSCCL++"
4+
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
5+
6+
ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
7+
CMAKE_VERSION="3.26.4"
8+
9+
ADD . ${MSCCLPP_SRC_DIR}
10+
WORKDIR ${MSCCLPP_SRC_DIR}
11+
12+
# Install cmake 3.26.4
13+
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
14+
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
15+
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
16+
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
17+
rm -rf ${CMAKE_HOME}.tar.gz
18+
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
19+
20+
# Install pytest & dependencies
21+
RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt
22+
23+
# Set PATH
24+
RUN echo PATH="${PATH}" > /etc/environment
25+
26+
# Cleanup
27+
WORKDIR /
28+
RUN rm -rf ${MSCCLPP_SRC_DIR}

docker/dev-cuda12.1.dockerfile

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
2+
3+
LABEL maintainer="MSCCL++"
4+
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
5+
6+
ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
7+
CMAKE_VERSION="3.26.4"
8+
9+
ADD . ${MSCCLPP_SRC_DIR}
10+
WORKDIR ${MSCCLPP_SRC_DIR}
11+
12+
# Install cmake 3.26.4
13+
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
14+
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
15+
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
16+
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
17+
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
18+
19+
# Install pytest & dependencies
20+
RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt
21+
22+
# Set PATH
23+
RUN echo PATH="${PATH}" > /etc/environment
24+
25+
# Cleanup
26+
WORKDIR /
27+
RUN rm -rf ${MSCCLPP_SRC_DIR}

include/mscclpp/core.hpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#define MSCCLPP_CORE_HPP_
66

77
#define MSCCLPP_MAJOR 0
8-
#define MSCCLPP_MINOR 2
8+
#define MSCCLPP_MINOR 3
99
#define MSCCLPP_PATCH 0
1010
#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
1111

@@ -24,6 +24,9 @@ namespace mscclpp {
2424
/// Unique ID for a process. This is a MSCCLPP_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
2525
using UniqueId = std::array<uint8_t, MSCCLPP_UNIQUE_ID_BYTES>;
2626

27+
/// Return a version string.
28+
std::string version();
29+
2730
/// Base class for bootstraps.
2831
class Bootstrap {
2932
public:

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"
77

88
[project]
99
name = "mscclpp"
10-
version = "0.2.0"
10+
version = "0.3.0"
1111

1212
[tool.scikit-build]
1313
cmake.minimum-version = "3.25.0"

python/CMakeLists.txt

+2-3
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,10 @@ add_subdirectory(test)
77
add_custom_target(pylib-copy)
88
add_custom_command(TARGET pylib-copy POST_BUILD
99
COMMAND ${CMAKE_COMMAND} -E copy_if_different
10-
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.cpython-38-x86_64-linux-gnu.so
10+
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
1111
${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
1212
COMMAND ${CMAKE_COMMAND} -E copy_if_different
13-
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.cpython-38-x86_64-linux-gnu.so
13+
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
1414
${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
1515
COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries"
1616
)
17-

python/mscclpp/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@
1818
TcpBootstrap,
1919
Transport,
2020
TransportFlags,
21+
version,
2122
)
2223

24+
__version__ = version()
25+
2326

2427
def get_include():
2528
"""Return the directory that contains the MSCCL++ headers."""

0 commit comments

Comments
 (0)