intel · aregm · Sep 13, 2022 · Jan 6, 2023 · Jan 10, 2023 · Jan 10, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "third_party/oneCCL"]
 	path = third_party/oneCCL
-	url = https://github.com/oneapi-src/oneCCL/
+	url = https://github.com/oneapi-src/oneCCL.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,6 +23,8 @@ set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true)
 
 option(USE_SYSTEM_ONECCL "Use oneCCL library in system" OFF)
 
+option(BUILD_NO_ONECCL_PACKAGE "Build with oneCCL excluded" OFF)
+
 # Find the Torch lib
 find_package(Torch REQUIRED)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

diff --git a/README.md b/README.md
@@ -10,14 +10,34 @@ This repository holds PyTorch bindings maintained by Intel for the Intel® oneAP
 
 `oneccl_bindings_for_pytorch` module implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now.
 
+## Capability
+
+The table below shows which functions are available for use with CPU / Intel dGPU tensors.
+
+|                  | CPU   | GPU   |
+| :--------------- | :---: | :---: |
+| `send`           | ×     | ×     |
+| `recv`           | ×     | ×     |
+| `broadcast`      | √     | √     |
+| `all_reduce`     | √     | √     |
+| `reduce`         | √     | √     |
+| `all_gather`     | √     | √     |
+| `gather`         | √     | √     |
+| `scatter`        | ×     | ×     |
+| `reduce_scatter` | ×     | ×     |
+| `all_to_all`     | √     | √     |
+| `barrier`        | √     | √     |
+
+
 ## Pytorch API Align
 
 We recommend Anaconda as Python package management system. The following is the corresponding branches (tags) of `oneccl_bindings_for_pytorch` and supported Pytorch.
 
    | `torch`                                                         | `oneccl_bindings_for_pytorch`                                             |
    | :-------------------------------------------------------------: | :-----------------------------------------------------------------------: |
    | `master`                                                        |  `master`                                                                 |
-   | [v1.13](https://github.com/pytorch/pytorch/tree/v1.13)          |  [ccl_torch1.13](https://github.com/intel/torch-ccl/tree/ccl_torch1.13)   |
+   | [v1.13.0](https://github.com/pytorch/pytorch/tree/v1.13.0)          |  [ccl_torch1.13.100](https://github.com/intel/torch-ccl/tree/ccl_torch1.13.100)   |
+   | [v1.13.0](https://github.com/pytorch/pytorch/tree/v1.13.0)          |  [ccl_torch1.13](https://github.com/intel/torch-ccl/tree/ccl_torch1.13)   |
    | [v1.12.1](https://github.com/pytorch/pytorch/tree/v1.12.1)      |  [ccl_torch1.12.100](https://github.com/intel/torch-ccl/tree/ccl_torch1.12.100)   |
    | [v1.12.0](https://github.com/pytorch/pytorch/tree/v1.12.0)      |  [ccl_torch1.12](https://github.com/intel/torch-ccl/tree/ccl_torch1.12)   |
    | [v1.11.0](https://github.com/pytorch/pytorch/tree/v1.11.0)      |  [ccl_torch1.11](https://github.com/intel/torch-ccl/tree/ccl_torch1.11)   |
@@ -36,6 +56,27 @@ The usage details can be found in the README of corresponding branch. The follow
 
 - PyTorch v1.13.0
 
+## Build Option List
+
+The following build options are supported in Intel® oneCCL Bindings for PyTorch*.
+
+| Build Option                        | Default Value  | Description                                                                                         |
+| :---------------------------------: | :------------: | :-------------------------------------------------------------------------------------------------: |
+| COMPUTE_BACKEND                     |                | Set oneCCL `COMPUTE_BACKEND`,set to `dpcpp`  and use DPC++ Compiler to enable support for Intel XPU |
+| CCL_PACKAGE_NAME                    | oneccl-bind-pt | Set Wheel Name                                                                                      |
+| ONECCL_BINDINGS_FOR_PYTORCH_BACKEND | cpu            | Set BACKEND                                                                                         |
+| CCL_SHA_VERSION                     | False          | Add git head sha version to Wheel name                                                              |
+| BUILD_NO_ONECCL_PACKAGE             | False          | Package the Wheel without oneCCL library                                                            |
+
+## Launch Option List
+
+The following launch options are supported in Intel® oneCCL Bindings for PyTorch*.
+
+| Launch Option                             | Default Value | Description                                                           |
+| :--------------------------------------: | :-----------: | :-------------------------------------------------------------------: |
+| ONECCL_BINDINGS_FOR_PYTORCH_ENV_VERBOSE  | 0             | Set verbose level in ONECCL_BINDINGS_FOR_PYTORCH                      |
+| ONECCL_BINDINGS_FOR_PYTORCH_ENV_WAIT_GDB | 0             | Set 1 to force the oneccl_bindings_for_pytorch wait for GDB attaching |
+
 ## Installation
 
 ### Install from Source
@@ -51,24 +92,39 @@ The usage details can be found in the README of corresponding branch. The follow
 2. Install `oneccl_bindings_for_pytorch`
 
    ```bash
+   # for CPU Backend Only
    python setup.py install
+   # use DPC++ Compiler to enable support for Intel XPU
+   BUILD_NO_ONECCL_PACKAGE=ON COMPUTE_BACKEND=dpcpp python setup.py install
    ```
+
+**Note:** To run the torch-ccl without oneCCL library installed, Please make sure you have installed oneCCL in the oneAPI basekit from https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html#base-kit
+
+```bash
+source $basekit_root/ccl/latest/env/vars.sh
+```
 
 ### Install PreBuilt Wheel
 
 Wheel files are avaiable for the following Python versions.
 
 | Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 |
 | :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: |
+| 1.13.100          |            | √          | √          | √          | √           |
 | 1.13              |            | √          | √          | √          | √           |
 | 1.12.100          |            | √          | √          | √          | √           |
 | 1.12.0            |            | √          | √          | √          | √           |
 | 1.11.0            |            | √          | √          | √          | √           |
 | 1.10.0            | √          | √          | √          | √          |             |
 
+Installation for CPU:
 ```bash
 python -m pip install oneccl_bind_pt==1.13 -f https://developer.intel.com/ipex-whl-stable-cpu
 ```
+Installation for GPU:
+```bash
+python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-xpu
+```
 ## Usage
 
 example.py

diff --git a/cmake/Modules/FindoneCCL.cmake b/cmake/Modules/FindoneCCL.cmake
@@ -23,7 +23,7 @@ IF (USE_SYSTEM_ONECCL)
         set(oneapi_root_hint $ENV{INTELONEAPIROOT})
     endif()
 
-    IF(COMPUTE_BACKEND STREQUAL "dpcpp_level_zero")
+    IF(COMPUTE_BACKEND STREQUAL "dpcpp")
         SET(CCL_CONFIGURATION "cpu_gpu_dpcpp")
     ELSE()
         SET(CCL_CONFIGURATION "cpu_icc")
@@ -34,7 +34,12 @@ IF (USE_SYSTEM_ONECCL)
 ELSE()
     SET(ONECCL_ROOT "${PROJECT_SOURCE_DIR}/third_party/oneCCL")
 
-    ADD_SUBDIRECTORY(${ONECCL_ROOT})
+    IF(BUILD_NO_ONECCL_PACKAGE)
+        ADD_SUBDIRECTORY(${ONECCL_ROOT} oneCCL EXCLUDE_FROM_ALL)
+    ELSE()
+        ADD_SUBDIRECTORY(${ONECCL_ROOT})
+    ENDIF()
+
     IF(NOT TARGET ccl)
         MESSAGE(FATAL_ERROR "Failed to find oneCCL target")
     ENDIF()

diff --git a/demo/demo.py b/demo/demo.py
@@ -39,7 +39,7 @@ def forward(self, input):
     device = 'cpu' #"xpu:{}".format(dist.get_rank())
     model = Model().to(device)
     if dist.get_world_size() > 1:
-        model = DDP(model, device_ids=[device] if device is not 'cpu' else None)
+        model = DDP(model, device_ids=[device] if (device != 'cpu') else None)
 
     optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
     loss_fn = nn.MSELoss().to(device)
@@ -55,7 +55,9 @@ def forward(self, input):
         L = loss_fn(res, labels)
         # backward
         print("Runing backward: {} on device {}".format(i, device))
-        L.backward()
+        with torch.autograd.profiler_legacy.profile(enabled=True, use_xpu=True) as prof:
+            L.backward()
+        print(prof)
         # update
         print("Runing optim: {} on device {}".format(i, device))
         optimizer.step()
diff --git a/oneccl_bindings_for_pytorch/__init__.py b/oneccl_bindings_for_pytorch/__init__.py
@@ -3,24 +3,35 @@
 import warnings
 import torch
 
+
 cwd = os.path.dirname(os.path.abspath(__file__))
-os.environ['CCL_ROOT'] = cwd
-FI_PROVIDER_PATH = os.path.join(cwd, "lib/prov")
-os.environ['FI_PROVIDER_PATH'] = FI_PROVIDER_PATH
 if not os.path.exists(os.path.join(cwd, "version.py")):
     raise RuntimeError("oneccl_bindings_for_pytorch is not installed!")
 
+
+def set_env_default(env, key, value):
+    new_value = env.get(key, value)
+    env[key] = new_value
+
+
+if os.environ.get("CCL_ROOT") is None:
+    # set the default oneCCL and MPI library path
+    set_env_default(os.environ, 'CCL_ROOT', cwd)
+
+    FI_PROVIDER_PATH = os.path.join(cwd, "lib/prov")
+    set_env_default(os.environ, 'FI_PROVIDER_PATH', FI_PROVIDER_PATH)
+
+
 from .version import __version__, git_version
 from . import _C as ccl_lib
 
 if hasattr(torch, 'xpu'):
-    if torch.xpu.is_available():
-        try:
-            # load the CCL/XPU library
-            import ctypes
-            my_c_library = ctypes.cdll.LoadLibrary(os.path.join(cwd, "lib/liboneccl_bindings_for_pytorch_xpu.so"))
-        except OSError:
-            print("Warning: Cannot load xpu CCL. CCL doesn't work for XPU device")
+    try:
+        # load the CCL/XPU library
+        import ctypes
+        my_c_library = ctypes.cdll.LoadLibrary(os.path.join(cwd, "lib/liboneccl_bindings_for_pytorch_xpu.so"))
+    except OSError:
+        print("Warning: Cannot load xpu CCL. CCL doesn't work for XPU device")
 
 __all__ = []
 __all__ += [name for name in dir(ccl_lib)

diff --git a/oneccl_bindings_for_pytorch/csrc/init.cpp b/oneccl_bindings_for_pytorch/csrc/init.cpp
@@ -42,10 +42,19 @@
 #include <pybind11/chrono.h>
 #include <pybind11/cast.h>
 
+#include <torch/version.h>
+#if TORCH_VERSION_MINOR >= 13
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/Types.hpp>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
+#else
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/Store.hpp>
+#include <c10d/Types.hpp>
+#include <c10d/Utils.hpp>
+#endif
+
 #include <ProcessGroupCCL.hpp>
 
 namespace py = pybind11;

diff --git a/patches/Update_oneCCL.patch b/patches/Update_oneCCL.patch
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+torch>=1.10.0
+setuptools
diff --git a/setup.py b/setup.py
@@ -49,8 +49,8 @@ def create_version():
         if sha != 'Unknown':
             version += '+' + sha[:7]
 
-    if os.environ.get("COMPUTE_BACKEND") == "dpcpp_level_zero":
-        backend = "xpu"
+    if os.environ.get("COMPUTE_BACKEND") == "dpcpp":
+        backend = "gpu"
     else:
         backend = os.environ.get("ONECCL_BINDINGS_FOR_PYTORCH_BACKEND", "cpu")
 
@@ -78,12 +78,6 @@ def run(self):
         """
         cmake_extensions = [ext for ext in self.extensions if isinstance(ext, CMakeExtension)]
         for ext in cmake_extensions:
-            try:
-                # temp patch the oneCCL code
-                check_call(["git", "apply", "./patches/Update_oneCCL.patch"], cwd=CWD)
-            except:
-                # ignore patch fail
-                pass
             self.build_cmake(ext)
 
         self.extensions = [ext for ext in self.extensions if not isinstance(ext, CMakeExtension)]
@@ -123,7 +117,7 @@ def build_cmake(self, extension: CMakeExtension):
 
         runtime = 'gcc'
         if 'COMPUTE_BACKEND' in os.environ:
-            if os.environ['COMPUTE_BACKEND'] == 'dpcpp_level_zero':
+            if os.environ['COMPUTE_BACKEND'] == 'dpcpp':
                 runtime = 'dpcpp'
                 build_options['COMPUTE_BACKEND'] = os.environ['COMPUTE_BACKEND']
                 import intel_extension_for_pytorch
@@ -138,7 +132,7 @@ def build_cmake(self, extension: CMakeExtension):
         build_args = ['-j', str(os.cpu_count())]
         check_call(['make', 'oneccl_bindings_for_pytorch'] + build_args, cwd=str(build_dir))
         if 'COMPUTE_BACKEND' in os.environ:
-            if os.environ['COMPUTE_BACKEND'] == 'dpcpp_level_zero':
+            if os.environ['COMPUTE_BACKEND'] == 'dpcpp':
                 check_call(['make', 'oneccl_bindings_for_pytorch_xpu'] + build_args, cwd=str(build_dir))
         check_call(['make', 'install'], cwd=str(build_dir))
 
@@ -148,14 +142,6 @@ def run(self):
         import glob
         import re
 
-        if os.path.isfile(os.path.join(CWD, "third_party/oneCCL", "README.md")):
-            try:
-                check_call(["git", "reset", "--hard"], cwd=os.path.join(CWD, "third_party/oneCCL"))
-            except Exception as e:
-                print("=" * 64 + "\nWARNNING!\n" + "=" * 64)
-                print(e)
-                print("=" * 64)
-
         with open('.gitignore', 'r') as f:
             ignores = f.read()
             pat = re.compile(r'^#( BEGIN NOT-CLEAN-FILES )?')

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -8,7 +8,7 @@ target_compile_options(oneccl_bindings_for_pytorch PUBLIC -Wall
         -Wno-sign-compare
         -Wno-unused-function)
 
-if(COMPUTE_BACKEND STREQUAL "dpcpp_level_zero")
+if(COMPUTE_BACKEND STREQUAL "dpcpp")
     add_subdirectory(./gpu)
 endif()
 

diff --git a/src/ProcessGroupCCL.cpp b/src/ProcessGroupCCL.cpp
@@ -92,12 +92,13 @@ ProcessGroupCCL::AsyncWorkCCL::AsyncWorkCCL(std::vector<std::vector<at::Tensor>>
 // Profiler: Pass nullptr as profilingTitle to parent constructor to
 // replace default profiler implementation with async version that reports
 // correct timestamps for work that is asynchronously executed.
-        : C10D_Work(rank, opType, profilingTitle, inputTensors),
+        : C10D_Work(rank, opType, nullptr, inputTensors),
           outputTensors_(std::move(outputTensors)),
           future_(createFutureAsOutput(outputTensors)) {
-//  if (profilingTitle != nullptr) {
+  if (profilingTitle != nullptr) {
 //    recordAsyncWorkProfilingInfo(profilingTitle, inputTensors);
-//  }
+    // TODO: for cpu async profiling repot.
+  }
 }
 
 c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupCCL::AsyncWorkCCL::getFuture() {
@@ -243,7 +244,12 @@ c10::intrusive_ptr<C10D_Work> ProcessGroupCCL::_allgather_base(
       at::Tensor& inputTensor,
       const AllgatherOptions& opts)
 {
-  TORCH_CHECK(false, "ProcessGroupCCL does not support _allgather_base");
+  std::vector<c10::IValue> tensor_param;
+  format_tensors_param(tensor_param, inputTensor);
+  format_tensors_param(tensor_param, outputTensor);
+  RECORD_FUNCTION("oneccl_bindings_for_pytorch::_allgather_base", tensor_param);
+  auto work = DispatchStub::_allgather_base(outputTensor, inputTensor, opts, *this);
+  return work;
 }
 
 c10::intrusive_ptr<C10D_Work> ProcessGroupCCL::allgather_coalesced(
@@ -290,6 +296,19 @@ c10::intrusive_ptr<C10D_Work> ProcessGroupCCL::reduce_scatter(
   TORCH_CHECK(false, "ProcessGroupCCL does not support reduce_scatter");
 }
 
+c10::intrusive_ptr<C10D_Work> ProcessGroupCCL::_reduce_scatter_base(
+     at::Tensor& outputTensor,
+     at::Tensor& inputTensor,
+     const ReduceScatterOptions& opts)
+{
+     std::vector<c10::IValue> tensor_param;
+     format_tensors_param(tensor_param, inputTensor);
+     format_tensors_param(tensor_param, outputTensor);
+     RECORD_FUNCTION("oneccl_bindings_for_pytorch::_reduce_scatter_base", tensor_param);
+     auto work = DispatchStub::_reduce_scatter_base(outputTensor, inputTensor, opts, *this);
+     return work;
+}
+
 c10::intrusive_ptr<C10D_Work> ProcessGroupCCL::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,