lightvector
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎cpp/CMakeLists.txt
Lines changed: 103 additions & 19 deletions b/‎cpp/CMakeLists.txt
Lines changed: 103 additions & 19 deletions
diff --git a/‎cpp/command/benchmark.cpp
Lines changed: 45 additions & 0 deletions b/‎cpp/command/benchmark.cpp
Lines changed: 45 additions & 0 deletions
diff --git a/‎cpp/core/rand.cpp
Lines changed: 1 addition & 1 deletion b/‎cpp/core/rand.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/neuralnet/cudabackend.cpp
Lines changed: 4 additions & 0 deletions b/‎cpp/neuralnet/cudabackend.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/neuralnet/dummybackend.cpp
Lines changed: 4 additions & 0 deletions b/‎cpp/neuralnet/dummybackend.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/neuralnet/eigenbackend.cpp
Lines changed: 4 additions & 0 deletions b/‎cpp/neuralnet/eigenbackend.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/neuralnet/nneval.cpp
Lines changed: 8 additions & 2 deletions b/‎cpp/neuralnet/nneval.cpp
Lines changed: 8 additions & 2 deletions
diff --git a/‎cpp/neuralnet/nneval.h
Lines changed: 4 additions & 0 deletions b/‎cpp/neuralnet/nneval.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/neuralnet/nninterface.h
Lines changed: 12 additions & 0 deletions b/‎cpp/neuralnet/nninterface.h
Lines changed: 12 additions & 0 deletions
@@ -49,6 +49,9 @@ out.txt
 # For clion IDE
 .idea
 
+# For vscode
+.vscode
+
 # For cmake
 CMakeCache.txt
 CMakeFiles/
 
@@ -22,25 +22,9 @@ if(NOT WIN32)
   set(ColorBoldRed "${ColorRed}${ColorBold}")
 endif()
 
-#--------------------------- CMAKE VARIABLES (partly for Cmake GUI) ----------------------------------------------------
-
-set(USE_BACKEND CACHE STRING "Neural net backend")
-string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
-set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA OPENCL EIGEN)
-
-set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
-set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
-set(Boost_USE_STATIC_LIBS_ON 0 CACHE BOOL "Compile against boost statically instead of dynamically")
-set(USE_AVX2 0 CACHE BOOL "Compile with AVX2")
-set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 29. Compiling with this Will use more memory and slow down KataGo, even when playing on boards of size 19.")
-
-#--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------
-
-message(STATUS "Building 'katago' executable for GTP engine and other tools.")
-if(USE_BACKEND STREQUAL "CUDA")
-  message(STATUS "-DUSE_BACKEND=CUDA, using CUDA backend.")
-
+#--------------------------- CUDA MACRO -------------------------------------------------------------------------------
 
+macro(CONFIGURE_CUDA)
   # Ensure dynamic cuda linking (Versions prior to 3.17)
   if (${CMAKE_VERSION} VERSION_LESS "3.17")
     set(CMAKE_CUDA_FLAGS "" CACHE STRING "")
@@ -145,6 +129,26 @@ if(USE_BACKEND STREQUAL "CUDA")
         "
         )
   endif()
+endmacro()
+
+#--------------------------- CMAKE VARIABLES (partly for Cmake GUI) ----------------------------------------------------
+
+set(USE_BACKEND CACHE STRING "Neural net backend")
+string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
+set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA OPENCL EIGEN ONNXRUNTIME)
+
+set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
+set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
+set(Boost_USE_STATIC_LIBS_ON 0 CACHE BOOL "Compile against boost statically instead of dynamically")
+set(USE_AVX2 0 CACHE BOOL "Compile with AVX2")
+set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 29. Compiling with this Will use more memory and slow down KataGo, even when playing on boards of size 19.")
+
+#--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------
+
+message(STATUS "Building 'katago' executable for GTP engine and other tools.")
+if(USE_BACKEND STREQUAL "CUDA")
+  message(STATUS "-DUSE_BACKEND=CUDA, using CUDA backend.")
+  configure_cuda()
 elseif(USE_BACKEND STREQUAL "OPENCL")
   message(STATUS "-DUSE_BACKEND=OPENCL, using OpenCL backend.")
   set(NEURALNET_BACKEND_SOURCES
@@ -161,8 +165,28 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
   set(NEURALNET_BACKEND_SOURCES
     neuralnet/eigenbackend.cpp
     )
+elseif(USE_BACKEND STREQUAL "ONNXRUNTIME")
+  message(STATUS "-DUSE_BACKEND=ONNXRUNTIME, using ONNXRuntime backend.")
+  set(ORT_CUDA 0 CACHE BOOL "Use CUDA execution provider for ONNXRuntime.")
+  set(ORT_TENSORRT 0 CACHE BOOL "Use TensorRT execution provider for ONNXRuntime.")
+  set(ORT_DIRECTML 0 CACHE BOOL "Use DirectML execution provider for ONNXRuntime.")
+  set(ORT_MIGRAPHX 0 CACHE BOOL "Use MIGraphX execution provider for ONNXRuntime.")
+  if(ORT_CUDA OR ORT_TENSORRT)
+    configure_cuda()
+  endif()
+  if(ORT_MIGRAPHX)
+    set(NEURALNET_BACKEND_SOURCES
+      neuralnet/ortbackend.cpp
+      neuralnet/openclhelpers.cpp
+      )
+  else()
+    set(NEURALNET_BACKEND_SOURCES
+      neuralnet/ortbackend.cpp
+      )
+  endif()
+  
 elseif(USE_BACKEND STREQUAL "")
-  message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
+  message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=ONNXRUNTIME or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
   set(NEURALNET_BACKEND_SOURCES neuralnet/dummybackend.cpp)
 else()
   message(FATAL_ERROR "Unrecognized backend: " ${USE_BACKEND})
@@ -327,6 +351,66 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
       endif()
     endif()
   endif()
+elseif(USE_BACKEND STREQUAL "ONNXRUNTIME")
+  target_compile_definitions(katago PRIVATE USE_ONNXRUNTIME_BACKEND)
+  set(ORT_LIB_DIR CACHE STRING "ONNXRuntime library location")
+  set(ORT_INCLUDE_DIR CACHE STRING "ONNXRuntime header files location")
+  message(STATUS "ORT_LIB_DIR: " ${ORT_LIB_DIR})
+  message(STATUS "ORT_INCLUDE_DIR: " ${ORT_INCLUDE_DIR})
+  include_directories(${ORT_INCLUDE_DIR})
+  if(EXISTS ${ORT_INCLUDE_DIR}/core/session)
+    include_directories(${ORT_INCLUDE_DIR}/core/session)
+  endif()
+  if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/cpu)
+    include_directories(${ORT_INCLUDE_DIR}/core/providers/cpu)
+  endif()
+  find_library(ORT_LIBRARY NAMES onnxruntime PATHS ${ORT_LIB_DIR})
+  if(NOT ORT_LIBRARY)
+    message(FATAL_ERROR "Could not find onnxruntime")
+  endif()
+  target_link_libraries(katago ${ORT_LIBRARY})
+  if(ORT_CUDA)
+    target_compile_definitions(katago PRIVATE USE_ORT_CUDA)
+  endif()
+  if(ORT_TENSORRT)
+    target_compile_definitions(katago PRIVATE USE_ORT_TENSORRT)
+    set(TENSORRT_LIB_DIR CACHE STRING "TensorRT library location")
+    set(TENSORRT_INCLUDE_DIR CACHE STRING "TensorRT header file location")
+    include_directories(${TENSORRT_INCLUDE_DIR})
+    find_library(TENSORRT_LIBRARY NAMES nvinfer PATHS ${TENSORRT_LIB_DIR})
+    if(NOT TENSORRT_LIBRARY)
+      message(FATAL_ERROR "Could not find nvinfer")
+    endif()
+    target_link_libraries(katago ${TENSORRT_LIBRARY})
+    if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/tensorrt)
+      include_directories(${ORT_INCLUDE_DIR}/core/providers/tensorrt)
+    endif()
+  endif()
+  if(ORT_CUDA OR ORT_TENSORRT)
+    find_package(CUDA REQUIRED)
+    find_path(CUDNN_INCLUDE_DIR cudnn.h HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES cuda/include include)
+    if((NOT CUDNN_INCLUDE_DIR))
+      message(ERROR "${ColorBoldRed} cudnn.h was NOT found, specify CUDNN_INCLUDE_DIR to indicate where it is. ${ColorReset}")
+    endif()
+    find_library(CUDNN_LIBRARY libcudnn.so PATHS /usr/local/cuda/lib64 /opt/cuda/lib64)
+    include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR}) #SYSTEM is for suppressing some compiler warnings in thrust libraries
+    target_link_libraries(katago ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_LIBRARIES})
+    if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/cuda)
+      include_directories(${ORT_INCLUDE_DIR}/core/providers/cuda)
+    endif()
+  endif()
+  if(ORT_DIRECTML)
+    target_compile_definitions(katago PRIVATE USE_ORT_DIRECTML)
+    if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/directml)
+      include_directories(${ORT_INCLUDE_DIR}/core/providers/directml)
+    endif()
+  endif()
+  if(ORT_MIGRAPHX)
+    target_compile_definitions(katago PRIVATE USE_ORT_MIGRAPHX)
+    if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/migraphx)
+      include_directories(${ORT_INCLUDE_DIR}/core/providers/migraphx)
+    endif()
+  endif()
 endif()
 
 if(USE_BIGGER_BOARDS_EXPENSIVE)
 
@@ -209,6 +209,10 @@ int MainCmds::benchmark(int argc, const char* const* argv) {
   cout << "If you have a strong GPU capable of FP16 tensor cores (e.g. RTX2080), "
        << "using the Cuda version of KataGo instead may give a mild performance boost." << endl;
 #endif
+#ifdef USE_ONNXRUNTIME_BACKEND
+  cout << "You are currently using the ONNXRuntime version of KataGo with "
+       << nnEval->getOnnxRuntimeExecutionProvider() << " execution provider." << endl;
+#endif
 #ifdef USE_EIGEN_BACKEND
   cout << "You are currently using the Eigen (CPU) version of KataGo. Due to having no GPU, it may be slow." << endl;
 #endif
@@ -564,6 +568,7 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm
   int64_t configMaxPlayouts = ((int64_t)1) << 50;
   double configMaxTime = 1e20;
   double configMaxPonderTime = -1.0;
+  string configOnnxRuntimeExecutionProvider;
   vector<int> configDeviceIdxs;
   int configNNCacheSizePowerOfTwo = 20;
   int configNNMutexPoolSizePowerOfTwo = 16;
@@ -693,6 +698,41 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm
       });
   }
 
+#ifdef USE_ONNXRUNTIME_BACKEND
+  cout << endl;
+  cout << "=========================================================================" << endl;
+  cout << "ONNXRUNTIME EXECUTION PROVIDER" << endl;
+
+  {
+    vector<string> executionProviders;
+#ifdef USE_ORT_CUDA
+      executionProviders.push_back("CUDA");
+#endif
+#ifdef USE_ORT_TENSORRT
+      executionProviders.push_back("TensorRT");
+#endif
+#ifdef USE_ORT_DIRECTML
+      executionProviders.push_back("DirectML");
+#endif
+#ifdef USE_ORT_MIGRAPHX
+      executionProviders.push_back("MIGraphX");
+#endif
+
+    cout << endl;
+    cout << "Available ONNXRuntime execution providers:" << endl;
+    for(const auto provider: executionProviders) {
+      cout << provider << " ";
+    }
+    cout << endl << endl;
+
+    string prompt = "Specify an execution provider for ONNXRuntime. Leave blank to use the first available provider.\n";
+    promptAndParseInput(prompt, [&](const string& line) {
+      if(line == "") configOnnxRuntimeExecutionProvider = executionProviders[0];
+      else configOnnxRuntimeExecutionProvider = line;
+    });
+  }
+#endif
+
   cout << endl;
   cout << "=========================================================================" << endl;
   cout << "GPUS AND RAM" << endl;
@@ -701,7 +741,11 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm
   {
     cout << endl;
     cout << "Finding available GPU-like devices..." << endl;
+    #ifndef USE_ONNXRUNTIME_BACKEND
     NeuralNet::printDevices();
+    #else
+    NeuralNet::printDevices(configOnnxRuntimeExecutionProvider);
+    #endif
     cout << endl;
 
     string prompt =
@@ -789,6 +833,7 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm
       configMaxPlayouts,
       configMaxTime,
       configMaxPonderTime,
+      configOnnxRuntimeExecutionProvider,
       configDeviceIdxs,
       configNNCacheSizePowerOfTwo,
       configNNMutexPoolSizePowerOfTwo,
 
@@ -248,7 +248,7 @@ void Rand::init()
     int* heapVal = new int[1];
     size_t stackAddr = (size_t)(&stackVal);
     size_t heapAddr = (size_t)(heapVal);
-    delete heapVal;
+    delete[] heapVal;
     s += "|";
     s += Global::uint64ToHexString((uint64_t)stackAddr);
     s += Global::uint64ToHexString((uint64_t)heapAddr);
 
@@ -2584,6 +2584,8 @@ ComputeContext* NeuralNet::createComputeContext(
   int nnXLen,
   int nnYLen,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
@@ -2593,6 +2595,8 @@ ComputeContext* NeuralNet::createComputeContext(
   (void)gpuIdxs;
   (void)logger;
   (void)openCLTunerFile;
+  (void)onnxOptModelFile;
+  (void)onnxRuntimeExecutionProvider;
   (void)homeDataDirOverride;
   (void)openCLReTunePerBoardSize;
   (void)loadedModel;
 
@@ -19,6 +19,8 @@ ComputeContext* NeuralNet::createComputeContext(
   int nnXLen,
   int nnYLen,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
@@ -30,6 +32,8 @@ ComputeContext* NeuralNet::createComputeContext(
   (void)nnXLen;
   (void)nnYLen;
   (void)openCLTunerFile;
+  (void)onnxOptModelFile;
+  (void)onnxRuntimeExecutionProvider;
   (void)homeDataDirOverride;
   (void)openCLReTunePerBoardSize;
   (void)useFP16Mode;
 
@@ -1429,6 +1429,8 @@ ComputeContext* NeuralNet::createComputeContext(
   int nnXLen,
   int nnYLen,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
@@ -1438,6 +1440,8 @@ ComputeContext* NeuralNet::createComputeContext(
   (void)gpuIdxs;
   (void)logger;
   (void)openCLTunerFile;
+  (void)onnxOptModelFile;
+  (void)onnxRuntimeExecutionProvider;
   (void)homeDataDirOverride;
   (void)openCLReTunePerBoardSize;
 
 
@@ -66,6 +66,8 @@ NNEvaluator::NNEvaluator(
   int nnMutexPoolSizePowerofTwo,
   bool skipNeuralNet,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
@@ -83,6 +85,7 @@ NNEvaluator::NNEvaluator(
    requireExactNNLen(rExactNNLen),
    policySize(NNPos::getPolicySize(xLen,yLen)),
    inputsUseNHWC(iUseNHWC),
+   ortExecutionProvider(onnxRuntimeExecutionProvider),
    usingFP16Mode(useFP16Mode),
    usingNHWCMode(useNHWCMode),
    numThreads(numThr),
@@ -145,8 +148,8 @@ NNEvaluator::NNEvaluator(
     inputsVersion = NNModelVersion::getInputsVersion(modelVersion);
     computeContext = NeuralNet::createComputeContext(
       gpuIdxs,logger,nnXLen,nnYLen,
-      openCLTunerFile,homeDataDirOverride,openCLReTunePerBoardSize,
-      usingFP16Mode,usingNHWCMode,loadedModel
+      openCLTunerFile,onnxOptModelFile,onnxRuntimeExecutionProvider,
+      homeDataDirOverride,openCLReTunePerBoardSize,usingFP16Mode,usingNHWCMode,loadedModel
     );
   }
   else {
@@ -224,6 +227,9 @@ int NNEvaluator::getNNXLen() const {
 int NNEvaluator::getNNYLen() const {
   return nnYLen;
 }
+string NNEvaluator::getOnnxRuntimeExecutionProvider() const{
+   return ortExecutionProvider;
+}
 enabled_t NNEvaluator::getUsingFP16Mode() const {
   return usingFP16Mode;
 }
 
@@ -89,6 +89,8 @@ class NNEvaluator {
     int nnMutexPoolSizePowerofTwo,
     bool debugSkipNeuralNet,
     const std::string& openCLTunerFile,
+    const std::string& onnxOptModelFile,
+    const std::string& onnxRuntimeExecutionProvider,
     const std::string& homeDataDirOverride,
     bool openCLReTunePerBoardSize,
     enabled_t useFP16Mode,
@@ -113,6 +115,7 @@ class NNEvaluator {
   int getNumServerThreads() const;
   int getNNXLen() const;
   int getNNYLen() const;
+  std::string getOnnxRuntimeExecutionProvider() const;
   enabled_t getUsingFP16Mode() const;
   enabled_t getUsingNHWCMode() const;
 
@@ -172,6 +175,7 @@ class NNEvaluator {
   const bool requireExactNNLen;
   const int policySize;
   const bool inputsUseNHWC;
+  const std::string ortExecutionProvider;
   const enabled_t usingFP16Mode;
   const enabled_t usingNHWCMode;
   int numThreads;
 
@@ -36,6 +36,16 @@ namespace NeuralNet {
 
   //Print available backend devices
   void printDevices();
+  void printDevices(const std::string& ortExecutionProvider);
+  #if defined(USE_ORT_CUDA) || defined(USE_ORT_TENSORRT)
+    void printCUDADevices();
+  #endif
+  #ifdef USE_ORT_DIRECTML
+    void printDirectMLDevices();
+  #endif
+  #ifdef USE_ORT_MIGRAPHX
+    void printOpenCLDevices();
+  #endif
 
   // Model I/O -----------------------------------------------------------------
 
@@ -59,6 +69,8 @@ namespace NeuralNet {
     int nnXLen,
     int nnYLen,
     const std::string& openCLTunerFile,
+    const std::string& onnxOptModelFile,
+    const std::string& onnxRuntimeExecutionProvider,
     const std::string& homeDataDirOverride,
     bool openCLReTunePerBoardSize,
     enabled_t useFP16Mode,