Fix nightly CI pipeline to generate ROCm 4.2 wheels and add ROCm 4.3.1 wheels (microsoft#9101)

Suffian Khan · web-flow · commit 47888392abd6 · 2021-09-19T23:36:03.000-07:00
* make work for both rocm 4.2 and rocm 4.3.1

* fix rocm 4.3.1 docker image reference

* fix CUDA_VERSION to ROCM_VERSION

* fix ReduceConsts conflict def

* add ifdef to miopen_common.h as well

* trailing ws
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -1591,6 +1591,31 @@ if (onnxruntime_USE_ROCM)
   if (onnxruntime_USE_CUDA)
     message(FATAL_ERROR "ROCM does not support build with CUDA!")
   endif()
+
+  if(NOT DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH /opt/rocm)
+  else()
+    set(ROCM_PATH $ENV{ROCM_PATH})
+  endif()
+
+  # replicate strategy used by pytorch to get ROCM_VERSION
+  # https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173
+  file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)
+  string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
+  if(ROCM_VERSION_DEV_MATCH)
+    set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
+    set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
+    set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
+    set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
+    math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
+  endif()
+  message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n")
+  message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
+  message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
+  message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
+  message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
+  message("ROCM_VERSION_DEV_INT:   ${ROCM_VERSION_DEV_INT}")
+  add_definitions(-DROCM_VERSION=${ROCM_VERSION_DEV_INT})
 endif()
 
 if (onnxruntime_USE_TVM)
diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc
@@ -88,23 +88,28 @@ const float Consts<half>::Zero = 0;
 
 const float Consts<half>::One = 1;
 
-// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+#if ROCM_VERSION >= 40300
+template <>
+const float ReduceConsts<half>::One = 1;
+
+template <>
+const float ReduceConsts<half>::Zero = 0;
+#else
+// Up until ROCm 4.2, miopenReduceTensor() required alpha/beta to be the same data
 // type as the input type. This differs from cudnnReduceTensor() and other
 // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
-//
-// NOTE: this workaround can be removed in ROCm 4.3:
-//       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
 template <>
 const half ReduceConsts<half>::One = 1.f;
 
 template <>
-const float ReduceConsts<float>::One = 1;
+const half ReduceConsts<half>::Zero = 0.f;
+#endif
 
 template <>
-const double ReduceConsts<double>::One = 1;
+const float ReduceConsts<float>::One = 1;
 
 template <>
-const half ReduceConsts<half>::Zero = 0.f;
+const double ReduceConsts<double>::One = 1;
 
 template <>
 const float ReduceConsts<float>::Zero = 0;
diff --git a/onnxruntime/core/providers/rocm/miopen_common.h b/onnxruntime/core/providers/rocm/miopen_common.h
@@ -46,18 +46,23 @@ struct Consts<half> {
   static const float One;
 };
 
-// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
-// type as the input type. This differs from cudnnReduceTensor() and other
-// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
-//
-// NOTE: this workaround can be removed in ROCm 4.3:
-//       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
 template <typename ElemType>
 struct ReduceConsts {
   static const ElemType Zero;
   static const ElemType One;
 };
 
+#if ROCM_VERSION >= 40300
+// Up until ROCm 4.2 miopenReduceTensor() required alpha/beta to be the same data
+// type as the input type. This differs from cudnnReduceTensor() and other
+// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+template <>
+struct ReduceConsts<half> {
+  static const float Zero;
+  static const float One;
+};
+#endif
+
 inline double ClampMiopenBatchNormEpsilon(double epsilon) {
   if (epsilon < MIOPEN_BN_MIN_EPSILON) {
     if (MIOPEN_BN_MIN_EPSILON - epsilon > FLT_EPSILON)
diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@@ -186,14 +186,9 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
   else
     ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
 
-  // As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
-  // type as the input type. This differs from cudnnReduceTensor() and other
-  // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
-  //
-  // NOTE: this workaround can be removed in ROCm 4.3:
-  //       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
-  const auto one = Consts<float>::One;
-  const auto zero = Consts<float>::Zero;
+  const auto one = ReduceConsts<HipT>::One;
+  const auto zero = ReduceConsts<HipT>::Zero;
+
   MiopenTensor input_tensor;
   MiopenTensor output_tensor;
   ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
@@ -515,14 +510,9 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
     ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
   }
 
-  // As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
-  // type as the input type. This differs from cudnnReduceTensor() and other
-  // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
-  //
-  // NOTE: this workaround can be removed in ROCm 4.3:
-  //       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
-  const float one = Consts<float>::One; 
-  const float zero = Consts<float>::Zero; 
+  const auto one = ReduceConsts<HipT>::One;
+  const auto zero = ReduceConsts<HipT>::Zero;
+
   MiopenTensor input_tensor;
   MiopenTensor output_tensor;
   ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py
@@ -286,6 +286,10 @@ def hipify(src_file_path, dst_file_path):
 
         # CUFFT -> HIPFFT
         s = s.replace('CUFFT', 'HIPFFT')
+
+        # Undo where above hipify steps went too far.
+        s = s.replace('ROCM_VERSION', 'CUDA_VERSION')  # semantically different meanings, cannot hipify
+
     with open(dst_file_path, 'w') as f:
         f.write(s)
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -296,20 +296,20 @@ stages:
       steps:
       - template: get-docker-image-steps.yml
         parameters:
-          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm4_2
           Context: tools/ci_build/github/linux/docker
           DockerBuildArgs: >-
-            --build-arg TORCH_VERSION=1.8.1
+            --build-arg TORCH_VERSION=1.9.0
             --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
             --build-arg BUILD_UID=$(id -u)
             --network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64
             --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-10/root
             --build-arg PREPEND_PATH=/opt/rh/devtoolset-10/root/usr/bin:
             --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
-          Repository: onnxruntimetrainingrocmbuild-torch1.8.1
+          Repository: onnxruntimetrainingrocmbuild-torch1.9.0-rocm4.2
       - template: get-docker-image-steps.yml
         parameters:
-          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm4_3_1
           Context: tools/ci_build/github/linux/docker
           DockerBuildArgs: >-
             --build-arg TORCH_VERSION=1.9.0
@@ -319,7 +319,7 @@ stages:
             --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-10/root
             --build-arg PREPEND_PATH=/opt/rh/devtoolset-10/root/usr/bin:
             --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
-          Repository: onnxruntimetrainingrocmbuild-torch1.9.0
+          Repository: onnxruntimetrainingrocmbuild-torch1.9.0-rocm4.3.1
 
     - job: ROCM_training_wheels
       timeoutInMinutes: 180
@@ -331,30 +331,38 @@ stages:
       - ROCm_build_environment
       strategy:
         matrix:
-          Python36 Torch181:
+          Python36 Torch190 Rocm42:
             PythonVersion: '3.6'
-            TorchVersion: '1.8.1'
-          Python37 Torch181:
+            TorchVersion: '1.9.0'
+            RocmVersion: '4.2'
+          Python37 Torch190 Rocm42:
             PythonVersion: '3.7'
-            TorchVersion: '1.8.1'
-          Python38 Torch181:
+            TorchVersion: '1.9.0'
+            RocmVersion: '4.2'
+          Python38 Torch190 Rocm42:
             PythonVersion: '3.8'
-            TorchVersion: '1.8.1'
-          Python39 Torch181:
+            TorchVersion: '1.9.0'
+            RocmVersion: '4.2'
+          Python39 Torch190 Rocm42:
             PythonVersion: '3.9'
-            TorchVersion: '1.8.1'
-          Python36 Torch190:
+            TorchVersion: '1.9.0'
+            RocmVersion: '4.2'
+          Python36 Torch190 Rocm431:
             PythonVersion: '3.6'
             TorchVersion: '1.9.0'
-          Python37 Torch190:
+            RocmVersion: '4.3.1'
+          Python37 Torch190 Rocm431:
             PythonVersion: '3.7'
             TorchVersion: '1.9.0'
-          Python38 Torch190:
+            RocmVersion: '4.3.1'
+          Python38 Torch190 Rocm431:
             PythonVersion: '3.8'
             TorchVersion: '1.9.0'
-          Python39 Torch190:
+            RocmVersion: '4.3.1'
+          Python39 Torch190 Rocm431:
             PythonVersion: '3.9'
             TorchVersion: '1.9.0'
+            RocmVersion: '4.3.1'
       steps:
 
       - checkout: self
@@ -380,11 +388,11 @@ stages:
               -e NIGHTLY_BUILD \
               -e BUILD_BUILDNUMBER \
               --user onnxruntimedev \
-              onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
+              onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
                 /onnxruntime_src/tools/ci_build/build.py \
                   --config Release \
                   --use_rocm \
-                    --rocm_version=4.2 \
+                    --rocm_version=$(RocmVersion) \
                     --rocm_home=/opt/rocm \
                     --nccl_home=/opt/rocm \
                   --update \
@@ -435,7 +443,7 @@ stages:
             -e NIGHTLY_BUILD \
             -e BUILD_BUILDNUMBER \
             --user onnxruntimedev \
-            onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
+            onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
                /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
         displayName: 'Run onnxruntime unit tests (in container)'
 
@@ -458,7 +466,7 @@ stages:
             -e NIGHTLY_BUILD \
             -e BUILD_BUILDNUMBER \
             --user onnxruntimedev \
-            onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
+            onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
               orttraining/tools/ci_test/run_batch_size_test.py \
                 --binary_dir /build/Release \
                 --model_root training_e2e_test_data/models \
@@ -485,7 +493,7 @@ stages:
             -e NIGHTLY_BUILD \
             -e BUILD_BUILDNUMBER \
             --user onnxruntimedev \
-            onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
+            onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
               orttraining/tools/ci_test/run_bert_perf_test.py \
                 --binary_dir /build/Release \
                 --model_root training_e2e_test_data/models \
@@ -513,7 +521,7 @@ stages:
             -e NIGHTLY_BUILD \
             -e BUILD_BUILDNUMBER \
             --user onnxruntimedev \
-            onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
+            onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
               orttraining/tools/ci_test/run_convergence_test.py \
                 --binary_dir /build/Release \
                 --model_root training_e2e_test_data/models \
@@ -552,7 +560,7 @@ stages:
               -e NIGHTLY_BUILD \
               -e BUILD_BUILDNUMBER \
               -e PythonManylinuxDir=$(PythonManylinuxdir) \
-              onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
+              onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
                 /onnxruntime_src/tools/ci_build/github/pai/wrap_rocm_python_doc_publisher.sh
           workingDirectory: $(Build.SourcesDirectory)
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm4_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm4_2
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm4_3_1 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm4_3_1