Skip to content

Commit 4788839

Browse files
author
Suffian Khan
authored
Fix nightly CI pipeline to generate ROCm 4.2 wheels and add ROCm 4.3.1 wheels (microsoft#9101)
* make work for both rocm 4.2 and rocm 4.3.1 * fix rocm 4.3.1 docker image reference * fix CUDA_VERSION to ROCM_VERSION * fix ReduceConsts conflict def * add ifdef to miopen_common.h as well * trailing ws
1 parent 23e9c0a commit 4788839

File tree

8 files changed

+293
-53
lines changed

8 files changed

+293
-53
lines changed

cmake/CMakeLists.txt

+25
Original file line numberDiff line numberDiff line change
@@ -1591,6 +1591,31 @@ if (onnxruntime_USE_ROCM)
15911591
if (onnxruntime_USE_CUDA)
15921592
message(FATAL_ERROR "ROCM does not support build with CUDA!")
15931593
endif()
1594+
1595+
if(NOT DEFINED ENV{ROCM_PATH})
1596+
set(ROCM_PATH /opt/rocm)
1597+
else()
1598+
set(ROCM_PATH $ENV{ROCM_PATH})
1599+
endif()
1600+
1601+
# replicate strategy used by pytorch to get ROCM_VERSION
1602+
# https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173
1603+
file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)
1604+
string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
1605+
if(ROCM_VERSION_DEV_MATCH)
1606+
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
1607+
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
1608+
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
1609+
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
1610+
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
1611+
endif()
1612+
message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n")
1613+
message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
1614+
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
1615+
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
1616+
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
1617+
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
1618+
add_definitions(-DROCM_VERSION=${ROCM_VERSION_DEV_INT})
15941619
endif()
15951620

15961621
if (onnxruntime_USE_TVM)

onnxruntime/core/providers/rocm/miopen_common.cc

+12-7
Original file line numberDiff line numberDiff line change
@@ -88,23 +88,28 @@ const float Consts<half>::Zero = 0;
8888

8989
const float Consts<half>::One = 1;
9090

91-
// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
91+
#if ROCM_VERSION >= 40300
92+
template <>
93+
const float ReduceConsts<half>::One = 1;
94+
95+
template <>
96+
const float ReduceConsts<half>::Zero = 0;
97+
#else
98+
// Up until ROCm 4.2, miopenReduceTensor() required alpha/beta to be the same data
9299
// type as the input type. This differs from cudnnReduceTensor() and other
93100
// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
94-
//
95-
// NOTE: this workaround can be removed in ROCm 4.3:
96-
// https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
97101
template <>
98102
const half ReduceConsts<half>::One = 1.f;
99103

100104
template <>
101-
const float ReduceConsts<float>::One = 1;
105+
const half ReduceConsts<half>::Zero = 0.f;
106+
#endif
102107

103108
template <>
104-
const double ReduceConsts<double>::One = 1;
109+
const float ReduceConsts<float>::One = 1;
105110

106111
template <>
107-
const half ReduceConsts<half>::Zero = 0.f;
112+
const double ReduceConsts<double>::One = 1;
108113

109114
template <>
110115
const float ReduceConsts<float>::Zero = 0;

onnxruntime/core/providers/rocm/miopen_common.h

+11-6
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,23 @@ struct Consts<half> {
4646
static const float One;
4747
};
4848

49-
// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
50-
// type as the input type. This differs from cudnnReduceTensor() and other
51-
// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
52-
//
53-
// NOTE: this workaround can be removed in ROCm 4.3:
54-
// https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
5549
template <typename ElemType>
5650
struct ReduceConsts {
5751
static const ElemType Zero;
5852
static const ElemType One;
5953
};
6054

55+
#if ROCM_VERSION >= 40300
56+
// Up until ROCm 4.2 miopenReduceTensor() required alpha/beta to be the same data
57+
// type as the input type. This differs from cudnnReduceTensor() and other
58+
// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
59+
template <>
60+
struct ReduceConsts<half> {
61+
static const float Zero;
62+
static const float One;
63+
};
64+
#endif
65+
6166
inline double ClampMiopenBatchNormEpsilon(double epsilon) {
6267
if (epsilon < MIOPEN_BN_MIN_EPSILON) {
6368
if (MIOPEN_BN_MIN_EPSILON - epsilon > FLT_EPSILON)

onnxruntime/core/providers/rocm/reduction/reduction_ops.cc

+6-16
Original file line numberDiff line numberDiff line change
@@ -186,14 +186,9 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
186186
else
187187
ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
188188

189-
// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
190-
// type as the input type. This differs from cudnnReduceTensor() and other
191-
// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
192-
//
193-
// NOTE: this workaround can be removed in ROCm 4.3:
194-
// https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
195-
const auto one = Consts<float>::One;
196-
const auto zero = Consts<float>::Zero;
189+
const auto one = ReduceConsts<HipT>::One;
190+
const auto zero = ReduceConsts<HipT>::Zero;
191+
197192
MiopenTensor input_tensor;
198193
MiopenTensor output_tensor;
199194
ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
@@ -515,14 +510,9 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
515510
ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
516511
}
517512

518-
// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
519-
// type as the input type. This differs from cudnnReduceTensor() and other
520-
// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
521-
//
522-
// NOTE: this workaround can be removed in ROCm 4.3:
523-
// https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
524-
const float one = Consts<float>::One;
525-
const float zero = Consts<float>::Zero;
513+
const auto one = ReduceConsts<HipT>::One;
514+
const auto zero = ReduceConsts<HipT>::Zero;
515+
526516
MiopenTensor input_tensor;
527517
MiopenTensor output_tensor;
528518
ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));

tools/ci_build/amd_hipify.py

+4
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,10 @@ def hipify(src_file_path, dst_file_path):
286286

287287
# CUFFT -> HIPFFT
288288
s = s.replace('CUFFT', 'HIPFFT')
289+
290+
# Undo where above hipify steps went too far.
291+
s = s.replace('ROCM_VERSION', 'CUDA_VERSION') # semantically different meanings, cannot hipify
292+
289293
with open(dst_file_path, 'w') as f:
290294
f.write(s)
291295

tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml

+32-24
Original file line numberDiff line numberDiff line change
@@ -296,20 +296,20 @@ stages:
296296
steps:
297297
- template: get-docker-image-steps.yml
298298
parameters:
299-
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
299+
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm4_2
300300
Context: tools/ci_build/github/linux/docker
301301
DockerBuildArgs: >-
302-
--build-arg TORCH_VERSION=1.8.1
302+
--build-arg TORCH_VERSION=1.9.0
303303
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
304304
--build-arg BUILD_UID=$(id -u)
305305
--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64
306306
--build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-10/root
307307
--build-arg PREPEND_PATH=/opt/rh/devtoolset-10/root/usr/bin:
308308
--build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
309-
Repository: onnxruntimetrainingrocmbuild-torch1.8.1
309+
Repository: onnxruntimetrainingrocmbuild-torch1.9.0-rocm4.2
310310
- template: get-docker-image-steps.yml
311311
parameters:
312-
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
312+
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm4_3_1
313313
Context: tools/ci_build/github/linux/docker
314314
DockerBuildArgs: >-
315315
--build-arg TORCH_VERSION=1.9.0
@@ -319,7 +319,7 @@ stages:
319319
--build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-10/root
320320
--build-arg PREPEND_PATH=/opt/rh/devtoolset-10/root/usr/bin:
321321
--build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
322-
Repository: onnxruntimetrainingrocmbuild-torch1.9.0
322+
Repository: onnxruntimetrainingrocmbuild-torch1.9.0-rocm4.3.1
323323

324324
- job: ROCM_training_wheels
325325
timeoutInMinutes: 180
@@ -331,30 +331,38 @@ stages:
331331
- ROCm_build_environment
332332
strategy:
333333
matrix:
334-
Python36 Torch181:
334+
Python36 Torch190 Rocm42:
335335
PythonVersion: '3.6'
336-
TorchVersion: '1.8.1'
337-
Python37 Torch181:
336+
TorchVersion: '1.9.0'
337+
RocmVersion: '4.2'
338+
Python37 Torch190 Rocm42:
338339
PythonVersion: '3.7'
339-
TorchVersion: '1.8.1'
340-
Python38 Torch181:
340+
TorchVersion: '1.9.0'
341+
RocmVersion: '4.2'
342+
Python38 Torch190 Rocm42:
341343
PythonVersion: '3.8'
342-
TorchVersion: '1.8.1'
343-
Python39 Torch181:
344+
TorchVersion: '1.9.0'
345+
RocmVersion: '4.2'
346+
Python39 Torch190 Rocm42:
344347
PythonVersion: '3.9'
345-
TorchVersion: '1.8.1'
346-
Python36 Torch190:
348+
TorchVersion: '1.9.0'
349+
RocmVersion: '4.2'
350+
Python36 Torch190 Rocm431:
347351
PythonVersion: '3.6'
348352
TorchVersion: '1.9.0'
349-
Python37 Torch190:
353+
RocmVersion: '4.3.1'
354+
Python37 Torch190 Rocm431:
350355
PythonVersion: '3.7'
351356
TorchVersion: '1.9.0'
352-
Python38 Torch190:
357+
RocmVersion: '4.3.1'
358+
Python38 Torch190 Rocm431:
353359
PythonVersion: '3.8'
354360
TorchVersion: '1.9.0'
355-
Python39 Torch190:
361+
RocmVersion: '4.3.1'
362+
Python39 Torch190 Rocm431:
356363
PythonVersion: '3.9'
357364
TorchVersion: '1.9.0'
365+
RocmVersion: '4.3.1'
358366
steps:
359367

360368
- checkout: self
@@ -380,11 +388,11 @@ stages:
380388
-e NIGHTLY_BUILD \
381389
-e BUILD_BUILDNUMBER \
382390
--user onnxruntimedev \
383-
onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
391+
onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
384392
/onnxruntime_src/tools/ci_build/build.py \
385393
--config Release \
386394
--use_rocm \
387-
--rocm_version=4.2 \
395+
--rocm_version=$(RocmVersion) \
388396
--rocm_home=/opt/rocm \
389397
--nccl_home=/opt/rocm \
390398
--update \
@@ -435,7 +443,7 @@ stages:
435443
-e NIGHTLY_BUILD \
436444
-e BUILD_BUILDNUMBER \
437445
--user onnxruntimedev \
438-
onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
446+
onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
439447
/onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
440448
displayName: 'Run onnxruntime unit tests (in container)'
441449
@@ -458,7 +466,7 @@ stages:
458466
-e NIGHTLY_BUILD \
459467
-e BUILD_BUILDNUMBER \
460468
--user onnxruntimedev \
461-
onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
469+
onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
462470
orttraining/tools/ci_test/run_batch_size_test.py \
463471
--binary_dir /build/Release \
464472
--model_root training_e2e_test_data/models \
@@ -485,7 +493,7 @@ stages:
485493
-e NIGHTLY_BUILD \
486494
-e BUILD_BUILDNUMBER \
487495
--user onnxruntimedev \
488-
onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
496+
onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
489497
orttraining/tools/ci_test/run_bert_perf_test.py \
490498
--binary_dir /build/Release \
491499
--model_root training_e2e_test_data/models \
@@ -513,7 +521,7 @@ stages:
513521
-e NIGHTLY_BUILD \
514522
-e BUILD_BUILDNUMBER \
515523
--user onnxruntimedev \
516-
onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
524+
onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
517525
orttraining/tools/ci_test/run_convergence_test.py \
518526
--binary_dir /build/Release \
519527
--model_root training_e2e_test_data/models \
@@ -552,7 +560,7 @@ stages:
552560
-e NIGHTLY_BUILD \
553561
-e BUILD_BUILDNUMBER \
554562
-e PythonManylinuxDir=$(PythonManylinuxdir) \
555-
onnxruntimetrainingrocmbuild-torch$(TorchVersion) \
563+
onnxruntimetrainingrocmbuild-torch$(TorchVersion)-rocm$(RocmVersion) \
556564
/onnxruntime_src/tools/ci_build/github/pai/wrap_rocm_python_doc_publisher.sh
557565
workingDirectory: $(Build.SourcesDirectory)
558566

0 commit comments

Comments
 (0)