Skip to content

Commit 96bb4b1

Browse files
liqunfupranavsharmaThiago Crepaldibaijumeswani
authoredJul 1, 2021
Liqun/havenka/rel 1.8.1 round3 (microsoft#8246)
* Revert the cuda algo finding change as this causes a significant memory bloat. (microsoft#8181) * Revert the cuda algo finding change as this causes a significant memory bloat. * Address PR comment * Make pipelines to support torch1.8.1 and torch1.9.0 (microsoft#8084) * Add post-install command to build PyTorch CPP extensions from within onnxruntime package (microsoft#8027) ORTModule requires two PyTorch CPP extensions that are currently JIT compiled. The runtime compilation can cause issues in some environments without all build requirements or in environments with multiple instances of ORTModule running in parallel This PR creates a custom command to compile such extensions that must be manually executed before ORTModule is executed for the first time. When users try to use ORTModule before the extensions are compiled, an error with instructions are raised PyTorch CPP Extensions for ORTModule can be compiled by running: python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install Full build environment is needed for this * Patch orttraining-ortmodule pipeline with latest fix on master * add cuda version to build config * lib path * . * . * . * . * . * . * . * . * . * . * . * Remove auto doc gen Co-authored-by: Pranav Sharma <prs@microsoft.com> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Baiju Meswani <bmeswani@microsoft.com>
1 parent 6057515 commit 96bb4b1

7 files changed

+21
-94
lines changed
 

‎tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-torch181-cuda111.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ stages:
66
build_py_parameters: --enable_training --update --build
77
torch_version: '1.8.1'
88
cuda_version: '11.1'
9-
gcc_version: 9
9+
gcc_version: 8
1010
cmake_cuda_architectures: 37;50;52;60;61;70;75;80
1111
docker_file: Dockerfile.manylinux2014_training_cuda11_1
1212
agent_pool: Onnxruntime-Linux-GPU

‎tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-torch190-cuda111.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ stages:
66
build_py_parameters: --enable_training --update --build
77
torch_version: '1.9.0'
88
cuda_version: '11.1'
9-
gcc_version: 9
9+
gcc_version: 8
1010
cmake_cuda_architectures: 37;50;52;60;61;70;75;80
1111
docker_file: Dockerfile.manylinux2014_training_cuda11_1
1212
agent_pool: Onnxruntime-Linux-GPU

‎tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml

+6-56
Original file line numberDiff line numberDiff line change
@@ -117,32 +117,6 @@ stages:
117117
Contents: 'Release/dist/*.whl'
118118
TargetFolder: '$(Build.ArtifactStagingDirectory)'
119119

120-
- task: CmdLine@2
121-
displayName: 'Build Python Documentation'
122-
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
123-
inputs:
124-
script: |
125-
mkdir -p $HOME/.onnx
126-
docker run --rm \
127-
--volume /data/onnx:/data/onnx:ro \
128-
--volume $(Build.SourcesDirectory):/onnxruntime_src \
129-
--volume $(Build.BinariesDirectory):/build \
130-
--volume /data/models:/build/models:ro \
131-
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
132-
-e NIGHTLY_BUILD \
133-
-e BUILD_BUILDNUMBER \
134-
onnxruntimecpubuild \
135-
bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/*.whl && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release " ;
136-
workingDirectory: $(Build.SourcesDirectory)
137-
138-
- task: CopyFiles@2
139-
displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
140-
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
141-
inputs:
142-
SourceFolder: '$(Build.BinariesDirectory)/docs/inference/html'
143-
Contents: '**'
144-
TargetFolder: '$(Build.ArtifactStagingDirectory)/inference_html_doc'
145-
146120
- task: PublishBuildArtifacts@1
147121
displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
148122
inputs:
@@ -329,7 +303,7 @@ stages:
329303
render_gid=$(getent group | awk '/render/ {split($0,a,":"); print(a[3])}')
330304
echo "##vso[task.setvariable variable=render]$render_gid"
331305
displayName: 'Find video and render gid to be mapped into container'
332-
306+
333307
- script: |-
334308
echo "video=$video"
335309
echo "render=$render"
@@ -354,7 +328,7 @@ stages:
354328
onnxruntimetrainingrocmbuild \
355329
/onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
356330
displayName: 'Run onnxruntime unit tests (in container)'
357-
331+
358332
- script: |-
359333
docker run --rm \
360334
--device=/dev/kfd \
@@ -381,7 +355,7 @@ stages:
381355
--gpu_sku MI100_32G
382356
displayName: 'Run C++ BERT-L batch size test (in container)'
383357
condition: succeededOrFailed() # ensure all tests are run
384-
358+
385359
- script: |-
386360
docker run --rm \
387361
--device=/dev/kfd \
@@ -409,7 +383,7 @@ stages:
409383
--gpu_sku MI100_32G
410384
displayName: 'Run C++ BERT-L performance test (in container)'
411385
condition: succeededOrFailed() # ensure all tests are run
412-
386+
413387
- script: |-
414388
docker run --rm \
415389
--device=/dev/kfd \
@@ -437,38 +411,14 @@ stages:
437411
--gpu_sku MI100_32G
438412
displayName: 'Run C++ BERT-L convergence test (in container)'
439413
condition: succeededOrFailed() # ensure all tests are run
440-
414+
441415
- task: CopyFiles@2
442416
displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
443417
inputs:
444418
SourceFolder: '$(Build.BinariesDirectory)'
445419
Contents: 'Release/dist/*.whl'
446420
TargetFolder: '$(Build.ArtifactStagingDirectory)'
447421

448-
- task: CmdLine@2
449-
displayName: 'Build Python Documentation'
450-
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
451-
inputs:
452-
script: |
453-
mkdir -p $HOME/.onnx
454-
docker run --rm \
455-
--volume $(Build.SourcesDirectory):/onnxruntime_src \
456-
--volume $(Build.BinariesDirectory):/build \
457-
-e NIGHTLY_BUILD \
458-
-e BUILD_BUILDNUMBER \
459-
--entrypoint /bin/bash \
460-
onnxruntimetrainingrocmbuild \
461-
/onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release
462-
workingDirectory: $(Build.SourcesDirectory)
463-
464-
- task: CopyFiles@2
465-
displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
466-
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
467-
inputs:
468-
SourceFolder: '$(Build.BinariesDirectory)/docs/training/html'
469-
Contents: '**'
470-
TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc'
471-
472422
- task: PublishBuildArtifacts@1
473423
displayName: 'Upload Rocm wheel as build artifact'
474424
inputs:
@@ -737,7 +687,7 @@ stages:
737687
displayName: 'Publish Artifact: ONNXRuntime python wheel'
738688
inputs:
739689
ArtifactName: onnxruntime_gpu
740-
690+
741691
- task: DeleteFiles@1
742692
displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
743693
condition: and (succeeded(), eq(variables['PythonVersion'], '3.7'))

‎tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml

+6-34
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ parameters:
2424
displayName: >
2525
gcc_version.
2626
type: number
27-
27+
2828
- name: docker_file
2929
displayName: >
3030
docker_file.
@@ -87,9 +87,9 @@ stages:
8787
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
8888
--build-arg BUILD_UID=$(id -u)
8989
--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64
90-
--build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-$(GccVersion)/root
91-
--build-arg PREPEND_PATH=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin:
92-
--build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64/dyninst:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib/dyninst:/usr/local/lib64
90+
--build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-$(GccVersion)/root
91+
--build-arg PREPEND_PATH=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin:
92+
--build-arg LD_LIBRARY_PATH_ARG=$(PythonManylinuxLibDir):/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64/dyninst:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib/dyninst:/usr/local/lib64
9393
Repository: onnxruntimetraininggpubuild
9494

9595
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist"
@@ -128,7 +128,7 @@ stages:
128128
--build_wheel \
129129
--enable_onnx_tests \
130130
${{ parameters.build_py_parameters }} \
131-
--cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' \
131+
--cmake_extra_defines PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=$(PythonManylinuxLibDir) CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' \
132132
--use_cuda --cuda_version=$(CudaVersion) --cuda_home=/usr/local/cuda-$(CudaVersion) --cudnn_home=/usr/local/cuda-$(CudaVersion) ;
133133
workingDirectory: $(Build.SourcesDirectory)
134134

@@ -159,34 +159,6 @@ stages:
159159
Contents: 'Release/dist/*.whl'
160160
TargetFolder: '$(Build.ArtifactStagingDirectory)'
161161

162-
- task: CmdLine@2
163-
displayName: 'Build Python Documentation'
164-
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
165-
inputs:
166-
script: |
167-
mkdir -p $HOME/.onnx
168-
docker run --rm \
169-
--gpus all \
170-
-e NVIDIA_VISIBLE_DEVICES=all \
171-
--volume /data/onnx:/data/onnx:ro \
172-
--volume $(Build.SourcesDirectory):/onnxruntime_src \
173-
--volume $(Build.BinariesDirectory):/build \
174-
--volume /data/models:/build/models:ro \
175-
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
176-
-e NIGHTLY_BUILD \
177-
-e BUILD_BUILDNUMBER \
178-
onnxruntimetraininggpubuild \
179-
bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/*.whl && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release " ;
180-
workingDirectory: $(Build.SourcesDirectory)
181-
182-
- task: CopyFiles@2
183-
displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
184-
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
185-
inputs:
186-
SourceFolder: '$(Build.BinariesDirectory)/docs/training/html'
187-
Contents: '**'
188-
TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc'
189-
190162
- task: PublishBuildArtifacts@1
191163
displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
192164
inputs:
@@ -207,7 +179,7 @@ stages:
207179
--account_key $(orttrainingpackagestorageaccountkey) \
208180
--container_name '$web'
209181
condition: succeededOrFailed()
210-
displayName:
182+
displayName:
211183

212184
- template: component-governance-component-detection-steps.yml
213185
parameters:

‎tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml

+5
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,31 @@ steps:
1313
variables = {
1414
"PythonManylinuxDir": "/opt/python/cp35-cp35m",
1515
"PythonManylinuxIncludeDir": "/opt/python/cp35-cp35m/include/python3.5m",
16+
"PythonManylinuxLibDir": "/opt/python/cp35-cp35m/lib/python3.5",
1617
}
1718
elif version == "3.6":
1819
variables = {
1920
"PythonManylinuxDir": "/opt/python/cp36-cp36m",
2021
"PythonManylinuxIncludeDir": "/opt/python/cp36-cp36m/include/python3.6m",
22+
"PythonManylinuxLibDir": "/opt/python/cp36-cp36m/lib/python3.6",
2123
}
2224
elif version == "3.7":
2325
variables = {
2426
"PythonManylinuxDir": "/opt/python/cp37-cp37m",
2527
"PythonManylinuxIncludeDir": "/opt/python/cp37-cp37m/include/python3.7m",
28+
"PythonManylinuxLibDir": "/opt/python/cp37-cp37m/lib/python3.7",
2629
}
2730
elif version == "3.8":
2831
variables = {
2932
"PythonManylinuxDir": "/opt/python/cp38-cp38",
3033
"PythonManylinuxIncludeDir": "/opt/python/cp38-cp38/include/python3.8",
34+
"PythonManylinuxLibDir": "/opt/python/cp38-cp38/lib/python3.8",
3135
}
3236
elif version == "3.9":
3337
variables = {
3438
"PythonManylinuxDir": "/opt/python/cp39-cp39",
3539
"PythonManylinuxIncludeDir": "/opt/python/cp39-cp39/include/python3.9",
40+
"PythonManylinuxLibDir": "/opt/python/cp39-cp39/lib/python3.9",
3641
}
3742
else:
3843
raise ValueError("Unsupported Python version: '{}'".format(version))

‎tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda10_2

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ ENV LANG en_US.UTF-8
1515
ENV LANGUAGE en_US.UTF-8
1616
ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root
1717
ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH
18-
ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
18+
ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH_ARG}:$DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
1919
ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig
2020

2121
COPY manylinux2014_build_scripts /manylinux2014_build_scripts

‎tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_1

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ ENV LANG en_US.UTF-8
1515
ENV LANGUAGE en_US.UTF-8
1616
ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root
1717
ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH
18-
ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
18+
ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH_ARG}:$DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
1919
ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig
2020

2121
COPY manylinux2014_build_scripts /manylinux2014_build_scripts

0 commit comments

Comments
 (0)
Please sign in to comment.