Liqun/havenka/rel 1.8.1 round3 (microsoft#8246)

liqunfu · pranavsharma · Thiago Crepaldi · web-flow · commit 96bb4b1ce83e · 2021-06-30T23:02:16.000-07:00
* Revert the cuda algo finding change as this causes a significant memory bloat. (microsoft#8181) * Revert the cuda algo finding change as this causes a significant memory bloat. * Address PR comment * Make pipelines to support torch1.8.1 and torch1.9.0 (microsoft#8084) * Add post-install command to build PyTorch CPP extensions from within onnxruntime package (microsoft#8027) ORTModule requires two PyTorch CPP extensions that are currently JIT compiled. The runtime compilation can cause issues in some environments without all build requirements or in environments with multiple instances of ORTModule running in parallel This PR creates a custom command to compile such extensions that must be manually executed before ORTModule is executed for the first time. When users try to use ORTModule before the extensions are compiled, an error with instructions are raised PyTorch CPP Extensions for ORTModule can be compiled by running: python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install Full build environment is needed for this * Patch orttraining-ortmodule pipeline with latest fix on master * add cuda version to build config * lib path * . * . * . * . * . * . * . * . * . * . * . * Remove auto doc gen Co-authored-by: Pranav Sharma <prs@microsoft.com> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Baiju Meswani <bmeswani@microsoft.com>
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-torch181-cuda111.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-torch181-cuda111.yml
@@ -6,7 +6,7 @@ stages:
     build_py_parameters: --enable_training --update --build
     torch_version: '1.8.1'
     cuda_version: '11.1'
-    gcc_version: 9
+    gcc_version: 8
     cmake_cuda_architectures: 37;50;52;60;61;70;75;80
     docker_file: Dockerfile.manylinux2014_training_cuda11_1
     agent_pool: Onnxruntime-Linux-GPU
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-torch190-cuda111.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-torch190-cuda111.yml
@@ -6,7 +6,7 @@ stages:
     build_py_parameters: --enable_training --update --build
     torch_version: '1.9.0'
     cuda_version: '11.1'
-    gcc_version: 9
+    gcc_version: 8
     cmake_cuda_architectures: 37;50;52;60;61;70;75;80
     docker_file: Dockerfile.manylinux2014_training_cuda11_1
     agent_pool: Onnxruntime-Linux-GPU
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -117,32 +117,6 @@ stages:
           Contents: 'Release/dist/*.whl'
           TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
-      - task: CmdLine@2
-        displayName: 'Build Python Documentation'
-        condition: and(succeeded(), ne(variables['PythonVersion'], '3.9'))  # tensorflow not available on python 3.9
-        inputs:
-          script: |
-            mkdir -p $HOME/.onnx
-            docker run --rm \
-              --volume /data/onnx:/data/onnx:ro \
-              --volume $(Build.SourcesDirectory):/onnxruntime_src \
-              --volume $(Build.BinariesDirectory):/build \
-              --volume /data/models:/build/models:ro \
-              --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-              -e NIGHTLY_BUILD \
-              -e BUILD_BUILDNUMBER \
-              onnxruntimecpubuild \
-                bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/*.whl && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release " ;
-          workingDirectory: $(Build.SourcesDirectory)
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
-        condition: and(succeeded(), ne(variables['PythonVersion'], '3.9'))  # tensorflow not available on python 3.9
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)/docs/inference/html'
-          Contents: '**'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)/inference_html_doc'
-
       - task: PublishBuildArtifacts@1
         displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
         inputs:
@@ -329,7 +303,7 @@ stages:
           render_gid=$(getent group | awk '/render/ {split($0,a,":"); print(a[3])}')
           echo "##vso[task.setvariable variable=render]$render_gid"
         displayName: 'Find video and render gid to be mapped into container'
- 
+
       - script: |-
           echo "video=$video"
           echo "render=$render"
@@ -354,7 +328,7 @@ stages:
             onnxruntimetrainingrocmbuild \
                /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
         displayName: 'Run onnxruntime unit tests (in container)'
-      
+
       - script: |-
           docker run --rm \
             --device=/dev/kfd \
@@ -381,7 +355,7 @@ stages:
                 --gpu_sku MI100_32G
         displayName: 'Run C++ BERT-L batch size test (in container)'
         condition: succeededOrFailed() # ensure all tests are run
-      
+
       - script: |-
           docker run --rm \
             --device=/dev/kfd \
@@ -409,7 +383,7 @@ stages:
                 --gpu_sku MI100_32G
         displayName: 'Run C++ BERT-L performance test (in container)'
         condition: succeededOrFailed() # ensure all tests are run
-      
+
       - script: |-
           docker run --rm \
             --device=/dev/kfd \
@@ -437,38 +411,14 @@ stages:
                 --gpu_sku MI100_32G
         displayName: 'Run C++ BERT-L convergence test (in container)'
         condition: succeededOrFailed() # ensure all tests are run
-      
+
       - task: CopyFiles@2
         displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
         inputs:
           SourceFolder: '$(Build.BinariesDirectory)'
           Contents: 'Release/dist/*.whl'
           TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
-      - task: CmdLine@2
-        displayName: 'Build Python Documentation'
-        condition: and(succeeded(), ne(variables['PythonVersion'], '3.9'))  # tensorflow not available on python 3.9
-        inputs:
-          script: |
-            mkdir -p $HOME/.onnx
-            docker run --rm \
-              --volume $(Build.SourcesDirectory):/onnxruntime_src \
-              --volume $(Build.BinariesDirectory):/build \
-              -e NIGHTLY_BUILD \
-              -e BUILD_BUILDNUMBER \
-              --entrypoint /bin/bash \
-              onnxruntimetrainingrocmbuild \
-                /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release
-          workingDirectory: $(Build.SourcesDirectory)
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
-        condition: and(succeeded(), ne(variables['PythonVersion'], '3.9'))  # tensorflow not available on python 3.9
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)/docs/training/html'
-          Contents: '**'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc'
-
       - task: PublishBuildArtifacts@1
         displayName: 'Upload Rocm wheel as build artifact'
         inputs:
@@ -737,7 +687,7 @@ stages:
         displayName: 'Publish Artifact: ONNXRuntime python wheel'
         inputs:
           ArtifactName: onnxruntime_gpu
-      
+
       - task: DeleteFiles@1
         displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
         condition: and (succeeded(), eq(variables['PythonVersion'], '3.7'))
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -24,7 +24,7 @@ parameters:
   displayName: >
     gcc_version.
   type: number
-  
+
 - name: docker_file
   displayName: >
     docker_file.
@@ -87,9 +87,9 @@ stages:
             --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
             --build-arg BUILD_UID=$(id -u)
             --network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64
-            --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-$(GccVersion)/root 
-            --build-arg PREPEND_PATH=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin: 
-            --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64/dyninst:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib/dyninst:/usr/local/lib64
+            --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-$(GccVersion)/root
+            --build-arg PREPEND_PATH=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin:
+            --build-arg LD_LIBRARY_PATH_ARG=$(PythonManylinuxLibDir):/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64/dyninst:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib/dyninst:/usr/local/lib64
           Repository: onnxruntimetraininggpubuild
 
       - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist"
@@ -128,7 +128,7 @@ stages:
                   --build_wheel \
                   --enable_onnx_tests \
                   ${{ parameters.build_py_parameters }} \
-                  --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' \
+                  --cmake_extra_defines PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=$(PythonManylinuxLibDir) CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' \
                   --use_cuda --cuda_version=$(CudaVersion) --cuda_home=/usr/local/cuda-$(CudaVersion) --cudnn_home=/usr/local/cuda-$(CudaVersion) ;
           workingDirectory: $(Build.SourcesDirectory)
 
@@ -159,34 +159,6 @@ stages:
           Contents: 'Release/dist/*.whl'
           TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
-      - task: CmdLine@2
-        displayName: 'Build Python Documentation'
-        condition: and(succeeded(), ne(variables['PythonVersion'], '3.9'))  # tensorflow not available on python 3.9
-        inputs:
-          script: |
-            mkdir -p $HOME/.onnx
-            docker run --rm \
-              --gpus all \
-              -e NVIDIA_VISIBLE_DEVICES=all \
-              --volume /data/onnx:/data/onnx:ro \
-              --volume $(Build.SourcesDirectory):/onnxruntime_src \
-              --volume $(Build.BinariesDirectory):/build \
-              --volume /data/models:/build/models:ro \
-              --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-              -e NIGHTLY_BUILD \
-              -e BUILD_BUILDNUMBER \
-              onnxruntimetraininggpubuild \
-                bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/*.whl && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release " ;
-          workingDirectory: $(Build.SourcesDirectory)
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
-        condition: and(succeeded(), ne(variables['PythonVersion'], '3.9'))  # tensorflow not available on python 3.9
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)/docs/training/html'
-          Contents: '**'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc'
-
       - task: PublishBuildArtifacts@1
         displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
         inputs:
@@ -207,7 +179,7 @@ stages:
                 --account_key $(orttrainingpackagestorageaccountkey) \
                 --container_name '$web'
           condition: succeededOrFailed()
-          displayName: 
+          displayName:
 
       - template: component-governance-component-detection-steps.yml
         parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
@@ -13,26 +13,31 @@ steps:
         variables = {
           "PythonManylinuxDir": "/opt/python/cp35-cp35m",
           "PythonManylinuxIncludeDir": "/opt/python/cp35-cp35m/include/python3.5m",
+          "PythonManylinuxLibDir":  "/opt/python/cp35-cp35m/lib/python3.5",
         }
       elif version == "3.6":
         variables = {
           "PythonManylinuxDir": "/opt/python/cp36-cp36m",
           "PythonManylinuxIncludeDir": "/opt/python/cp36-cp36m/include/python3.6m",
+          "PythonManylinuxLibDir":  "/opt/python/cp36-cp36m/lib/python3.6",
         }
       elif version == "3.7":
         variables = {
           "PythonManylinuxDir": "/opt/python/cp37-cp37m",
           "PythonManylinuxIncludeDir": "/opt/python/cp37-cp37m/include/python3.7m",
+          "PythonManylinuxLibDir":  "/opt/python/cp37-cp37m/lib/python3.7",
         }
       elif version == "3.8":
         variables = {
           "PythonManylinuxDir": "/opt/python/cp38-cp38",
           "PythonManylinuxIncludeDir": "/opt/python/cp38-cp38/include/python3.8",
+          "PythonManylinuxLibDir":  "/opt/python/cp38-cp38/lib/python3.8",
         }
       elif version == "3.9":
         variables = {
           "PythonManylinuxDir": "/opt/python/cp39-cp39",
           "PythonManylinuxIncludeDir": "/opt/python/cp39-cp39/include/python3.9",
+          "PythonManylinuxLibDir":  "/opt/python/cp39-cp39/lib/python3.9",
         }
       else:
         raise ValueError("Unsupported Python version: '{}'".format(version))
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda10_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda10_2
@@ -15,7 +15,7 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root
 ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH
-ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH_ARG}:$DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
 ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig
 
 COPY manylinux2014_build_scripts /manylinux2014_build_scripts
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_1 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_1
@@ -15,7 +15,7 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root
 ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH
-ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH_ARG}:$DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
 ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig
 
 COPY manylinux2014_build_scripts /manylinux2014_build_scripts