From 572d54f91791448c015e74a4f1d6923b77b79795 Mon Sep 17 00:00:00 2001
From: Rajeev Rao <rajeevrao@nvidia.com>
Date: Wed, 18 Dec 2019 13:36:46 -0800
Subject: [PATCH] TensorRT Open Source Release v7.0

Release Notes:
https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/tensorrt-7.html

Signed-off-by: Rajeev Rao <rajeevrao@nvidia.com>
---
 .gitignore                                    |    1 +
 .gitmodules                                   |    2 +-
 CMakeLists.txt                                |    4 +-
 README.md                                     |   87 +-
 VERSION                                       |    2 +-
 ...{centos-7.Dockerfile => centos.Dockerfile} |   26 +-
 docker/jetpack_files/.gitkeep                 |    0
 docker/ubuntu-18.04.Dockerfile                |   48 -
 ...erfile => ubuntu-cross-aarch64.Dockerfile} |   44 +-
 ...ntu-16.04.Dockerfile => ubuntu.Dockerfile} |   35 +-
 include/NvCaffeParser.h                       |   18 +-
 include/NvInfer.h                             | 1154 +++++++++++++++--
 include/NvInferRuntime.h                      |   66 +-
 include/NvInferRuntimeCommon.h                |   17 +-
 include/NvInferVersion.h                      |   10 +-
 include/NvOnnxParser.h                        |   20 +-
 include/NvOnnxParserRuntime.h                 |   79 --
 include/NvUffParser.h                         |   10 +-
 parsers/onnx                                  |    2 +-
 plugin/batchedNMSPlugin/README.md             |    3 +-
 .../embLayerNormPlugin/embLayerNormPlugin.cu  |    2 +-
 plugin/fcPlugin/fcPlugin.cu                   |   20 +-
 .../instanceNormalizationPlugin.cpp           |   17 +-
 .../instanceNormalizationPlugin.h             |    2 +-
 plugin/specialSlicePlugin/README.md           |    4 +-
 samples/CMakeSamplesTemplate.txt              |    5 +-
 samples/common/BatchStream.h                  |    2 +-
 samples/common/EntropyCalibrator.h            |    2 +-
 samples/common/argsParser.h                   |    6 +-
 samples/common/buffers.h                      |    1 +
 samples/common/common.h                       |    4 +-
 samples/common/logging.h                      |    2 +-
 samples/common/sampleConfig.h                 |    9 +
 samples/common/sampleDevice.h                 |   75 +-
 samples/common/sampleEngines.cpp              |   91 +-
 samples/common/sampleInference.cpp            |  384 ++++--
 samples/common/sampleInference.h              |    6 +-
 samples/common/sampleOptions.cpp              |  176 +--
 samples/common/sampleOptions.h                |    5 +-
 samples/common/sampleReporting.cpp            |  315 ++++-
 samples/common/sampleReporting.h              |  132 +-
 samples/common/sampleUtils.h                  |  330 ++++-
 samples/common/windows/getopt.c               |  193 +--
 samples/common/windows/getopt.h               |    7 +-
 samples/opensource/sampleCharRNN/README.md    |    6 +-
 .../sampleCharRNN/sampleCharRNN.cpp           |  468 +++++--
 .../opensource/sampleDynamicReshape/README.md |   23 +-
 samples/opensource/sampleINT8/README.md       |    4 +-
 samples/opensource/sampleINT8/sampleINT8.cpp  |   18 +-
 samples/opensource/sampleINT8API/README.md    |   24 +-
 .../sampleINT8API/sampleINT8API.cpp           |  118 +-
 samples/opensource/sampleMNISTAPI/README.md   |    4 +-
 .../sampleMNISTAPI/sampleMNISTAPI.cpp         |   12 +-
 samples/opensource/sampleMovieLens/README.md  |   10 +-
 samples/opensource/sampleNMT/trtUtil.cpp      |    1 +
 .../opensource/samplePlugin/samplePlugin.cpp  |   10 +-
 samples/opensource/sampleSSD/batchPrepare.py  |    2 +-
 samples/opensource/sampleSSD/sampleSSD.cpp    |    4 +-
 .../sampleUffFasterRCNN.cpp                   |    6 +-
 .../sampleUffMaskRCNN/sampleUffMaskRCNN.cpp   |    9 +-
 samples/opensource/trtexec/CMakeLists.txt     |    1 -
 samples/opensource/trtexec/README.md          |   38 +-
 samples/opensource/trtexec/giexec             |    2 +-
 samples/opensource/trtexec/prn_utils.py       |   89 ++
 samples/opensource/trtexec/profiler.py        |  104 ++
 samples/opensource/trtexec/tracer.py          |  135 ++
 samples/opensource/trtexec/trtexec.cpp        |   43 +-
 scripts/stubify.sh                            |   48 +
 68 files changed, 3558 insertions(+), 1039 deletions(-)
 rename docker/{centos-7.Dockerfile => centos.Dockerfile} (68%)
 create mode 100644 docker/jetpack_files/.gitkeep
 delete mode 100644 docker/ubuntu-18.04.Dockerfile
 rename docker/{ubuntu-18.04-cross-aarch64.Dockerfile => ubuntu-cross-aarch64.Dockerfile} (79%)
 rename docker/{ubuntu-16.04.Dockerfile => ubuntu.Dockerfile} (62%)
 delete mode 100644 include/NvOnnxParserRuntime.h
 create mode 100644 samples/opensource/trtexec/prn_utils.py
 create mode 100755 samples/opensource/trtexec/profiler.py
 create mode 100755 samples/opensource/trtexec/tracer.py
 create mode 100755 scripts/stubify.sh

diff --git a/.gitignore b/.gitignore
index 567609b1..7eb1c34a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 build/
+docker/jetpack_files/*
diff --git a/.gitmodules b/.gitmodules
index a4bea916..19c5e52e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "parsers/onnx"]
 	path = parsers/onnx
 	url = https://github.com/onnx/onnx-tensorrt.git
-	branch = master
+	branch = 7.0
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d940cd2..b6b189c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,8 +107,8 @@ endif()
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wno-deprecated-declarations")
 
 ################################### DEPENDENCIES ##########################################
-set(DEFAULT_CUDA_VERSION 10.1)
-set(DEFAULT_CUDNN_VERSION 7.5)
+set(DEFAULT_CUDA_VERSION 10.2)
+set(DEFAULT_CUDNN_VERSION 7.6)
 set(DEFAULT_PROTOBUF_VERSION 3.0.0)
 set(DEFAULT_PROTOBUF_INTERNAL_VERSION 10.0)
 set(DEFAULT_CUB_VERSION 1.8.0)
diff --git a/README.md b/README.md
index 2ea6afeb..0834d90b 100644
--- a/README.md
+++ b/README.md
@@ -15,9 +15,8 @@ To build the TensorRT OSS components, ensure you meet the following package requ
 
 * [CUDA](https://developer.nvidia.com/cuda-toolkit)
   * Recommended versions:
-  * [cuda-10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) + cuDNN-7.6
+  * [cuda-10.2](https://developer.nvidia.com/cuda-10.2-download-archive-base) + cuDNN-7.6
   * [cuda-10.0](https://developer.nvidia.com/cuda-10.0-download-archive) + cuDNN-7.6
-  * [cuda-9.0](https://developer.nvidia.com/cuda-90-download-archive) + cuDNN 7.6
 
 * [GNU Make](https://ftp.gnu.org/gnu/make/) >= v4.1
 
@@ -33,6 +32,9 @@ To build the TensorRT OSS components, ensure you meet the following package requ
 * Essential libraries and utilities
   * [Git](https://git-scm.com/downloads), [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/), [Wget](https://www.gnu.org/software/wget/faq.html#download), [Zlib](https://zlib.net/)
 
+* Cross compilation for Jetson platforms requires JetPack's host component installation
+  * [JetPack](https://developer.nvidia.com/embedded/jetpack) >= 4.2
+
 **Optional Packages**
 
 * Containerized builds
@@ -45,11 +47,12 @@ To build the TensorRT OSS components, ensure you meet the following package requ
 
 **TensorRT Release**
 
-* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v6.0.1
+* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v7.0
+
 
 NOTE: Along with the TensorRT OSS components, the following source packages will also be downloaded, and they are not required to be installed on the system.
 
-- [ONNX-TensorRT](https://github.com/onnx/onnx-tensorrt) v6.0
+- [ONNX-TensorRT](https://github.com/onnx/onnx-tensorrt) v7.0
 - [CUB](http://nvlabs.github.io/cub/) v1.8.0
 - [Protobuf](https://github.com/protocolbuffers/protobuf.git) v3.8.x
 
@@ -59,7 +62,7 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 1. #### Download TensorRT OSS sources.
 
 	```bash
-	git clone -b master https://github.com/nvidia/TensorRT TensorRT
+	git clone -b master https://github.com/nvidia/TensorRT TensorRT -b release/7.0
 	cd TensorRT
 	git submodule update --init --recursive
 	export TRT_SOURCE=`pwd`
@@ -67,30 +70,40 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 2. #### Download the TensorRT binary release.
 
-	To build the TensorRT OSS, obtain the corresponding TensorRT 6.0.1 binary release from [NVidia Developer Zone](https://developer.nvidia.com/nvidia-tensorrt-download). For a list of key features, known and fixed issues, see the [TensorRT 6.0.1 Release Notes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/index.html).
+	To build the TensorRT OSS, obtain the corresponding TensorRT 7.0 binary release from [NVidia Developer Zone](https://developer.nvidia.com/nvidia-tensorrt-7x-download). For a list of key features, known and fixed issues, refer to the [TensorRT 7.0 Release Notes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/tensorrt-7.html#tensorrt-7).
 
-	**Example: Ubuntu 18.04 with cuda-10.1**
+	**Example: Ubuntu 18.04 with cuda-10.2**
 
-	Download and extract the *TensorRT 6.0.1.5 GA for Ubuntu 18.04 and CUDA 10.1 tar package*
+	Download and extract the latest *TensorRT 7.0 GA package for Ubuntu 18.04 and CUDA 10.2*
 	```bash
 	cd ~/Downloads
-	# Download TensorRT-6.0.1.5.Ubuntu-18.04.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz
-	tar -xvzf TensorRT-6.0.1.5.Ubuntu-18.04.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz
-	export TRT_RELEASE=`pwd`/TensorRT-6.0.1.5
+	# Download TensorRT-7.0.0.11.Ubuntu-18.04.x86_64-gnu.cuda-10.2.cudnn7.6.tar.gz
+	tar -xvzf TensorRT-7.0.0.11.Ubuntu-18.04.x86_64-gnu.cuda-10.2.cudnn7.6.tar.gz
+	export TRT_RELEASE=`pwd`/TensorRT-7.0.0.11
 	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_RELEASE/lib
 	```
 
-	**Example: CentOS/RedHat 7 with cuda-9.0**
+	**Example: CentOS/RedHat 7 with cuda-10.0**
 
-	Download and extract the *TensorRT 6.0.1.5 GA for CentOS/RedHat 7 and CUDA 9.0 tar package*
+	Download and extract the *TensorRT 7.0 GA for CentOS/RedHat 7 and CUDA 10.0 tar package*
 	```bash
 	cd ~/Downloads
-	# Download TensorRT-6.0.1.5.Red-Hat.x86_64-gnu.cuda-9.0.cudnn7.6.tar.gz
-	tar -xvzf TensorRT-6.0.1.5.Red-Hat.x86_64-gnu.cuda-9.0.cudnn7.6.tar.gz
-	export TRT_RELEASE=`pwd`/TensorRT-6.0.1.5
+	# Download TensorRT-7.0.0.11.CentOS-7.6.x86_64-gnu.cuda-10.0.cudnn7.6.tar.gz
+	tar -xvzf TensorRT-7.0.0.11.CentOS-7.6.x86_64-gnu.cuda-10.0.cudnn7.6.tar.gz
+	export TRT_RELEASE=`pwd`/TensorRT-7.0.0.11
 	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_RELEASE/lib
 	```
 
+3. #### Download JetPack packages for cross-compilation.[OPTIONAL]
+
+Using the SDK manager, download the host componets of the PDK version or Jetpack specified in the name of the Dockerfile. To do this:
+
+1. [**SDK Manager Step 01**] Log into the SDK manager
+2. [**SDK Manager Step 01**] Select the correct platform and Target OS System  (should be corresponding to the name of the Dockerfile you are building (e.g. Jetson AGX Xavier, `Linux Jetpack 4.2.1`), then click `Continue`
+3. [**SDK Manager Step 02**] Under `Download & Install Options` make note of or change the download folder **and Select Download now. Install later.** then agree to the license terms and click `Continue`
+
+You should now have all expected files to build the container. Move these into the `docker/jetpack_files` folder.
+
 ## Setting Up The Build Environment
 
 * Install the *System Packages* list of components in the *Prerequisites* section.
@@ -99,24 +112,30 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 1. #### Generate the TensorRT build container.
 
-	The docker container can be built using the included Dockerfile. The build container is configured with the environment and packages required for building TensorRT OSS.
+  The docker container can be built using the included Dockerfile. The build container is configured with the environment and packages required for building TensorRT OSS.
 
-	**Example: Ubuntu 18.04 with cuda-10.1**
+  **Example: Ubuntu 18.04 with cuda-10.2**
 
-	```bash
-	docker build -f docker/ubuntu-18.04.Dockerfile --build-arg CUDA_VERSION=10.1 --tag=tensorrt .
-	```
+  ```bash
+  docker build -f docker/ubuntu.Dockerfile --build-arg UBUNTU_VERSION=18.04 --build-arg CUDA_VERSION=10.2 --tag=tensorrt-ubuntu .
+  ```
 
-	**Example: CentOS/RedHat 7 with cuda-9.0**
+  **Example: CentOS/RedHat 7 with cuda-10.0**
 
-	```bash
-	docker build -f docker/centos-7.Dockerfile --build-arg CUDA_VERSION=9.0 --tag=tensorrt .
-	```
+  ```bash
+  docker build -f docker/centos.Dockerfile --build-arg CENTOS_VERSION=7 --build-arg CUDA_VERSION=10.0 --tag=tensorrt-centos .
+  ```
+
+   **Example: Cross compile for JetPack 4.2.1 with cuda-10.0**
+   ```bash
+   docker build -f docker/ubuntu-cross-aarch64.Dockerfile --build-arg UBUNTU_VERSION=18.04 --build-arg CUDA_VERSION=10.0 --tag tensorrt-ubuntu-aarch64 .
+   `
+   ```
 
 2. #### Launch the TensorRT build container.
 
 	```bash
-	docker run -v $TRT_RELEASE:/tensorrt -v $TRT_SOURCE:/workspace/TensorRT -it tensorrt:latest
+	docker run -v $TRT_RELEASE:/tensorrt -v $TRT_SOURCE:/workspace/TensorRT -it tensorrt-ubuntu:latest
 	```
 
 	> NOTE: To run TensorRT/CUDA programs within the build container, install [nvidia-docker](#prerequisites). Replace the `docker run` command with `nvidia-docker run` or `docker run --runtime=nvidia`.
@@ -128,17 +147,15 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 	```bash
 	cd $TRT_SOURCE
-	mkdir -p build && cd build 
+	mkdir -p build && cd build
 	cmake .. -DTRT_LIB_DIR=$TRT_RELEASE/lib -DTRT_BIN_DIR=`pwd`/out
 	make -j$(nproc)
 	```
 
 	> NOTE:
-	> 1. The default CUDA version used by CMake is 10.1. To override this, for example to 9.0, append `-DCUDA_VERSION=9.0` to the cmake command.
+	> 1. The default CUDA version used by CMake is 10.2. To override this, for example to 10.0, append `-DCUDA_VERSION=10.0` to the cmake command.
 	> 2. Samples may fail to link on CentOS7. To work around this create the following symbolic link:
-	> ```bash
-	> ln -s $TRT_BIN_DIR/libnvinfer_plugin.so $TRT_BIN_DIR/libnvinfer_plugin.so.6
-	> ```
+	> `ln -s $TRT_BIN_DIR/libnvinfer_plugin.so $TRT_BIN_DIR/libnvinfer_plugin.so.7`
 
 	The required CMake arguments are:
 
@@ -150,9 +167,9 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 	- `CMAKE_BUILD_TYPE`: Specify if binaries generated are for release or debug (contain debug symbols). Values consists of [`Release`] | `Debug`
 
-	- `CUDA_VERISON`: The version of CUDA to target, for example [`10.1`].
+	- `CUDA_VERISON`: The version of CUDA to target, for example [`10.2`].
 
-	- `CUDNN_VERSION`: The version of cuDNN to target, for example [`7.5`].
+	- `CUDNN_VERSION`: The version of cuDNN to target, for example [`7.6`].
 
 	- `PROTOBUF_VERSION`:  The version of Protobuf to use, for example [`3.8.x`]. Note: Changing this will not configure CMake to use a system version of Protobuf, it will configure CMake to download and try building that version.
 
@@ -222,5 +239,5 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 ## Known Issues
 
-#### TensorRT 6.0.1
-* See [Release Notes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/index.html).
+#### TensorRT 7.0
+* See [Release Notes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/tensorrt-7.html#tensorrt-7).
diff --git a/VERSION b/VERSION
index 8cd21902..dbdef6f9 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-6.0.1.5
+7.0.0.11
diff --git a/docker/centos-7.Dockerfile b/docker/centos.Dockerfile
similarity index 68%
rename from docker/centos-7.Dockerfile
rename to docker/centos.Dockerfile
index ea31ba0f..e1a4c545 100644
--- a/docker/centos-7.Dockerfile
+++ b/docker/centos.Dockerfile
@@ -12,11 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG CUDA_VERSION=10.1
-FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-cudnn7-devel-centos7
+ARG CUDA_VERSION=10.2
+ARG CENTOS_VERSION=7
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn7-devel-centos${CENTOS_VERSION}
 
 LABEL maintainer="NVIDIA CORPORATION"
 
+ARG uid=1000
+ARG gid=1000
+RUN groupadd -r -f -g ${gid} trtuser && useradd -r -u ${uid} -g ${gid} -ms /bin/bash trtuser
+RUN usermod -aG wheel trtuser
+RUN echo 'trtuser:nvidia' | chpasswd
+RUN mkdir -p /workspace && chown trtuser /workspace
 # Install requried libraries
 RUN yum -y install \
     libcurl4-openssl-dev \
@@ -26,6 +33,10 @@ RUN yum -y install \
     pkg-config \
     python3 \
     python3-pip \
+    python3-dev \
+    python3-setuptools \
+    python3-wheel \
+    sudo \
     make
 
 RUN cd /usr/local/bin &&\
@@ -33,17 +44,16 @@ RUN cd /usr/local/bin &&\
     ln -s /usr/bin/pip3 pip
 
 # Install Cmake
-RUN cd /tmp &&\
-    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh &&\
-    chmod +x cmake-3.14.4-Linux-x86_64.sh &&\
-    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license &&\
+RUN cd /tmp && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \
+    chmod +x cmake-3.14.4-Linux-x86_64.sh && \
+    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
     rm ./cmake-3.14.4-Linux-x86_64.sh
 
 # Set environment and working directory
 ENV TRT_RELEASE /tensorrt
-ENV TRT_LIB_DIR $TRT_RELEASE/lib
 ENV TRT_SOURCE /workspace/TensorRT
-ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:$TRT_LIB_DIR
 WORKDIR /workspace
 
+USER trtuser
 RUN ["/bin/bash"]
diff --git a/docker/jetpack_files/.gitkeep b/docker/jetpack_files/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/docker/ubuntu-18.04.Dockerfile b/docker/ubuntu-18.04.Dockerfile
deleted file mode 100644
index ec37f196..00000000
--- a/docker/ubuntu-18.04.Dockerfile
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG CUDA_VERSION=10.1
-FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-cudnn7-devel-ubuntu18.04
-
-LABEL maintainer="NVIDIA CORPORATION"
-
-# Install requried libraries
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libcurl4-openssl-dev \
-    wget \
-    zlib1g-dev \
-    git \
-    pkg-config \
-    python3 \
-    python3-pip
-
-RUN cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python &&\
-    ln -s /usr/bin/pip3 pip
-
-# Install Cmake
-RUN cd /tmp &&\
-    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh &&\
-    chmod +x cmake-3.14.4-Linux-x86_64.sh &&\
-    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license &&\
-    rm ./cmake-3.14.4-Linux-x86_64.sh
-
-# Set environment and working directory
-ENV TRT_RELEASE /tensorrt
-ENV TRT_LIB_DIR $TRT_RELEASE/lib
-ENV TRT_SOURCE /workspace/TensorRT
-ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:$TRT_LIB_DIR
-WORKDIR /workspace
-
-RUN ["/bin/bash"]
diff --git a/docker/ubuntu-18.04-cross-aarch64.Dockerfile b/docker/ubuntu-cross-aarch64.Dockerfile
similarity index 79%
rename from docker/ubuntu-18.04-cross-aarch64.Dockerfile
rename to docker/ubuntu-cross-aarch64.Dockerfile
index 9b540247..0376a0d4 100644
--- a/docker/ubuntu-18.04-cross-aarch64.Dockerfile
+++ b/docker/ubuntu-cross-aarch64.Dockerfile
@@ -12,13 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#ARG CUDA_VERSION=10.1
-#FROM ubuntu:18.04 
-FROM nvidia/cuda:10.0-devel-ubuntu18.04
+ARG CUDA_VERSION=10.0
+ARG UBUNTU_VERSION=18.04
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 
 LABEL maintainer="NVIDIA CORPORATION"
 
+ARG uid=1000
+ARG gid=1000
+RUN groupadd -r -f -g ${gid} trtuser && useradd -r -u ${uid} -g ${gid} -ms /bin/bash trtuser
+RUN usermod -aG sudo trtuser
+RUN echo 'trtuser:nvidia' | chpasswd
+RUN mkdir -p /workspace && chown trtuser /workspace
 # Install requried libraries
+RUN apt-get update && apt-get install -y software-properties-common
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libcurl4-openssl-dev \
     wget \
@@ -26,20 +34,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
     pkg-config \
     python3 \
-    python3-pip
+    python3-pip \
+    python3-dev \
+    python3-setuptools \
+    python3-wheel \
+    sudo \
+    ssh \
+    pbzip2 \
+    pv \
+    bzip2 \
+    unzip
 
 RUN cd /usr/local/bin &&\
     ln -s /usr/bin/python3 python &&\
     ln -s /usr/bin/pip3 pip
 
 # Install Cmake
-RUN cd /tmp &&\
-    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh &&\
-    chmod +x cmake-3.14.4-Linux-x86_64.sh &&\
-    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license &&\
+RUN cd /tmp && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \
+    chmod +x cmake-3.14.4-Linux-x86_64.sh && \
+    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
     rm ./cmake-3.14.4-Linux-x86_64.sh
 
-        
+
 COPY docker/jetpack_files /pdk_files
 COPY scripts/stubify.sh /pdk_files
 
@@ -57,7 +74,7 @@ RUN dpkg -x /pdk_files/libcudnn7_7.5.0.56-1+cuda10.0_arm64.deb /pdk_files/cudnn
     && ln -s libcudnn.so.7 libcudnn.so \
     && cd /pdk_files/cudnn \
     && ln -s usr/include/aarch64-linux-gnu include \
-    && ln -s usr/lib/aarch64-linux-gnu lib \ 
+    && ln -s usr/lib/aarch64-linux-gnu lib \
     && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/cudnn.h
 
 # Unpack libnvinfer
@@ -69,9 +86,9 @@ RUN dpkg -x /pdk_files/libnvinfer6_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorr
     && dpkg -x /pdk_files/libnvinfer-plugin6_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
     && dpkg -x /pdk_files/libnvinfer-plugin-dev_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
     && dpkg -x /pdk_files/libnvonnxparsers6_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
-    && dpkg -x /pdk_files/libnvonnxparsers-dev_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt 
+    && dpkg -x /pdk_files/libnvonnxparsers-dev_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt
 
-# create stub libraries 
+# create stub libraries
 RUN cd /pdk_files/tensorrt \
     && ln -s usr/include/aarch64-linux-gnu include \
     && ln -s usr/lib/aarch64-linux-gnu lib \
@@ -84,9 +101,8 @@ RUN cd /pdk_files/tensorrt \
 
 # Set environment and working directory
 ENV TRT_RELEASE /pdk_files/tensorrt
-ENV TRT_LIB_DIR $TRT_RELEASE/lib
 ENV TRT_SOURCE /workspace/TensorRT
-ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:$TRT_LIB_DIR
 WORKDIR /workspace
 
+USER trtuser
 RUN ["/bin/bash"]
diff --git a/docker/ubuntu-16.04.Dockerfile b/docker/ubuntu.Dockerfile
similarity index 62%
rename from docker/ubuntu-16.04.Dockerfile
rename to docker/ubuntu.Dockerfile
index bb3fcc7f..e29f0245 100644
--- a/docker/ubuntu-16.04.Dockerfile
+++ b/docker/ubuntu.Dockerfile
@@ -12,12 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG CUDA_VERSION=10.1
-FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-cudnn7-devel-ubuntu16.04
+ARG CUDA_VERSION=10.2
+ARG UBUNTU_VERSION=18.04
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn7-devel-ubuntu${UBUNTU_VERSION}
 
 LABEL maintainer="NVIDIA CORPORATION"
 
+ARG uid=1000
+ARG gid=1000
+RUN groupadd -r -f -g ${gid} trtuser && useradd -r -u ${uid} -g ${gid} -ms /bin/bash trtuser
+RUN usermod -aG sudo trtuser
+RUN echo 'trtuser:nvidia' | chpasswd
+RUN mkdir -p /workspace && chown trtuser /workspace
 # Install requried libraries
+RUN apt-get update && apt-get install -y software-properties-common
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libcurl4-openssl-dev \
     wget \
@@ -25,24 +34,32 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
     pkg-config \
     python3 \
-    python3-pip
+    python3-pip \
+    python3-dev \
+    python3-setuptools \
+    python3-wheel \
+    sudo \
+    ssh \
+    pbzip2 \
+    pv \
+    bzip2 \
+    unzip
 
 RUN cd /usr/local/bin &&\
     ln -s /usr/bin/python3 python &&\
     ln -s /usr/bin/pip3 pip
 
 # Install Cmake
-RUN cd /tmp &&\
-    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh &&\
-    chmod +x cmake-3.14.4-Linux-x86_64.sh &&\
-    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license &&\
+RUN cd /tmp && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \
+    chmod +x cmake-3.14.4-Linux-x86_64.sh && \
+    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
     rm ./cmake-3.14.4-Linux-x86_64.sh
 
 # Set environment and working directory
 ENV TRT_RELEASE /tensorrt
-ENV TRT_LIB_DIR $TRT_RELEASE/lib
 ENV TRT_SOURCE /workspace/TensorRT
-ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:$TRT_LIB_DIR
 WORKDIR /workspace
 
+USER trtuser
 RUN ["/bin/bash"]
diff --git a/include/NvCaffeParser.h b/include/NvCaffeParser.h
index eaecf1d6..2bbd8f63 100644
--- a/include/NvCaffeParser.h
+++ b/include/NvCaffeParser.h
@@ -97,6 +97,8 @@ class IPluginFactory
     //! \param nbWeights Number of weights.
     //!
     virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) TRTNOEXCEPT = 0;
+
+    virtual ~IPluginFactory() {}
 };
 
 //!
@@ -144,6 +146,8 @@ class IPluginFactoryV2
     //! \param libNamespace Library Namespace associated with the plugin object
     //!
     virtual nvinfer1::IPluginV2* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights, const char* libNamespace = "") TRTNOEXCEPT = 0;
+
+    virtual ~IPluginFactoryV2() {}
 };
 //!
 //! \class ICaffeParser
@@ -259,14 +263,14 @@ class ICaffeParser
     //! \brief Set the ErrorRecorder for this interface
     //!
     //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
-    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting 
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
     //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
     //! a recorder has been registered.
-    //! 
+    //!
     //! \param recorder The error recorder to register with this interface.
-    //
+    //!
     //! \see getErrorRecorder
-    //! 
+    //!
     virtual void setErrorRecorder(nvinfer1::IErrorRecorder* recorder) TRTNOEXCEPT = 0;
 
     //!
@@ -297,7 +301,11 @@ TENSORRTAPI ICaffeParser* createCaffeParser() TRTNOEXCEPT;
 //! \note No part of the protocol buffers library can be used after this function is called.
 //!
 TENSORRTAPI void shutdownProtobufLibrary() TRTNOEXCEPT;
-}
+} // namespace nvcaffeparser1
 
+//!
+//! Internal C entry point for creating ICaffeParser.
+//! @private
+//!
 extern "C" TENSORRTAPI void* createNvCaffeParser_INTERNAL();
 #endif
diff --git a/include/NvInfer.h b/include/NvInfer.h
index 808d8dbd..6363bc60 100644
--- a/include/NvInfer.h
+++ b/include/NvInfer.h
@@ -403,13 +403,19 @@ enum class LayerType : int
     kSLICE = 24,           //!< Slice layer.
     kSHAPE = 25,           //!< Shape layer.
     kPARAMETRIC_RELU = 26, //!< Parametric ReLU layer.
-    kRESIZE = 27           //!< Resize Layer.
+    kRESIZE = 27,          //!< Resize Layer.
+    kTRIP_LIMIT = 28,      //!< Loop Trip limit layer
+    kRECURRENCE = 29,      //!< Loop Recurrence layer
+    kITERATOR = 30,        //!< Loop Iterator layer
+    kLOOP_OUTPUT = 31,     //!< Loop output layer
+    kSELECT = 32,          //!< Select layer.
+    kFILL = 33             //!< Fill layer
 };
 
 template <>
 constexpr inline int EnumMax<LayerType>()
 {
-    return 28;
+    return 34;
 } //!< Maximum number of elements in LayerType enum. \see LayerType
 
 //!
@@ -511,7 +517,7 @@ class ITensor
     //!
     //! \return maximal absolute value of the dynamic range, -1.0f if no dynamic range is set.
     //!
-    //! \deprecated This interface is superceded by getDynamicRangeMin and getDynamicRangeMax.
+    //! \deprecated This interface is superseded by getDynamicRangeMin and getDynamicRangeMax.
     //!
     TRT_DEPRECATED virtual float getDynamicRange() const TRTNOEXCEPT = 0;
 
@@ -700,7 +706,7 @@ class ILayer
     //! \param index The index of the input tensor.
     //!
     //! \return The input tensor, or nullptr if the index is out of range or the tensor is optional
-    //! (\ref IRNNLayer and \ref IRNNv2Layer).
+    //! (\ref ISliceLayer, \ref IRNNLayer and \ref IRNNv2Layer).
     //!
     virtual ITensor* getInput(int index) const TRTNOEXCEPT = 0;
 
@@ -718,12 +724,15 @@ class ILayer
     virtual ITensor* getOutput(int index) const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief replace an input of this layer with a specific tensor
+    //! \brief Replace an input of this layer with a specific tensor
     //!
-    //! Except of IShuffleLayer and ISliceLayer, this method cannot change the number of inputs to a layer.
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor
+    //
+    //! Except for IShuffleLayer, ISliceLayer, IResizeLayer and ILoopOutputLayer, this method cannot change the number of inputs to a layer.
     //! The index argument must be less than the value of getNbInputs().
     //!
-    //! See comments for IShuffleLayer::setInput() and ISliceLayer::setInput() for their special behavior.
+    //! See overloaded setInput() comments for the layers special behavior.
     //!
     //! \param index the index of the input to modify.
     //! \param tensor the new input tensor
@@ -779,8 +788,9 @@ class ILayer
     //! given type. If it is not set, TensorRT will select output type based on layer computational precision. TensorRT
     //! could still choose non-conforming output type based on fastest implementation. Use BuilderFlag::kSTRICT_TYPES to
     //! force choose requested output type. In case layer precision is not specified, output type would depend on
-    //! choosen implementation based on performance considerations and the flags specified to the builder. Note that
-    //! this method cannot be used to set the data type of the second output tensor of the topK layer. The data type of
+    //! chosen implementation based on performance considerations and the flags specified to the builder.
+    //!
+    //! This method cannot be used to set the data type of the second output tensor of the TopK layer. The data type of
     //! the second output tensor of the topK layer is always Int32. Also the output type of all layers that are shape
     //! operations must be DataType::kINT32, and all attempts to set the output type to some other data type will be
     //! ignored except for issuing an error message.
@@ -838,19 +848,230 @@ class ILayer
 //! \brief Enumerates the modes of padding to perform in convolution, deconvolution and pooling layer,
 //! padding mode takes precedence if setPaddingMode() and setPrePadding() are also used.
 //!
-//! kEXPLICIT* padding is to use explicit padding.
-//! kSAME* padding is to implicitly calculate padding to keep output dim to be the "same" with input dim. For
-//! convolution and pooling, output dim is ceil(input dim, stride), for deconvolution it is inverse, then use
-//! the output dim to calculate padding size. kCAFFE* padding is symmetric padding.
+//! There are three padding styles, EXPLICIT, SAME, and CAFFE, with each style having two variants.
+//! The EXPLICIT and CAFFE styles determine if the final sampling location is used or not.
+//! The SAME style determine if the asymmetry in the padding is on the pre or post padding.
+//!
+//! \code
+//! Shorthand:
+//!     I = dimensions of input image.
+//!     B = prePadding, before the image data. For deconvolution, prePadding is set before output.
+//!     A = postPadding, after the image data. For deconvolution, postPadding is set after output.
+//!     P = delta between input and output
+//!     S = stride
+//!     F = filter
+//!     O = output
+//!     D = dilation
+//!     M = I + B + A ; The image data plus any padding
+//!     E = F - S ; The discarded remainder on the right border
+//!     K = 1 + D * (E - 1)
+//! \endcode
+//!
+//! Formulas for Convolution:
+//!     - EXPLICIT_ROUND_DOWN:
+//! \code
+//!         O = floor((M - K) / S)
+//! \endcode
+//!     - CAFFE_ROUND_DOWN:
+//! \code
+//!         O = floor((I + B * 2 - K) / S)
+//! \endcode
+//!     - EXPLICIT_ROUND_UP:
+//! \code
+//!         O = ceil((M - K) / S)
+//! \endcode
+//!     - CAFFE_ROUND_UP:
+//! \code
+//!         O = ceil((I + B * 2 - K) / S)
+//! \endcode
+//!     - SAME_UPPER:
+//! \code
+//!         P = (I - ceil(I / S))
+//!         B = floor(P / 2)
+//!         A = P - B
+//! \endcode
+//!     - SAME_LOWER:
+//! \code
+//!         P = (I - ceil(I / S))
+//!         A = floor(P / 2)
+//!         B = P - A
+//! \endcode
+//!
+//! Formulas for Deconvolution:
+//!     - EXPLICIT_ROUND_DOWN:
+//!     - CAFFE_ROUND_DOWN:
+//!     - EXPLICIT_ROUND_UP:
+//!     - CAFFE_ROUND_UP:
+//! \code
+//!         O = (I - 1) * S + K - (B + A)
+//! \endcode
+//!     - SAME_UPPER:
+//! \code
+//!         O = min(I * S, (I - 1) * S + K)
+//!         P = max(K - S, 0)
+//!         B = floor(P / 2)
+//!         A = P - B
+//! \endcode
+//!     - SAME_LOWER:
+//! \code
+//!         O = min(I * S, (I - 1) * S + K)
+//!         P = max(K - S, 0)
+//!         A = floor(P / 2)
+//!         B = P - A
+//! \endcode
+//!
+//! Formulas for Pooling:
+//!     - EXPLICIT_ROUND_DOWN:
+//! \code
+//!         O = floor((M - E) / S)
+//! \endcode
+//!     - EXPLICIT_ROUND_UP:
+//! \code
+//!         O = ceil((M - E) / S)
+//! \endcode
+//!     - SAME_UPPER:
+//! \code
+//!         O = ceil(I / S)
+//!         P = (I - ceil(I / S))
+//!         B = floor(P / 2)
+//!         A = P - B
+//! \endcode
+//!     - SAME_LOWER:
+//! \code
+//!         O = ceil(I / S)
+//!         P = (I - ceil(I / S))
+//!         A = floor(P / 2)
+//!         B = P - A
+//! \endcode
+//!     - CAFFE_ROUND_DOWN:
+//! \code
+//!         EXPLICIT_ROUND_DOWN - ((EXPLICIT_ROUND_DOWN - 1) * S >= I + B)
+//! \endcode
+//!     - CAFFE_ROUND_UP:
+//! \code
+//!         EXPLICIT_ROUND_UP - ((EXPLICIT_ROUND_UP - 1) * S >= I + B)
+//! \endcode
+//!
+//! Pooling Example 1:
+//! \code
+//!     Given I = {6, 6}, B = {3, 3}, A = {2, 2}, S = {2, 2}, F = {3, 3}. What is O?
+//! \endcode
+//!
+//! - EXPLICIT_ROUND_DOWN:
+//! \code
+//!     Computation:
+//!         M = {6, 6} + {3, 3} + {2, 2} ==> {11, 11}
+//!         E = {3, 3} - {2, 2} ==> {1, 1}
+//!         O ==> floor(({11, 11} - {1, 1}) / {2, 2})
+//!           ==> floor({10, 10} / {2, 2})
+//!           ==> floor({5, 5})
+//!           ==> {5, 5}
+//! \endcode
+//! - EXPLICIT_ROUND_UP:
+//! \code
+//!     Computation:
+//!         M = {6, 6} + {3, 3} + {2, 2} ==> {11, 11}
+//!         E = {3, 3} - {2, 2} ==> {1, 1}
+//!         O ==> ceil(({11, 11} - {1, 1}) / {2, 2})
+//!           ==> ceil({10, 10} / {2, 2})
+//!           ==> ceil({5, 5})
+//!           ==> {5, 5}
+//! \endcode
+//!     The sample points are {0, 2, 4, 6, 8} in each dimension.
+//!
+//! - SAME_UPPER:
+//! \code
+//!     Computation:
+//!         I = {6, 6}
+//!         S = {2, 2}
+//!         O = {3, 3}
+//!         P = ({6, 6} - ceil({6, 6} / {2, 2}))
+//!             ==> ({6, 6} - {3, 3})
+//!             ==> {3, 3}
+//!         B = floor({3, 3} / {2, 2})
+//!             ==> {1, 1}
+//!         A = {3, 3} - {1, 1}
+//!             ==> {2, 2}
+//! \endcode
+//! - SAME_LOWER:
+//! \code
+//!     Computation:
+//!         I = {6, 6}
+//!         S = {2, 2}
+//!         O = {6, 6}
+//!         P = ({6, 6} - ceil({6, 6} / {2, 2}))
+//!           ==> ({6, 6} - {3, 3})
+//!           ==> {3, 3}
+//!         B = floor({3, 3} / {2, 2})
+//!           ==> {1, 1}
+//!         A = {3, 3} - {1, 1}
+//!           ==> {2, 2}
+//! \endcode
+//!     The sample pointers are {0, 2, 4} in each dimension.
+//!     SAMPLE_UPPER has {pad, O0, O1, O2, pad, pad} in output in each dimension.
+//!     SAMPLE_LOWER has {pad, pad, O0, O1, O2, pad} in output in each dimension.
+//!
+//! Pooling Example 2:
+//! \code
+//!     Given I = {6, 6}, B = {3, 3}, A = {3, 3}, S = {2, 2}, F = {3, 3}. What is O?
+//! \endcode
+//!
+//! - CAFFE_ROUND_DOWN:
+//! \code
+//!     Computation:
+//!         M = {6, 6} + {3, 3} + {3, 3} ==> {12, 12}
+//!         E = {3, 3} - {2, 2} ==> {1, 1}
+//!         EXPLICIT_ROUND_DOWN = floor(M - E, S) ==> {5, 5}
+//!
+//!         DIFF = (((EXPLICIT_ROUND_DOWN - 1) * S >= I + B) ? {1, 1} : {0, 0})
+//!           ==> ({5, 5} - {1, 1}) * {2, 2} >= {6, 6} + {3, 3} ? {1, 1} : {0,0}
+//!           ==> {0, 0}
+//!         O ==> EXPLICIT_ROUND_DOWN - DIFF
+//!           ==> {5, 5} - {0, 0}
+//!           ==> {5, 5}
+//! \endcode
+//! - CAFFE_ROUND_UP:
+//! \code
+//!     Computation:
+//!         M = {6, 6} + {3, 3} + {3, 3} ==> {12, 12}
+//!         E = {3, 3} - {2, 2} ==> {1, 1}
+//!         EXPLICIT_ROUND_UP = CEIL(M - E, S) ==> {6, 6}
+//!
+//!         DIFF = (((EXPLICIT_ROUND_UP - 1) * S >= I + B) ? {1, 1} : {0, 0})
+//!           ==> ({6, 6} - {1, 1}) * {2, 2} >= {6, 6} + {3, 3} ? {1, 1} : {0,0}
+//!           ==> {1, 1}
+//!         O ==> EXPLICIT_ROUND_UP - DIFF
+//!           ==> {6, 6} - {1, 1}
+//!           ==> {5, 5}
+//! \endcode
+//!
+//! The sample points are {0, 2, 4, 6, 8} in each dimension. <br>
+//! CAFFE_ROUND_DOWN and CAFFE_ROUND_UP have two restrictions each on usage with pooling operations.
+//! This will cause getDimensions to return an empty dimension and also to reject the network
+//! at validation time. <br>
+//! For more information on original reference code, see
+//! https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp
+//!
+//! - Restriction 1:
+//! \code
+//!     CAFFE_ROUND_DOWN: B >= F is an error if (B - S) < F
+//!     CAFFE_ROUND_UP: (B + S) >= (F + 1) is an error if B < (F + 1)
+//! \endcode
+//!
+//! - Restriction 2:
+//! \code
+//!     CAFFE_ROUND_DOWN: (B - S) >= F is an error if B >= F
+//!     CAFFE_ROUND_UP: B >= (F + 1) is an error if (B + S) >= (F + 1)
+//! \endcode
 //!
 enum class PaddingMode : int
 {
     kEXPLICIT_ROUND_DOWN = 0, //!< Use explicit padding, rounding output size down.
     kEXPLICIT_ROUND_UP = 1,   //!< Use explicit padding, rounding output size up.
-    kSAME_UPPER = 2,          //!< Use SAME padding with prePadding <= postPadding.
+    kSAME_UPPER = 2,          //!< Use SAME padding, with prePadding <= postPadding.
     kSAME_LOWER = 3,          //!< Use SAME padding, with prePadding >= postPadding.
-    kCAFFE_ROUND_DOWN = 4,    //!< Use CAFFE padding, rounding output size down.
-    kCAFFE_ROUND_UP = 5       //!< Use CAFFE padding, rounding output size up.
+    kCAFFE_ROUND_DOWN = 4,    //!< Use CAFFE padding, rounding output size down, uses prePadding value.
+    kCAFFE_ROUND_UP = 5       //!< Use CAFFE padding, rounding output size up, uses prePadding value.
 };
 
 template <>
@@ -881,14 +1102,18 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getKernelSize()
     //!
-    virtual void setKernelSize(DimsHW kernelSize) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setKernelSizeNd.
+    //!
+    TRT_DEPRECATED virtual void setKernelSize(DimsHW kernelSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the HW kernel size of the convolution.
     //!
     //! \see setKernelSize()
     //!
-    virtual DimsHW getKernelSize() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getKernelSizeNd.
+    //!
+    TRT_DEPRECATED virtual DimsHW getKernelSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of output maps for the convolution.
@@ -915,12 +1140,16 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getStride()
     //!
-    virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setStrideNd.
+    //!
+    TRT_DEPRECATED virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the stride of the convolution.
     //!
-    virtual DimsHW getStride() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getStrideNd.
+    //!
+    TRT_DEPRECATED virtual DimsHW getStride() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding of the convolution.
@@ -934,14 +1163,18 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setPaddingNd
+    //!
+    TRT_DEPRECATED virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding of the convolution. If the padding is asymmetric, the pre-padding is returned.
     //!
     //! \see setPadding()
     //!
-    virtual DimsHW getPadding() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getPaddingNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getPadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of groups for a convolution.
@@ -1009,14 +1242,18 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getDilation()
     //!
-    virtual void setDilation(DimsHW dilation) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setDilationNd
+    //!
+    TRT_DEPRECATED virtual void setDilation(DimsHW dilation) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the dilation for a convolution.
     //!
     //! \see setDilation()
     //!
-    virtual DimsHW getDilation() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getDilationNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getDilation() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IConvolutionLayer() {}
@@ -1087,7 +1324,7 @@ class IConvolutionLayer : public ILayer
     //!
     //! If executing this layer on DLA, only support 2D kernel size, both height and width of kernel size must be in the range [1,16].
     //!
-    //! \see getKernelSizeNd() setKernelSize() getKernelSize()
+    //! \see getKernelSizeNd()
     //!
     virtual void setKernelSizeNd(Dims kernelSize) TRTNOEXCEPT = 0;
 
@@ -1154,6 +1391,24 @@ class IConvolutionLayer : public ILayer
     //! \see setDilation()
     //!
     virtual Dims getDilationNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Append or replace an input of this layer with a specific tensor
+    //!
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor
+    //!
+    //! For a convolution layer, the values 0-2 are valid. The value 1 override kernel weights, and the value
+    //! 2 override bias weights. Conversely, this input tensor can be overridden via appropriate set call.
+    //! The indices are as follows:
+    //!
+    //! Index | Description
+    //!   0   | The input activation tensor.
+    //!   1   | The kernel weights tensor.
+    //!   2   | The bias weights tensor.
+    //!
+    //! If this function is called with a value greater than 0, then the function getNbInputs() changes
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
 };
 
 //! \class IFullyConnectedLayer
@@ -1236,6 +1491,25 @@ class IFullyConnectedLayer : public ILayer
 
 protected:
     virtual ~IFullyConnectedLayer() {}
+
+public:
+    //!
+    //! \brief Append or replace an input of this layer with a specific tensor
+    //!
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor
+    //!
+    //! For a fulyconnected layer, the values 0-2 are valid. The value 1 override kernel weights, and the value
+    //! 2 override bias weights. Conversely, this input tensor can be overridden via appropriate set call.
+    //! The indices are as follows:
+    //!
+    //! Index | Description
+    //!   0   | The input activation tensor.
+    //!   1   | The kernel weights tensor.
+    //!   2   | The bias weights tensor.
+    //!
+    //! If this function is called with a value greater than 0, then the function getNbInputs() changes
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
 };
 
 //!
@@ -1357,14 +1631,18 @@ class IPoolingLayer : public ILayer
     //!
     //! \see getWindowSize()
     //!
-    virtual void setWindowSize(DimsHW windowSize) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setWindowSizeNd.
+    //!
+    TRT_DEPRECATED virtual void setWindowSize(DimsHW windowSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the window size for pooling.
     //!
     //! \see setWindowSize()
     //!
-    virtual DimsHW getWindowSize() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getWindowSizeNd.
+    //!
+    TRT_DEPRECATED virtual DimsHW getWindowSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the stride for pooling.
@@ -1375,14 +1653,18 @@ class IPoolingLayer : public ILayer
     //!
     //! \see getStride()
     //!
-    virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setStrideNd
+    //!
+    TRT_DEPRECATED virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the stride for pooling.
     //!
     //! \see setStride()
     //!
-    virtual DimsHW getStride() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getStrideNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getStride() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding for pooling.
@@ -1393,7 +1675,9 @@ class IPoolingLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setPaddingNd
+    //!
+    TRT_DEPRECATED virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding for pooling.
@@ -1402,7 +1686,9 @@ class IPoolingLayer : public ILayer
     //!
     //! \see setPadding()
     //!
-    virtual DimsHW getPadding() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getPaddingNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getPadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the blending factor for the max_average_blend mode:
@@ -1690,7 +1976,7 @@ class IScaleLayer : public ILayer
     virtual void setMode(ScaleMode mode) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the scale mode.
+    //! \brief Get the scale mode.
     //!
     //! \see setMode()
     //!
@@ -1866,14 +2152,18 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see getKernelSize()
     //!
-    virtual void setKernelSize(DimsHW kernelSize) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setKernelSizeNd
+    //!
+    TRT_DEPRECATED virtual void setKernelSize(DimsHW kernelSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the HW kernel size of the deconvolution.
     //!
     //! \see setKernelSize()
     //!
-    virtual DimsHW getKernelSize() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getKernelSizeNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getKernelSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of output feature maps for the deconvolution.
@@ -1898,14 +2188,18 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see setStride()
     //!
-    virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setStrideNd
+    //!
+    TRT_DEPRECATED virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the stride of the deconvolution.
     //!
     //! Default: (1,1)
     //!
-    virtual DimsHW getStride() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getStrideNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getStride() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding of the deconvolution.
@@ -1920,14 +2214,20 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setPaddingNd
+    //!
+    TRT_DEPRECATED virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding of the deconvolution.
     //!
+    //! Default: (0, 0)
+    //!
     //! \see setPadding()
     //!
-    virtual DimsHW getPadding() const TRTNOEXCEPT = 0; // padding defaults to 0
+    //! \deprecated Superseded by getPaddingNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getPadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of groups for a deconvolution.
@@ -2040,6 +2340,7 @@ class IDeconvolutionLayer : public ILayer
     //! Default: kEXPLICIT_ROUND_DOWN
     //!
     //! \see getPaddingMode()
+    //!
     virtual void setPaddingMode(PaddingMode paddingMode) TRTNOEXCEPT = 0;
 
     //!
@@ -2048,6 +2349,7 @@ class IDeconvolutionLayer : public ILayer
     //! Default: kEXPLICIT_ROUND_DOWN
     //!
     //! \see setPaddingMode()
+    //!
     virtual PaddingMode getPaddingMode() const TRTNOEXCEPT = 0;
 
     //!
@@ -2106,6 +2408,25 @@ class IDeconvolutionLayer : public ILayer
     //! \see setPaddingNd()
     //!
     virtual Dims getPaddingNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Append or replace an input of this layer with a specific tensor
+    //!
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor
+    //!
+    //! For a deconvolution layer, the values 0-2 are valid. The value 1 override kernel weights, and the value
+    //! 2 override bias weights. Conversely, this input tensor can be overridden via appropriate set call.
+    //! The indices are as follows:
+    //!
+    //! Index | Description
+    //!   0   | The input activation tensor.
+    //!   1   | The kernel weights tensor.
+    //!   2   | The bias weights tensor.
+    //!
+    //! If this function is called with a value greater than 0, then the function getNbInputs() changes
+    //!
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
 };
 
 //!
@@ -2124,13 +2445,19 @@ enum class ElementWiseOperation : int
     kSUB = 4,      //!< Substract the second element from the first.
     kDIV = 5,      //!< Divide the first element by the second.
     kPOW = 6,      //!< The first element to the power of the second element.
-    kFLOOR_DIV = 7 //!< Floor division of the first element by the second.
+    kFLOOR_DIV = 7,//!< Floor division of the first element by the second.
+    kAND = 8,      //!< Logical AND of two elements.
+    kOR = 9,       //!< Logical OR of two elements.
+    kXOR = 10,     //!< Logical XOR of two elements.
+    kEQUAL = 11,   //!< Check if two elements are equal.
+    kGREATER = 12, //!< Check if element in first tensor is greater than corresponding element in second tensor.
+    kLESS = 13     //!< Check if element in first tensor is less than corresponding element in second tensor.
 };
 
 template <>
 constexpr inline int EnumMax<ElementWiseOperation>()
 {
-    return 8;
+    return 14;
 } //!< Maximum number of elements in ElementWiseOperation enum. \see ElementWiseOperation
 
 //!
@@ -2156,7 +2483,7 @@ class IElementWiseLayer : public ILayer
     //!
     //! \see getBiasWeights()
     //!
-    virtual void setOperation(ElementWiseOperation type) TRTNOEXCEPT = 0;
+    virtual void setOperation(ElementWiseOperation op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the binary operation for the layer.
@@ -2355,7 +2682,9 @@ constexpr inline int EnumMax<RNNInputMode>()
 //!
 //! \brief A RNN layer in a network definition.
 //!
-//! This layer applies an RNN operation on the inputs.
+//! This layer applies an RNN operation on the inputs. This layer only works with networks that
+//! that have an implicit batch dimension. For dynamic shapes and explicit batch dimension networks,
+//! use IRNNv2Layer.
 //!
 //! \deprecated This interface is superseded by IRNNv2Layer.
 //!
@@ -2889,7 +3218,9 @@ class IRNNv2Layer : public ILayer
 //!
 //! \brief Application-implemented interface to compute layer output sizes.
 //!
-class IOutputDimensionsFormula
+//! \deprecated IOutputDimensionsFormula has been superseded by PaddingMode.
+//!
+class TRT_DEPRECATED IOutputDimensionsFormula
 {
 public:
     //!
@@ -2987,13 +3318,15 @@ enum class UnaryOperation : int
     kACOSH = 15, //!< Inverse hyperbolic cosine.
     kATANH = 16, //!< Inverse hyperbolic tangent.
     kCEIL = 17,  //!< Ceiling.
-    kFLOOR = 18  //!< Floor.
+    kFLOOR = 18, //!< Floor.
+    kERF = 19,   //!< Gauss error function.
+    kNOT = 20    //!< Logical NOT.
 };
 
 template <>
 constexpr inline int EnumMax<UnaryOperation>()
 {
-    return 19;
+    return 21;
 } //!< Maximum number of elements in UnaryOperation enum. \see UnaryOperation
 
 //!
@@ -3047,7 +3380,7 @@ constexpr inline int EnumMax<ReduceOperation>()
 //!
 //! \class IReduceLayer
 //!
-//! \brief Layer that represents a reduction operator.
+//! \brief Layer that represents a reduction operator across Shape, Int32, Float, and Half tensors.
 //!
 //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
@@ -3120,14 +3453,18 @@ class IPaddingLayer : public ILayer
     //!
     //! \see getPrePadding
     //!
-    virtual void setPrePadding(DimsHW padding) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setPrePaddingNd
+    //!
+    TRT_DEPRECATED virtual void setPrePadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding that is applied at the start of the tensor.
     //!
     //! \see setPrePadding
     //!
-    virtual DimsHW getPrePadding() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getPrePaddingNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getPrePadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding that is applied at the end of the tensor.
@@ -3136,17 +3473,62 @@ class IPaddingLayer : public ILayer
     //!
     //! \see getPostPadding
     //!
-    virtual void setPostPadding(DimsHW padding) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by setPostPaddingNd
+    //!
+    TRT_DEPRECATED virtual void setPostPadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding that is applied at the end of the tensor.
     //!
     //! \see setPostPadding
     //!
-    virtual DimsHW getPostPadding() const TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by getPostPaddingNd
+    //!
+    TRT_DEPRECATED virtual DimsHW getPostPadding() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IPaddingLayer() {}
+
+public:
+    //!
+    //! \brief Set the padding that is applied at the start of the tensor.
+    //!
+    //! Negative padding results in trimming the edge by the specified amount.
+    //!
+    //! \warning Only 2 dimensionsional padding is currently supported.
+    //!
+    //! \see getPrePaddingNd
+    //!
+    virtual void setPrePaddingNd(Dims padding) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the padding that is applied at the start of the tensor.
+    //!
+    //! \warning Only 2 dimensionsional padding is currently supported.
+    //!
+    //! \see setPrePaddingNd
+    //!
+    virtual Dims getPrePaddingNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the padding that is applied at the end of the tensor.
+    //!
+    //! Negative padding results in trimming the edge by the specified amount
+    //!
+    //! \warning Only 2 dimensionsional padding is currently supported.
+    //!
+    //! \see getPostPaddingNd
+    //!
+    virtual void setPostPaddingNd(Dims padding) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the padding that is applied at the end of the tensor.
+    //!
+    //! \warning Only 2 dimensionsional padding is currently supported.
+    //!
+    //! \see setPostPaddingNd
+    //!
+    virtual Dims getPostPaddingNd() const TRTNOEXCEPT = 0;
 };
 
 struct Permutation
@@ -3213,9 +3595,7 @@ class IShuffleLayer : public ILayer
     //!
     //! The product of the new dimensions must be equal to the product of the old.
     //!
-    //! If there is a second input, i.e. reshape dimensions are dynamic,
-    //! calling setReshapeDimensions() is an error and does not update
-    //! the dimensions.
+    //! If the second input is set, it is reset to null.
     //!
     virtual void setReshapeDimensions(Dims dimensions) TRTNOEXCEPT = 0;
 
@@ -3224,18 +3604,30 @@ class IShuffleLayer : public ILayer
     //!
     //! \return The reshaped dimensions.
     //!
-    //! If there is a second input, returns Dims with nbDims == -1.
+    //! If a second input is present and non-null, or setReshapeDimensions has
+    //! not yet been called, this function returns Dims with nbDims == -1.
     //!
     virtual Dims getReshapeDimensions() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Relaxes ILayer::setInput to allow appending a second input.
+    //! \brief Append or replace an input of this layer with a specific tensor
+    //!
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor
+    //
+    //! Sets the input tensor for the given index. The index must be 0 for a static shuffle layer.
+    //! A static shuffle layer is converted to a dynamic shuffle layer by calling setInput with an index 1.
+    //! A dynamic shuffle layer cannot be converted back to a static shuffle layer.
+    //!
+    //! For a dynamic shuffle layer, the values 0 and 1 are valid.
+    //! The indices in the dynamic case are as follows:
     //!
-    //! Like ILayer::setInput, but additionally works if index==1, nbInputs()==1, and
-    //! there is no implicit batch dimension, in which case nbInputs() changes to 2.
+    //! Index | Description
+    //!   0   | Data or Shape tensor to be shuffled.
+    //!   1   | The dimensions for the reshape operation, as a 1D shape tensor.
     //!
-    //! When there is a 2nd input, the reshapeDimensions are taken from it, overriding
-    //! the dimensions supplied by setReshapeDimensions.
+    //! If this function is called with a value 1, then the function getNbInputs() changes
+    //! from returning 1 to 2.
     //!
     void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
 
@@ -3266,27 +3658,48 @@ class IShuffleLayer : public ILayer
     virtual ~IShuffleLayer() {}
 };
 
+//!
+//! \brief Controls how ISliceLayer handles out of bounds coordinates.
+//!
+//! \see ISliceLayer
+//!
+enum class SliceMode : int
+{
+    kDEFAULT = 0, //!< Fail with error when the coordinates are out of bounds. This is the default.
+    kWRAP = 1,    //!< Coordinates wrap around periodically.
+};
+
+template <>
+constexpr inline int EnumMax<SliceMode>()
+{
+    return 2;
+} //!< Maximum number of elements in SliceMode enum. \see SliceMode
+
 //!
 //! \brief Slices an input tensor into an output tensor based on the offset and strides.
 //!
 //! The slice layer has two variants, static and dynamic. Static slice specifies the start, size, and stride
-//! dimensions at layer create time via Dims and can use the get/set accessor functions of the ISliceLayer. Dynamic
-//! slice specifies the start and size dimensions at layer create time via ITensors and uses ILayer::setTensor to
-//! set the optional stride parameter after layer construction.
-//! An application can determine if the ISliceLayer is dynamic or static based on if there are 3 or 4 inputs(Dynamic)
-//! or 1 input(Static). When working on a shape tensor, a dynamic slace layer must have start, size, and stride
-//! specified at build time.
-//!
-//! The slice layer selects for each dimension a start location from within the input tensor, and given the
-//! specified stride, copies strided elements to the output tensor. Start, Size, and Stride shape tensors must be
-//! DataType::kINT32.
-//!
-//! For example using slice on a data tensor:
-//! input = {{0, 1}, {2, 3}, {4, 5}}
+//! dimensions at layer creation time via Dims and can use the get/set accessor functions of the ISliceLayer.
+//! Dynamic slice specifies one or more of start, size or stride as ITensors, by using ILayer::setTensor to add
+//! a second, third, or fourth input respectively. The corresponding Dims are used if an input
+//! is missing or null.
+//!
+//! An application can determine if the ISliceLayer has a dynamic output shape based on whether
+//! the size input (third input) is present and non-null.
+//!
+//! The slice layer selects for each dimension a start location from within the input tensor, and
+//! copies elements to the output tensor using the specified stride across the input tensor.
+//! Start, size, and stride tensors must be 1D shape tensors if not specified via Dims.
+//!
+//! Furthermore, if the slice layer must produce a shape tensor, then start, size, and stride must be
+//! build time constants, i.e. as static Dims, or be computable by constant folding.
+//!
+//! For example using slice on a tensor:
+//! input = {{0, 2, 4}, {1, 3, 5}}
 //! start = {1, 0}
 //! size = {1, 2}
 //! stride = {1, 2}
-//! output = {1, 5}
+//! output = {{1, 5}}
 //!
 //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
@@ -3298,8 +3711,7 @@ class ISliceLayer : public ILayer
     //!
     //! \param start The start offset to read data from the input tensor.
     //!
-    //! If the SliceLayer is using dynamic inputs for the start parameter, calling setStart() results in an error
-    //! and does not update the dimensions.
+    //! If the second input is set, it is reset to null.
     //!
     //! \see getStart
     //!
@@ -3310,8 +3722,8 @@ class ISliceLayer : public ILayer
     //!
     //! \return The start offset, or an invalid Dims structure.
     //!
-    //! If the SliceLayer is using dynamic inputs for the start parameter, this function returns an invalid
-    //! Dims structure.
+    //! If the second input is present and non-null,
+    //! this function returns a Dims with nbDims = -1.
     //!
     //! \see setStart
     //!
@@ -3322,8 +3734,7 @@ class ISliceLayer : public ILayer
     //!
     //! \param size The dimensions of the output slice.
     //!
-    //! If the SliceLayer is using dynamic inputs for the size parameter, calling setSize() results in an error
-    //! and does not update the dimensions.
+    //! If the third input is set, it is reset to null.
     //!
     //! \see getSize
     //!
@@ -3334,8 +3745,8 @@ class ISliceLayer : public ILayer
     //!
     //! \return The output dimension, or an invalid Dims structure.
     //!
-    //! If the SliceLayer is using dynamic inputs for the size parameter, this function returns an invalid
-    //! Dims structure.
+    //! If the third input is present and non-null,
+    //! this function returns a Dims with nbDims = -1.
     //!
     //! \see setSize
     //!
@@ -3346,8 +3757,7 @@ class ISliceLayer : public ILayer
     //!
     //! \param stride The dimensions of the stride to compute the values to store in the output slice.
     //!
-    //! If the SliceLayer is using dynamic inputs for the stride parameter, calling setSlice() results in an error
-    //! and does not update the dimensions.
+    //! If the fourth input is set, it is reset to null.
     //!
     //! \see getStride
     //!
@@ -3358,39 +3768,45 @@ class ISliceLayer : public ILayer
     //!
     //! \return The slicing stride, or an invalid Dims structure.
     //!
-    //! If the SliceLayer is using dynamic inputs for the stride parameter, this function returns a invalid
-    //! Dims structure.
+    //! If the fourth input is present and non-null,
+    //! this function returns a Dims with nbDims = -1.
     //!
     //! \see setStride
     //!
     virtual Dims getStride() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief replace an input of this layer with a specific tensor.
+    //! \brief Set the slice mode.
+    //!
+    //! \see getMode()
+    //!
+    virtual void setMode(SliceMode mode) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the slice mode.
+    //!
+    //! \see setMode()
+    //!
+    virtual SliceMode getMode() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Append or replace an input of this layer with a specific tensor
     //!
     //! \param index the index of the input to modify.
     //! \param tensor the new input tensor
     //!
-    //! Sets the input tensor for the given index. The index must be 0 for a static slice layer.
-    //! A static slice layer is converted to a dynamic slice layer by calling setInput with an index > 0.
-    //! A dynamic slice layer cannot be converted back to a static slice layer.
-    //!
-    //! For a dynamic slice layer, the values 0-3 are valid. If an index > 0 is specified, all values between
-    //! index 0 and that index must be dynamic tensors. The values larger than index can use static dimensions.
-    //! For example, if an index of two is specified, the stride tensor can be set via setStride, but the start tensor
-    //! must be specified via setInput as both size and start are converted to dynamic tensors.
-    //! The indices in the dynamic case are as follows:
+    //! For a slice layer, the values 0-3 are valid. The values 1-3 override start, size or stride
+    //! dimensions, respectively. Conversely, this input tensor can be overridden via appropriate set call.
+    //! The indices are as follows:
     //!
     //! Index | Description
     //!   0   | Data or Shape tensor to be sliced.
-    //!   1   | The start tensor to begin slicing, N-dimensional for Data, and 1-D for Shape.
-    //!   2   | The size tensor of the resulting slice, N-dimensional for Data, and 1-D for Shape.
-    //!   3   | The stride of the slicing operation, N-dimensional for Data, and 1-D for Shape.
+    //!   1   | The start tensor to begin slicing, as a 1D shape tensor.
+    //!   2   | The size tensor of the resulting slice, as a 1D shape tensor.
+    //!   3   | The stride of the slicing operation, as a 1D shape tensor.
     //!
     //! If this function is called with a value greater than 0, then the function getNbInputs() changes
-    //! from returning 1 to index + 1. When converting from static to dynamic slice layer,
-    //! all unset tensors, between 1 and index + 1, are initialized to nullptr. It is an error to attempt to build
-    //! a network that has any nullptr inputs.
+    //! from returning 1 to index + 1.
     //!
     void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
 
@@ -3694,7 +4110,7 @@ class IParametricReLULayer : public ILayer
 //!
 enum class ResizeMode : int
 {
-    kNEAREST = 0, // N-D (0 < N <= 8) nearest neighbor resizing.
+    kNEAREST = 0, // ND (0 < N <= 8) nearest neighbor resizing.
     kLINEAR = 1   // Can handle linear (1D), bilinear (2D), and trilinear (3D) resizing.
 };
 
@@ -3708,11 +4124,11 @@ constexpr inline int EnumMax<ResizeMode>()
 //!
 //! \brief A resize layer in a network definition.
 //!
-//! Resize layer can be used for resizing a N-D tensor.
+//! Resize layer can be used for resizing a ND tensor.
 //!
 //! Resize layer currently supports the following configurations:
-//!     -   ResizeMode::kNEAREST - resizes innermost `m` dimensions of N-D, where 0 < m <= min(8, N) and N > 0
-//!     -   ResizeMode::kLINEAR - resizes innermost `m` dimensions of N-D, where 0 < m <= min(3, N) and N > 0
+//!     -   ResizeMode::kNEAREST - resizes innermost `m` dimensions of ND, where 0 < m <= min(8, N) and N > 0
+//!     -   ResizeMode::kLINEAR - resizes innermost `m` dimensions of ND, where 0 < m <= min(3, N) and N > 0
 //!
 //! Default resize mode is ResizeMode::kNEAREST.
 //! Resize layer provides two ways to resize tensor dimensions.
@@ -3822,19 +4238,24 @@ class IResizeLayer : public ILayer
     virtual bool getAlignCorners() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Relaxes ILayer::setInput to allow appending a second input.
+    //! \brief Append or replace an input of this layer with a specific tensor
     //!
     //! \param index the index of the input to modify.
-    //! \param tensor the new input tensor.
+    //! \param tensor the new input tensor
+    //!
+    //! Sets the input tensor for the given index. The index must be 0 for a static resize layer.
+    //! A static resize layer is converted to a dynamic resize layer by calling setInput with an index 1.
+    //! A dynamic resize layer cannot be converted back to a static resize layer.
     //!
-    //! Like ILayer::setInput, but additionally works if index == 1 and nbInputs == 1, and
-    //! there is no implicit batch dimension, in which case nbInputs() changes to 2.
-    //! Once such additional input is set, resize layer works in dynamic mode.
+    //! For a dynamic resize layer, the values 0 and 1 are valid.
+    //! The indices in the dynamic case are as follows:
     //!
-    //! When index == 1 and nbInputs == 1, the output dimensions are used from
-    //! the input tensor, overriding the dimensions supplied by setOutputDimensions.
+    //! Index | Description
+    //!   0   | Data or Shape tensor to be resized.
+    //!   1   | The output dimensions, as a 1D shape tensor.
     //!
-    //! \warning tensor must be a shape tensor.
+    //! If this function is called with a value 1, then the function getNbInputs() changes
+    //! from returning 1 to 2.
     //!
     void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
 
@@ -3842,6 +4263,405 @@ class IResizeLayer : public ILayer
     virtual ~IResizeLayer() {}
 };
 
+//! Enum that describes kinds of loop outputs.
+enum class LoopOutput : int
+{
+    //! Output value is value of tensor for last iteration.
+    kLAST_VALUE = 0,
+
+    //! Output value is concatenation of values of tensor for each iteration, in forward order.
+    kCONCATENATE = 1,
+
+    //! Output value is concatenation of values of tensor for each iteration, in reverse order.
+    kREVERSE = 2
+};
+
+template <>
+constexpr inline int EnumMax<LoopOutput>()
+{
+    return 3;
+} //!< Maximum number of elements in LoopOutput enum. \see DataType
+
+//! Enum that describes kinds of trip limits.
+enum class TripLimit : int
+{
+    // Tensor is scalar of type kINT32 that contains the trip count.
+    kCOUNT = 0,
+
+    // Tensor is a scalar of type kBOOL. Loop terminates when value is false.
+    kWHILE = 1
+};
+
+template <>
+constexpr inline int EnumMax<TripLimit>()
+{
+    return 2;
+} //!< Maximum number of elements in TripLimit enum. \see DataType
+
+class ILoop;
+
+class ILoopBoundaryLayer : public ILayer
+{
+public:
+    //! Return pointer to ILoop associated with this boundary layer.
+    virtual ILoop* getLoop() const noexcept = 0;
+};
+
+class IRecurrenceLayer : public ILoopBoundaryLayer
+{
+public:
+    //!
+    //! \brief Append or replace an input of this layer with a specific tensor
+    //!
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor
+    //
+    //! Sets the input tensor for the given index.
+    //!
+    //! For a recurrence layer, the values 0 and 1 are valid.
+    //! The indices are as follows:
+    //!
+    //! Index | Description
+    //!   0   | The initial value of the output tensor. The value must come from outside the loop.
+    //!   1   | The next value of the output tensor. The value usually comes from inside the loop, and must have the same dimensions as input 0.
+    //!
+    //! If this function is called with a value 1, then the function getNbInputs() changes
+    //! from returning 1 to 2.
+    //!
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
+};
+
+//!
+//! An ILoopOutputLayer is the sole way to get output from a loop.
+//!
+//! The first input tensor must be defined inside the loop; the output tensor is outside the loop.
+//! The second input tensor, if present, must be defined outside the loop.
+//!
+//! If getLoopOutput() is kLAST_VALUE, a single input must be provided,
+//! and that input must from a IRecurrenceLayer in the same loop.
+//!
+//! If getLoopOutput() is kCONCATENATE or kREVERSE, a second input must be provided.
+//! The second input must be a scalar “shape tensor”, defined before the loop commences,
+//! that specifies the concatenation length of the output.
+//!
+//! The output tensor has j more dimensions than the input tensor, where
+//! j == 0 if getLoopOutput() is kLAST_VALUE
+//! j == 1 if getLoopOutput() is kCONCATENATE or kREVERSE.
+//!
+class ILoopOutputLayer : public ILoopBoundaryLayer
+{
+public:
+    virtual LoopOutput getLoopOutput() const noexcept = 0;
+
+    //!
+    //! \brief Set where to insert the contenation axis. Ignored if getLoopOutput() is kLAST_VALUE.
+    //!
+    //! For example, if the input tensor has dimensions [b,c,d],
+    //! and getLoopOutput() is  kCONCATENATE, the output has four dimensions.
+    //! Let a be the value of the second input.
+    //! setAxis(0) causes the output to have dimensions [a,b,c,d].
+    //! setAxis(1) causes the output to have dimensions [b,a,c,d].
+    //! setAxis(2) causes the output to have dimensions [b,c,a,d].
+    //! setAxis(3) causes the output to have dimensions [b,c,d,a].
+    //! Default is axis is 0.
+    //!
+    virtual void setAxis(int axis) noexcept = 0;
+
+    //! Get axis being concatenated over.
+    virtual int getAxis() const noexcept = 0;
+
+    //!
+    //! \brief Append or replace an input of this layer with a specific tensor
+    //!
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor
+    //
+    //! Sets the input tensor for the given index. The index must be 0 for a kLAST_VALUE loop output layer.
+    //! Loop output layer is converted to a kCONCATENATE or kREVERSE loop output layer by calling setInput with an index 1.
+    //! A kCONCATENATE or kREVERSE loop output layer cannot be converted back to a kLAST_VALUE loop output layer.
+    //!
+    //! For a kCONCATENATE or kREVERSE loop output layer, the values 0 and 1 are valid.
+    //! The indices in the kCONCATENATE or kREVERSE cases are as follows:
+    //!
+    //! Index | Description
+    //!   0   | Contribution to the output tensor.  The contribution must come from inside the loop.
+    //!   1   | The concatenation length scalar value, must come from outside the loop, as a 0D shape tensor.
+    //!
+    //! If this function is called with a value 1, then the function getNbInputs() changes
+    //! from returning 1 to 2.
+    //!
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
+};
+
+class ITripLimitLayer : public ILoopBoundaryLayer
+{
+public:
+    virtual TripLimit getTripLimit() const noexcept = 0;
+};
+
+class IIteratorLayer : public ILoopBoundaryLayer
+{
+public:
+    //! Set axis to iterate over.
+    virtual void setAxis(int axis) noexcept = 0;
+
+    //! Get axis being iterated over.
+    virtual int getAxis() const noexcept = 0;
+
+    //! For reverse=false, the layer is equivalent to addGather(tensor, I, 0) where I is a
+    //! scalar tensor containing the loop iteration number.
+    //! For reverse=true, the layer is equivalent to addGather(tensor, M-1-I, 0) where M is the trip count
+    //! computed from TripLimits of kind kCOUNT.
+    //! The default is reverse=false.
+    virtual void setReverse(bool reverse) noexcept = 0;
+
+    //! True if and only if reversing input.
+    virtual bool getReverse() const noexcept = 0;
+};
+
+//!
+//! Helper for creating a recurrent subgraph.
+//!
+class ILoop
+{
+public:
+    //!
+    //! \brief Create a recurrence layer for this loop with initialValue as its first input.
+    //!
+    //! IRecurrenceLayer requires exactly two inputs.  The 2nd input must be added, via method IRecurrenceLayer::setInput(1,...)
+    //! before an Engine can be built.
+    //
+    //!
+    virtual IRecurrenceLayer* addRecurrence(ITensor& initialValue) noexcept = 0;
+
+    //!
+    //! \brief Add a trip-count limiter, based on the given tensor.
+    //!
+    //! There may be at most one kCOUNT and one kWHILE limiter for a loop.
+    //! When both trip limits exist, the loop exits when the
+    //! count is reached or condition is falsified.
+    //! It is an error to not add at least one trip limiter.
+    //!
+    //! For kTRIP_LIMIT, the input tensor must be available before the loop starts.
+    //!
+    //! For kWHILE, the input tensor must be the output of a subgraph that contains
+    //! only layers that are not ITripLimitLayer, IIteratorLayer or ILoopOutputLayer.
+    //! Any IRecurrenceLayers in the subgraph must belong to the same loop as the
+    //! ITripLimitLayer.  A trivial example of this rule is that the input to the kWHILE
+    //! is the output of an IRecurrenceLayer for the same loop.
+    //!
+    virtual ITripLimitLayer* addTripLimit(ITensor& tensor, TripLimit limit) noexcept = 0;
+
+    //!
+    //! \brief Return layer that subscripts tensor by loop iteration.
+    //!
+    //! For reverse=false, this is equivalent to addGather(tensor, I, 0) where I is a
+    //! scalar tensor containing the loop iteration number.
+    //! For reverse=true, this is equivalent to addGather(tensor, M-1-I, 0) where M is the trip count
+    //! computed from TripLimits of kind kCOUNT.
+    //!
+    virtual IIteratorLayer* addIterator(ITensor& tensor, int axis = 0, bool reverse = false) noexcept = 0;
+
+    //! \brief Make an output for this loop, based on the given tensor.
+    //!
+    //! axis is the axis for concatenation (if using outputKind of kCONCATENATE or kREVERSE).
+    //!
+    //! If outputKind is kCONCATENATE or kREVERSE, a second input specifying the
+    //! concatenation dimension must be added via method ILoopOutputLayer::setInput.
+    //!
+    virtual ILoopOutputLayer* addLoopOutput(ITensor& tensor, LoopOutput outputKind, int axis = 0) noexcept = 0;
+
+    //!
+    //! \brief Set the name of the loop.
+    //!
+    //! The name is used in error diagnostics.
+    //! This method copies the name string.
+    //!
+    //! \see getName()
+    //!
+    virtual void setName(const char* name) noexcept = 0;
+
+    //!
+    //! \brief Return the name of the loop.
+    //!
+    //! \see setName()
+    //!
+    virtual const char* getName() const noexcept = 0;
+
+protected:
+    virtual ~ILoop() {}
+};
+
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+class ISelectLayer : public ILayer
+{
+protected:
+    virtual ~ISelectLayer() {}
+};
+
+//!
+//! \enum FillOperation
+//!
+//! \brief Enumerates the tensor fill operations that may performed by a fill layer.
+//!
+//! \see IFillLayer
+//!
+enum class FillOperation : int
+{
+    kLINSPACE = 0,         //!< Generate evenly spaced numbers over a specified interval.
+    kRANDOM_UNIFORM = 1    //!< Generate a tensor with random values drawn from a uniform distribution.
+};
+
+template <>
+constexpr inline int EnumMax<FillOperation>()
+{
+    return 2;
+} //!< Maximum number of elements in FillOperation enum. \see FillOperation
+
+//!
+//! \brief Generate an output tensor with specified mode.
+//!
+//! The fill layer has two variants, static and dynamic. Static fill specifies its parameters
+//! at layer creation time via Dims and the get/set accessor functions of the IFillLayer.
+//! Dynamic fill specifies one or more of its parameters as ITensors, by using ILayer::setTensor to add
+//! a corresponding input.  The corresponding static parameter is used if an input is missing or null.
+//!
+//! The shape of the output is specified by the parameter \p Dimension, or if non-null and present,
+//! the first input, which must be a 1D shape tensor.  Thus an application can determine if the
+//! IFillLayer has a dynamic output shape based on whether it has a non-null first input.
+//!
+//! Alpha and Beta are treated differently based on the Fill Operation specified. See details in
+//! IFillLayer::setAlpha(), IFillLayer::setBeta(), and IFillLayer::setInput().
+//!
+//! \see FillOperation
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+class IFillLayer : public ILayer
+{
+public:
+    //!
+    //! \brief Set the output tensor's dimensions.
+    //!
+    //! \param dimensions The output tensor's dimensions.
+    //!
+    //! If the first input is set, it is reset to null.
+    //!
+    //! \see getDimensions
+    //
+    virtual void setDimensions(Dims dimensions) noexcept = 0;
+
+    //!
+    //! \brief Get the output tensor's dimensions.
+    //!
+    //! \return The output tensor's dimensions, or an invalid Dims structure.
+    //!
+    //! If the first input is present and non-null,
+    //! this function returns a Dims with nbDims = -1.
+    //!
+    //! \see setDimensions
+    //!
+    virtual Dims getDimensions() const noexcept = 0;
+
+    //!
+    //! \brief Set the fill operation for the layer.
+    //!
+    //! \see getOperation(), FillOperation
+    //!
+    virtual void setOperation(FillOperation op) noexcept = 0;
+
+    //!
+    //! \brief Get the fill operation for the layer.
+    //!
+    //! \see setOperation(), FillOperation
+    //!
+    virtual FillOperation getOperation() const noexcept = 0;
+
+    //!
+    //! \brief Set the alpha parameter.
+    //!
+    //! \param alpha has different meanings for each operator:
+    //!
+    //! Operation          | Usage
+    //! kLINSPACE          | the start value;
+    //! kRANDOMUNIFORM     | the minimum value;
+    //!
+    //! If the second input is set, it is reset to null.
+    //!
+    //! \see getAlpha
+    //
+    virtual void setAlpha(double alpha) noexcept = 0;
+
+    //!
+    //! \brief Get the value of alpha parameter.
+    //!
+    //! \return A double value of alpha.
+    //!
+    //! If the second input is present and non-null,
+    //! this function returns a Dims with nbDims = -1.
+    //!
+    //! \see setAlpha
+    //!
+    virtual double getAlpha() const noexcept = 0;
+
+    //!
+    //! \brief Set the beta parameter.
+    //!
+    //! \param beta has different meanings for each operator:
+    //!
+    //! Operation          | Usage
+    //! kLINSPACE          | the delta value;
+    //! kRANDOMUNIFORM     | the maximal value;
+    //!
+    //! If the third input is set, it is reset to null.
+    //!
+    //! \see getBeta
+    //!
+    virtual void setBeta(double beta) noexcept = 0;
+
+    //!
+    //! \brief Get the value of beta parameter.
+    //!
+    //! \return A double value of beta.
+    //!
+    //! If the third input is present and non-null,
+    //! this function returns a Dims with nbDims = -1.
+    //!
+    //! \see setBeta
+    //!
+    virtual double getBeta() const noexcept = 0;
+
+    //!
+    //! \brief replace an input of this layer with a specific tensor.
+    //!
+    //! \param index the index of the input to set.
+    //! \param tensor the new input tensor
+    //!
+    //! Index | Description for kLINSPACE
+    //!   0   | Shape tensor, represents the output tensor's dimensions.
+    //!   1   | Start, a scalar, represents the start value.
+    //!   2   | Delta, a 1D tensor, length equals to shape tensor's nbDims, represents the delta value for each dimension.
+    //!
+    //! Index | Description for kRANDOM_UNIFORM
+    //!   0   | Shape tensor, represents the output tensor's dimensions.
+    //!   1   | Minimum, a scalar, represents the minimum random value.
+    //!   2   | Maximum, a scalar, represents the maximal random value.
+    //!
+    //! Using the corresponding setter resets the input to null.
+    //!
+    //! If either inputs 1 or 2, is non-null, then both must be non-null and have the same data type.
+    //!
+    //! If this function is called for an index greater or equal to getNbInputs(),
+    //! then afterwards getNbInputs() returns index + 1, and any missing intervening
+    //! inputs are set to null.
+    //!
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
+
+protected:
+    virtual ~IFillLayer() {}
+};
+
 //!
 //! \class INetworkDefinition
 //!
@@ -3877,8 +4697,14 @@ class INetworkDefinition
     //! be specified at runtime. Input tensors with such a wildcard must have a corresponding entry in the
     //! IOptimizationProfiles indicating the permitted extrema, and the input dimensions must be set by
     //! IExecutionContext::setBindingDimensions. Different IExecutionContext instances can have different dimensions.
-    //! Wildcard dimensions are only supported for EngineCapability::kDEFAULT with DeviceType::kGPU. They are not
-    //! supported in safety contexts or on the DLA.
+    //! Wildcard dimensions are only supported for EngineCapability::kDEFAULT. They are not
+    //! supported in safety contexts. DLA does not support Wildcard dimensions in {C, H, W} dimensions.
+    //!
+    //! Tensor dimensions are specified independent of format.  For example, if a
+    //! tensor is formatted in "NHWC" or a vectorized format, the dimensions are
+    //! still specified in the order{N, C, H, W}. For 2D images with a channel
+    //! dimension, the last three dimensions are always {C,H,W}. For 3D images
+    //! with a channel dimension, the last four dimensions are always {C,D,H,W}.
     //!
     //! \param name The name of the tensor.
     //! \param type The type of the data held in the tensor.
@@ -3917,7 +4743,9 @@ class INetworkDefinition
     //!
     //! \return The new convolution layer, or nullptr if it could not be created.
     //!
-    virtual IConvolutionLayer* addConvolution(ITensor& input, int nbOutputMaps, DimsHW kernelSize,
+    //! \deprecated Superseded by addConvolutionNd
+    //!
+    TRT_DEPRECATED virtual IConvolutionLayer* addConvolution(ITensor& input, int nbOutputMaps, DimsHW kernelSize,
         Weights kernelWeights, Weights biasWeights) TRTNOEXCEPT = 0;
 
     //!
@@ -3966,7 +4794,10 @@ class INetworkDefinition
     //!
     //! \return The new pooling layer, or nullptr if it could not be created.
     //!
-    virtual IPoolingLayer* addPooling(ITensor& input, PoolingType type, DimsHW windowSize) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded than addPoolingNd
+    //!
+    TRT_DEPRECATED virtual IPoolingLayer* addPooling(
+        ITensor& input, PoolingType type, DimsHW windowSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a LRN layer to the network.
@@ -4045,7 +4876,9 @@ class INetworkDefinition
     //!
     //! \return The new deconvolution layer, or nullptr if it could not be created.
     //!
-    virtual IDeconvolutionLayer* addDeconvolution(ITensor& input, int nbOutputMaps, DimsHW kernelSize,
+    //! \deprecated Superseded by addDeconvolutionNd
+    //!
+    TRT_DEPRECATED virtual IDeconvolutionLayer* addDeconvolution(ITensor& input, int nbOutputMaps, DimsHW kernelSize,
         Weights kernelWeights, Weights biasWeights) TRTNOEXCEPT = 0;
 
     //!
@@ -4123,7 +4956,7 @@ class INetworkDefinition
     //!
     //! \see IRNNLayer
     //!
-    //! \warning RNN inputs do not support wildcard dimensions or explicit batch size networks.
+    //! \warning This layer does not support wildcard dimensions or explicit batch size networks.
     //! \warning Int32 tensors are not valid input tensors.
     //!
     //! \return The new RNN layer, or nullptr if it could not be created.
@@ -4174,7 +5007,10 @@ class INetworkDefinition
     //!
     //! \return The new padding layer, or nullptr if it could not be created.
     //!
-    virtual IPaddingLayer* addPadding(ITensor& input, DimsHW prePadding, DimsHW postPadding) TRTNOEXCEPT = 0;
+    //! \deprecated Superseded by addPaddingNd.
+    //!
+    TRT_DEPRECATED virtual IPaddingLayer* addPadding(
+        ITensor& input, DimsHW prePadding, DimsHW postPadding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a shuffle layer to the network.
@@ -4356,6 +5192,10 @@ class INetworkDefinition
     //! \param keepDimensions The boolean that specifies whether or not to keep the reduced dimensions in the
     //! output of the layer.
     //!
+    //! The reduce layer works by performing an operation specified by \p operation to reduce the tensor \p input across
+    //! the
+    //! axes specified by \p reduceAxes.
+    //!
     //! \see IReduceLayer
     //!
     //! \warning If input is a shape tensor, ReduceOperation::kAVG is unsupported.
@@ -4537,7 +5377,7 @@ class INetworkDefinition
     //! \see IRNNv2Layer
     //!
     //! \warning RNN inputs do not support wildcard dimensions or explicit batch size networks.
-    //! \warning Int32 tensors are not valid input tensors.
+    //! \warning Int32 tensors are not valid input tensors, only for sequence lengths.
     //!
     //! \return The new RNN layer, or nullptr if it could not be created.
     //!
@@ -4675,7 +5515,7 @@ class INetworkDefinition
     virtual IShapeLayer* addShape(ITensor& input) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief True if tensors have implicit batch dimension.
+    //! \brief Query whether the network was created with an implicit batch dimension.
     //!
     //! \return True if tensors have implicit batch dimension, false otherwise.
     //!
@@ -4831,6 +5671,53 @@ class INetworkDefinition
     //! \return True if network has explicit precision, false otherwise.
     //!
     virtual bool hasExplicitPrecision() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Add a loop to the network.
+    //!
+    //! An ILoop provides a way to specify a recurrent subgraph.
+    //!
+    //! \return Pointer to ILoop that can be used to add loop boundary layers for the loop,
+    //!         or nullptr if network has an implicit batch dimension or this version
+    //!         of TensorRT does not support loops.
+    //!
+    virtual ILoop* addLoop() noexcept = 0;
+
+    //! \brief Add a select layer to the network.
+    //!
+    //! \param condition The condition tensor to the layer.
+    //! \param thenInput The "then" input tensor to the layer.
+    //! \param elseInput The "else" input tensor to the layer.
+    //!
+    //! \see ISelectLayer
+    //!
+    //! \return The new select layer, or nullptr if it could not be created.
+    virtual ISelectLayer* addSelect(ITensor& condition, ITensor& thenInput, ITensor& elseInput) TRTNOEXCEPT = 0;
+
+    //! \brief Add a fill layer to the network.
+    //!
+    //! \param dimensions The output tensor dimensions.
+    //! \param op The fill operation that the layer applies.
+    //!
+    //! \warning The dimensions's nbDims must be 1.
+    //!
+    //! \see IFillLayer
+    //!
+    //! \return The new fill layer, or nullptr if it could not be created.
+    virtual IFillLayer* addFill(Dims dimensions, FillOperation op) noexcept = 0;
+
+    //! \brief Add a padding layer to the network. Only 2D padding is currently supported.
+    //!
+    //! \param input The input tensor to the layer.
+    //! \param prePadding The padding to apply to the start of the tensor.
+    //! \param postPadding The padding to apply to the end of the tensor.
+    //!
+    //! \see IPaddingLayer
+    //!
+    //! \return The new padding layer, or nullptr if it could not be created.
+    //!
+    TRT_DEPRECATED virtual IPaddingLayer* addPaddingNd(
+        ITensor& input, Dims prePadding, Dims postPadding) TRTNOEXCEPT = 0;
 };
 
 //!
@@ -5040,8 +5927,8 @@ typedef uint32_t BuilderFlags;
 //!
 enum class BuilderFlag : int
 {
-    kFP16 = 0,         //!< Enable FP16 layer selection.
-    kINT8 = 1,         //!< Enable Int8 layer selection.
+    kFP16 = 0,         //!< Enable FP16 layer selection, with FP32 fallback.
+    kINT8 = 1,         //!< Enable Int8 layer selection, with FP32 fallback with FP16 fallback if kFP16 also specified.
     kDEBUG = 2,        //!< Enable debugging of layers via synchronizing after every layer.
     kGPU_FALLBACK = 3, //!< Enable layers marked to execute on GPU if layer cannot execute on DLA.
     kSTRICT_TYPES = 4, //!< Enables strict type constraints.
@@ -5208,7 +6095,7 @@ class IBuilderConfig
 
     //!
     //! \brief Set the device that this layer must execute on.
-    //! \param DeviceType that this layer must execute on.
+    //! \param deviceType that this layer must execute on.
     //! If DeviceType is not set or is reset, TensorRT will use the default DeviceType set in the builder.
     //!
     //! \note The device type for a layer must be compatible with the safety flow (if specified).
@@ -5333,6 +6220,7 @@ class IBuilderConfig
     }
 };
 
+
 //! \typedef NetworkDefinitionCreationFlags
 //!
 //! \brief This bitset is capable of representing one or more NetworkDefinitionCreationFlag flags
@@ -5380,6 +6268,7 @@ constexpr inline int EnumMax<NetworkDefinitionCreationFlag>()
     return 2;
 }
 
+
 //!
 //! \class IBuilder
 //!
@@ -5815,18 +6704,21 @@ class IBuilder
     virtual nvinfer1::ICudaEngine* buildEngineWithConfig(
         INetworkDefinition& network, IBuilderConfig& config) TRTNOEXCEPT = 0;
 
+
     //! \brief Create a network definition object
     //!
     //! Creates a network definition object with immutable properties specified using the flags parameter. Providing
     //! the kDEFAULT flag as parameter mimics the behaviour of createNetwork(). CreateNetworkV2 supports dynamic shapes
     //! and explicit batch dimensions when used with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag.
     //!
-    //! \param flags Bitset of NetworkDefinitionCreationFlags specifying network properties
+    //! \param flags Bitset of NetworkDefinitionCreationFlags specifying network properties combined with bitwise OR.
+    //!             e.g., 1U << NetworkDefinitionCreationFlag::kEXPLICIT_BATCH
     //!
     //! \see INetworkDefinition, NetworkDefinitionCreationFlags
     //!
     virtual nvinfer1::INetworkDefinition* createNetworkV2(NetworkDefinitionCreationFlags flags) TRTNOEXCEPT = 0;
 
+
     //! \brief Create a new optimization profile.
     //!
     //! If the network has any dynamic input tensors, the appropriate calls to setDimensions() must be made.
@@ -5872,10 +6764,16 @@ class IBuilder
 
 } // namespace nvinfer1
 
-extern "C" TENSORRTAPI void* createInferBuilder_INTERNAL(void* logger, int version); //!< Internal C entry point for creating IBuilder.
+//!
+//! Internal C entry point for creating IBuilder.
+//! @private
+//!
+extern "C" TENSORRTAPI void* createInferBuilder_INTERNAL(void* logger, int version);
 
 namespace nvinfer1
 {
+namespace
+{
 //!
 //! \brief Create an instance of an IBuilder class.
 //!
@@ -5883,8 +6781,6 @@ namespace nvinfer1
 //!
 //! unnamed namespace avoids linkage surprises when linking objects built with different versions of this header.
 //!
-namespace
-{
 inline IBuilder* createInferBuilder(ILogger& logger)
 {
     return static_cast<IBuilder*>(createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
diff --git a/include/NvInferRuntime.h b/include/NvInferRuntime.h
index 5788e75a..3277e660 100644
--- a/include/NvInferRuntime.h
+++ b/include/NvInferRuntime.h
@@ -302,6 +302,9 @@ class IDimensionExpr
     //! If isConstant(), returns value of the constant.
     //! If !isConstant(), return std::numeric_limits<int>::min().
     virtual int getConstantValue() const = 0;
+
+protected:
+    virtual ~IDimensionExpr() {}
 };
 
 //!
@@ -458,6 +461,8 @@ class IPluginV2DynamicExt : public nvinfer1::IPluginV2Ext
     //! This function is called by the builder prior to initialize().  It provides an opportunity for the layer to make
     //! algorithm choices on the basis of bounds on the input and output tensors, and the target value.
     //!
+    //! This function is also called once when the resource requirements are changed based on the optimization profiles.
+    //!
     //! \param in The input tensors attributes that are used for configuration.
     //! \param nbInputs Number of input tensors.
     //! \param out The output tensors attributes that are used for configuration.
@@ -509,9 +514,7 @@ class IPluginV2DynamicExt : public nvinfer1::IPluginV2Ext
     TRT_DEPRECATED
     Dims getOutputDimensions(int /*index*/, const Dims* /*inputs*/, int /*nbInputDims*/) _TENSORRT_FINAL TRTNOEXCEPT
     {
-        Dims result;
-        result.nbDims = -1;
-        return result;
+        return Dims{-1, {}, {}};
     }
 
     //!
@@ -733,7 +736,7 @@ class IRuntime
     virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;
 
     //!
-    //! \breif Deserialize an engine from a stream when plugin factory is not used.
+    //! \brief Deserialize an engine from a stream when plugin factory is not used.
     //!
     //! \param blob The memory that holds the serialized engine.
     //! \param size The size of the memory.
@@ -816,7 +819,9 @@ class IRefitter
     //!
     //! Update dynamic range for a tensor.
     //!
-    //! \param name of an ITensor used to construct the network.
+    //! \param tensorName The name of an ITensor in the network.
+    //! \param min The minimum of the dynamic range for the tensor.
+    //! \param max The maximum of the dynamic range for the tensor.
     //!
     //! \return True if successful; false otherwise.
     //!
@@ -907,6 +912,8 @@ class IPluginFactory
     //! \see IPlugin::serialize()
     //!
     virtual IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) TRTNOEXCEPT = 0;
+
+    virtual ~IPluginFactory() {}
 };
 
 //!
@@ -1076,7 +1083,7 @@ class ICudaEngine
 {
 public:
     //!
-    //! \brief Get the number of binding indices. 
+    //! \brief Get the number of binding indices.
     //!
     //! If the engine has been built for K profiles, the first getNbBindings() / K bindings are used by profile
     //! number 0, the following getNbBindings() / K bindings are used by profile number 1 etc.
@@ -1146,6 +1153,8 @@ class ICudaEngine
     //!
     //! \brief Get the maximum batch size which can be used for inference.
     //!
+    //! For an engine built from an INetworkDefinition without an implicit batch dimension, this will always return 1.
+    //!
     //! \return The maximum batch size for this engine.
     //!
     virtual int getMaxBatchSize() const noexcept = 0;
@@ -1414,6 +1423,22 @@ class ICudaEngine
     //! \see setErrorRecorder
     //!
     virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;
+
+    //!
+    //! \brief Query whether the engine was built with an implicit batch dimension.
+    //!
+    //! \return True if tensors have implicit batch dimension, false otherwise.
+    //!
+    //! This is an engine-wide property.  Either all tensors in the engine
+    //! have an implicit batch dimension or none of them do.
+    //!
+    //! hasImplicitBatchDimension() is true if and only if the INetworkDefinition
+    //! from which this engine was built was created with createNetwork() or
+    //! createNetworkV2() without NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag.
+    //!
+    //! \see createNetworkV2
+    //!
+    virtual bool hasImplicitBatchDimension() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -1551,21 +1576,18 @@ class IExecutionContext
     //!
     //! The selected profile will be used in subsequent calls to execute() or enqueue().
     //!
-    //! If the associated CUDA engine has dynamic inputs, this method must be called exactly once
+    //! If the associated CUDA engine has dynamic inputs, this method must be called at least once
     //! with a unique profileIndex before calling execute or enqueue (i.e. the profile index
-    //! may not be in use by another execution context that has not been destroyed yet). Once the
-    //! optimization profile has been set (getOptimizationProfile() != -1), it cannot be changed.
+    //! may not be in use by another execution context that has not been destroyed yet).
     //! For the first execution context that is created for an engine, setOptimizationProfile(0)
-    //! is called implicitly. This means users only ever need to call this method if they need more
-    //! than a single execution context. In this case, profileIdx must be nonzero and unique for
-    //! all execution contexts that are created after the first.
+    //! is called implicitly.
     //!
-    //! If the associated CUDA engine has not dynamic inputs, this method need not be
+    //! If the associated CUDA engine does not have inputs with dynamic shapes, this method need not be
     //! called, in which case the default profile index of 0 will be used (this is particularly
     //! the case for all safe engines).
     //!
-    //! setOptimizationProfile() must be called before calling setBindingDimensions() and 
-    //! setInputShapeBinding() for all dynamic input tensors or input shape tensors, which in 
+    //! setOptimizationProfile() must be called before calling setBindingDimensions() and
+    //! setInputShapeBinding() for all dynamic input tensors or input shape tensors, which in
     //! turn must be called before either execute() or enqueue().
     //!
     //! \return true if the call succeeded, else false (e.g. input out of range)
@@ -1740,10 +1762,17 @@ class IExecutionContext
     virtual bool enqueueV2(void** bindings, cudaStream_t stream, cudaEvent_t* inputConsumed) noexcept = 0;
 };
 }
+//!
+//! Internal C entry point for creating IRuntime.
+//! @private
+//!
+extern "C" TENSORRTAPI void* createInferRuntime_INTERNAL(void* logger, int version);
 
-extern "C" TENSORRTAPI void* createInferRuntime_INTERNAL(void* logger, int version); //!< Internal C entry point for creating IRuntime.
-
-extern "C" TENSORRTAPI void* createInferRefitter_INTERNAL(void* engine, void* logger, int version); //!< Internal C entry point for creating IRefitter.
+//!
+//! Internal C entry point for creating IRefitter.
+//! @private
+//!
+extern "C" TENSORRTAPI void* createInferRefitter_INTERNAL(void* engine, void* logger, int version);
 
 namespace nvinfer1
 {
@@ -1758,6 +1787,7 @@ inline IRuntime* createInferRuntime(ILogger& logger)
 {
     return static_cast<IRuntime*>(createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
 }
+
 //!
 //! \brief Create an instance of an IRefitter class.
 //!
diff --git a/include/NvInferRuntimeCommon.h b/include/NvInferRuntimeCommon.h
index 6d3308e6..27316036 100644
--- a/include/NvInferRuntimeCommon.h
+++ b/include/NvInferRuntimeCommon.h
@@ -21,7 +21,7 @@
 #include <cstdint>
 #include "NvInferVersion.h"
 
-#if __cplusplus > 201103L
+#if __cplusplus >= 201103L
 #define _TENSORRT_FINAL final
 #define _TENSORRT_OVERRIDE override
 #else
@@ -131,13 +131,14 @@ enum class DataType : int
     kFLOAT = 0, //!< FP32 format.
     kHALF = 1,  //!< FP16 format.
     kINT8 = 2,  //!< quantized INT8 format.
-    kINT32 = 3  //!< INT32 format.
+    kINT32 = 3, //!< INT32 format.
+    kBOOL = 4   //!< BOOL format.
 };
 
 template <>
 constexpr inline int EnumMax<DataType>()
 {
-    return 4;
+    return 5;
 } //!< Maximum number of elements in DataType enum. \see DataType
 
 //!
@@ -243,7 +244,7 @@ enum class TensorFormat : int
     kCHW16 = 4,
 
     //! Thirty-two wide channel vectorized row major format. This format is
-    //! bound to INT8 or FP32. It is only available for dimensions >= 3.
+    //! only available for dimensions >= 3.
     //! For a tensor with dimensions {N, C, H, W},
     //! the memory layout is equivalent to a C array with dimensions
     //! [N][(C+31)/32][H][W][32], with the tensor coordinates (n, c, h, w)
@@ -995,7 +996,7 @@ enum class ErrorCode : int
     //!
     kSUCCESS = 0,
 
-    //! 
+    //!
     //! An error that does not fall into any other category. This error is included for forward compatibility
     //!
     kUNSPECIFIED_ERROR = 1,
@@ -1236,7 +1237,11 @@ class IErrorRecorder
 
 } // namespace nvinfer1
 
-extern "C" TENSORRTAPI void* createSafeInferRuntime_INTERNAL(void* logger, int version); //!< Internal C entry point for creating safe::IRuntime.
+//!
+//! Internal C entry point for creating safe::IRuntime.
+//! @private
+//!
+extern "C" TENSORRTAPI void* createSafeInferRuntime_INTERNAL(void* logger, int version);
 
 //!
 //! \brief Return the logger object.
diff --git a/include/NvInferVersion.h b/include/NvInferVersion.h
index a277d8c1..6d94b788 100644
--- a/include/NvInferVersion.h
+++ b/include/NvInferVersion.h
@@ -17,13 +17,13 @@
 #ifndef NV_INFER_VERSION_H
 #define NV_INFER_VERSION_H
 
-#define NV_TENSORRT_MAJOR 6 //!< TensorRT major version.
+#define NV_TENSORRT_MAJOR 7 //!< TensorRT major version.
 #define NV_TENSORRT_MINOR 0 //!< TensorRT minor version.
-#define NV_TENSORRT_PATCH 1 //!< TensorRT patch version.
-#define NV_TENSORRT_BUILD 0 //!< TensorRT build number.
+#define NV_TENSORRT_PATCH 0 //!< TensorRT patch version.
+#define NV_TENSORRT_BUILD 11 //!< TensorRT build number.
 
-#define NV_TENSORRT_SONAME_MAJOR 6 //!< Shared object library major version number.
+#define NV_TENSORRT_SONAME_MAJOR 7 //!< Shared object library major version number.
 #define NV_TENSORRT_SONAME_MINOR 0 //!< Shared object library minor version number.
-#define NV_TENSORRT_SONAME_PATCH 1 //!< Shared object library patch version number.
+#define NV_TENSORRT_SONAME_PATCH 0 //!< Shared object library patch version number.
 
 #endif // NV_INFER_VERSION_H
diff --git a/include/NvOnnxParser.h b/include/NvOnnxParser.h
index e6fc63b6..e9ab8bcc 100644
--- a/include/NvOnnxParser.h
+++ b/include/NvOnnxParser.h
@@ -18,6 +18,8 @@
 #define NV_ONNX_PARSER_H
 
 #include "NvInfer.h"
+#include <stddef.h>
+#include <vector>
 
 #define NV_ONNX_PARSER_MAJOR 0
 #define NV_ONNX_PARSER_MINOR 1
@@ -25,6 +27,20 @@
 
 static const int NV_ONNX_PARSER_VERSION = ((NV_ONNX_PARSER_MAJOR * 10000) + (NV_ONNX_PARSER_MINOR * 100) + NV_ONNX_PARSER_PATCH);
 
+//! \typedef SubGraph_t
+//!
+//! \brief The data structure containing the parsing capability of
+//! a set of nodes in an ONNX graph.
+//!
+typedef std::pair<std::vector<size_t>, bool> SubGraph_t;
+
+//! \typedef SubGraphCollection_t
+//!
+//! \brief The data structure containing all SubGraph_t partitioned
+//! out of an ONNX graph.
+//!
+typedef std::vector<SubGraph_t> SubGraphCollection_t;
+
 class onnxTensorDescriptorV1;
 //!
 //! \namespace nvonnxparser
@@ -128,10 +144,12 @@ class IParser
      * \param serialized_onnx_model Pointer to the serialized ONNX model
      * \param serialized_onnx_model_size Size of the serialized ONNX model
      *        in bytes
+     * \param sub_graph_collection Container to hold supported subgraphs
      * \return true if the model is supported
      */
     virtual bool supportsModel(void const* serialized_onnx_model,
-                               size_t serialized_onnx_model_size)
+                               size_t serialized_onnx_model_size,
+                               SubGraphCollection_t& sub_graph_collection)
         = 0;
 
     /** \brief Parse a serialized ONNX model into the TensorRT network
diff --git a/include/NvOnnxParserRuntime.h b/include/NvOnnxParserRuntime.h
deleted file mode 100644
index 522a2f33..00000000
--- a/include/NvOnnxParserRuntime.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef NV_ONNX_PARSER_RUNTIME_H
-#define NV_ONNX_PARSER_RUNTIME_H
-
-#include "NvOnnxParser.h"
-
-namespace nvonnxparser
-{
-
- /** \class IPluginFactory
- *
- * \brief a destroyable plugin factory object
- */
-class IPluginFactory : public nvinfer1::IPluginFactory
-{
-public:
-    /** \brief destroy this object
-     */
-    virtual void destroy() = 0;
-protected:
-    virtual ~IPluginFactory() {}
-};
-
-} // namespace nvonnxparser
-
-extern "C" TENSORRTAPI void* createNvOnnxParserPluginFactory_INTERNAL(void* logger, int version);
-
-namespace nvonnxparser
-{
-
-#ifdef SWIG
-inline IPluginFactory* createPluginFactory(nvinfer1::ILogger* logger)
-{
-    return static_cast<IPluginFactory*>(
-        createNvOnnxParserPluginFactory_INTERNAL(logger, NV_ONNX_PARSER_VERSION));
-}
-#endif // SWIG
-
-namespace
-{
-
-/** \brief Create a new plugin factory for deserializing engines built using
- *         the ONNX parser.
- *
- * This plugin factory handles deserialization of the plugins that are built
- * into the ONNX parser. Engines built using the ONNX parser must use this
- * plugin factory during deserialization.
- *
- * \param logger The logger to use
- *
- * \return a new plugin factory object or NULL if an error occurred
- * \see IPluginFactory
- */
-inline IPluginFactory* createPluginFactory(nvinfer1::ILogger& logger)
-{
-    return static_cast<IPluginFactory*>(
-        createNvOnnxParserPluginFactory_INTERNAL(&logger, NV_ONNX_PARSER_VERSION));
-}
-
-} // namespace
-
-} // namespace nvonnxparser
-
-#endif // NV_ONNX_PARSER_RUNTIME_H
diff --git a/include/NvUffParser.h b/include/NvUffParser.h
index db528f23..1e957ed7 100644
--- a/include/NvUffParser.h
+++ b/include/NvUffParser.h
@@ -112,6 +112,7 @@ class IPluginFactory
     virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights,
                                             const FieldCollection fc) TRTNOEXCEPT = 0;
 
+    virtual ~IPluginFactory() {}
 };
 
 //!
@@ -267,8 +268,13 @@ TENSORRTAPI IUffParser* createUffParser() TRTNOEXCEPT;
 //!
 TENSORRTAPI void shutdownProtobufLibrary(void) TRTNOEXCEPT;
 
-}
+} // namespace nvuffparser
 
+
+//!
+//! Internal C entry point for creating IUffParser
+//! @private
+//!
 extern "C" TENSORRTAPI void* createNvUffParser_INTERNAL() TRTNOEXCEPT;
 
-#endif // NV_UFF_PARSER_H
+#endif /* !NV_UFF_PARSER_H */
\ No newline at end of file
diff --git a/parsers/onnx b/parsers/onnx
index b7c08404..84b5be1d 160000
--- a/parsers/onnx
+++ b/parsers/onnx
@@ -1 +1 @@
-Subproject commit b7c0840493e72891096771d000d6de26a03aed62
+Subproject commit 84b5be1d6fc03564f2c0dba85a2ee75bad242c2e
diff --git a/plugin/batchedNMSPlugin/README.md b/plugin/batchedNMSPlugin/README.md
index 94f879e1..7213df81 100644
--- a/plugin/batchedNMSPlugin/README.md
+++ b/plugin/batchedNMSPlugin/README.md
@@ -110,4 +110,5 @@ This is the first release of this `README.md` file.
 
 ## Known issues
 
-When running `cub::DeviceSegmentedRadixSort::SortPairsDescending` with `cuda-memcheck --tool racecheck`, it will not work correctly.
+- When running `cub::DeviceSegmentedRadixSort::SortPairsDescending` with `cuda-memcheck --tool racecheck`, it will not work correctly.
+- BatchedNMS plugin cannot handle greater than 4096 rectangles in the input.
diff --git a/plugin/embLayerNormPlugin/embLayerNormPlugin.cu b/plugin/embLayerNormPlugin/embLayerNormPlugin.cu
index 403c2b6e..283b0c12 100644
--- a/plugin/embLayerNormPlugin/embLayerNormPlugin.cu
+++ b/plugin/embLayerNormPlugin/embLayerNormPlugin.cu
@@ -344,7 +344,7 @@ bool EmbLayerNormPluginDynamic::supportsFormatCombination(
     }
 
     // pos == 4: mask
-    return desc.type == DataType::kINT32 && desc.dims.nbDims == 1 && desc.dims.d[0] == prev.dims.d[0];
+    return desc.type == DataType::kINT32 && desc.dims.nbDims == 1 && desc.dims.d[0] == prev.dims.d[1];
 }
 
 void EmbLayerNormPluginDynamic::configurePlugin(
diff --git a/plugin/fcPlugin/fcPlugin.cu b/plugin/fcPlugin/fcPlugin.cu
index fc536132..821623d1 100644
--- a/plugin/fcPlugin/fcPlugin.cu
+++ b/plugin/fcPlugin/fcPlugin.cu
@@ -344,6 +344,7 @@ FCPluginDynamic::FCPluginDynamic(const std::string name, const DataType type, co
 {
     mB.count = 0;
     mB.values = nullptr;
+    memset(mAlgo.data, 0, sizeof(mAlgo.data));
 }
 
 FCPluginDynamic::FCPluginDynamic(const std::string name, const void* data, size_t length)
@@ -432,17 +433,20 @@ void FCPluginDynamic::configurePlugin(
     // max workspace size allowed for search
     size_t actualWorkspace = 0;
 
-    gLogVerbose << "Start cuBLAS GEMM search" << std::endl;
-    if (mType == DataType::kFLOAT)
+    if (mAlgo.data[0] == 0 && memcmp(mAlgo.data, mAlgo.data+1, sizeof(mAlgo.data)-sizeof(mAlgo.data[0])) == 0)
     {
-        mAlgo = gemmSearch<float>(mOutDim, mNmax, mK, maxWorkspaceBytes, actualWorkspace);
-    }
-    else
-    {
-        mAlgo = gemmSearch<half>(mOutDim, mNmax, mK, maxWorkspaceBytes, actualWorkspace);
+        gLogVerbose << "Start cuBLAS GEMM search" << std::endl;
+        if (mType == DataType::kFLOAT)
+        {
+            mAlgo = gemmSearch<float>(mOutDim, mNmax, mK, maxWorkspaceBytes, actualWorkspace);
+        }
+        else
+        {
+            mAlgo = gemmSearch<half>(mOutDim, mNmax, mK, maxWorkspaceBytes, actualWorkspace);
+        }
+        gLogVerbose << "Done cuBLAS GEMM search" << std::endl;
     }
 
-    gLogVerbose << "Done cuBLAS GEMM search" << std::endl;
     AlgoProps p;
     p.populate(mAlgo);
 
diff --git a/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp b/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
index 6bdcccc8..589dfa29 100644
--- a/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
+++ b/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
@@ -86,13 +86,23 @@ namespace {
 PluginFieldCollection InstanceNormalizationPluginCreator::mFC{};
 std::vector<PluginField> InstanceNormalizationPluginCreator::mPluginAttributes;
 
+
+InstanceNormalizationPlugin::InstanceNormalizationPlugin(
+    float epsilon, const std::vector<float>& scale, const std::vector<float>& bias)
+    : _epsilon(epsilon)
+    , _nchan(scale.size())
+    , _h_scale(scale)
+    , _h_bias(bias)
+    , _initialized(false)
+{
+    ASSERT(scale.size() == bias.size());
+}
+
 InstanceNormalizationPlugin::InstanceNormalizationPlugin(
     float epsilon, nvinfer1::Weights const& scale, nvinfer1::Weights const& bias)
     : _epsilon(epsilon)
     , _nchan(scale.count)
     , _initialized(false)
-    , _scale(scale)
-    , _bias(bias)
 {
     ASSERT(scale.count == bias.count);
     if (scale.type == nvinfer1::DataType::kFLOAT)
@@ -269,7 +279,7 @@ void InstanceNormalizationPlugin::destroy()
 
 IPluginV2DynamicExt* InstanceNormalizationPlugin::clone() const
 { 
-    return new InstanceNormalizationPlugin{_epsilon, _scale, _bias}; 
+    return new InstanceNormalizationPlugin{_epsilon, _h_scale, _h_bias};
 }
 
 // Set plugin namespace
@@ -385,7 +395,6 @@ IPluginV2DynamicExt* InstanceNormalizationPluginCreator::createPlugin(const char
     return obj;
 }
 
-// TO TEST:
 IPluginV2DynamicExt* InstanceNormalizationPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
 {
     InstanceNormalizationPlugin* obj = new InstanceNormalizationPlugin{serialData, serialLength}; 
diff --git a/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h b/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
index bc077e35..9cd48d7c 100644
--- a/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
+++ b/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
@@ -33,6 +33,7 @@ class InstanceNormalizationPlugin final : public nvinfer1::IPluginV2DynamicExt
 
 public:
   InstanceNormalizationPlugin(float epsilon, nvinfer1::Weights const& scale, nvinfer1::Weights const& bias);
+  InstanceNormalizationPlugin(float epsilon, const std::vector<float>& scale, const std::vector<float>& bias);
   InstanceNormalizationPlugin(void const* serialData, size_t serialLength);
 
   InstanceNormalizationPlugin() = delete;
@@ -90,7 +91,6 @@ class InstanceNormalizationPlugin final : public nvinfer1::IPluginV2DynamicExt
     float* _d_scale;
     float* _d_bias;
     bool _initialized;
-    nvinfer1::Weights _scale, _bias;
     cudnnHandle_t _cudnn_handle;
     cudnnTensorDescriptor_t _x_desc, _y_desc, _b_desc;
     const char* mPluginNamespace;
diff --git a/plugin/specialSlicePlugin/README.md b/plugin/specialSlicePlugin/README.md
index fbf8f6a4..4dcdc8f9 100644
--- a/plugin/specialSlicePlugin/README.md
+++ b/plugin/specialSlicePlugin/README.md
@@ -24,14 +24,14 @@ This plugin generates one output tensor of shape `[N, num_det, 4]`.
 
 ## Parameters
 
-This plugin has the plugin creator class `SpecialSliceCreator` and the plugin class `SpecialSlice`.
+This plugin has the plugin creator class `FlattenConcatPluginCreator` and the plugin class `FlattenConcat`.
 
 This plugin has no parameter.
   
 
 ## Additional resources
 
-The following resources provide a deeper understanding of the `SpecialSlice` plugin:
+The following resources provide a deeper understanding of the `flattenConcat` plugin:
 
 - [MaskRCNN](https://github.com/matterport/Mask_RCNN)
 
diff --git a/samples/CMakeSamplesTemplate.txt b/samples/CMakeSamplesTemplate.txt
index 3c28f339..8b0dcec6 100644
--- a/samples/CMakeSamplesTemplate.txt
+++ b/samples/CMakeSamplesTemplate.txt
@@ -77,7 +77,6 @@ else()
     set(ONNX_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/parsers${NVINTERNAL_SUFFIX}/onnx CACHE STRING "Onnx include directory")
 endif()
 
-add_compile_options("-fno-rtti")
 target_include_directories(${TARGET_NAME}
     PUBLIC ${PROJECT_SOURCE_DIR}/include
     PUBLIC ${ONNX_INCLUDE_DIR}
@@ -86,7 +85,9 @@ target_include_directories(${TARGET_NAME}
     PRIVATE ${TARGET_DIR}
 )
 
-set(SAMPLE_DEP_LIBS  
+target_compile_options(${TARGET_NAME} PUBLIC "-fno-rtti")
+
+set(SAMPLE_DEP_LIBS
     ${CUDART_LIB}
     ${CUBLAS_LIB}
     ${CUDNN_LIB}
diff --git a/samples/common/BatchStream.h b/samples/common/BatchStream.h
index c3c72447..306ed591 100644
--- a/samples/common/BatchStream.h
+++ b/samples/common/BatchStream.h
@@ -91,7 +91,7 @@ class MNISTBatchStream : public IBatchStream
 
     nvinfer1::Dims getDims() const override
     {
-        return mDims;
+        return Dims{4, {mBatchSize, mDims.d[0], mDims.d[1], mDims.d[2]}, {}};
     }
 
 private:
diff --git a/samples/common/EntropyCalibrator.h b/samples/common/EntropyCalibrator.h
index efa80081..433b91e0 100644
--- a/samples/common/EntropyCalibrator.h
+++ b/samples/common/EntropyCalibrator.h
@@ -36,7 +36,7 @@ class EntropyCalibratorImpl
         , mReadCache(readCache)
     {
         nvinfer1::Dims dims = mStream.getDims();
-        mInputCount = samplesCommon::volume(dims) * mStream.getBatchSize();
+        mInputCount = samplesCommon::volume(dims);
         CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
         mStream.reset(firstBatch);
     }
diff --git a/samples/common/argsParser.h b/samples/common/argsParser.h
index 0d98a40d..b59fe32a 100644
--- a/samples/common/argsParser.h
+++ b/samples/common/argsParser.h
@@ -83,6 +83,7 @@ struct Args
     int useDLACore{-1};
     int batch{1};
     std::vector<std::string> dataDirs;
+    bool useILoop{false};
 };
 
 //!
@@ -98,8 +99,8 @@ inline bool parseArgs(Args& args, int argc, char* argv[])
     {
         int arg;
         static struct option long_options[] = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'},
-            {"int8", no_argument, 0, 'i'}, {"fp16", no_argument, 0, 'f'}, {"useDLACore", required_argument, 0, 'u'},
-            {"batch", required_argument, 0, 'b'}, {nullptr, 0, nullptr, 0}};
+            {"int8", no_argument, 0, 'i'}, {"fp16", no_argument, 0, 'f'}, {"useILoop", no_argument, 0, 'l'},
+            {"useDLACore", required_argument, 0, 'u'}, {"batch", required_argument, 0, 'b'}, {nullptr, 0, nullptr, 0}};
         int option_index = 0;
         arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
         if (arg == -1)
@@ -123,6 +124,7 @@ inline bool parseArgs(Args& args, int argc, char* argv[])
             break;
         case 'i': args.runInInt8 = true; break;
         case 'f': args.runInFp16 = true; break;
+        case 'l': args.useILoop = true; break;
         case 'u':
             if (optarg)
             {
diff --git a/samples/common/buffers.h b/samples/common/buffers.h
index 47abf8ce..705ce586 100644
--- a/samples/common/buffers.h
+++ b/samples/common/buffers.h
@@ -344,6 +344,7 @@ class BufferManager
         case nvinfer1::DataType::kFLOAT: print<float>(os, buf, bufSize, rowCount); break;
         case nvinfer1::DataType::kHALF: print<half_float::half>(os, buf, bufSize, rowCount); break;
         case nvinfer1::DataType::kINT8: assert(0 && "Int8 network-level input and output is not supported"); break;
+        case nvinfer1::DataType::kBOOL: assert(0 && "Bool network-level input and output are not supported"); break;
         }
     }
 
diff --git a/samples/common/common.h b/samples/common/common.h
index 17032bd7..12feca56 100644
--- a/samples/common/common.h
+++ b/samples/common/common.h
@@ -63,8 +63,6 @@ using namespace plugin;
 #define ENABLE_DLA_API 1
 #endif
 
-#define TRT_UNUSED (void)
-
 #define CHECK(status)                                                                                                  \
     do                                                                                                                 \
     {                                                                                                                  \
@@ -607,6 +605,7 @@ inline unsigned int getElementSize(nvinfer1::DataType t)
     case nvinfer1::DataType::kINT32: return 4;
     case nvinfer1::DataType::kFLOAT: return 4;
     case nvinfer1::DataType::kHALF: return 2;
+    case nvinfer1::DataType::kBOOL:
     case nvinfer1::DataType::kINT8: return 1;
     }
     throw std::runtime_error("Invalid DataType.");
@@ -625,6 +624,7 @@ inline unsigned int elementSize(DataType t)
     case DataType::kINT32:
     case DataType::kFLOAT: return 4;
     case DataType::kHALF: return 2;
+    case DataType::kBOOL:
     case DataType::kINT8: return 1;
     }
     return 0;
diff --git a/samples/common/logging.h b/samples/common/logging.h
index 63a0a3a1..602b69fb 100644
--- a/samples/common/logging.h
+++ b/samples/common/logging.h
@@ -72,7 +72,7 @@ class LogStreamConsumerBuffer : public std::stringbuf
             std::time_t timestamp = std::time(nullptr);
             tm* tm_local = std::localtime(&timestamp);
             std::cout << "[";
-            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mon << "/";
+            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
             std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
diff --git a/samples/common/sampleConfig.h b/samples/common/sampleConfig.h
index a5b29c13..d106eb5d 100644
--- a/samples/common/sampleConfig.h
+++ b/samples/common/sampleConfig.h
@@ -41,6 +41,7 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     std::string mReferenceFilename;
     std::string mOutputFilename;
     std::string mCalibrationFilename;
+    int64_t mLabel{-1};
     int64_t mMaxBatchSize{32};
     int64_t mMaxWorkspaceSize{1 * 1024 * 1024 * 1024};
     int64_t mCalibBatchSize{0};
@@ -137,6 +138,14 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     {
         mFullTextFilename = string(fullTextFilename);
     }
+    void setLabel(int64_t label)
+    {
+        mLabel = label;
+    }//!<  set the Label
+    int64_t getLabel() const
+    {
+        return mLabel;
+    }//!<  get the Label
     bool getPrintLayerInfo() const
     {
         return mPrintLayercInfo;
diff --git a/samples/common/sampleDevice.h b/samples/common/sampleDevice.h
index 28bff280..b0efeca6 100644
--- a/samples/common/sampleDevice.h
+++ b/samples/common/sampleDevice.h
@@ -38,13 +38,21 @@ class TrtCudaEvent;
 namespace
 {
 
+#if CUDA_VERSION < 10000
+void cudaSleep(cudaStream_t stream, cudaError_t status, void* sleep)
+#else
 void cudaSleep(void* sleep)
+#endif
 {
     std::this_thread::sleep_for(std::chrono::duration<int, std::milli>(*static_cast<int*>(sleep)));
 }
 
 }
 
+//!
+//! \class TrtCudaStream
+//! \brief Managed CUDA stream
+//!
 class TrtCudaStream
 {
 public:
@@ -76,7 +84,11 @@ class TrtCudaStream
 
     void sleep(int* ms)
     {
+#if CUDA_VERSION < 10000
+        cudaCheck(cudaStreamAddCallback(mStream, cudaSleep, ms, 0));
+#else
         cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms));
+#endif
     }
 
 private:
@@ -84,17 +96,20 @@ class TrtCudaStream
     cudaStream_t mStream{};
 };
 
+//!
+//! \class TrtCudaEvent
+//! \brief Managed CUDA event
+//!
 class TrtCudaEvent
 {
 public:
 
-    TrtCudaEvent(unsigned int flags)
+    explicit TrtCudaEvent(bool blocking = true)
     {
+        const unsigned int flags = blocking ? cudaEventBlockingSync : cudaEventDefault;
         cudaCheck(cudaEventCreateWithFlags(&mEvent, flags));
     }
 
-    TrtCudaEvent() = default;
-
     TrtCudaEvent(const TrtCudaEvent&) = delete;
 
     TrtCudaEvent& operator=(const TrtCudaEvent&) = delete;
@@ -123,16 +138,11 @@ class TrtCudaEvent
         cudaCheck(cudaEventSynchronize(mEvent));
     }
 
-    void reset(unsigned int flags = cudaEventDefault)
-    {
-        cudaCheck(cudaEventDestroy(mEvent));
-        cudaCheck(cudaEventCreateWithFlags(&mEvent, flags));
-    }
-
+    // Returns time elapsed time in milliseconds
     float operator-(const TrtCudaEvent& e) const
     {
         float time{0};
-        cudaCheck(cudaEventElapsedTime(&time, e.get(), get())); 
+        cudaCheck(cudaEventElapsedTime(&time, e.get(), get()));
         return time;
     }
 
@@ -146,6 +156,10 @@ inline void TrtCudaStream::wait(TrtCudaEvent& event)
     cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0));
 }
 
+//!
+//! \class TrtCudaBuffer
+//! \brief Managed buffer for host and device
+//!
 template <typename A, typename D>
 class TrtCudaBuffer
 {
@@ -232,6 +246,47 @@ using TrtDeviceBuffer = TrtCudaBuffer<DeviceAllocator, DeviceDeallocator>;
 
 using TrtHostBuffer = TrtCudaBuffer<HostAllocator, HostDeallocator>;
 
+//!
+//! \class MirroredBuffer
+//! \brief Coupled host and device buffers
+//!
+class MirroredBuffer
+{
+public:
+
+    void allocate(size_t size)
+    {
+        mSize = size;
+        mHostBuffer.allocate(size);
+        mDeviceBuffer.allocate(size);
+    }
+
+    void* getDeviceBuffer() const { return mDeviceBuffer.get(); }
+
+    void* getHostBuffer() const { return mHostBuffer.get(); }
+
+    void hostToDevice(TrtCudaStream& stream)
+    {
+        cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get()));
+    }
+
+    void deviceToHost(TrtCudaStream& stream)
+    {
+        cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get()));
+    }
+
+    int getSize() const
+    {
+        return mSize;
+    }
+
+private:
+
+    int mSize{0};
+    TrtHostBuffer mHostBuffer;
+    TrtDeviceBuffer mDeviceBuffer;
+};
+
 } // namespace sample
 
 #endif // TRT_SAMPLE_DEVICE_H
diff --git a/samples/common/sampleEngines.cpp b/samples/common/sampleEngines.cpp
index d53970d0..2aa8051b 100644
--- a/samples/common/sampleEngines.cpp
+++ b/samples/common/sampleEngines.cpp
@@ -286,12 +286,10 @@ ICudaEngine* networkToEngine(const BuildOptions& build, const SystemOptions& sys
     }
     else
     {
-        if (!build.shapes.empty())
-        {
-            profile = builder.createOptimizationProfile();
-        }
+        profile = builder.createOptimizationProfile();
     }
 
+    bool hasDynamicShapes{false};
     for (unsigned int i = 0, n = network.getNbInputs(); i < n; i++)
     {
         // Set formats and data types of inputs
@@ -303,34 +301,79 @@ ICudaEngine* networkToEngine(const BuildOptions& build, const SystemOptions& sys
         }
         else
         {
-            input->setType(DataType::kFLOAT);
+            switch (input->getType())
+            {
+            case DataType::kINT32:
+            case DataType::kBOOL:
+                // Leave these as is.
+                break;
+            case DataType::kFLOAT:
+            case DataType::kINT8:
+            case DataType::kHALF:
+                // User did not specify a floating-point format.  Default to kFLOAT.
+                input->setType(DataType::kFLOAT);
+                break;
+            }
             input->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
         }
 
         if (profile)
         {
-            if (input->isShapeTensor())
-            {
-                err << "Shape tensor inputs are unsupported" << std::endl;
-                return nullptr;
-            }
-            Dims profileDims = input->getDimensions();;
-            auto shape = build.shapes.find(input->getName());
-            if (shape == build.shapes.end())
+            Dims dims = input->getDimensions();
+            const bool isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int dim){ return dim == -1; }) || input->isShapeTensor();
+            if (isDynamicInput)
             {
-                err << "Dynamic dimensions required for input " << input->getName() << std::endl;
-                return nullptr;
+                hasDynamicShapes = true;
+                auto shape = build.shapes.find(input->getName());
+                ShapeRange shapes{};
+
+                // If no shape is provided, set dynamic dimensions to 1.
+                if (shape == build.shapes.end())
+                {
+                    constexpr int DEFAULT_DIMENSION = 1;
+                    Dims staticDims{};
+                    if (input->isShapeTensor())
+                    {
+                        staticDims.nbDims = dims.d[0];
+                        std::fill(staticDims.d, staticDims.d + staticDims.nbDims, DEFAULT_DIMENSION);
+                    }
+                    else
+                    {
+                        staticDims.nbDims = dims.nbDims;
+                        std::transform(dims.d, dims.d + dims.nbDims, staticDims.d, [&DEFAULT_DIMENSION](int dim) { return dim > 0 ? dim : DEFAULT_DIMENSION; });
+                    }
+                    gLogWarning << "Dynamic dimensions required for input: " << input->getName() << ", but no shapes were provided. Automatically overriding shape to: " << staticDims << std::endl;
+                    std::fill(shapes.begin(), shapes.end(), staticDims);
+                }
+                else
+                {
+                    shapes = shape->second;
+                }
+
+                Dims profileDims{};
+                if (input->isShapeTensor())
+                {
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMIN)];
+                    profile->setShapeValues(input->getName(), OptProfileSelector::kMIN, profileDims.d, profileDims.nbDims);
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kOPT)];
+                    profile->setShapeValues(input->getName(), OptProfileSelector::kOPT, profileDims.d, profileDims.nbDims);
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMAX)];
+                    profile->setShapeValues(input->getName(), OptProfileSelector::kMAX, profileDims.d, profileDims.nbDims);
+                }
+                else
+                {
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMIN)];
+                    profile->setDimensions(input->getName(), OptProfileSelector::kMIN, profileDims);
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kOPT)];
+                    profile->setDimensions(input->getName(), OptProfileSelector::kOPT, profileDims);
+                    profileDims = shapes[static_cast<size_t>(OptProfileSelector::kMAX)];
+                    profile->setDimensions(input->getName(), OptProfileSelector::kMAX, profileDims);
+                }
             }
-            profileDims = shape->second[static_cast<size_t>(OptProfileSelector::kMIN)];
-            profile->setDimensions(input->getName(), OptProfileSelector::kMIN, profileDims);
-            profileDims = shape->second[static_cast<size_t>(OptProfileSelector::kOPT)];
-            profile->setDimensions(input->getName(), OptProfileSelector::kOPT, profileDims);
-            profileDims = shape->second[static_cast<size_t>(OptProfileSelector::kMAX)];
-            profile->setDimensions(input->getName(), OptProfileSelector::kMAX, profileDims);
         }
     }
 
-    if (profile)
+    if (profile && hasDynamicShapes)
     {
         if (!profile->isValid())
         {
@@ -351,7 +394,6 @@ ICudaEngine* networkToEngine(const BuildOptions& build, const SystemOptions& sys
         }
         else
         {
-            output->setType(DataType::kFLOAT);
             output->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
         }
     }
@@ -424,7 +466,8 @@ ICudaEngine* modelToEngine(
         err << "Builder creation failed" << std::endl;
         return nullptr;
     }
-    auto batchFlag = build.maxBatch ? 0U : 1U
+    const bool isOnnxModel = model.baseModel.format == ModelFormat::kONNX;
+    auto batchFlag = (build.maxBatch && !isOnnxModel) ? 0U : 1U
         << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
     TrtUniquePtr<INetworkDefinition> network{builder->createNetworkV2(batchFlag)};
     if (!network)
diff --git a/samples/common/sampleInference.cpp b/samples/common/sampleInference.cpp
index 1e47bff6..0029a479 100644
--- a/samples/common/sampleInference.cpp
+++ b/samples/common/sampleInference.cpp
@@ -14,17 +14,21 @@
  * limitations under the License.
  */
 
+#include <array>
 #include <vector>
-#include <chrono>
 #include <numeric>
-#include <utility> 
+#include <algorithm>
+#include <utility>
 #include <thread>
 #include <mutex>
 #include <functional>
 #include <limits>
+#include <memory>
 
 #include "NvInfer.h"
 
+#include "logger.h"
+#include "sampleDevice.h"
 #include "sampleUtils.h"
 #include "sampleOptions.h"
 #include "sampleReporting.h"
@@ -35,15 +39,15 @@ namespace sample
 
 void setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inference)
 {
-    if (iEnv.profiler)
-    {
-        iEnv.context.front()->setProfiler(iEnv.profiler.get());
-    }
     for (int s = 0; s < inference.streams; ++s)
     {
         iEnv.context.emplace_back(iEnv.engine->createExecutionContext());
+        iEnv.bindings.emplace_back(new Bindings);
+    }
+    if (iEnv.profiler)
+    {
+        iEnv.context.front()->setProfiler(iEnv.profiler.get());
     }
-    iEnv.bindings.resize(inference.streams);
 
     // Set all input dimensions before all bindings can be allocated
     for (int b = 0; b < iEnv.engine->getNbBindings(); ++b)
@@ -51,13 +55,43 @@ void setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inferenc
         if (iEnv.engine->bindingIsInput(b))
         {
             auto dims = iEnv.context.front()->getBindingDimensions(b);
-            if (std::any_of(dims.d, dims.d + dims.nbDims, [](int d) { return d == -1; }))
+            const bool isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int dim){ return dim == -1; }) || iEnv.engine->isShapeBinding(b);
+            if (isDynamicInput)
             {
                 auto shape = inference.shapes.find(iEnv.engine->getBindingName(b));
-                std::copy(shape->second.d, shape->second.d + shape->second.nbDims, dims.d);
+
+                // If no shape is provided, set dynamic dimensions to 1.
+                nvinfer1::Dims staticDims{};
+                if (shape == inference.shapes.end())
+                {
+                    constexpr int DEFAULT_DIMENSION = 1;
+                    if (iEnv.engine->isShapeBinding(b))
+                    {
+                        staticDims.nbDims = dims.d[0];
+                        std::fill(staticDims.d, staticDims.d + staticDims.nbDims, DEFAULT_DIMENSION);
+                    }
+                    else
+                    {
+                        staticDims.nbDims = dims.nbDims;
+                        std::transform(dims.d, dims.d + dims.nbDims, staticDims.d, [&](int dim) { return dim > 0 ? dim : DEFAULT_DIMENSION; });
+                    }
+                    gLogWarning << "Dynamic dimensions required for input: " << iEnv.engine->getBindingName(b) << ", but no shapes were provided. Automatically overriding shape to: " << staticDims << std::endl;
+                }
+                else
+                {
+                    staticDims = shape->second;
+                }
+
                 for (auto& c : iEnv.context)
                 {
-                    c->setBindingDimensions(b, dims);
+                    if (iEnv.engine->isShapeBinding(b))
+                    {
+                        c->setInputShapeBinding(b, staticDims.d);
+                    }
+                    else
+                    {
+                        c->setBindingDimensions(b, staticDims);
+                    }
                 }
             }
         }
@@ -65,169 +99,298 @@ void setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inferenc
 
     for (int b = 0; b < iEnv.engine->getNbBindings(); ++b)
     {
-        auto dims = iEnv.context.front()->getBindingDimensions(b);
-        auto vecDim = iEnv.engine->getBindingVectorizedDim(b);
-        if (vecDim != -1)
-        {
-            dims.d[vecDim] = roundUp(dims.d[vecDim], iEnv.engine->getBindingComponentsPerElement(b));
-        }
-        auto name = iEnv.engine->getBindingName(b);
-        auto vol = volume(dims) * std::max(inference.batch, 1);
-        vol *= dataTypeSize(iEnv.engine->getBindingDataType(b));
-        for (auto& bin : iEnv.bindings)
+        const auto dims = iEnv.context.front()->getBindingDimensions(b);
+        const auto vecDim = iEnv.engine->getBindingVectorizedDim(b);
+        const auto comps = iEnv.engine->getBindingComponentsPerElement(b);
+        const auto dataType = iEnv.engine->getBindingDataType(b);
+        const auto vol = volume(dims, vecDim, comps, inference.batch);
+        const auto name = iEnv.engine->getBindingName(b);
+        const auto isInput = iEnv.engine->bindingIsInput(b);
+        for (auto& bindings : iEnv.bindings)
         {
-            bin.addBinding(b, name, vol);
+            const auto input = inference.inputs.find(name);
+            if (isInput && input != inference.inputs.end())
+            {
+                bindings->addBinding(b, name, isInput, vol, dataType, input->second);
+            }
+            else
+            {
+                bindings->addBinding(b, name, isInput, vol, dataType);
+            }
         }
     }
 }
 
 namespace {
 
-struct SynchStruct
+//!
+//! \struct SyncStruct
+//! \brief Threads synchronization structure
+//!
+struct SyncStruct
 {
     std::mutex mutex;
     TrtCudaStream mainStream;
     TrtCudaEvent mainStart{cudaEventBlockingSync};
     int sleep{0};
-    InferenceTime totalTime;
 };
 
-struct IterStruct
+//!
+//! \class EnqueueImplicit
+//! \brief Functor to enqueue inference with implict batch
+//!
+class EnqueueImplicit
 {
-    TrtCudaStream stream;
-    TrtCudaEvent start{cudaEventBlockingSync};
-    TrtCudaEvent end{cudaEventBlockingSync};
-    nvinfer1::IExecutionContext* context{nullptr};
-    void** buffers{nullptr};
-};
 
-inline
-void enqueue(nvinfer1::IExecutionContext& context, int batch, void** buffers, TrtCudaStream& stream)
-{
-    if (batch)
+public:
+
+    explicit EnqueueImplicit(int batch): mBatch(batch) {}
+
+    void operator() (nvinfer1::IExecutionContext& context, void** buffers, TrtCudaStream& stream) const
     {
-        context.enqueue(batch, buffers, stream.get(), nullptr);
+        context.enqueue(mBatch, buffers, stream.get(), nullptr);
     }
-    else
+
+private:
+
+    int mBatch{};
+};
+
+//!
+//! \class EnqueueExplicit
+//! \brief Functor to enqueue inference with explict batch
+//!
+class EnqueueExplicit
+{
+
+public:
+
+    void operator() (nvinfer1::IExecutionContext& context, void** buffers, TrtCudaStream& stream) const
     {
         context.enqueueV2(buffers, stream.get(), nullptr);
     }
-}
+};
+
+using EnqueueFunction = std::function<void(nvinfer1::IExecutionContext&, void**, TrtCudaStream&)>;
 
-void inferenceLoop(std::vector<IterStruct>& streamItor, SynchStruct& synch, int batch, int iterations, float maxDuration, float warmup, std::vector<InferenceTime>& times)
+enum class StreamType : int
 {
-    float duration = 0;
-    float gpuStart = 0;
-    int skip = 0;
+    kINPUT = 0,
+    kCOMPUTE = 1,
+    kOUTPUT = 2,
+    kNUM = 3
+};
+
+enum class EventType : int
+{
+    kINPUT_S = 0,
+    kINPUT_E = 1,
+    kCOMPUTE_S = 2,
+    kCOMPUTE_E = 3,
+    kOUTPUT_S = 4,
+    kOUTPUT_E = 5,
+    kNUM = 6
+};
 
-    auto loopStart = std::chrono::high_resolution_clock::now();
-    auto timingStart{loopStart};
-    auto timingEnd{loopStart};
+using MultiStream = std::array<TrtCudaStream, static_cast<int>(StreamType::kNUM)>;
 
-    for (int i = 0; i - skip < iterations || duration < maxDuration; ++i)
-    {
-        auto iterStart = std::chrono::high_resolution_clock::now();
+using MultiEvent = std::array<std::unique_ptr<TrtCudaEvent>, static_cast<int>(EventType::kNUM)>;
+
+//!
+//! \class Iteration
+//! \brief Inference iteration and streams management
+//!
+class Iteration
+{
 
-        for (auto& s : streamItor)
+public:
+
+    Iteration(int id, bool overlap, bool spin, nvinfer1::IExecutionContext& context, Bindings& bindings,
+               EnqueueFunction enqueue): mContext(context), mBindings(bindings), mEnqueue(enqueue),
+               mStreamId(id), mDepth(1 + overlap), mActive(mDepth), mEvents(mDepth)
+    {
+        for (int d = 0; d < mDepth; ++d)
         {
-            s.start.record(s.stream);
-            enqueue(*s.context, batch, s.buffers, s.stream);
-            s.end.record(s.stream);
+            for (int e = 0; e < static_cast<int>(EventType::kNUM); ++e)
+            {
+                mEvents[d][e].reset(new TrtCudaEvent(!spin));
+            }
         }
-        float currentGpuStart = std::numeric_limits<float>::max();
-        for (auto& s : streamItor)
+    }
+
+    void query()
+    {
+        if (mActive[mNext])
         {
-            s.end.synchronize();
-            currentGpuStart = std::min(currentGpuStart, s.start - synch.mainStart);
+            return;
         }
 
-        auto iterEnd = std::chrono::high_resolution_clock::now();
-        duration = std::chrono::duration<float, std::milli>(iterEnd - loopStart).count();
+        record(EventType::kINPUT_S, StreamType::kINPUT);
+        mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
+        record(EventType::kINPUT_E, StreamType::kINPUT);
 
-        if (duration < warmup)
-        {
-            ++skip;
-            timingStart = std::chrono::high_resolution_clock::now();
-            gpuStart = currentGpuStart;
-            continue;
-        }
-        else
+        wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute
+        record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE);
+        mEnqueue(mContext, mBindings.getDeviceBuffers(), getStream(StreamType::kCOMPUTE));
+        record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE);
+
+        wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA
+        record(EventType::kOUTPUT_S, StreamType::kOUTPUT);
+        mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
+        record(EventType::kOUTPUT_E, StreamType::kOUTPUT);
+
+        mActive[mNext] = true;
+        moveNext();
+    }
+
+    float sync(const TrtCudaEvent& start, std::vector<InferenceTrace>& trace)
+    {
+        if (mActive[mNext])
         {
-            timingEnd = iterEnd;
+            getEvent(EventType::kOUTPUT_E).synchronize();
+            trace.emplace_back(getTrace(start));
+            mActive[mNext] = false;
+            return getEvent(EventType::kCOMPUTE_S) - start;
         }
+        return 0;
+    }
 
-        float latency = std::chrono::duration<float, std::milli>(iterEnd - iterStart).count();
-        for (auto& s : streamItor)
+    void syncAll(const TrtCudaEvent& start, std::vector<InferenceTrace>& trace)
+    {
+        for (int d = 0; d < mDepth; ++d)
         {
-            float gpuTime = s.end - s.start;
-            times.emplace_back(latency, gpuTime);
+            sync(start, trace);
+            moveNext();
         }
     }
 
-    float totalLatency = std::chrono::duration<float, std::milli>(timingEnd - timingStart).count();
-    float totalGpuTime = 0;
-    for (auto& s : streamItor)
+    void wait(TrtCudaEvent& start)
     {
-        totalGpuTime = std::max(totalGpuTime, s.end - synch.mainStart);
+        getStream(StreamType::kINPUT).wait(start);
+    }
+
+private:
+
+    void moveNext()
+    {
+        mNext = mDepth - 1 - mNext;
     }
-    totalGpuTime -= gpuStart;
-    times.emplace_back(totalLatency, totalGpuTime);
-}
 
-void inferenceExecution(const InferenceOptions& inference, InferenceEnvironment& iEnv, SynchStruct& synch, int offset, int streams, std::vector<InferenceTime>& trace)
+    TrtCudaStream& getStream(StreamType t)
+    {
+        return mStream[static_cast<int>(t)];
+    }
+
+    TrtCudaEvent& getEvent(EventType t)
+    {
+        return *mEvents[mNext][static_cast<int>(t)];
+    }
+
+    void record(EventType e, StreamType s)
+    {
+        getEvent(e).record(getStream(s));
+    }
+
+    void wait(EventType e, StreamType s)
+    {
+        getStream(s).wait(getEvent(e));
+    }
+
+    InferenceTrace getTrace(const TrtCudaEvent& start)
+    {
+        return InferenceTrace(mStreamId, getEvent(EventType::kINPUT_S) - start, getEvent(EventType::kINPUT_E) - start,
+                                         getEvent(EventType::kCOMPUTE_S) - start, getEvent(EventType::kCOMPUTE_E) - start,
+                                         getEvent(EventType::kOUTPUT_S)- start, getEvent(EventType::kOUTPUT_E)- start);
+    }
+
+    nvinfer1::IExecutionContext& mContext;
+    Bindings& mBindings;
+
+    EnqueueFunction mEnqueue;
+
+    int mStreamId{0};
+    int mNext{0};
+    int mDepth{2}; // default to double buffer to hide DMA transfers
+
+    std::vector<bool> mActive;
+    MultiStream mStream;
+    std::vector<MultiEvent> mEvents;
+};
+
+using IterationStreams = std::vector<std::unique_ptr<Iteration>>;
+
+void inferenceLoop(IterationStreams& iStreams, const TrtCudaEvent& mainStart, int batch, int iterations, float maxDurationMs, float warmupMs, std::vector<InferenceTrace>& trace)
 {
-    float warmup = static_cast<float>(inference.warmup);
-    float duration = static_cast<float>(inference.duration * 1000 + inference.warmup);
+    float durationMs = 0;
+    int skip = 0;
 
-    std::vector<IterStruct> streamItor(streams);
-    for (auto& s : streamItor)
+    for (int i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i)
     {
-        if (inference.spin)
+        for (auto& s : iStreams)
         {
-            s.start.reset(cudaEventDefault);
-            s.end.reset(cudaEventDefault);
+            s->query();
         }
-
-        s.context = iEnv.context[offset].get();
-        s.buffers = iEnv.bindings[offset].getDeviceBuffers();
-        ++offset;
+        for (auto& s : iStreams)
+        {
+            durationMs = std::max(durationMs, s->sync(mainStart, trace));
+        }
+        if (durationMs < warmupMs) // Warming up
+        {
+            if (durationMs) // Skip complete iterations
+            {
+                ++skip;
+            }
+            continue;
+        }
+    }
+    for (auto& s : iStreams)
+    {
+        s->syncAll(mainStart, trace);
     }
+}
+
+void inferenceExecution(const InferenceOptions& inference, InferenceEnvironment& iEnv, SyncStruct& sync, int offset, int streams, std::vector<InferenceTrace>& trace)
+{
+    float warmupMs = static_cast<float>(inference.warmup);
+    float durationMs = static_cast<float>(inference.duration) * 1000 + warmupMs;
 
-    // Allocate enough space for all iterations and the duration assuming 1ms inference
-    // to avoid allocations during timing
-    std::vector<InferenceTime> times;
-    times.reserve(static_cast<size_t>(std::max(inference.iterations, static_cast<int>(duration * 1000))));
+    auto enqueue = inference.batch ? EnqueueFunction(EnqueueImplicit(inference.batch)) : EnqueueFunction(EnqueueExplicit());
+
+    IterationStreams iStreams;
+    for (int s = 0; s < streams; ++s)
+    {
+        iStreams.emplace_back(new Iteration(offset + s, inference.overlap, inference.spin, *iEnv.context[offset], *iEnv.bindings[offset], enqueue));
+    }
 
-    for (auto& s : streamItor)
+    for (auto& s : iStreams)
     {
-        s.stream.wait(synch.mainStart);
+        s->wait(sync.mainStart);
     }
 
-    inferenceLoop(streamItor, synch, inference.batch, inference.iterations, duration, warmup, times);
+    std::vector<InferenceTrace> localTrace;
+    inferenceLoop(iStreams, sync.mainStart, inference.batch, inference.iterations, durationMs, warmupMs, localTrace);
 
-    synch.mutex.lock();
-    trace.insert(trace.end(), times.begin(), times.end() - 1);
-    synch.totalTime.latency = std::max(synch.totalTime.latency, times.back().latency);
-    synch.totalTime.gpuTime = std::max(synch.totalTime.gpuTime, times.back().gpuTime);
-    synch.mutex.unlock();
+    sync.mutex.lock();
+    trace.insert(trace.end(), localTrace.begin(), localTrace.end());
+    sync.mutex.unlock();
 }
 
 inline
-std::thread makeThread(const InferenceOptions& inference, InferenceEnvironment& iEnv, SynchStruct& synch, int thread, int streamsPerThread, std::vector<InferenceTime>& trace)
+std::thread makeThread(const InferenceOptions& inference, InferenceEnvironment& iEnv, SyncStruct& sync, int thread, int streamsPerThread, std::vector<InferenceTrace>& trace)
 {
-    return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), std::ref(synch), thread, streamsPerThread, std::ref(trace));
+    return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), std::ref(sync), thread, streamsPerThread, std::ref(trace));
 }
 
 } // namespace
 
-void runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv, std::vector<InferenceTime>& trace)
+void runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv, std::vector<InferenceTrace>& trace)
 {
     trace.resize(0);
 
-    SynchStruct synch;
-    synch.sleep = inference.sleep;
-    synch.mainStream.sleep(&synch.sleep);
-    synch.mainStart.record(synch.mainStream);
+    SyncStruct sync;
+    sync.sleep = inference.sleep;
+    sync.mainStream.sleep(&sync.sleep);
+    sync.mainStart.record(sync.mainStream);
 
     int threadsNum = inference.threads ? inference.streams : 1;
     int streamsPerThread  = inference.streams / threadsNum;
@@ -235,14 +398,15 @@ void runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv,
     std::vector<std::thread> threads;
     for (int t = 0; t < threadsNum; ++t)
     {
-        threads.emplace_back(makeThread(inference, iEnv, synch, t, streamsPerThread, trace));
+        threads.emplace_back(makeThread(inference, iEnv, sync, t, streamsPerThread, trace));
     }
     for (auto& th : threads)
     {
         th.join();
     }
 
-    trace.emplace_back(synch.totalTime);
+    auto cmpTrace = [](const InferenceTrace& a, const InferenceTrace& b) { return a.inStart < b.inStart; };
+    std::sort(trace.begin(), trace.end(), cmpTrace);
 }
 
 } // namespace sample
diff --git a/samples/common/sampleInference.h b/samples/common/sampleInference.h
index 5a4b0b41..6a293087 100644
--- a/samples/common/sampleInference.h
+++ b/samples/common/sampleInference.h
@@ -33,9 +33,9 @@ namespace sample
 struct InferenceEnvironment
 {
     TrtUniquePtr<nvinfer1::ICudaEngine> engine;
-    std::unique_ptr<nvinfer1::IProfiler> profiler;
+    std::unique_ptr<Profiler> profiler;
     std::vector<TrtUniquePtr<nvinfer1::IExecutionContext>> context;
-    std::vector<Bindings> bindings;
+    std::vector<std::unique_ptr<Bindings>> bindings;
 };
 
 //!
@@ -46,7 +46,7 @@ void setUpInference(InferenceEnvironment& iEnv, const InferenceOptions& inferenc
 //!
 //! \brief Run inference and collect timing
 //!
-void runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv, std::vector<InferenceTime>& trace);
+void runInference(const InferenceOptions& inference, InferenceEnvironment& iEnv, std::vector<InferenceTrace>& trace);
 
 } // namespace sample
 
diff --git a/samples/common/sampleOptions.cpp b/samples/common/sampleOptions.cpp
index 5df27f75..52ae56b0 100644
--- a/samples/common/sampleOptions.cpp
+++ b/samples/common/sampleOptions.cpp
@@ -24,6 +24,7 @@
 
 #include "NvInfer.h"
 
+#include "sampleUtils.h"
 #include "sampleOptions.h"
 
 namespace sample
@@ -142,6 +143,37 @@ inline IOFormat stringToValue<IOFormat>(const std::string& option)
     return ioFormat;
 }
 
+template <typename T>
+inline std::pair<std::string, T> splitNameAndValue(const std::string& s)
+{
+    std::string tensorName;
+    std::string valueString;
+    std::vector<std::string> nameWithQuotes{splitToStringVec(s, '\'')};
+    if (nameWithQuotes.size() == 1)
+    {
+        // Name not wrapped with single quotes
+        std::vector<std::string> nameRange{splitToStringVec(s, ':')};
+        tensorName = nameRange[0];
+        valueString = nameRange[1];
+    }
+    else
+    {
+        // Name wrapped with single quotes
+        tensorName = nameWithQuotes[1];
+        valueString = splitToStringVec(nameWithQuotes[2], ':')[1];
+    }
+    return std::pair<std::string, T>(tensorName, stringToValue<T>(valueString));
+}
+
+template <typename T>
+inline void splitInsertKeyValue(const std::vector<std::string>& kvList, T& map)
+{
+    for (const auto& kv : kvList)
+    {
+        map.insert(splitNameAndValue<typename T::mapped_type>(kv));
+    }
+}
+
 inline const char* boolToEnabled(bool enable)
 {
     return enable ? "Enabled" : "Disabled";
@@ -342,37 +374,23 @@ void BuildOptions::parse(Arguments& arguments)
     getFormats(outputFormats, "--outputIOFormats");
 
     auto getShapes = [&arguments](std::unordered_map<std::string, ShapeRange>& shapes, const char* argument,
-                         nvinfer1::OptProfileSelector selector)
-    {
+                         nvinfer1::OptProfileSelector selector) {
         std::string list;
         checkEraseOption(arguments, argument, list);
         std::vector<std::string> shapeList{splitToStringVec(list, ',')};
         for (const auto& s : shapeList)
         {
-            std::string tensorName, dimsString;
-            std::vector<std::string> nameWithQuotes{splitToStringVec(s, '\'')};
-            if (nameWithQuotes.size() == 1)
-            {
-                // Name not wrapped with single quotes
-                std::vector<std::string> nameRange{splitToStringVec(s, ':')};
-                tensorName = nameRange[0];
-                dimsString = nameRange[1];
-            }
-            else
-            {
-                // Name wrapped with single quotes
-                tensorName = nameWithQuotes[1];
-                dimsString = splitToStringVec(nameWithQuotes[2], ':')[1];
-            }
+            auto nameDimsPair = splitNameAndValue<nvinfer1::Dims>(s);
+            std::string tensorName = nameDimsPair.first;
+            nvinfer1::Dims dims = nameDimsPair.second;
 
             if (shapes.find(tensorName) == shapes.end())
             {
-                auto dims = stringToValue<nvinfer1::Dims>(dimsString);
                 insertShapes(shapes, tensorName, dims);
             }
             else
             {
-                shapes[tensorName][static_cast<size_t>(selector)] = stringToValue<nvinfer1::Dims>(dimsString);
+                shapes[tensorName][static_cast<size_t>(selector)] = dims;
             }
         }
     };
@@ -444,32 +462,25 @@ void InferenceOptions::parse(Arguments& arguments)
     checkEraseOption(arguments, "--duration", duration);
     checkEraseOption(arguments, "--warmUp", warmup);
     checkEraseOption(arguments, "--sleepTime", sleep);
+    bool exposeDMA{false};
+    if (checkEraseOption(arguments, "--exposeDMA", exposeDMA))
+    {
+        overlap = !exposeDMA;
+    }
     checkEraseOption(arguments, "--useSpinWait", spin);
     checkEraseOption(arguments, "--threads", threads);
-    checkEraseOption(arguments, "--loadInputs", inputs);
     checkEraseOption(arguments, "--useCudaGraph", graph);
     checkEraseOption(arguments, "--buildOnly", skip);
 
     std::string list;
+    checkEraseOption(arguments, "--loadInputs", list);
+    std::vector<std::string> inputsList{splitToStringVec(list, ',')};
+    splitInsertKeyValue(inputsList, inputs);
+
+    list.erase();
     checkEraseOption(arguments, "--shapes", list);
     std::vector<std::string> shapeList{splitToStringVec(list, ',')};
-    for (const auto& s : shapeList)
-    {
-        std::vector<std::string> nameWithQuotes{splitToStringVec(s, '\'')};
-        if (nameWithQuotes.size() == 1)
-        {
-            // Name not wrapped with single quotes
-            std::vector<std::string> shapeSpec{splitToStringVec(s, ':')};
-            shapes.insert({shapeSpec[0], stringToValue<nvinfer1::Dims>(shapeSpec[1])});
-        }
-        else
-        {
-            // Name wrapped with single quotes
-            std::string tensorName = nameWithQuotes[1];
-            std::string dimsString = splitToStringVec(nameWithQuotes[2], ':')[1];
-            shapes.insert({tensorName, stringToValue<nvinfer1::Dims>(dimsString)});
-        }
-    }
+    splitInsertKeyValue(shapeList, shapes);
 
     int batchOpt{0};
     checkEraseOption(arguments, "--batch", batchOpt);
@@ -521,7 +532,7 @@ void AllOptions::parse(Arguments& arguments)
     system.parse(arguments);
     inference.parse(arguments);
 
-    if ((!build.maxBatch && inference.batch && inference.batch != defaultBatch)
+    if ((!build.maxBatch && inference.batch && inference.batch != defaultBatch && !build.shapes.empty())
         || (build.maxBatch && build.maxBatch != defaultMaxBatch && !inference.batch))
     {
         // If either has selected implict batch and the other has selected explicit batch
@@ -705,6 +716,11 @@ std::ostream& operator<<(std::ostream& os, const IOFormat& format)
         os << "int32:";
         break;
     }
+    case nvinfer1::DataType::kBOOL:
+    {
+        os << "Bool:";
+        break;
+    }
     }
 
     for (int f = 0; f < nvinfer1::EnumMax<nvinfer1::TensorFormat>(); ++f)
@@ -753,15 +769,6 @@ std::ostream& operator<<(std::ostream& os, const IOFormat& format)
     return os;
 };
 
-std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
-{
-    for (int i = 0; i < dims.nbDims; ++i)
-    {
-        os << (i ? "x" : "") << dims.d[i];
-    }
-    return os;
-}
-
 std::ostream& operator<<(std::ostream& os, const ShapeRange& dims)
 {
     int i = 0;
@@ -837,32 +844,37 @@ std::ostream& operator<<(std::ostream& os, const SystemOptions& options)
 std::ostream& operator<<(std::ostream& os, const InferenceOptions& options)
 {
 // clang-format off
-    os << "=== Inference Options ==="                                        << std::endl <<
+    os << "=== Inference Options ==="                                << std::endl <<
 
           "Batch: ";
     if (options.batch && options.shapes.empty())
     {
-                          os << options.batch                                << std::endl;
+                          os << options.batch                        << std::endl;
     }
     else
     {
-                          os << "Explicit"                                   << std::endl;
+                          os << "Explicit"                           << std::endl;
     }
-    os << "Iterations: "     << options.iterations << " (" << options.warmup <<
-                                                      " ms warm up)"         << std::endl <<
-          "Inputs: "         << options.inputs                               << std::endl <<
-          "Duration: "       << options.duration   << "s"                    << std::endl <<
-          "Sleep time: "     << options.sleep      << "ms"                   << std::endl <<
-          "Streams: "        << options.streams                              << std::endl <<
-          "Spin-wait: "      << boolToEnabled(options.spin)                  << std::endl <<
-          "Multithreading: " << boolToEnabled(options.threads)               << std::endl <<
-          "CUDA Graph: "     << boolToEnabled(options.graph)                 << std::endl <<
-          "Skip inference: " << boolToEnabled(options.skip)                  << std::endl;
-// clang-format on
+    os << "Iterations: "     << options.iterations                   << std::endl <<
+          "Duration: "       << options.duration   << "s (+ "
+                             << options.warmup     << "ms warm up)"  << std::endl <<
+          "Sleep time: "     << options.sleep      << "ms"           << std::endl <<
+          "Streams: "        << options.streams                      << std::endl <<
+          "ExposeDMA: "      << boolToEnabled(!options.overlap)      << std::endl <<
+          "Spin-wait: "      << boolToEnabled(options.spin)          << std::endl <<
+          "Multithreading: " << boolToEnabled(options.threads)       << std::endl <<
+          "CUDA Graph: "     << boolToEnabled(options.graph)         << std::endl <<
+          "Skip inference: " << boolToEnabled(options.skip)          << std::endl;
     if (options.batch)
     {
         printShapes(os, "inference", options.shapes);
     }
+// clang-format on
+    os << "Inputs:" << std::endl;
+    for (const auto& input : options.inputs)
+    {
+        os << input.first << "<-" << input.second << std::endl;
+    }
 
     return os;
 }
@@ -936,6 +948,7 @@ void BuildOptions::help(std::ostream& os)
           "                                    provided and assuming that opt will be equal to max unless they are both specified;"   << std::endl <<           
           "                                    partially specified shapes are applied starting from the batch size;"                  << std::endl <<           
           "                                    dynamic shapes imply explicit batch"                                                   << std::endl <<           
+          "                                    input names can be wrapped with single quotes (ex: 'Input:0')"                         << std::endl <<
           "                              Input shapes spec ::= Ishp[\",\"spec]"                                                       << std::endl <<
           "                                           Ishp ::= name\":\"shape"                                                        << std::endl <<
           "                                          shape ::= N[[\"x\"N]*\"*\"]"                                                     << std::endl <<
@@ -950,8 +963,8 @@ void BuildOptions::help(std::ostream& os)
                                                                                                            << defaultMinTiming << ")" << std::endl <<
           "  --avgTiming=M               Set the number of times averaged in each iteration for kernel selection (default = "
                                                                                                            << defaultAvgTiming << ")" << std::endl <<
-          "  --fp16                      Enable fp16 mode (default = disabled)"                                                       << std::endl <<
-          "  --int8                      Run in int8 mode (default = disabled)"                                                       << std::endl <<
+          "  --fp16                      Enable fp16 algorithms, in addition to fp32 (default = disabled)"                            << std::endl <<
+          "  --int8                      Enable int8 algorithms, in addition to fp32 (default = disabled)"                             << std::endl <<
           "  --calib=<file>              Read INT8 calibration cache file"                                                            << std::endl <<
           "  --safe                      Only test the functionality available in safety restricted flows"                            << std::endl <<
           "  --saveEngine=<file>         Save the serialized engine"                                                                  << std::endl <<
@@ -974,26 +987,31 @@ void SystemOptions::help(std::ostream& os)
 void InferenceOptions::help(std::ostream& os)
 {
 // clang-format off
-    os << "=== Inference Options ==="                                                                                            << std::endl <<
-          "  --batch=N                   Set batch size for implicit batch engines (default = "           << defaultBatch << ")" << std::endl <<
-          "  --shapes=spec               Set input shapes for explicit batch and dynamic shapes inputs"                          << std::endl <<
-          "  --loadInputs=<file>         Load input values from file (default = disabled)"                                       << std::endl <<
-          "                              Input shapes spec ::= Ishp[\",\"spec]"                                                  << std::endl <<
-          "                                           Ishp ::= name\":\"shape"                                                   << std::endl <<
-          "                                          shape ::= N[[\"x\"N]*\"*\"]"                                                << std::endl <<
-          "  --iterations=N              Run at least N inference iterations (default = "            << defaultIterations << ")" << std::endl <<
+    os << "=== Inference Options ==="                                                                                               << std::endl <<
+          "  --batch=N                   Set batch size for implicit batch engines (default = "              << defaultBatch << ")" << std::endl <<
+          "  --shapes=spec               Set input shapes for dynamic shapes inputs. Input names can be wrapped with single quotes"
+                                                                                                                  "(ex: 'Input:0')" << std::endl <<
+          "                              Input shapes spec ::= Ishp[\",\"spec]"                                                     << std::endl <<
+          "                                           Ishp ::= name\":\"shape"                                                      << std::endl <<
+          "                                          shape ::= N[[\"x\"N]*\"*\"]"                                                   << std::endl <<
+          "  --loadInputs=spec           Load input values from files (default = generate random inputs). Input names can be "
+                                                                                       "wrapped with single quotes (ex: 'Input:0')" << std::endl <<
+          "                              Input values spec ::= Ival[\",\"spec]"                                                     << std::endl <<
+          "                                           Ival ::= name\":\"file"                                                       << std::endl <<
+          "  --iterations=N              Run at least N inference iterations (default = "               << defaultIterations << ")" << std::endl <<
           "  --warmUp=N                  Run for N milliseconds to warmup before measuring performance (default = "
-                                                                                                         << defaultWarmUp << ")" << std::endl <<
+                                                                                                            << defaultWarmUp << ")" << std::endl <<
           "  --duration=N                Run performance measurements for at least N seconds wallclock time (default = "
-                                                                                               << defaultDuration << ")"         << std::endl <<
+                                                                                                          << defaultDuration << ")" << std::endl <<
           "  --sleepTime=N               Delay inference start with a gap of N milliseconds between launch and compute "
-                                                                                            "(default = " << defaultSleep << ")" << std::endl <<
-          "  --streams=N                 Instantiate N engines to use concurrently (default = "         << defaultStreams << ")" << std::endl <<
+                                                                                               "(default = " << defaultSleep << ")" << std::endl <<
+          "  --streams=N                 Instantiate N engines to use concurrently (default = "            << defaultStreams << ")" << std::endl <<
+          "  --exposeDMA                 Serialize DMA transfers to and from device. (default = disabled)"                          << std::endl <<
           "  --useSpinWait               Actively synchronize on GPU events. This option may decrease synchronization time but "
-                                                                                "increase CPU usage and power (default = false)" << std::endl <<
-          "  --threads                   Enable multithreading to drive engines with independent threads (default = disabled)"   << std::endl <<
-          "  --useCudaGraph              Use cuda graph to capture engine execution and then launch inference (default = false)" << std::endl <<
-          "  --buildOnly                 Skip inference perf measurement (default = disabled)"                                   << std::endl;
+                                                                             "increase CPU usage and power (default = disabled)"    << std::endl <<
+          "  --threads                   Enable multithreading to drive engines with independent threads (default = disabled)"      << std::endl <<
+          "  --useCudaGraph              Use cuda graph to capture engine execution and then launch inference (default = disabled)" << std::endl <<
+          "  --buildOnly                 Skip inference perf measurement (default = disabled)"                                      << std::endl;
 // clang-format on
 }
 
diff --git a/samples/common/sampleOptions.h b/samples/common/sampleOptions.h
index 932f7aad..88fb0e56 100644
--- a/samples/common/sampleOptions.h
+++ b/samples/common/sampleOptions.h
@@ -145,11 +145,12 @@ struct InferenceOptions : public Options
     int duration{defaultDuration};
     int sleep{defaultSleep};
     int streams{defaultStreams};
+    bool overlap{true};
     bool spin{false};
     bool threads{false};
     bool graph{false};
     bool skip{false};
-    std::string inputs;
+    std::unordered_map<std::string, std::string> inputs;
     std::unordered_map<std::string, nvinfer1::Dims> shapes;
 
     void parse(Arguments& arguments) override;
@@ -201,8 +202,6 @@ std::ostream& operator<<(std::ostream& os, const UffInput& input);
 
 std::ostream& operator<<(std::ostream& os, const IOFormat& format);
 
-std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims);
-
 std::ostream& operator<<(std::ostream& os, const ShapeRange& dims);
 
 std::ostream& operator<<(std::ostream& os, const ModelOptions& options);
diff --git a/samples/common/sampleReporting.cpp b/samples/common/sampleReporting.cpp
index 8d490ab9..a987ee68 100644
--- a/samples/common/sampleReporting.cpp
+++ b/samples/common/sampleReporting.cpp
@@ -15,7 +15,11 @@
  */
 
 #include <iostream>
+#include <iomanip>
+#include <fstream>
 #include <utility>
+#include <algorithm>
+#include <numeric>
 
 #include "sampleOptions.h"
 #include "sampleInference.h"
@@ -29,114 +33,295 @@ namespace sample
 namespace
 {
 
+//!
+//! \brief Find percentile in an ascending sequence of timings
+//!
 template <typename T>
-float percentile(float percentage, const std::vector<InferenceTime>& times, const T& toFloat)
+float findPercentile(float percentage, const std::vector<InferenceTime>& timings, const T& toFloat)
 {
-    int all = static_cast<int>(times.size());
-    int exclude = static_cast<int>((1 - percentage / 100) * all);
+    const int all = static_cast<int>(timings.size());
+    const int exclude = static_cast<int>((1 - percentage / 100) * all);
     if (0 <= exclude && exclude <= all)
     {
-        return toFloat(times[std::max(all - 1 - exclude, 0)]);
+        return toFloat(timings[std::max(all - 1 - exclude, 0)]);
     }
     return std::numeric_limits<float>::infinity();
 }
 
+//!
+//! \brief Find median in a sorted sequence of timings
+//!
 template <typename T>
-float median(const std::vector<InferenceTime>& times, const T& val)
+float findMedian(const std::vector<InferenceTime>& timings, const T& toFloat)
 {
-    if (!times.size())
+    if (timings.empty())
     {
         return std::numeric_limits<float>::infinity();
     }
 
-    int m = times.size()/2;
-    if (times.size() % 2)
+    const int m = timings.size()/2;
+    if (timings.size() % 2)
     {
-        return val(times[m]);
+        return toFloat(timings[m]);
     }
 
-    return (val(times[m-1]) + val(times[m])) / 2;
+    return (toFloat(timings[m-1]) + toFloat(timings[m])) / 2;
 }
 
+inline InferenceTime traceToTiming(const InferenceTrace& a)
+{
+    return InferenceTime((a.inEnd - a.inStart), (a.computeEnd - a.computeStart), (a.outEnd - a.outStart), (a.outEnd - a.inStart));
+};
+
 } // namespace
 
-void printTimes(std::vector<InferenceTime>& times, const ReportingOptions& reporting, float queries, std::ostream& os)
+void printProlog(int warmups, int timings, float warmupMs, float benchTimeMs, std::ostream& os)
 {
-    auto toGpuTime = [](const InferenceTime& t) { return t.gpuTime; };
-    auto toLatency = [](const InferenceTime& t) { return t.latency; };
-
-    auto cmpGpuTime = [](const InferenceTime& a, const InferenceTime& b) { return a.gpuTime < b.gpuTime; };
-    auto cmpLatency = [](const InferenceTime& a, const InferenceTime& b) { return a.latency < b.latency; };
+    os << "Warmup completed " << warmups << " queries over " << warmupMs << " ms" << std::endl;
+    os << "Timing trace has " << timings << " queries over " << benchTimeMs / 1000 << " s" << std::endl;
+}
 
-    int avgs{0};
-    InferenceTime sum = times.back();
-    times.pop_back();
-    InferenceTime average;
+void printTiming(const std::vector<InferenceTime>& timings, int runsPerAvg, std::ostream& os)
+{
+    int count = 0;
+    InferenceTime sum;
 
-    os << "Timing trace of "  << times.size() << " iterations over " << sum.latency << " s" << std::endl;
-    os << "Averages of " << reporting.avgs << " iterations:" << std::endl;
-    for (const auto& t : times)
+    os << "Trace averages of " << runsPerAvg << " runs:" << std::endl;
+    for (const auto& t : timings)
     {
-        average.gpuTime += t.gpuTime;
-        average.latency += t.latency;
+        sum += t;
 
-        if (++avgs == reporting.avgs)
+        if (++count == runsPerAvg)
         {
 // clang off
-            os << "GPU compute time average over " << reporting.avgs << " runs is " << average.gpuTime/reporting.avgs << " ms"
-                  " (latency average " << average.latency/reporting.avgs << " ms)" << std::endl;
+            os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg
+               << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (end to end "
+               << sum.e2e / runsPerAvg << " ms)" << std::endl;
 // clang on
-            avgs = 0;
-            average.gpuTime = 0;
-            average.latency = 0;
+            count = 0;
+            sum.in = 0;
+            sum.compute = 0;
+            sum.out = 0;
+            sum.e2e = 0;
         }
     }
-    os << std::endl;
+}
+
+void printEpilog(std::vector<InferenceTime> timings, float walltimeMs, float percentile, int queries, std::ostream& os)
+{
+    const InferenceTime totalTime = std::accumulate(timings.begin(), timings.end(), InferenceTime());
+
+    const auto getLatency = [](const InferenceTime& t) { return t.latency(); };
+    const auto cmpLatency = [](const InferenceTime& a, const InferenceTime& b) { return a.latency() < b.latency(); };
+    std::sort(timings.begin(), timings.end(), cmpLatency);
+    const float latencyMin = timings.front().latency();
+    const float latencyMax = timings.back().latency();
+    const float latencyMedian = findMedian(timings, getLatency);
+    const float latencyPercentile = findPercentile(percentile, timings, getLatency);
+    const float latencyThroughput = queries * timings.size() / walltimeMs * 1000;
 
-    std::sort(times.begin(), times.end(), cmpLatency);
-    float latencyMin = times.front().latency;
-    float latencyMax = times.back().latency;
-    float latencyMedian = median(times, toLatency);
-    float latencyPercentile = percentile(reporting.percentile, times, toLatency);
-    float latencyThroughput = queries * times.size() * 1000 / sum.latency;
+    const auto getEndToEnd = [](const InferenceTime& t) { return t.e2e; };
+    const auto cmpEndToEnd = [](const InferenceTime& a, const InferenceTime& b) { return a.e2e < b.e2e; };
+    std::sort(timings.begin(), timings.end(), cmpEndToEnd);
+    const float endToEndMin = timings.front().e2e;
+    const float endToEndMax = timings.back().e2e;
+    const float endToEndMedian = findMedian(timings, getEndToEnd);
+    const float endToEndPercentile = findPercentile(percentile, timings, getEndToEnd);
 
-    std::sort(times.begin(), times.end(), cmpGpuTime);
-    float gpuMin = times.front().gpuTime;
-    float gpuMax = times.back().gpuTime;
-    float gpuMedian = median(times, toGpuTime);
-    float gpuPercentile = percentile(reporting.percentile, times, toGpuTime);
-    float gpuThroughput = queries * times.size() * 1000 / sum.gpuTime;
+    const auto getCompute = [](const InferenceTime& t) { return t.compute; };
+    const auto cmpCompute = [](const InferenceTime& a, const InferenceTime& b) { return a.compute < b.compute; };
+    std::sort(timings.begin(), timings.end(), cmpCompute);
+    const float gpuMin = timings.front().compute;
+    const float gpuMax = timings.back().compute;
+    const float gpuMedian = findMedian(timings, getCompute);
+    const float gpuPercentile = findPercentile(percentile, timings, getCompute);
 
 // clang off
-    os << "Host latency"                                         << std::endl <<
-          "min: "        << latencyMin                << " ms"   << std::endl <<
-          "max: "        << latencyMax                << " ms"   << std::endl <<
-          "mean: "       << sum.latency/times.size()  << " ms"   << std::endl <<
-          "median: "     << latencyMedian             << " ms"   << std::endl <<
-          "percentile: " << latencyPercentile         << " ms (" << reporting.percentile << "%)" << std::endl <<
-          "throughput: " << latencyThroughput         << " qps"  << std::endl <<
-          "walltime: "   << sum.latency/1000          << " s"    << std::endl << std::endl <<
-          "GPU Compute"                                          << std::endl <<
-          "min: "        << gpuMin                    << " ms"   << std::endl <<
-          "max: "        << gpuMax                    << " ms"   << std::endl <<
-          "mean: "       << sum.gpuTime/times.size()  << " ms"   << std::endl <<
-          "median: "     << gpuMedian                 << " ms"   << std::endl <<
-          "percentile: " << gpuPercentile             << " ms (" << reporting.percentile << "%)" << std::endl <<
-          "throughput: " << gpuThroughput             << " qps"  << std::endl <<
-          "walltime: "   << sum.gpuTime/1000          << " s"    << std::endl << std::endl;
+    os << "Host latency"                                                           << std::endl <<
+          "min: "                << latencyMin                           << " ms "
+          "(end to end "         << endToEndMin                          << " ms)" << std::endl <<
+          "max: "                << latencyMax                           << " ms "
+          "(end to end "         << endToEndMax                          << " ms)" << std::endl <<
+          "mean: "               << totalTime.latency() / timings.size() << " ms "
+          "(end to end "         << totalTime.e2e / timings.size()       << " ms)" << std::endl <<
+          "median: "             << latencyMedian                        << " ms "
+          "(end to end "         << endToEndMedian                       << " ms)" << std::endl <<
+          "percentile: "         << latencyPercentile                    << " ms "
+          "at "                  << percentile                           << "% "
+          "(end to end "         << endToEndPercentile                   << " ms "
+          "at "                  << percentile                           << "%)"   << std::endl <<
+          "throughput: "         << latencyThroughput                    << " qps" << std::endl <<
+          "walltime: "           << walltimeMs / 1000                    << " s"   << std::endl <<
+          "GPU Compute"                                                            << std::endl <<
+          "min: "                << gpuMin                               << " ms"  << std::endl <<
+          "max: "                << gpuMax                               << " ms"  << std::endl <<
+          "mean: "               << totalTime.compute / timings.size()   << " ms"  << std::endl <<
+          "median: "             << gpuMedian                            << " ms"  << std::endl <<
+          "percentile: "         << gpuPercentile                        << " ms "
+          "at "                  << percentile                           << "%"    << std::endl <<
+          "total compute time: " << totalTime.compute / 1000             << " s"   << std::endl;
 // clang on
 }
 
-void dumpOutputs(const nvinfer1::ICudaEngine& engine, const std::vector<Bindings>& bindings, std::ostream& os)
+void printPerformanceReport(const std::vector<InferenceTrace>& trace, const ReportingOptions& reporting, float warmupMs, int queries, std::ostream& os)
+{
+    const auto isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; };
+    const auto noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup);
+    const int warmups = noWarmup - trace.begin();
+    const float benchTime = trace.back().outEnd - noWarmup->inStart;
+    printProlog(warmups * queries, (trace.size() - warmups) * queries, warmupMs, benchTime, os);
+
+    std::vector<InferenceTime> timings(trace.size() - warmups);
+    std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming);
+    printTiming(timings, reporting.avgs, os);
+    printEpilog(timings, benchTime, reporting.percentile, queries, os);
+}
+
+//! Printed format:
+//! [ value, ...]
+//! value ::= { "start in" : time, "end in" : time, "start compute" : time, "end compute" : time, "start out" : time,
+//!             "in" : time, "compute" : time, "out" : time, "latency" : time, "end to end" : time}
+//!
+void exportJSONTrace(const std::vector<InferenceTrace>& trace, const std::string& fileName)
 {
-    os << std::endl << "Output tensors" << std::endl;
-    for (int b = 0; b < engine.getNbBindings(); ++b)
+    std::ofstream os(fileName, std::ofstream::trunc);
+    os << "[" << std::endl;
+    const char* sep = "  ";
+    for (const auto& t : trace)
     {
-        if (!engine.bindingIsInput(b))
+        const InferenceTime it(traceToTiming(t));
+        os << sep << "{ ";
+        sep = ", ";
+// clang off
+        os << "\"startInMs\" : "      << t.inStart      << sep << "\"endInMs\" : "      << t.inEnd      << sep
+           << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep
+           << "\"startOutMs\" : "     << t.outStart     << sep << "\"endOutMs\" : "     << t.outEnd     << sep
+           << "\"inMs\" : "           << it.in          << sep << "\"computeMs\" : "    << it.compute   << sep
+           << "\"outMs\" : "          << it.out         << sep << "\"latencyMs\" : "    << it.latency() << sep
+           << "\"endToEndMs\" : "     << it.e2e         << " }"                                         << std::endl;
+// clang on
+    }
+    os << "]" << std::endl;
+}
+
+void Profiler::reportLayerTime(const char* layerName, float timeMs)
+{
+    if (mIterator == mLayers.end())
+    {
+        const bool first = !mLayers.empty() && mLayers.begin()->name == layerName;
+        mUpdatesCount += mLayers.empty() || first;
+        if (first)
+        {
+            mIterator = mLayers.begin();
+        }
+        else
         {
-            os << engine.getBindingName(b) << std::endl;
+            mLayers.emplace_back();
+            mLayers.back().name = layerName;
+            mIterator = mLayers.end() - 1;
         }
     }
+
+    mIterator->timeMs += timeMs;
+    ++mIterator;
+}
+
+void Profiler::print(std::ostream& os) const
+{
+    const std::string nameHdr("Layer");
+    const std::string timeHdr("   Time (ms)");
+    const std::string avgHdr("   Avg. Time (ms)");
+    const std::string percentageHdr("   Time \%");
+
+    const float totalTimeMs = getTotalTime();
+
+    const auto cmpLayer = [](const LayerProfile& a, const LayerProfile& b)
+    {
+        return a.name.size() < b.name.size();
+    };
+    const auto longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer);
+    const auto nameLength = std::max(longestName->name.size() + 1, nameHdr.size());
+    const auto timeLength = timeHdr.size();
+    const auto avgLength = avgHdr.size();
+    const auto percentageLength = percentageHdr.size();
+
+    os << std::endl << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl
+       << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << percentageHdr << std::endl;
+
+    for (const auto& p : mLayers)
+    {
+// clang off
+        os << std::setw(nameLength)                                             << p.name
+           << std::setw(timeLength)       << std::fixed << std::setprecision(2) << p.timeMs
+           << std::setw(avgLength)        << std::fixed << std::setprecision(2) << p.timeMs / mUpdatesCount
+           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << p.timeMs / totalTimeMs * 100
+           << std::endl;
+    }
+    {
+        os << std::setw(nameLength)                                             << "Total"
+           << std::setw(timeLength)       << std::fixed << std::setprecision(2) << totalTimeMs
+           << std::setw(avgLength)        << std::fixed << std::setprecision(2) << totalTimeMs / mUpdatesCount
+           << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0
+           << std::endl;
+// clang on
+    }
+    os << std::endl;
+
+}
+
+void Profiler::exportJSONProfile(const std::string& fileName) const
+{
+    std::ofstream os(fileName, std::ofstream::trunc);
+    os << "[" << std::endl
+       << "  { \"count\" : " << mUpdatesCount << " }" << std::endl;
+
+    const auto totalTimeMs = getTotalTime();
+
+    for (const auto& l : mLayers)
+    {
+// clang off
+        os << ", {" << " \"name\" : \""      << l.name << "\""
+                       ", \"timeMs\" : "     << l.timeMs
+           <<          ", \"averageMs\" : "  << l.timeMs / mUpdatesCount
+           <<          ", \"percentage\" : " << l.timeMs / totalTimeMs * 100
+           << " }"  << std::endl;
+// clang on
+    }
+    os << "]" << std::endl;
+}
+
+void dumpInputs(const nvinfer1::IExecutionContext& context, const Bindings& bindings, std::ostream& os)
+{
+    os << "Input Tensors:" << std::endl;
+    bindings.dumpInputs(context, os);
+}
+
+void dumpOutputs(const nvinfer1::IExecutionContext& context, const Bindings& bindings, std::ostream& os)
+{
+    os << "Output Tensors:" << std::endl;
+    bindings.dumpOutputs(context, os);
+}
+
+void exportJSONOutput(const nvinfer1::IExecutionContext& context, const Bindings& bindings, const std::string& fileName)
+{
+    std::ofstream os(fileName, std::ofstream::trunc);
+    std::string sep ="  ";
+    const auto output = bindings.getOutputBindings();
+    os << "[" << std::endl;
+    for (const auto& binding : output)
+    {
+// clang off
+        os << sep << "{ \"name\" : \"" << binding.first << "\"" << std::endl;
+        sep = ", ";
+        os << "  " << sep << "\"dimensions\" : \"";
+        bindings.dumpBindingDimensions(binding.second, context, os);
+        os << "\"" << std::endl;
+        os << "  " << sep << "\"values\" : [ ";
+        bindings.dumpBindingValues(binding.second, os, sep);
+        os << " ]" << std::endl << "  }"  << std::endl;
+// clang on
+    }
+    os << "]" << std::endl;
 }
 
 } // namespace sample
diff --git a/samples/common/sampleReporting.h b/samples/common/sampleReporting.h
index 25c36c47..c30e4b78 100644
--- a/samples/common/sampleReporting.h
+++ b/samples/common/sampleReporting.h
@@ -22,13 +22,18 @@
 #include "NvInfer.h"
 
 #include "sampleOptions.h"
+#include "sampleUtils.h"
 
 namespace sample
 {
 
+//!
+//! \struct InferenceTime
+//! \brief Measurement times in milliseconds
+//!
 struct InferenceTime
 {
-    InferenceTime(float l, float g): latency(l), gpuTime(g) {}
+    InferenceTime(float i, float c, float o, float e): in(i), compute(c), out(o), e2e(e) {}
 
     InferenceTime() = default;
     InferenceTime(const InferenceTime&) = default;
@@ -37,27 +42,136 @@ struct InferenceTime
     InferenceTime& operator=(InferenceTime&&) = default;
     ~InferenceTime() = default;
 
-    float latency{};
-    float gpuTime{};
+    float in{0};      // Host to Device
+    float compute{0}; // Compute
+    float out{0};     // Device to Host
+    float e2e{0};     // end to end
+
+    // ideal latency
+    float latency() const
+    {
+        return in + compute + out;
+    }
+};
+
+//!
+//! \struct InferenceTrace
+//! \brief Measurement points in milliseconds
+//!
+struct InferenceTrace
+{
+    InferenceTrace(int s, float is, float ie, float cs, float ce, float os, float oe):
+        stream(s), inStart(is), inEnd(ie), computeStart(cs), computeEnd(ce), outStart(os), outEnd(oe) {}
+
+    InferenceTrace() = default;
+    InferenceTrace(const InferenceTrace&) = default;
+    InferenceTrace(InferenceTrace&&) = default;
+    InferenceTrace& operator=(const InferenceTrace&) = default;
+    InferenceTrace& operator=(InferenceTrace&&) = default;
+    ~InferenceTrace() = default;
+
+    int stream{0};
+    float inStart{0};
+    float inEnd{0};
+    float computeStart{0};
+    float computeEnd{0};
+    float outStart{0};
+    float outEnd{0};
 };
 
 inline InferenceTime operator+(const InferenceTime& a, const InferenceTime& b)
 {
-    InferenceTime sum;
-    sum.gpuTime = a.gpuTime + b.gpuTime;
-    sum.latency = a.latency + b.latency;
-    return sum;
+    return InferenceTime(a.in + b.in, a.compute + b.compute, a.out + b.out, a.e2e + b.e2e);
+}
+
+inline InferenceTime operator+=(InferenceTime& a, const InferenceTime& b)
+{
+    return a = a+b;
 }
 
+//!
+//! \brief Print benchmarking time and number of traces collected
+//!
+void printProlog(int warmups, int timings, float warmupMs, float walltime, std::ostream& os);
+
+//!
+//! \brief Print a timing trace
+//!
+void printTiming(const std::vector<InferenceTime>& timings, int runsPerAvg, std::ostream& os);
+
+//!
+//! \brief Print the performance summary of a trace
+//!
+void printEpilog(std::vector<InferenceTime> timings, float percentile, int queries, std::ostream& os);
+
 //!
 //! \brief Print and summarize a timing trace
 //!
-void printTimes(std::vector<InferenceTime>& times, const ReportingOptions& reporting, float queries, std::ostream& os);
+void printPerformanceReport(const std::vector<InferenceTrace>& trace, const ReportingOptions& reporting, float warmupMs, int queries, std::ostream& os);
+
+//!
+//! \brief Export a timing trace to JSON file
+//!
+void exportJSONTrace(const std::vector<InferenceTrace>& trace, const std::string& fileName);
+
+//!
+//! \brief Print input tensors to stream
+//!
+void dumpInputs(const nvinfer1::IExecutionContext& context, const Bindings& bindings, std::ostream& os);
 
 //!
 //! \brief Print output tensors to stream
 //!
-void dumpOutputs(const nvinfer1::ICudaEngine& engine, const std::vector<Bindings>& bindings, std::ostream& os);
+void dumpOutputs(const nvinfer1::IExecutionContext& context, const Bindings& bindings, std::ostream& os);
+
+//!
+//! \brief Export output tensors to JSON file
+//!
+void exportJSONOutput(const nvinfer1::IExecutionContext& context, const Bindings& bindings, const std::string& fileName);
+
+//!
+//! \struct LayerProfile
+//! \brief Layer profile information
+//!
+struct LayerProfile
+{
+    std::string name;
+    float timeMs{0};
+};
+
+//!
+//! \class Profiler
+//! \brief Collect per-layer profile information, assuming times are reported in the same order
+//!
+class Profiler : public nvinfer1::IProfiler
+{
+
+public:
+
+    void reportLayerTime(const char* layerName, float timeMs) override;
+
+    void print(std::ostream& os) const;
+
+    //!
+    //! \brief Export a profile to JSON file
+    //!
+    void exportJSONProfile(const std::string& fileName) const;
+
+private:
+
+    float getTotalTime() const
+    {
+        const auto plusLayerTime = [](float accumulator, const LayerProfile& lp)
+        {
+            return accumulator + lp.timeMs;
+        };
+        return std::accumulate(mLayers.begin(), mLayers.end(), 0.0, plusLayerTime);
+    }
+
+    std::vector<LayerProfile> mLayers;
+    std::vector<LayerProfile>::iterator mIterator{mLayers.begin()};
+    int mUpdatesCount{0};
+};
 
 } // namespace sample
 
diff --git a/samples/common/sampleUtils.h b/samples/common/sampleUtils.h
index 60843339..c4fff26a 100644
--- a/samples/common/sampleUtils.h
+++ b/samples/common/sampleUtils.h
@@ -19,9 +19,16 @@
 
 #include <iostream>
 #include <memory>
+#include <fstream>
+#include <random>
 #include <numeric>
 #include <unordered_map>
 #include <vector>
+#if CUDA_VERSION < 10000
+#include <half.h>
+#else
+#include <cuda_fp16.h>
+#endif
 
 #include "NvInfer.h"
 
@@ -31,80 +38,343 @@ namespace sample
 {
 
 template <typename T>
-struct TrtDestroyer
-{
-    void operator()(T* t) { t->destroy(); }
-};
-
-template <typename T> using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T> >;
+inline T roundUp(T m, T n) { return ((m + n - 1) / n) * n; }
 
 inline int volume(const nvinfer1::Dims& d)
 {
     return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int>());
 }
 
-inline int dataTypeSize(nvinfer1::DataType t)
+inline int volume(nvinfer1::Dims dims, int vecDim, int comps, int batch)
+{
+    if (vecDim != -1)
+    {
+        dims.d[vecDim] = roundUp(dims.d[vecDim], comps);
+    }
+    return volume(dims) * std::max(batch, 1);
+}
+
+inline
+std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
+{
+    for (int i = 0; i < dims.nbDims; ++i)
+    {
+        os << (i ? "x" : "") << dims.d[i];
+    }
+    return os;
+}
+
+inline int dataTypeSize(nvinfer1::DataType dataType)
 {
-    switch (t)
+    switch (dataType)
     {
     case nvinfer1::DataType::kINT32:
     case nvinfer1::DataType::kFLOAT: return 4;
     case nvinfer1::DataType::kHALF: return 2;
+    case nvinfer1::DataType::kBOOL:
     case nvinfer1::DataType::kINT8: return 1;
     }
     return 0;
 }
 
 template <typename T>
-inline T roundUp(T m, T n) { return ((m + n - 1) / n) * n; }
+inline void fillBuffer(void* buffer, int volume, T min, T max)
+{
+    T* typedBuffer = static_cast<T*>(buffer);
+    std::default_random_engine engine;
+    if (std::is_integral<T>::value)
+    {
+        std::uniform_int_distribution<int> distribution(min, max);
+        auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
+        std::generate(typedBuffer, typedBuffer + volume, generator);
+    }
+    else
+    {
+        std::uniform_real_distribution<float> distribution(min, max);
+        auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
+        std::generate(typedBuffer, typedBuffer + volume, generator);
+    }
+}
 
-class BindingBuffers
+// Specialization needed for custom type __half
+template <typename H>
+inline void fillBufferHalf(void* buffer, int volume, H min, H max)
 {
-public:
+    H* typedBuffer = static_cast<H*>(buffer);
+    std::default_random_engine engine;
+    std::uniform_real_distribution<float> distribution(min, max);
+    auto generator = [&engine, &distribution]() { return static_cast<H>(distribution(engine)); };
+    std::generate(typedBuffer, typedBuffer + volume, generator);
+}
+template <>
+#if CUDA_VERSION < 10000
+inline void fillBuffer<half_float::half>(void* buffer, int volume, half_float::half min, half_float::half max)
+#else
+inline void fillBuffer<__half>(void* buffer, int volume, __half min, __half max)
+#endif
+{
+    fillBufferHalf(buffer, volume, min, max);
+}
 
-    void allocate(size_t size)
+template <typename T>
+inline void dumpBuffer(const void* buffer, int volume, const std::string& separator, std::ostream& os)
+{
+    const T* typedBuffer = static_cast<const T*>(buffer);
+    std::string sep;
+    for (int v = 0; v < volume; ++v)
     {
-        mSize = size;
-        mHostBuffer.allocate(size);
-        mDeviceBuffer.allocate(size);
+        os << sep << typedBuffer[v];
+        sep = separator;
     }
+}
 
-    void* getDeviceBuffer() const { return mDeviceBuffer.get(); }
+struct Binding
+{
+    bool isInput{false};
+    MirroredBuffer buffer;
+    int volume{0};
+    nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT};
 
-    void* getHostBuffer() const { return mHostBuffer.get(); }
+    void fill(const std::string& fileName)
+    {
+        std::ifstream file(fileName, std::ios::in|std::ios::binary);
+        if (file.is_open())
+        {
+            file.read(static_cast<char*>(buffer.getHostBuffer()), buffer.getSize());
+            file.close();
+        }
+    }
 
-private:
+    void fill()
+    {
+        switch (dataType)
+        {
+        case nvinfer1::DataType::kBOOL:
+        {
+            fillBuffer<bool>(buffer.getHostBuffer(), volume, 0, 1);
+            break;
+        }
+        case nvinfer1::DataType::kINT32:
+        {
+            fillBuffer<int32_t>(buffer.getHostBuffer(), volume, -128, 127);
+            break;
+        }
+        case nvinfer1::DataType::kINT8:
+        {
+            fillBuffer<int8_t>(buffer.getHostBuffer(), volume, -128, 127);
+            break;
+        }
+        case nvinfer1::DataType::kFLOAT:
+        {
+            fillBuffer<float>(buffer.getHostBuffer(), volume, -1.0, 1.0);
+            break;
+        }
+        case nvinfer1::DataType::kHALF:
+        {
+#if CUDA_VERSION < 10000
+            fillBuffer<half_float::half>(buffer.getHostBuffer(), volume, static_cast<half_float::half>(-1.0), static_cast<half_float::half>(-1.0));
+#else
+            fillBuffer<__half>(buffer.getHostBuffer(), volume, -1.0, 1.0);
+#endif
+            break;
+        }
+        }
+    }
+
+    void dump(std::ostream& os, const std::string separator = " ") const
+    {
+        switch (dataType)
+        {
+        case nvinfer1::DataType::kBOOL:
+        {
+            dumpBuffer<bool>(buffer.getHostBuffer(), volume, separator, os);
+            break;
+        }
+        case nvinfer1::DataType::kINT32:
+        {
+            dumpBuffer<int32_t>(buffer.getHostBuffer(), volume, separator, os);
+            break;
+        }
+        case nvinfer1::DataType::kINT8:
+        {
+            dumpBuffer<int8_t>(buffer.getHostBuffer(), volume, separator, os);
+            break;
+        }
+        case nvinfer1::DataType::kFLOAT:
+        {
+            dumpBuffer<float>(buffer.getHostBuffer(), volume, separator, os);
+            break;
+        }
+        case nvinfer1::DataType::kHALF:
+        {
+#if CUDA_VERSION < 10000
+            dumpBuffer<half_float::half>(buffer.getHostBuffer(), volume, separator, os);
+#else
+            dumpBuffer<__half>(buffer.getHostBuffer(), volume, separator, os);
+#endif
+            break;
+        }
+        }
+    }
 
-    int mSize{0};
-    TrtHostBuffer mHostBuffer;
-    TrtDeviceBuffer mDeviceBuffer;
 };
 
 class Bindings
 {
 public:
 
-    void addBinding(int b, const std::string& name, size_t size)
+    void addBinding(int b, const std::string& name, bool isInput, int volume, nvinfer1::DataType dataType, const std::string& fileName = "")
     {
-        while (mBuffers.size() <= static_cast<size_t>(b))
+        while (mBindings.size() <= static_cast<size_t>(b))
         {
-             mBuffers.emplace_back();
+             mBindings.emplace_back();
              mDevicePointers.emplace_back();
         }
-        mBindings[name] = b;
-        mBuffers[b].allocate(size);
-        mDevicePointers[b] = mBuffers[b].getDeviceBuffer();
+        mNames[name] = b;
+        mBindings[b].isInput = isInput;
+        mBindings[b].buffer.allocate(volume * dataTypeSize(dataType));
+        mBindings[b].volume = volume;
+        mBindings[b].dataType = dataType;
+        mDevicePointers[b] = mBindings[b].buffer.getDeviceBuffer();
+        if (isInput)
+        {
+            if (fileName.empty())
+            {
+                fill(b);
+            }
+            else
+            {
+                fill(b, fileName);
+            }
+        }
     }
 
     void** getDeviceBuffers() { return mDevicePointers.data(); }
 
+    void transferInputToDevice(TrtCudaStream& stream)
+    {
+        for (auto& b : mNames)
+        {
+            if (mBindings[b.second].isInput)
+            {
+                mBindings[b.second].buffer.hostToDevice(stream);
+            }
+        }
+    }
+
+    void transferOutputToHost(TrtCudaStream& stream)
+    {
+        for (auto& b : mNames)
+        {
+            if (!mBindings[b.second].isInput)
+            {
+                mBindings[b.second].buffer.deviceToHost(stream);
+            }
+        }
+    }
+
+    void fill(int binding, const std::string& fileName)
+    {
+        mBindings[binding].fill(fileName);
+    }
+
+    void fill(int binding)
+    {
+        mBindings[binding].fill();
+    }
+
+    void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const
+    {
+        const auto dims = context.getBindingDimensions(binding);
+        os << dims;
+    }
+
+    void dumpBindingValues(int binding, std::ostream& os, const std::string& separator = " ") const
+    {
+        mBindings[binding].dump(os, separator);
+    }
+
+    void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
+    {
+        auto isInput = [](const Binding& b) { return b.isInput; };
+        dumpBindings(context, isInput, os);
+    }
+
+    void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
+    {
+        auto isOutput = [](const Binding& b) { return !b.isInput; };
+        dumpBindings(context, isOutput, os);
+    }
+
+    void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const
+    {
+        auto all = [](const Binding& b) { return true; };
+        dumpBindings(context, all, os);
+    }
+
+    void dumpBindings(const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const
+    {
+        for (const auto& n : mNames)
+        {
+            const auto binding = n.second;
+            if (predicate(mBindings[binding]))
+            {
+                os << n.first << ": (";
+                dumpBindingDimensions(binding, context, os);
+                os << ")" << std::endl;
+                dumpBindingValues(binding, os);
+                os << std::endl;
+            }
+        }
+    }
+
+    std::unordered_map<std::string, int> getInputBindings() const
+    {
+        auto isInput = [](const Binding& b) { return b.isInput; };
+        return getBindings(isInput);
+    }
+
+    std::unordered_map<std::string, int> getOutputBindings() const
+    {
+        auto isOutput = [](const Binding& b) { return !b.isInput; };
+        return getBindings(isOutput);
+    }
+
+    std::unordered_map<std::string, int> getBindings() const
+    {
+        auto all = [](const Binding& b) { return true; };
+        return getBindings(all);
+    }
+
+    std::unordered_map<std::string, int> getBindings(bool (*predicate)(const Binding& b)) const
+    {
+        std::unordered_map<std::string, int> bindings;
+        for (const auto& n : mNames)
+        {
+            const auto binding = n.second;
+            if (predicate(mBindings[binding]))
+            {
+                bindings.insert(n);
+            }
+        }
+        return bindings;
+    }
+
 private:
 
-    std::unordered_map<std::string, int> mBindings;
-    std::vector<BindingBuffers> mBuffers;
-    std::vector<void*> mDevicePointers; 
+    std::unordered_map<std::string, int> mNames;
+    std::vector<Binding> mBindings;
+    std::vector<void*> mDevicePointers;
 };
 
+template <typename T>
+struct TrtDestroyer
+{
+    void operator()(T* t) { t->destroy(); }
+};
+
+template <typename T> using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T> >;
+
 } // namespace sample
 
 #endif // TRT_SAMPLE_UTILS_H
diff --git a/samples/common/windows/getopt.c b/samples/common/windows/getopt.c
index 7a4b10ab..1976bf6c 100644
--- a/samples/common/windows/getopt.c
+++ b/samples/common/windows/getopt.c
@@ -20,7 +20,6 @@
  * Agency (DARPA) and Air Force Research Laboratory, Air Force
  * Materiel Command, USAF, under agreement number F39502-99-1-0512.
  */
-
 /*-
  * Copyright (c) 2000 The NetBSD Foundation, Inc.
  * All rights reserved.
@@ -67,11 +66,11 @@
  */
 
 #include <errno.h>
+#include <stdlib.h>
+#include <string.h>
 #include <getopt.h>
 #include <stdarg.h>
 #include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
 #include <windows.h>
 
 #define REPLACE_GETOPT /* use this getopt as the system getopt(3) */
@@ -109,8 +108,10 @@ static char EMSG[] = "";
 #define EMSG ""
 #endif
 
-static int getopt_internal(int, char* const*, const char*, const struct option*, int*, int);
-static int parse_long_options(char* const*, const char*, const struct option*, int*, int);
+static int getopt_internal(int, char* const*, const char*,
+                           const struct option*, int*, int);
+static int parse_long_options(char* const*, const char*,
+                              const struct option*, int*, int);
 static int gcd(int, int);
 static void permute_args(int, int, int, char* const*);
 
@@ -128,7 +129,8 @@ static const char noarg[] = "option doesn't take an argument -- %.*s";
 static const char illoptchar[] = "unknown option -- %c";
 static const char illoptstring[] = "unknown option -- %s";
 
-static void _vwarnx(const char* fmt, va_list ap)
+static void
+_vwarnx(const char* fmt, va_list ap)
 {
     (void) fprintf(stderr, "%s: ", __progname);
     if (fmt != NULL)
@@ -136,7 +138,8 @@ static void _vwarnx(const char* fmt, va_list ap)
     (void) fprintf(stderr, "\n");
 }
 
-static void warnx(const char* fmt, ...)
+static void
+warnx(const char* fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
@@ -147,7 +150,8 @@ static void warnx(const char* fmt, ...)
 /*
  * Compute the greatest common divisor of a and b.
  */
-static int gcd(int a, int b)
+static int
+gcd(int a, int b)
 {
     int c;
 
@@ -167,14 +171,16 @@ static int gcd(int a, int b)
  * from nonopt_end to opt_end (keeping the same order of arguments
  * in each block).
  */
-static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char* const* nargv)
+static void
+permute_args(int panonopt_start, int panonopt_end, int opt_end,
+             char* const* nargv)
 {
     int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
     char* swap;
 
     /*
-     * compute lengths of blocks and number and size of cycles
-     */
+	 * compute lengths of blocks and number and size of cycles
+	 */
     nnonopts = panonopt_end - panonopt_start;
     nopts = opt_end - panonopt_end;
     ncycle = gcd(nnonopts, nopts);
@@ -204,16 +210,16 @@ static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char
  *	Parse long options in argc/argv argument vector.
  * Returns -1 if short_too is set and the option does not match long_options.
  */
-static int parse_long_options(
-    char* const* nargv, const char* options, const struct option* long_options, int* idx, int short_too)
+static int
+parse_long_options(char* const* nargv, const char* options,
+                   const struct option* long_options, int* idx, int short_too)
 {
     char *current_argv, *has_equal;
     size_t current_argv_len;
     int i, ambiguous, match;
 
-#define IDENTICAL_INTERPRETATION(_x, _y)                                                                               \
-    (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag    \
-        && long_options[(_x)].val == long_options[(_y)].val)
+#define IDENTICAL_INTERPRETATION(_x, _y) \
+    (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag && long_options[(_x)].val == long_options[(_y)].val)
 
     current_argv = place;
     match = -1;
@@ -233,7 +239,8 @@ static int parse_long_options(
     for (i = 0; long_options[i].name; i++)
     {
         /* find matching long option */
-        if (strncmp(current_argv, long_options[i].name, current_argv_len))
+        if (strncmp(current_argv, long_options[i].name,
+                    current_argv_len))
             continue;
 
         if (strlen(long_options[i].name) == current_argv_len)
@@ -244,9 +251,9 @@ static int parse_long_options(
             break;
         }
         /*
-         * If this is a known short option, don't allow
-         * a partial match of a single character.
-         */
+		 * If this is a known short option, don't allow
+		 * a partial match of a single character.
+		 */
         if (short_too && current_argv_len == 1)
             continue;
 
@@ -259,19 +266,22 @@ static int parse_long_options(
     {
         /* ambiguous abbreviation */
         if (PRINT_ERROR)
-            warnx(ambig, (int) current_argv_len, current_argv);
+            warnx(ambig, (int) current_argv_len,
+                  current_argv);
         optopt = 0;
         return (BADCH);
     }
     if (match != -1)
     { /* option found */
-        if (long_options[match].has_arg == no_argument && has_equal)
+        if (long_options[match].has_arg == no_argument
+            && has_equal)
         {
             if (PRINT_ERROR)
-                warnx(noarg, (int) current_argv_len, current_argv);
+                warnx(noarg, (int) current_argv_len,
+                      current_argv);
             /*
-             * XXX: GNU sets optopt to val regardless of flag
-             */
+			 * XXX: GNU sets optopt to val regardless of flag
+			 */
             if (long_options[match].flag == NULL)
                 optopt = long_options[match].val;
             else
@@ -285,22 +295,24 @@ static int parse_long_options(
             else if (long_options[match].has_arg == required_argument)
             {
                 /*
-                 * optional argument doesn't use next nargv
-                 */
+				 * optional argument doesn't use next nargv
+				 */
                 optarg = nargv[optind++];
             }
         }
-        if ((long_options[match].has_arg == required_argument) && (optarg == NULL))
+        if ((long_options[match].has_arg == required_argument)
+            && (optarg == NULL))
         {
             /*
-             * Missing argument; leading ':' indicates no error
-             * should be generated.
-             */
+			 * Missing argument; leading ':' indicates no error
+			 * should be generated.
+			 */
             if (PRINT_ERROR)
-                warnx(recargstring, current_argv);
+                warnx(recargstring,
+                      current_argv);
             /*
-             * XXX: GNU sets optopt to val regardless of flag
-             */
+			 * XXX: GNU sets optopt to val regardless of flag
+			 */
             if (long_options[match].flag == NULL)
                 optopt = long_options[match].val;
             else
@@ -337,8 +349,9 @@ static int parse_long_options(
  * getopt_internal --
  *	Parse argc/argv argument vector.  Called by user level routines.
  */
-static int getopt_internal(
-    int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx, int flags)
+static int
+getopt_internal(int nargc, char* const* nargv, const char* options,
+                const struct option* long_options, int* idx, int flags)
 {
     const char* oli; /* option letter list index */
     int optchar, short_too;
@@ -348,19 +361,19 @@ static int getopt_internal(
         return (-1);
 
     /*
-     * XXX Some GNU programs (like cvs) set optind to 0 instead of
-     * XXX using optreset.  Work around this braindamage.
-     */
+	 * XXX Some GNU programs (like cvs) set optind to 0 instead of
+	 * XXX using optreset.  Work around this braindamage.
+	 */
     if (optind == 0)
         optind = optreset = 1;
 
     /*
-     * Disable GNU extensions if POSIXLY_CORRECT is set or options
-     * string begins with a '+'.
-     *
-     * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or
-     *                 optreset != 0 for GNU compatibility.
-     */
+	 * Disable GNU extensions if POSIXLY_CORRECT is set or options
+	 * string begins with a '+'.
+	 *
+	 * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or
+	 *                 optreset != 0 for GNU compatibility.
+	 */
     if (posixly_correct == -1 || optreset != 0)
         posixly_correct = (getenv("POSIXLY_CORRECT") != NULL);
     if (*options == '-')
@@ -383,15 +396,16 @@ static int getopt_internal(
             if (nonopt_end != -1)
             {
                 /* do permutation, if we have to */
-                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                permute_args(nonopt_start, nonopt_end,
+                             optind, nargv);
                 optind -= nonopt_end - nonopt_start;
             }
             else if (nonopt_start != -1)
             {
                 /*
-                 * If we skipped non-options, set optind
-                 * to the first of them.
-                 */
+				 * If we skipped non-options, set optind
+				 * to the first of them.
+				 */
                 optind = nonopt_start;
             }
             nonopt_start = nonopt_end = -1;
@@ -403,18 +417,18 @@ static int getopt_internal(
             if (flags & FLAG_ALLARGS)
             {
                 /*
-                 * GNU extension:
-                 * return non-option as argument to option 1
-                 */
+				 * GNU extension:
+				 * return non-option as argument to option 1
+				 */
                 optarg = nargv[optind++];
                 return (INORDER);
             }
             if (!(flags & FLAG_PERMUTE))
             {
                 /*
-                 * If no permutation wanted, stop parsing
-                 * at first non-option.
-                 */
+				 * If no permutation wanted, stop parsing
+				 * at first non-option.
+				 */
                 return (-1);
             }
             /* do permutation */
@@ -422,7 +436,8 @@ static int getopt_internal(
                 nonopt_start = optind;
             else if (nonopt_end != -1)
             {
-                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                permute_args(nonopt_start, nonopt_end,
+                             optind, nargv);
                 nonopt_start = optind - (nonopt_end - nonopt_start);
                 nonopt_end = -1;
             }
@@ -434,19 +449,20 @@ static int getopt_internal(
             nonopt_end = optind;
 
         /*
-         * If we have "-" do nothing, if "--" we are done.
-         */
+		 * If we have "-" do nothing, if "--" we are done.
+		 */
         if (place[1] != '\0' && *++place == '-' && place[1] == '\0')
         {
             optind++;
             place = EMSG;
             /*
-             * We found an option (--), so if we skipped
-             * non-options, we have to permute.
-             */
+			 * We found an option (--), so if we skipped
+			 * non-options, we have to permute.
+			 */
             if (nonopt_end != -1)
             {
-                permute_args(nonopt_start, nonopt_end, optind, nargv);
+                permute_args(nonopt_start, nonopt_end,
+                             optind, nargv);
                 optind -= nonopt_end - nonopt_start;
             }
             nonopt_start = nonopt_end = -1;
@@ -455,11 +471,11 @@ static int getopt_internal(
     }
 
     /*
-     * Check long options if:
-     *  1) we were passed some
-     *  2) the arg is not just "-"
-     *  3) either the arg starts with -- we are getopt_long_only()
-     */
+	 * Check long options if:
+	 *  1) we were passed some
+	 *  2) the arg is not just "-"
+	 *  3) either the arg starts with -- we are getopt_long_only()
+	 */
     if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY)))
     {
         short_too = 0;
@@ -468,7 +484,8 @@ static int getopt_internal(
         else if (*place != ':' && strchr(options, *place) != NULL)
             short_too = 1; /* could be short option too */
 
-        optchar = parse_long_options(nargv, options, long_options, idx, short_too);
+        optchar = parse_long_options(nargv, options, long_options,
+                                     idx, short_too);
         if (optchar != -1)
         {
             place = EMSG;
@@ -476,14 +493,13 @@ static int getopt_internal(
         }
     }
 
-    if ((optchar = (int) *place++) == (int) ':' || (optchar == (int) '-' && *place != '\0')
-        || (oli = strchr(options, optchar)) == NULL)
+    if ((optchar = (int) *place++) == (int) ':' || (optchar == (int) '-' && *place != '\0') || (oli = strchr(options, optchar)) == NULL)
     {
         /*
-         * If the user specified "-" and  '-' isn't listed in
-         * options, return -1 (non-option) as per POSIX.
-         * Otherwise, it is an unknown option character (or ':').
-         */
+		 * If the user specified "-" and  '-' isn't listed in
+		 * options, return -1 (non-option) as per POSIX.
+		 * Otherwise, it is an unknown option character (or ':').
+		 */
         if (optchar == (int) '-' && *place == '\0')
             return (-1);
         if (!*place)
@@ -508,7 +524,8 @@ static int getopt_internal(
         }
         else /* white space */
             place = nargv[optind];
-        optchar = parse_long_options(nargv, options, long_options, idx, 0);
+        optchar = parse_long_options(nargv, options, long_options,
+                                     idx, 0);
         place = EMSG;
         return (optchar);
     }
@@ -553,13 +570,13 @@ int getopt(int nargc, char* const* nargv, const char* options)
 {
 
     /*
-     * We don't pass FLAG_PERMUTE to getopt_internal() since
-     * the BSD getopt(3) (unlike GNU) has never done this.
-     *
-     * Furthermore, since many privileged programs call getopt()
-     * before dropping privileges it makes sense to keep things
-     * as simple (and bug-free) as possible.
-     */
+	 * We don't pass FLAG_PERMUTE to getopt_internal() since
+	 * the BSD getopt(3) (unlike GNU) has never done this.
+	 *
+	 * Furthermore, since many privileged programs call getopt()
+	 * before dropping privileges it makes sense to keep things
+	 * as simple (and bug-free) as possible.
+	 */
     return (getopt_internal(nargc, nargv, options, NULL, NULL, 0));
 }
 #endif /* REPLACE_GETOPT */
@@ -568,18 +585,22 @@ int getopt(int nargc, char* const* nargv, const char* options)
  * getopt_long --
  *	Parse argc/argv argument vector.
  */
-int getopt_long(int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx)
+int getopt_long(int nargc, char* const* nargv, const char* options,
+                const struct option* long_options, int* idx)
 {
 
-    return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE));
+    return (getopt_internal(nargc, nargv, options, long_options, idx,
+                            FLAG_PERMUTE));
 }
 
 /*
  * getopt_long_only --
  *	Parse argc/argv argument vector.
  */
-int getopt_long_only(int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx)
+int getopt_long_only(int nargc, char* const* nargv, const char* options,
+                     const struct option* long_options, int* idx)
 {
 
-    return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE | FLAG_LONGONLY));
+    return (getopt_internal(nargc, nargv, options, long_options, idx,
+                            FLAG_PERMUTE | FLAG_LONGONLY));
 }
diff --git a/samples/common/windows/getopt.h b/samples/common/windows/getopt.h
index 9fc4e24a..6023b959 100644
--- a/samples/common/windows/getopt.h
+++ b/samples/common/windows/getopt.h
@@ -84,9 +84,10 @@ enum /* permitted values for its `has_arg' field...	*/
     optional_argument  /* option may take an argument		*/
 };
 
-extern int getopt_long(int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx);
-extern int getopt_long_only(
-    int nargc, char* const* nargv, const char* options, const struct option* long_options, int* idx);
+extern int getopt_long(int nargc, char* const* nargv, const char* options,
+                       const struct option* long_options, int* idx);
+extern int getopt_long_only(int nargc, char* const* nargv, const char* options,
+                            const struct option* long_options, int* idx);
 /*
  * Previous MinGW implementation had...
  */
diff --git a/samples/opensource/sampleCharRNN/README.md b/samples/opensource/sampleCharRNN/README.md
index 873feca3..daaa5721 100644
--- a/samples/opensource/sampleCharRNN/README.md
+++ b/samples/opensource/sampleCharRNN/README.md
@@ -89,9 +89,11 @@ To see the full list of available options and their descriptions, use the `-h` o
 ```
 Usage: ./sample_char_rnn [-h or --help] [-d or --datadir=<path_to_data_directory>]
 
---help Display help information
+--help          Display help information
 
---datadir Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use data/samples/char-rnn/ and data/char-rnn/
+--useILoop      Use ILoop LSTM definition
+
+--datadir       Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use data/samples/char-rnn/ and data/char-rnn/
 
 ```
 
diff --git a/samples/opensource/sampleCharRNN/sampleCharRNN.cpp b/samples/opensource/sampleCharRNN/sampleCharRNN.cpp
index 295ca713..b7227f8c 100644
--- a/samples/opensource/sampleCharRNN/sampleCharRNN.cpp
+++ b/samples/opensource/sampleCharRNN/sampleCharRNN.cpp
@@ -36,6 +36,7 @@
 #include <sys/stat.h>
 #include <unordered_set>
 #include <vector>
+#include <array>
 
 #include "NvInfer.h"
 #include "NvUtils.h"
@@ -47,14 +48,15 @@
 
 const std::string gSampleName = "TensorRT.sample_char_rnn";
 
+static const std::array<int, 4> INDICES{0, 1, 2, 3};
+
 // The model used by this sample was trained using github repository:
 // https://github.com/crazydonkey200/tensorflow-char-rnn
 //
 // The data set used: tensorflow-char-rnn/data/tiny_shakespeare.txt
 //
 // The command used to train:
-// python train.py --data_file=data/tiny_shakespeare.txt --num_epochs=100 --num_layer=2 --hidden_size=512
-// --embedding_size=512 --dropout=.5
+// python train.py --data_file=data/tiny_shakespeare.txt --num_epochs=100 --num_layer=2 --hidden_size=512 --embedding_size=512 --dropout=.5
 //
 // Epochs trained: 100
 // Test perplexity: 4.940
@@ -74,8 +76,8 @@ struct SampleCharRNNWeightNames
     const std::string FCB_NAME{"softmax_softmax_b"};
     const std::string EMBED_NAME{"embedding"};
 
-    std::unordered_set<std::string> names
-        = {{RNNW_L0_NAME, RNNB_L0_NAME, RNNW_L1_NAME, RNNB_L1_NAME, FCW_NAME, FCB_NAME, EMBED_NAME}};
+    std::unordered_set<std::string> names = {{RNNW_L0_NAME, RNNB_L0_NAME, RNNW_L1_NAME,
+                                              RNNB_L1_NAME, FCW_NAME, FCB_NAME, EMBED_NAME}};
 };
 
 struct SampleCharRNNBindingNames
@@ -92,19 +94,14 @@ struct SampleCharRNNBindingNames
 struct SampleCharRNNMaps
 {
     // A mapping from character to index used by the tensorflow model.
-    const std::map<char, int> charToID{{'\n', 0}, {'!', 1}, {' ', 2}, {'$', 3}, {'\'', 4}, {'&', 5}, {'-', 6}, {',', 7},
-        {'.', 8}, {'3', 9}, {';', 10}, {':', 11}, {'?', 12}, {'A', 13}, {'C', 14}, {'B', 15}, {'E', 16}, {'D', 17},
-        {'G', 18}, {'F', 19}, {'I', 20}, {'H', 21}, {'K', 22}, {'J', 23}, {'M', 24}, {'L', 25}, {'O', 26}, {'N', 27},
-        {'Q', 28}, {'P', 29}, {'S', 30}, {'R', 31}, {'U', 32}, {'T', 33}, {'W', 34}, {'V', 35}, {'Y', 36}, {'X', 37},
-        {'Z', 38}, {'a', 39}, {'c', 40}, {'b', 41}, {'e', 42}, {'d', 43}, {'g', 44}, {'f', 45}, {'i', 46}, {'h', 47},
-        {'k', 48}, {'j', 49}, {'m', 50}, {'l', 51}, {'o', 52}, {'n', 53}, {'q', 54}, {'p', 55}, {'s', 56}, {'r', 57},
-        {'u', 58}, {'t', 59}, {'w', 60}, {'v', 61}, {'y', 62}, {'x', 63}, {'z', 64}};
+    const std::map<char, int> charToID{{'\n', 0}, {'!', 1}, {' ', 2}, {'$', 3}, {'\'', 4}, {'&', 5}, {'-', 6}, {',', 7}, {'.', 8}, {'3', 9}, {';', 10}, {':', 11}, {'?', 12}, {'A', 13}, {'C', 14}, {'B', 15}, {'E', 16}, {'D', 17}, {'G', 18}, {'F', 19}, {'I', 20}, {'H', 21}, {'K', 22}, {'J', 23}, {'M', 24}, {'L', 25}, {'O', 26}, {'N', 27}, {'Q', 28}, {'P', 29}, {'S', 30}, {'R', 31}, {'U', 32}, {'T', 33}, {'W', 34}, {'V', 35}, {'Y', 36}, {'X', 37}, {'Z', 38}, {'a', 39}, {'c', 40}, {'b', 41}, {'e', 42}, {'d', 43}, {'g', 44}, {'f', 45}, {'i', 46}, {'h', 47}, {'k', 48}, {'j', 49}, {'m', 50}, {'l', 51}, {'o', 52}, {'n', 53}, {'q', 54}, {'p', 55}, {'s', 56}, {'r', 57}, {'u', 58}, {'t', 59}, {'w', 60}, {'v', 61}, {'y', 62}, {'x', 63}, {'z', 64}};
 
     // A mapping from index to character used by the tensorflow model.
-    const std::vector<char> idToChar{{'\n', '!', ' ', '$', '\'', '&', '-', ',', '.', '3', ';', ':', '?', 'A', 'C', 'B',
-        'E', 'D', 'G', 'F', 'I', 'H', 'K', 'J', 'M', 'L', 'O', 'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X',
-        'Z', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't',
-        'w', 'v', 'y', 'x', 'z'}};
+    const std::vector<char> idToChar{{'\n', '!', ' ', '$', '\'', '&', '-', ',',
+                                      '.', '3', ';', ':', '?', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K',
+                                      'J', 'M', 'L', 'O', 'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X',
+                                      'Z', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o',
+                                      'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'}};
 };
 
 struct SampleCharRNNParams : samplesCommon::SampleParams
@@ -123,21 +120,22 @@ struct SampleCharRNNParams : samplesCommon::SampleParams
 
     vector<std::string> inputSentences;
     vector<std::string> outputSentences;
+    bool useILoop;
 };
 
 //!
-//! \brief  The SampleCharRNN class implements the char_rnn sample
+//! \brief  The SampleCharRNNBase class implements the char_rnn sample
 //!
 //! \details It uses weights from a trained TensorFlow model and creates
 //!          the network using the TensorRT network definition API
 //!
-class SampleCharRNN
+class SampleCharRNNBase
 {
+public:
     template <typename T>
     using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
 
-public:
-    SampleCharRNN(const SampleCharRNNParams& params)
+    SampleCharRNNBase(const SampleCharRNNParams& params)
         : mParams(params)
     {
     }
@@ -157,26 +155,33 @@ class SampleCharRNN
     //!
     bool teardown();
 
-private:
+protected:
     //!
-    //! \brief Load requested weights from a formatted file into a map.
+    //! \brief Add inputs to the TensorRT network and configure LSTM layers using network definition API.
     //!
-    std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
+    virtual nvinfer1::ILayer* addLSTMLayers(SampleUniquePtr<nvinfer1::INetworkDefinition>& network) = 0;
 
     //!
     //! \brief Converts RNN weights from TensorFlow's format to TensorRT's format.
     //!
-    nvinfer1::Weights convertRNNWeights(nvinfer1::Weights input);
+    nvinfer1::Weights convertRNNWeights(nvinfer1::Weights input, int dataSize);
 
     //!
     //! \brief Converts RNN Biases from TensorFlow's format to TensorRT's format.
     //!
     nvinfer1::Weights convertRNNBias(nvinfer1::Weights input);
 
+    std::map<std::string, nvinfer1::Weights> mWeightMap;
+    SampleCharRNNParams mParams;
+
+    nvinfer1::ITensor* addReshape(
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, nvinfer1::ITensor& tensor, nvinfer1::Dims dims);
+
+private:
     //!
-    //! \brief Add inputs to the TensorRT network and configure the RNNv2 layer using network definition API.
+    //! \brief Load requested weights from a formatted file into a map.
     //!
-    nvinfer1::IRNNv2Layer* addRNNv2Layer(SampleUniquePtr<nvinfer1::INetworkDefinition>& network);
+    std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
 
     //!
     //! \brief Create full model using the TensorRT network definition API and build the engine.
@@ -192,19 +197,67 @@ class SampleCharRNN
     //!
     //! \brief Perform one time step of inference with the TensorRT execution context
     //!
-    bool stepOnce(samplesCommon::BufferManager& buffers, SampleUniquePtr<nvinfer1::IExecutionContext>& context,
-        cudaStream_t& stream);
+    bool stepOnce(samplesCommon::BufferManager& buffers,
+                  SampleUniquePtr<nvinfer1::IExecutionContext>& context, cudaStream_t& stream);
 
     //!
     //! \brief Copies Ct/Ht output from the RNN to the Ct-1/Ht-1 input buffers for next time step
     //!
     void copyRNNOutputsToInputs(samplesCommon::BufferManager& buffers);
 
-    std::map<std::string, nvinfer1::Weights> mWeightMap;
-    SampleCharRNNParams mParams;
     std::shared_ptr<nvinfer1::ICudaEngine> mEngine{nullptr}; //!< The TensorRT engine used to run the network
 };
 
+class SampleCharRNNv2 : public SampleCharRNNBase
+{
+public:
+    SampleCharRNNv2(SampleCharRNNParams params)
+        : SampleCharRNNBase(params)
+    {
+    }
+
+protected:
+    //!
+    //! \brief Add inputs to the TensorRT network and configure LSTM layers using network definition API.
+    //!
+    nvinfer1::ILayer* addLSTMLayers(SampleCharRNNBase::SampleUniquePtr<nvinfer1::INetworkDefinition>& network) final;
+};
+
+class SampleCharRNNLoop : public SampleCharRNNBase
+{
+public:
+    struct LstmIO
+    {
+        nvinfer1::ITensor* data;
+        nvinfer1::ITensor* hidden;
+        nvinfer1::ITensor* cell;
+    };
+
+    struct LstmParams
+    {
+        nvinfer1::ITensor* inputWeights;
+        nvinfer1::ITensor* recurrentWeights;
+        nvinfer1::ITensor* inputBias;
+        nvinfer1::ITensor* recurrentBias;
+        nvinfer1::ITensor* maxSequenceSize;
+    };
+
+    SampleCharRNNLoop(SampleCharRNNParams params)
+        : SampleCharRNNBase(params)
+    {
+    }
+
+protected:
+    //!
+    //! \brief Add inputs to the TensorRT network and configure LSTM layers using network definition API.
+    //!
+    nvinfer1::ILayer* addLSTMLayers(SampleCharRNNBase::SampleUniquePtr<nvinfer1::INetworkDefinition>& network) final;
+
+private:
+    nvinfer1::ILayer* addLSTMCell(SampleUniquePtr<nvinfer1::INetworkDefinition>& network, const LstmIO& inputTensors,
+        nvinfer1::ITensor* sequenceSize, const LstmParams& params, LstmIO& outputTensors);
+};
+
 //!
 //! \brief Creates the network, configures the builder and creates
 //!        the network engine
@@ -216,14 +269,17 @@ class SampleCharRNN
 //! \return Returns true if the engine was created successfully and false
 //!         otherwise
 //!
-bool SampleCharRNN::build()
+bool SampleCharRNNBase::build()
 {
-    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
+    NetworkDefinitionCreationFlags flags{
+        mParams.useILoop ? 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH) : 0};
+    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(
+        nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
     if (!builder)
     {
         return false;
     }
-    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
+    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(flags));
     if (!network)
     {
         return false;
@@ -234,9 +290,10 @@ bool SampleCharRNN::build()
         return false;
     }
 
-    mWeightMap = SampleCharRNN::loadWeights(mParams.weightFileName);
+    mWeightMap = SampleCharRNNBase::loadWeights(mParams.weightFileName);
 
-    builder->setMaxBatchSize(mParams.batchSize);
+    builder->setMaxBatchSize(
+        flags & static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH) ? 0 : mParams.batchSize);
     config->setMaxWorkspaceSize(32_MiB);
     config->setFlag(BuilderFlag::kGPU_FALLBACK);
 
@@ -264,7 +321,7 @@ bool SampleCharRNN::build()
 //!        for each buffer: [name] [type] [shape] <data as binary blob>\n
 //!        Note: type is the integer value of the DataType enum in NvInfer.h.
 //!
-std::map<std::string, nvinfer1::Weights> SampleCharRNN::loadWeights(const std::string file)
+std::map<std::string, nvinfer1::Weights> SampleCharRNNBase::loadWeights(const std::string file)
 {
     std::map<std::string, nvinfer1::Weights> weightMap;
 
@@ -348,13 +405,16 @@ std::map<std::string, nvinfer1::Weights> SampleCharRNN::loadWeights(const std::s
 //!       TensorRT expects the format to laid out in memory:
 //!       CellN: Wi, Wc, Wf, Wo, Ri, Rc, Rf, Ro
 //!
-nvinfer1::Weights SampleCharRNN::convertRNNWeights(nvinfer1::Weights input)
+nvinfer1::Weights SampleCharRNNBase::convertRNNWeights(nvinfer1::Weights orig, int dataSize)
 {
+    nvinfer1::Weights input{orig.type, orig.values, (dataSize + mParams.hiddenSize) * 4 * mParams.hiddenSize};
     float* ptr = new float[input.count];
-    int dims[4]{2, mParams.hiddenSize, 4, mParams.hiddenSize};
-    int order[4]{0, 3, 1, 2};
-    utils::reshapeWeights(input, dims, order, ptr, 4);
-    utils::transposeSubBuffers(ptr, DataType::kFLOAT, 2, mParams.hiddenSize * mParams.hiddenSize, 4);
+    const float* data = static_cast<const float*>(input.values);
+    int dimsW[2]{dataSize, 4 * mParams.hiddenSize};
+    int dimsR[2]{mParams.hiddenSize, 4 * mParams.hiddenSize};
+    std::copy(data, data + input.count, ptr);
+    utils::transposeSubBuffers(ptr, DataType::kFLOAT, 1, dimsW[0], dimsW[1]);
+    utils::transposeSubBuffers(&ptr[dimsW[0] * dimsW[1]], DataType::kFLOAT, 1, dimsR[0], dimsR[1]);
     return nvinfer1::Weights{input.type, ptr, input.count};
 }
 
@@ -373,16 +433,187 @@ nvinfer1::Weights SampleCharRNN::convertRNNWeights(nvinfer1::Weights input)
 //!
 //!       Since tensorflow already combines U and W,
 //!       we double the size and set all of U to zero.
-nvinfer1::Weights SampleCharRNN::convertRNNBias(nvinfer1::Weights input)
+nvinfer1::Weights SampleCharRNNBase::convertRNNBias(nvinfer1::Weights input)
 {
-    const int sizeOfElement = samplesCommon::getElementSize(input.type);
-    char* ptr = new char[sizeOfElement * input.count * 2];
-    const char* iptr = static_cast<const char*>(input.values);
-    std::copy(iptr, iptr + 4 * mParams.hiddenSize * sizeOfElement, ptr);
-    std::fill(ptr + sizeOfElement * input.count, ptr + sizeOfElement * input.count * 2, 0);
+    float* ptr = new float[input.count * 2];
+    const float* iptr = static_cast<const float*>(input.values);
+    int64_t count = 4 * mParams.hiddenSize;
+    assert(input.count == count);
+    std::copy(iptr, iptr + count, ptr);
+    float* shiftedPtr = ptr + count;
+    std::fill(shiftedPtr, shiftedPtr + count, 0.0);
     return nvinfer1::Weights{input.type, ptr, input.count * 2};
 }
 
+nvinfer1::ILayer* SampleCharRNNLoop::addLSTMCell(SampleUniquePtr<nvinfer1::INetworkDefinition>& network,
+    const LstmIO& inputTensors, nvinfer1::ITensor* sequenceSize, const LstmParams& params, LstmIO& outputTensors)
+{
+    nvinfer1::ILoop* sequenceLoop = network->addLoop();
+    sequenceLoop->addTripLimit(*sequenceSize, nvinfer1::TripLimit::kCOUNT);
+
+    nvinfer1::ITensor* input = sequenceLoop->addIterator(*inputTensors.data)->getOutput(0);
+    nvinfer1::IRecurrenceLayer* hidden = sequenceLoop->addRecurrence(*inputTensors.hidden);
+    nvinfer1::IRecurrenceLayer* cell = sequenceLoop->addRecurrence(*inputTensors.cell);
+
+    nvinfer1::ITensor* mmInput = network
+                                     ->addMatrixMultiply(*input, nvinfer1::MatrixOperation::kVECTOR,
+                                         *params.inputWeights, nvinfer1::MatrixOperation::kTRANSPOSE)
+                                     ->getOutput(0);
+
+    nvinfer1::ITensor* mmHidden = network
+                                      ->addMatrixMultiply(*hidden->getOutput(0), nvinfer1::MatrixOperation::kVECTOR,
+                                          *params.recurrentWeights, nvinfer1::MatrixOperation::kTRANSPOSE)
+                                      ->getOutput(0);
+
+    nvinfer1::ITensor* mm
+        = network->addElementWise(*mmInput, *mmHidden, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+
+    nvinfer1::ITensor* bias
+        = network->addElementWise(*params.inputBias, *params.recurrentBias, nvinfer1::ElementWiseOperation::kSUM)
+              ->getOutput(0);
+
+    nvinfer1::ITensor* gatesICFO
+        = network->addElementWise(*mm, *bias, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+
+    const auto isolateGate = [&](nvinfer1::ITensor& gates, int gateIndex) -> nvinfer1::ITensor* {
+        nvinfer1::ISliceLayer* slice = network->addSlice(gates, nvinfer1::Dims{1, {gateIndex * mParams.hiddenSize}},
+            nvinfer1::Dims{1, {mParams.hiddenSize}}, nvinfer1::Dims{1, {1}});
+        return addReshape(network, *slice->getOutput(0), nvinfer1::Dims{1, {mParams.hiddenSize}});
+    };
+
+    nvinfer1::ITensor* i
+        = network->addActivation(*isolateGate(*gatesICFO, 0), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
+    nvinfer1::ITensor* c
+        = network->addActivation(*isolateGate(*gatesICFO, 1), nvinfer1::ActivationType::kTANH)->getOutput(0);
+    nvinfer1::ITensor* f
+        = network->addActivation(*isolateGate(*gatesICFO, 2), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
+    nvinfer1::ITensor* o
+        = network->addActivation(*isolateGate(*gatesICFO, 3), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
+
+    nvinfer1::ITensor* C
+        = network
+              ->addElementWise(*network->addElementWise(*f, *cell->getOutput(0), nvinfer1::ElementWiseOperation::kPROD)
+                                    ->getOutput(0),
+                  *network->addElementWise(*i, *c, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0),
+                  nvinfer1::ElementWiseOperation::kSUM)
+              ->getOutput(0);
+    nvinfer1::ITensor* H
+        = network
+              ->addElementWise(*o, *network->addActivation(*C, nvinfer1::ActivationType::kTANH)->getOutput(0),
+                  nvinfer1::ElementWiseOperation::kPROD)
+              ->getOutput(0);
+
+    // Recurrent backedge input for hidden and cell.
+    cell->setInput(1, *C);
+    hidden->setInput(1, *H);
+
+    nvinfer1::ILoopOutputLayer* outputLayer = sequenceLoop->addLoopOutput(*H, nvinfer1::LoopOutput::kCONCATENATE);
+    outputLayer->setInput(1, *params.maxSequenceSize);
+    nvinfer1::ITensor* hiddenOut
+        = sequenceLoop->addLoopOutput(*hidden->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0);
+    nvinfer1::ITensor* cellOut
+        = sequenceLoop->addLoopOutput(*cell->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0);
+
+    outputTensors = LstmIO{outputLayer->getOutput(0), hiddenOut, cellOut};
+    return outputLayer;
+}
+
+nvinfer1::ITensor* SampleCharRNNBase::addReshape(
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, nvinfer1::ITensor& tensor, nvinfer1::Dims dims)
+{
+    nvinfer1::IShuffleLayer* shuffle = network->addShuffle(tensor);
+    shuffle->setReshapeDimensions(dims);
+    return shuffle->getOutput(0);
+}
+
+nvinfer1::ILayer* SampleCharRNNLoop::addLSTMLayers(SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
+{
+    nvinfer1::ILayer* dataOut{nullptr};
+
+    nvinfer1::ITensor* data = network->addInput(mParams.bindingNames.INPUT_BLOB_NAME, nvinfer1::DataType::kFLOAT,
+        nvinfer1::Dims2(mParams.seqSize, mParams.dataSize));
+    assert(data != nullptr);
+
+    nvinfer1::ITensor* hiddenLayers = network->addInput(mParams.bindingNames.HIDDEN_IN_BLOB_NAME,
+        nvinfer1::DataType::kFLOAT, nvinfer1::Dims2(mParams.layerCount, mParams.hiddenSize));
+    assert(hiddenLayers != nullptr);
+
+    nvinfer1::ITensor* cellLayers = network->addInput(mParams.bindingNames.CELL_IN_BLOB_NAME,
+        nvinfer1::DataType::kFLOAT, nvinfer1::Dims2(mParams.layerCount, mParams.hiddenSize));
+    assert(cellLayers != nullptr);
+
+    nvinfer1::ITensor* sequenceSize
+        = network->addInput(mParams.bindingNames.SEQ_LEN_IN_BLOB_NAME, nvinfer1::DataType::kINT32, nvinfer1::Dims{});
+    assert(sequenceSize != nullptr);
+
+    // convert tensorflow weight format to trt weight format
+    std::array<nvinfer1::Weights, 2> rnnw{
+        SampleCharRNNBase::convertRNNWeights(mWeightMap[mParams.weightNames.RNNW_L0_NAME], mParams.dataSize),
+        SampleCharRNNBase::convertRNNWeights(mWeightMap[mParams.weightNames.RNNW_L1_NAME], mParams.hiddenSize)};
+    std::array<nvinfer1::Weights, 2> rnnb{
+        SampleCharRNNBase::convertRNNBias(mWeightMap[mParams.weightNames.RNNB_L0_NAME]),
+        SampleCharRNNBase::convertRNNBias(mWeightMap[mParams.weightNames.RNNB_L1_NAME])};
+
+    // Store the transformed weights in the weight map so the memory can be properly released later.
+    mWeightMap["rnnwL0"] = rnnw[0];
+    mWeightMap["rnnwL1"] = rnnw[1];
+    mWeightMap["rnnbL0"] = rnnb[0];
+    mWeightMap["rnnbL1"] = rnnb[1];
+
+    nvinfer1::ITensor* maxSequenceSize
+        = network->addConstant(nvinfer1::Dims{}, Weights{DataType::kINT32, &mParams.seqSize, 1})->getOutput(0);
+    assert(static_cast<size_t>(mParams.layerCount) <= INDICES.size());
+    LstmIO lstmNext{data, nullptr, nullptr};
+    std::vector<nvinfer1::ITensor*> hiddenOutputs;
+    std::vector<nvinfer1::ITensor*> cellOutputs;
+    nvinfer1::Dims2 dimWL0(4 * mParams.hiddenSize, mParams.dataSize);
+    nvinfer1::Dims2 dimR(4 * mParams.hiddenSize, mParams.hiddenSize);
+    nvinfer1::Dims dimB{1, {4 * mParams.hiddenSize}};
+    nvinfer1::Dims dim0{1, {0}};
+    auto extractWeights = [](nvinfer1::Weights weights, Dims start, Dims size) -> nvinfer1::Weights {
+        const char* data = static_cast<const char*>(weights.values);
+        int64_t shift = samplesCommon::volume(start);
+        const int sizeOfElement = samplesCommon::getElementSize(weights.type);
+        int64_t count = samplesCommon::volume(size);
+        assert(shift + count <= weights.count);
+        return nvinfer1::Weights{weights.type, data + shift * sizeOfElement, count};
+    };
+    for (int i = 0; i < mParams.layerCount; ++i)
+    {
+        nvinfer1::Dims dimW = i == 0 ? dimWL0 : dimR;
+        nvinfer1::ITensor* index
+            = network->addConstant(nvinfer1::Dims{}, Weights{DataType::kINT32, &INDICES[i], 1})->getOutput(0);
+        nvinfer1::ITensor* hidden = network->addGather(*hiddenLayers, *index, 0)->getOutput(0);
+        nvinfer1::ITensor* cell = network->addGather(*cellLayers, *index, 0)->getOutput(0);
+        nvinfer1::ITensor* weightIn = network->addConstant(dimW, extractWeights(rnnw[i], dim0, dimW))->getOutput(0);
+        nvinfer1::ITensor* weightRec = network->addConstant(dimR, extractWeights(rnnw[i], dimW, dimR))->getOutput(0);
+        nvinfer1::ITensor* biasIn = network->addConstant(dimB, extractWeights(rnnb[i], dim0, dimB))->getOutput(0);
+        nvinfer1::ITensor* biasRec = network->addConstant(dimB, extractWeights(rnnb[i], dimB, dimB))->getOutput(0);
+        LstmIO lstmInput{lstmNext.data, hidden, cell};
+        LstmParams params{weightIn, weightRec, biasIn, biasRec, maxSequenceSize};
+
+        Dims2 dims{1, mParams.hiddenSize};
+        dataOut = addLSTMCell(network, lstmInput, sequenceSize, params, lstmNext);
+        hiddenOutputs.push_back(addReshape(network, *lstmNext.hidden, dims));
+        cellOutputs.push_back(addReshape(network, *lstmNext.cell, dims));
+    }
+
+    auto addConcatenation = [&network](std::vector<nvinfer1::ITensor*> tensors) -> nvinfer1::ITensor* {
+        nvinfer1::IConcatenationLayer* concat = network->addConcatenation(tensors.data(), tensors.size());
+        concat->setAxis(0);
+        return concat->getOutput(0);
+    };
+
+    nvinfer1::ITensor* hiddenNext = addConcatenation(hiddenOutputs);
+    hiddenNext->setName(mParams.bindingNames.HIDDEN_OUT_BLOB_NAME);
+    network->markOutput(*hiddenNext);
+
+    nvinfer1::ITensor* cellNext = addConcatenation(cellOutputs);
+    cellNext->setName(mParams.bindingNames.CELL_OUT_BLOB_NAME);
+    network->markOutput(*cellNext);
+
+    return dataOut;
+}
 //!
 //! \brief Add inputs to the network and configure the RNNv2 layer using network definition API.
 //!
@@ -391,28 +622,23 @@ nvinfer1::Weights SampleCharRNN::convertRNNBias(nvinfer1::Weights input)
 //!
 //! \return Configured and added RNNv2 layer.
 //!
-nvinfer1::IRNNv2Layer* SampleCharRNN::addRNNv2Layer(SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
+nvinfer1::ILayer* SampleCharRNNv2::addLSTMLayers(SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
 {
     // Initialize data, hiddenIn, cellIn, and seqLenIn inputs into RNN Layer
-    nvinfer1::ITensor* data = network->addInput(mParams.bindingNames.INPUT_BLOB_NAME, nvinfer1::DataType::kFLOAT,
-        nvinfer1::Dims2(mParams.seqSize, mParams.dataSize));
+    nvinfer1::ITensor* data = network->addInput(mParams.bindingNames.INPUT_BLOB_NAME, nvinfer1::DataType::kFLOAT, nvinfer1::Dims2(mParams.seqSize, mParams.dataSize));
     assert(data != nullptr);
 
-    nvinfer1::ITensor* hiddenIn = network->addInput(mParams.bindingNames.HIDDEN_IN_BLOB_NAME,
-        nvinfer1::DataType::kFLOAT, nvinfer1::Dims2(mParams.layerCount, mParams.hiddenSize));
+    nvinfer1::ITensor* hiddenIn = network->addInput(mParams.bindingNames.HIDDEN_IN_BLOB_NAME, nvinfer1::DataType::kFLOAT, nvinfer1::Dims2(mParams.layerCount, mParams.hiddenSize));
     assert(hiddenIn != nullptr);
 
-    nvinfer1::ITensor* cellIn = network->addInput(mParams.bindingNames.CELL_IN_BLOB_NAME, nvinfer1::DataType::kFLOAT,
-        nvinfer1::Dims2(mParams.layerCount, mParams.hiddenSize));
+    nvinfer1::ITensor* cellIn = network->addInput(mParams.bindingNames.CELL_IN_BLOB_NAME, nvinfer1::DataType::kFLOAT, nvinfer1::Dims2(mParams.layerCount, mParams.hiddenSize));
     assert(cellIn != nullptr);
 
-    nvinfer1::ITensor* seqLenIn
-        = network->addInput(mParams.bindingNames.SEQ_LEN_IN_BLOB_NAME, nvinfer1::DataType::kINT32, nvinfer1::Dims{});
+    nvinfer1::ITensor* seqLenIn = network->addInput(mParams.bindingNames.SEQ_LEN_IN_BLOB_NAME, nvinfer1::DataType::kINT32, nvinfer1::Dims{});
     assert(seqLenIn != nullptr);
 
     // create an RNN layer w/ 2 layers and 512 hidden states
-    nvinfer1::IRNNv2Layer* rnn = network->addRNNv2(
-        *data, mParams.layerCount, mParams.hiddenSize, mParams.seqSize, nvinfer1::RNNOperation::kLSTM);
+    nvinfer1::IRNNv2Layer* rnn = network->addRNNv2(*data, mParams.layerCount, mParams.hiddenSize, mParams.seqSize, nvinfer1::RNNOperation::kLSTM);
     assert(rnn != nullptr);
 
     // Set RNNv2 optional inputs
@@ -430,36 +656,44 @@ nvinfer1::IRNNv2Layer* SampleCharRNN::addRNNv2Layer(SampleUniquePtr<nvinfer1::IN
     seqLenIn->setLocation(nvinfer1::TensorLocation::kDEVICE);
 
     // convert tensorflow weight format to trt weight format
-    nvinfer1::Weights rnnwL0 = SampleCharRNN::convertRNNWeights(mWeightMap[mParams.weightNames.RNNW_L0_NAME]);
-    nvinfer1::Weights rnnbL0 = SampleCharRNN::convertRNNBias(mWeightMap[mParams.weightNames.RNNB_L0_NAME]);
-    nvinfer1::Weights rnnwL1 = SampleCharRNN::convertRNNWeights(mWeightMap[mParams.weightNames.RNNW_L1_NAME]);
-    nvinfer1::Weights rnnbL1 = SampleCharRNN::convertRNNBias(mWeightMap[mParams.weightNames.RNNB_L1_NAME]);
-
-    std::vector<nvinfer1::RNNGateType> gateOrder({nvinfer1::RNNGateType::kINPUT, nvinfer1::RNNGateType::kCELL,
-        nvinfer1::RNNGateType::kFORGET, nvinfer1::RNNGateType::kOUTPUT});
+    nvinfer1::Weights rnnwL0
+        = SampleCharRNNBase::convertRNNWeights(mWeightMap[mParams.weightNames.RNNW_L0_NAME], mParams.dataSize);
+    nvinfer1::Weights rnnbL0 = SampleCharRNNBase::convertRNNBias(mWeightMap[mParams.weightNames.RNNB_L0_NAME]);
+    nvinfer1::Weights rnnwL1
+        = SampleCharRNNBase::convertRNNWeights(mWeightMap[mParams.weightNames.RNNW_L1_NAME], mParams.hiddenSize);
+    nvinfer1::Weights rnnbL1 = SampleCharRNNBase::convertRNNBias(mWeightMap[mParams.weightNames.RNNB_L1_NAME]);
+
+    std::vector<nvinfer1::RNNGateType> gateOrder({nvinfer1::RNNGateType::kINPUT,
+                                                  nvinfer1::RNNGateType::kCELL,
+                                                  nvinfer1::RNNGateType::kFORGET,
+                                                  nvinfer1::RNNGateType::kOUTPUT});
     const nvinfer1::DataType dataType = static_cast<nvinfer1::DataType>(rnnwL0.type);
     const float* wtsL0 = static_cast<const float*>(rnnwL0.values);
     const float* biasesL0 = static_cast<const float*>(rnnbL0.values);
     const float* wtsL1 = static_cast<const float*>(rnnwL1.values);
     const float* biasesL1 = static_cast<const float*>(rnnbL1.values);
-    size_t kernelOffset = 0, biasOffset = 0;
+    size_t kernelOffsetL0 = 0, kernelOffsetL1 = 0, biasOffset = 0;
     for (int gateIndex = 0, numGates = gateOrder.size(); gateIndex < 2 * numGates; gateIndex++)
     {
+        bool isW = (gateIndex < numGates);
+        int64_t weightCountL0 = (isW ? mParams.dataSize : mParams.hiddenSize) * mParams.hiddenSize;
+        int64_t weightCountL1 = mParams.hiddenSize * mParams.hiddenSize;
         // extract weights and bias for a given gate and layer
-        nvinfer1::Weights gateWeightL0{dataType, wtsL0 + kernelOffset, mParams.dataSize * mParams.hiddenSize};
+        nvinfer1::Weights gateWeightL0{dataType, wtsL0 + kernelOffsetL0, weightCountL0};
         nvinfer1::Weights gateBiasL0{dataType, biasesL0 + biasOffset, mParams.hiddenSize};
-        nvinfer1::Weights gateWeightL1{dataType, wtsL1 + kernelOffset, mParams.dataSize * mParams.hiddenSize};
+        nvinfer1::Weights gateWeightL1{dataType, wtsL1 + kernelOffsetL1, weightCountL1};
         nvinfer1::Weights gateBiasL1{dataType, biasesL1 + biasOffset, mParams.hiddenSize};
 
         // set weights and bias for given gate
-        rnn->setWeightsForGate(0, gateOrder[gateIndex % numGates], (gateIndex < numGates), gateWeightL0);
-        rnn->setBiasForGate(0, gateOrder[gateIndex % numGates], (gateIndex < numGates), gateBiasL0);
-        rnn->setWeightsForGate(1, gateOrder[gateIndex % numGates], (gateIndex < numGates), gateWeightL1);
-        rnn->setBiasForGate(1, gateOrder[gateIndex % numGates], (gateIndex < numGates), gateBiasL1);
+        rnn->setWeightsForGate(0, gateOrder[gateIndex % numGates], isW, gateWeightL0);
+        rnn->setBiasForGate(0, gateOrder[gateIndex % numGates], isW, gateBiasL0);
+        rnn->setWeightsForGate(1, gateOrder[gateIndex % numGates], isW, gateWeightL1);
+        rnn->setBiasForGate(1, gateOrder[gateIndex % numGates], isW, gateBiasL1);
 
         // Update offsets
-        kernelOffset = kernelOffset + mParams.dataSize * mParams.hiddenSize;
-        biasOffset = biasOffset + mParams.hiddenSize;
+        kernelOffsetL0 += weightCountL0;
+        kernelOffsetL1 += weightCountL1;
+        biasOffset += mParams.hiddenSize;
     }
 
     // Store the transformed weights in the weight map so the memory can be properly released later.
@@ -468,6 +702,13 @@ nvinfer1::IRNNv2Layer* SampleCharRNN::addRNNv2Layer(SampleUniquePtr<nvinfer1::IN
     mWeightMap["rnnwL1"] = rnnwL1;
     mWeightMap["rnnbL1"] = rnnbL1;
 
+    rnn->getOutput(1)->setName(mParams.bindingNames.HIDDEN_OUT_BLOB_NAME);
+    network->markOutput(*rnn->getOutput(1));
+    if (rnn->getOperation() == nvinfer1::RNNOperation::kLSTM)
+    {
+        rnn->getOutput(2)->setName(mParams.bindingNames.CELL_OUT_BLOB_NAME);
+        network->markOutput(*rnn->getOutput(2));
+    }
     return rnn;
 }
 
@@ -477,19 +718,17 @@ nvinfer1::IRNNv2Layer* SampleCharRNN::addRNNv2Layer(SampleUniquePtr<nvinfer1::IN
 //! \param weightMap Map that contains all the weights required by the model.
 //! \param modelStream The stream within which the engine is serialized once built.
 //!
-void SampleCharRNN::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+void SampleCharRNNBase::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
     SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config)
 {
     // add RNNv2 layer and set its parameters
-    auto rnn = SampleCharRNN::addRNNv2Layer(network);
+    auto rnn = addLSTMLayers(network);
 
     // Transpose FC weights since TensorFlow's weights are transposed when compared to TensorRT
-    utils::transposeSubBuffers((void*) mWeightMap[mParams.weightNames.FCW_NAME].values, nvinfer1::DataType::kFLOAT, 1,
-        mParams.hiddenSize, mParams.vocabSize);
+    utils::transposeSubBuffers((void*) mWeightMap[mParams.weightNames.FCW_NAME].values, nvinfer1::DataType::kFLOAT, 1, mParams.hiddenSize, mParams.vocabSize);
 
     // add Constant layers for fully connected weights
-    auto fcwts = network->addConstant(
-        nvinfer1::Dims2(mParams.vocabSize, mParams.hiddenSize), mWeightMap[mParams.weightNames.FCW_NAME]);
+    auto fcwts = network->addConstant(nvinfer1::Dims2(mParams.vocabSize, mParams.hiddenSize), mWeightMap[mParams.weightNames.FCW_NAME]);
 
     // Add matrix multiplication layer for multiplying rnn output with FC weights
     auto matrixMultLayer = network->addMatrixMultiply(*fcwts->getOutput(0), false, *rnn->getOutput(0), true);
@@ -498,8 +737,7 @@ void SampleCharRNN::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builde
 
     // Add elementwise layer for adding bias
     auto fcbias = network->addConstant(nvinfer1::Dims2(mParams.vocabSize, 1), mWeightMap[mParams.weightNames.FCB_NAME]);
-    auto addBiasLayer = network->addElementWise(
-        *matrixMultLayer->getOutput(0), *fcbias->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
+    auto addBiasLayer = network->addElementWise(*matrixMultLayer->getOutput(0), *fcbias->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
     assert(addBiasLayer != nullptr);
     addBiasLayer->getOutput(0)->setName("Add Bias output");
 
@@ -512,13 +750,6 @@ void SampleCharRNN::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builde
     // Mark the outputs for the network
     network->markOutput(*pred->getOutput(1));
     pred->getOutput(1)->setType(nvinfer1::DataType::kINT32);
-    rnn->getOutput(1)->setName(mParams.bindingNames.HIDDEN_OUT_BLOB_NAME);
-    network->markOutput(*rnn->getOutput(1));
-    if (rnn->getOperation() == nvinfer1::RNNOperation::kLSTM)
-    {
-        rnn->getOutput(2)->setName(mParams.bindingNames.CELL_OUT_BLOB_NAME);
-        network->markOutput(*rnn->getOutput(2));
-    }
 
     gLogInfo << "Done constructing network..." << std::endl;
 
@@ -532,12 +763,13 @@ void SampleCharRNN::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builde
 //! \details This function is the main execution function of the sample. It
 //!          allocates the buffer, sets inputs, executes the engine, and verifies the output.
 //!
-bool SampleCharRNN::infer()
+bool SampleCharRNNBase::infer()
 {
     // Create RAII buffer manager object
     samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);
 
-    auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
+    auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(
+        mEngine->createExecutionContext());
 
     if (!context)
     {
@@ -559,8 +791,7 @@ bool SampleCharRNN::infer()
     CHECK(cudaStreamCreate(&stream));
 
     // Set sequence lengths to maximum
-    std::fill_n(reinterpret_cast<int32_t*>(buffers.getHostBuffer(mParams.bindingNames.SEQ_LEN_IN_BLOB_NAME)),
-        mParams.batchSize, mParams.seqSize);
+    std::fill_n(reinterpret_cast<int32_t*>(buffers.getHostBuffer(mParams.bindingNames.SEQ_LEN_IN_BLOB_NAME)), mParams.batchSize, mParams.seqSize);
 
     // Initialize hiddenIn and cellIn tensors to zero before seeding
     void* hiddenIn = buffers.getHostBuffer(mParams.bindingNames.HIDDEN_IN_BLOB_NAME);
@@ -575,14 +806,14 @@ bool SampleCharRNN::infer()
     // Seed the RNN with the input sentence.
     for (auto& a : inputSentence)
     {
-        SampleCharRNN::copyEmbeddingToInput(buffers, a);
+        SampleCharRNNBase::copyEmbeddingToInput(buffers, a);
 
-        if (!SampleCharRNN::stepOnce(buffers, context, stream))
+        if (!SampleCharRNNBase::stepOnce(buffers, context, stream))
         {
             return false;
         }
 
-        SampleCharRNN::copyRNNOutputsToInputs(buffers);
+        SampleCharRNNBase::copyRNNOutputsToInputs(buffers);
         genstr.push_back(a);
     }
 
@@ -593,14 +824,14 @@ bool SampleCharRNN::infer()
     // Generate predicted sequence of characters
     for (size_t x = 0, y = expected.size() - 1; x < y; x++)
     {
-        SampleCharRNN::copyEmbeddingToInput(buffers, *genstr.rbegin());
+        SampleCharRNNBase::copyEmbeddingToInput(buffers, *genstr.rbegin());
 
-        if (!SampleCharRNN::stepOnce(buffers, context, stream))
+        if (!SampleCharRNNBase::stepOnce(buffers, context, stream))
         {
             return false;
         }
 
-        SampleCharRNN::copyRNNOutputsToInputs(buffers);
+        SampleCharRNNBase::copyRNNOutputsToInputs(buffers);
         predIdx = *reinterpret_cast<uint32_t*>(buffers.getHostBuffer(mParams.bindingNames.OUTPUT_BLOB_NAME));
         genstr.push_back(mParams.charMaps.idToChar.at(predIdx));
     }
@@ -616,27 +847,27 @@ bool SampleCharRNN::infer()
 //!
 //! \brief Looks up the embedding tensor for a given char and copies it to input buffer
 //!
-void SampleCharRNN::copyEmbeddingToInput(samplesCommon::BufferManager& buffers, const char& c)
+void SampleCharRNNBase::copyEmbeddingToInput(samplesCommon::BufferManager& buffers, const char& c)
 {
     auto embed = mWeightMap[mParams.weightNames.EMBED_NAME];
     float* inputBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.bindingNames.INPUT_BLOB_NAME));
     auto index = mParams.charMaps.charToID.at(c);
 
-    std::memcpy(inputBuffer, static_cast<const float*>(embed.values) + index * mParams.dataSize,
-        buffers.size(mParams.bindingNames.INPUT_BLOB_NAME));
+    std::memcpy(inputBuffer, static_cast<const float*>(embed.values) + index * mParams.dataSize, buffers.size(mParams.bindingNames.INPUT_BLOB_NAME));
 }
 
 //!
 //! \brief Perform one time step of inference with the TensorRT execution context
 //!
-bool SampleCharRNN::stepOnce(
+bool SampleCharRNNBase::stepOnce(
     samplesCommon::BufferManager& buffers, SampleUniquePtr<nvinfer1::IExecutionContext>& context, cudaStream_t& stream)
 {
     // Asynchronously copy data from host input buffers to device input buffers
     buffers.copyInputToDeviceAsync(stream);
 
     // Asynchronously enqueue the inference work
-    if (!context->enqueue(mParams.batchSize, buffers.getDeviceBindings().data(), stream, nullptr))
+    if (mParams.useILoop ? !context->enqueueV2(buffers.getDeviceBindings().data(), stream, nullptr)
+                         : !context->enqueue(mParams.batchSize, buffers.getDeviceBindings().data(), stream, nullptr))
     {
         return false;
     }
@@ -650,7 +881,7 @@ bool SampleCharRNN::stepOnce(
 //!
 //! \brief Copies Ct/Ht output from the RNN to the Ct-1/Ht-1 input buffers for next time step
 //!
-void SampleCharRNN::copyRNNOutputsToInputs(samplesCommon::BufferManager& buffers)
+void SampleCharRNNBase::copyRNNOutputsToInputs(samplesCommon::BufferManager& buffers)
 {
     // Copy Ct/Ht to the Ct-1/Ht-1 slots.
     void* hiddenIn = buffers.getHostBuffer(mParams.bindingNames.HIDDEN_IN_BLOB_NAME);
@@ -668,7 +899,7 @@ void SampleCharRNN::copyRNNOutputsToInputs(samplesCommon::BufferManager& buffers
 //!
 //! \brief Used to clean up any state created in the sample class
 //!
-bool SampleCharRNN::teardown()
+bool SampleCharRNNBase::teardown()
 {
     // Clean up runtime resources
     for (auto& mem : mWeightMap)
@@ -705,6 +936,7 @@ SampleCharRNNParams initializeSampleParams(const samplesCommon::Args& args)
     params.vocabSize = 65;
     params.outputSize = 1;
     params.weightFileName = locateFile("char-rnn.wts", params.dataDirs);
+    params.useILoop = args.useILoop;
 
     // Input strings and their respective expected output strings
     const std::vector<std::string> inS{
@@ -745,10 +977,8 @@ void printHelpInfo()
 {
     std::cout << "Usage: ./sample_char_rnn [-h or --help] [-d or --datadir=<path to data directory>]\n";
     std::cout << "--help          Display help information\n";
-    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
-                 "multiple times to add multiple directories. If no data directories are given, the default is to use "
-                 "data/samples/char-rnn/ and data/char-rnn/"
-              << std::endl;
+    std::cout << "--useILoop      Use ILoop LSTM definition\n";
+    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use data/samples/char-rnn/ and data/char-rnn/" << std::endl;
 }
 
 //!
@@ -776,19 +1006,29 @@ int main(int argc, char** argv)
     gLogger.reportTestStart(sampleTest);
 
     SampleCharRNNParams params = initializeSampleParams(args);
-    SampleCharRNN sample(params);
+    std::unique_ptr<SampleCharRNNBase> sample;
+
+    if (args.useILoop)
+    {
+        sample.reset(new SampleCharRNNLoop(params));
+    }
+    else
+    {
+        sample.reset(new SampleCharRNNv2(params));
+    }
 
-    gLogInfo << "Building and running a GPU inference engine for Char RNN model..." << std::endl;
+    gLogInfo << "Building and running a GPU inference engine for Char RNN model..."
+             << std::endl;
 
-    if (!sample.build())
+    if (!sample->build())
     {
         return gLogger.reportFail(sampleTest);
     }
-    if (!sample.infer())
+    if (!sample->infer())
     {
         return gLogger.reportFail(sampleTest);
     }
-    if (!sample.teardown())
+    if (!sample->teardown())
     {
         return gLogger.reportFail(sampleTest);
     }
diff --git a/samples/opensource/sampleDynamicReshape/README.md b/samples/opensource/sampleDynamicReshape/README.md
index 66697eb3..bdeb77bf 100644
--- a/samples/opensource/sampleDynamicReshape/README.md
+++ b/samples/opensource/sampleDynamicReshape/README.md
@@ -33,11 +33,11 @@ Specifically, this sample:
 ### Creating the preprocessing network
 
 First, create a network with full dims support:
-`auto preprocessorNetwork = this->makeUnique(builder->createNetworkV2(1U << static_cast<int32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));`
+`auto preprocessorNetwork = makeUnique(builder->createNetworkV2(1U << static_cast<int32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));`
 
 Next, add an input layer that accepts an input with a dynamic shape, followed by a resize layer that will reshape the input to the shape the model expects:
 ```
-auto input = preprocessorNetwork->addInput("input", nvinfer1::DataType::kFLOAT, Dims3{1, -1, -1});
+auto input = preprocessorNetwork->addInput("input", nvinfer1::DataType::kFLOAT, Dims4{1, 1, -1, -1});
 auto resizeLayer = preprocessorNetwork->addResize(*input);
 resizeLayer->setOutputDimensions(mPredictionInputDims);
 preprocessorNetwork->markOutput(*resizeLayer->getOutput(0));
@@ -47,9 +47,10 @@ The -1 dimensions denote dimensions that will be supplied at runtime.
 
 ### Parsing the ONNX MNIST model
 
-First, create an empty network, and parser:
+First, create an empty full-dims network, and parser:
 ```
-auto network = this->makeUnique(builder->createNetwork());
+const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+auto network = makeUnique(builder->createNetworkV2(explicitBatch));
 auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
 ```
 
@@ -62,17 +63,17 @@ parser->parseFromFile(locateFile(mParams.onnxFileName, mParams.dataDirs).c_str()
 
 When building the preprocessor engine, also provide an optimization profile so that TensorRT knows which input shapes to optimize for:
 ```
-auto preprocessorConfig = this->makeUnique(builder->createNetworkConfig());
+auto preprocessorConfig = makeUnique(builder->createNetworkConfig());
 auto profile = builder->createOptimizationProfile();
 ```
 
 `OptProfileSelector::kOPT` specifies the dimensions that the profile will be optimized for, whereas `OptProfileSelector::kMIN` and `OptProfileSelector::kMAX` specify the minimum and maximum dimensions for which the profile will be valid:
 ```
-profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims3{1, 1, 1});
-profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims3{1, 28, 28});
-profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims3{1, 56, 56});
+profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims4{1, 1, 1, 1});
+profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims4{1, 1, 28, 28});
+profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims4{1, 1, 56, 56});
 preprocessorConfig->addOptimizationProfile(profile);
-mPreprocessorEngine = this->makeUnique(builder->buildEngineWithConfig(*preprocessorNetwork, *preprocessorConfig));
+mPreprocessorEngine = makeUnique(builder->buildEngineWithConfig(*preprocessorNetwork, *preprocessorConfig));
 ```
 
 For the MNIST model, attach a Softmax layer to the end of the network and replace the existing network output with the Softmax:
@@ -83,7 +84,7 @@ network->markOutput(*softmax->getOutput(0));
 ```
 
 Finally, build as normal:
-`mPredictionEngine = this->makeUnique(builder->buildEngineWithConfig(*network, *config));`
+`mPredictionEngine = makeUnique(builder->buildEngineWithConfig(*network, *config));`
 
 ### Running inference
 
@@ -104,7 +105,7 @@ bool status = mPreprocessorContext->executeV2(preprocessorBindings.data());
 Then, run the MNIST engine:
 ```
 std::vector<void*> predicitonBindings = {mPredictionInput.data(), mOutput.deviceBuffer.data()};
-status = mPredictionContext->execute(mParams.batchSize, predicitonBindings.data());
+status = mPredictionContext->executeV2(predicitonBindings.data());
 ```
 
 Finally, copy the output back to the host:
diff --git a/samples/opensource/sampleINT8/README.md b/samples/opensource/sampleINT8/README.md
index b10b2897..8dd7dd34 100644
--- a/samples/opensource/sampleINT8/README.md
+++ b/samples/opensource/sampleINT8/README.md
@@ -18,7 +18,7 @@
 	* [Generating batch files for non-Caffe users](#generating-batch-files-for-non-caffe-users)
 - [Running the sample](#running-the-sample)
 	* [Sample `--help` options](#sample---help-options)
-- [Additional resources](#additiona-resources)
+- [Additional resources](#additional-resources)
 - [License](#license)
 - [Changelog](#changelog)
 - [Known issues](#known-issues)
@@ -34,7 +34,7 @@ Specifically, this sample demonstrates how to perform inference in 8-bit integer
 
 INT8 engines are build from 32-bit network definitions, similarly to 32-bit and 16-bit engines, but with more configuration steps. In particular, the builder and network must be configured to use INT8, which requires per-tensor dynamic ranges. The INT8 calibrator can determine how best to represent weights and activations as 8-bit integers and sets the per tensor dynamic ranges accordingly. Alternatively, you can set custom per tensor dynamic ranges; this is covered in sampleINT8API.
 
-This sample is accompanied by the [MNIST training set](https://github.com/BVLC/caffe/blob/master/data/mnist/get_mnist.sh) located in the `TensorRT-x.x.x.x/data/mnist` directory, where `x.x.x.x` is your installed version of TensorRT. The packaged MNIST model that is shipped with this sample is based on [lenet.prototxt](https://github.com/BVLC/caffe/edit/master/examples/mnist/lenet.prototxt). For more information, see the [MNIST BVLC Caffe example](https://github.com/BVLC/caffe/tree/master/examples/mnist). This sample can also be used with other Image classification models, for example, [deploy.prototxt](https://github.com/BVLC/caffe/blob/master/models/bvlc_googlenet/deploy.prototxt).
+This sample is accompanied by the [MNIST training set](https://github.com/BVLC/caffe/blob/master/data/mnist/get_mnist.sh) located in the `TensorRT-x.x.x.x/data/mnist` directory, where `x.x.x.x` is your installed version of TensorRT. The packaged MNIST model that is shipped with this sample is based on [lenet.prototxt](https://github.com/BVLC/caffe/edit/master/examples/mnist/lenet.prototxt). For more information, see the [MNIST BVLC Caffe example](https://github.com/BVLC/caffe/tree/master/examples/mnist).
 
 The packaged data set file that is shipped with this sample is based on the [MNIST data set](https://github.com/BVLC/caffe/tree/master/data/mnist). However, the batch file generation from the above data set is described in [Batch files for calibration](#batch-files-for-calibration).
 
diff --git a/samples/opensource/sampleINT8/sampleINT8.cpp b/samples/opensource/sampleINT8/sampleINT8.cpp
index ff1e238e..19853583 100644
--- a/samples/opensource/sampleINT8/sampleINT8.cpp
+++ b/samples/opensource/sampleINT8/sampleINT8.cpp
@@ -282,7 +282,7 @@ bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int
     }
 
     MNISTBatchStream batchStream(
-        mParams.batchSize, nbScoreBatches, "train-images-idx3-ubyte", "train-labels-idx1-ubyte", mParams.dataDirs);
+        mParams.batchSize, nbScoreBatches + firstScoreBatch, "train-images-idx3-ubyte", "train-labels-idx1-ubyte", mParams.dataDirs);
     batchStream.skip(firstScoreBatch);
 
     Dims outputDims = context->getEngine().getBindingDimensions(
@@ -340,7 +340,7 @@ bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int
         }
     }
 
-    int imagesRead = batchStream.getBatchesRead() * mParams.batchSize;
+    int imagesRead = (batchStream.getBatchesRead() - firstScoreBatch) * mParams.batchSize;
     score.first = float(top1) / float(imagesRead);
     score.second = float(top5) / float(imagesRead);
 
@@ -454,16 +454,16 @@ void printHelpInfo()
 
 int main(int argc, char** argv)
 {
-    if (argc >= 2 && (!strncmp(argv[1], "help", 4) || !strncmp(argv[1], "--help", 6) || !strncmp(argv[1], "--h", 3)))
+    if (argc >= 2 && (!strncmp(argv[1], "help", 4) || !strncmp(argv[1], "--help", 6) || !strncmp(argv[1], "--h", 3) || !strncmp(argv[1], "-h", 2)))
     {
         printHelpInfo();
         return EXIT_FAILURE;
     }
 
-    // By default we score over 40K images starting at 3200, so we don't score those used to search calibration
+    // By default we score over 57600 images starting at 512, so we don't score those used to search calibration
     int batchSize = 32;
-    int firstScoreBatch = 100;
-    int nbScoreBatches = 400;
+    int firstScoreBatch = 16;
+    int nbScoreBatches = 1800;
 
     // Parse extra arguments
     for (int i = 1; i < argc; ++i)
@@ -488,9 +488,9 @@ int main(int argc, char** argv)
         return EXIT_FAILURE;
     }
 
-    if ((firstScoreBatch + nbScoreBatches) * batchSize > 500000)
+    if ((firstScoreBatch + nbScoreBatches) * batchSize > 60000)
     {
-        gLogError << "Only 50000 images available" << std::endl;
+        gLogError << "Only 60000 images available" << std::endl;
         return EXIT_FAILURE;
     }
 
@@ -530,7 +530,7 @@ int main(int argc, char** argv)
     }
 
     auto isApproximatelyEqual = [](float a, float b, double tolerance) { return (std::abs(a - b) <= tolerance); };
-    double fp16tolerance{0.5}, int8tolerance{1.0};
+    double fp16tolerance{0.5}, int8tolerance{0.01};
 
     if (scores[1].first != 0.0f && !isApproximatelyEqual(scores[0].first, scores[1].first, fp16tolerance))
     {
diff --git a/samples/opensource/sampleINT8API/README.md b/samples/opensource/sampleINT8API/README.md
index 173ffdd7..b41e0dad 100644
--- a/samples/opensource/sampleINT8API/README.md
+++ b/samples/opensource/sampleINT8API/README.md
@@ -161,7 +161,7 @@ The ResNet-50 per tensor dynamic ranges file.
 The image to be inferred.
 
 1.  Download the [ONNX ResNet-50 model](https://github.com/onnx/models/tree/master/vision/classification/resnet/resnet50).
-    `wget https://s3.amazonaws.com/download.onnx/models/opset_3/resnet50.tar.gz`
+    `wget https://s3.amazonaws.com/download.onnx/models/opset_9/resnet50.tar.gz`
 
 2.  Unpackage the model file.
     `tar -xvzf resnet50.tar.gz`
@@ -219,20 +219,20 @@ The image to be inferred.
 To see the full list of available options and their descriptions, use the `-h` or `--help` command line option. For example:
 ```
     Usage:
-		./sample_int8_api [--model=model_file] [--ranges=per_tensor_dynamic_range_file] [--image=image_file] [--reference=reference_file] [--data=/path/to/data/dir] [--useDLACore=<int>] [-v or --verbose]
+		./sample_int8_api [-h or --help] [--model=<model file>] [--ranges=<dynamic range file>] [--image=<image file>] [--reference=<reference file>] [--data=<path to data directory>] [--useDLACore=<int>] [-v or --verbose]
 
 	Help:
 		./sample_int8_api [-h or --help]
-    		-h or --help. Display This help information
-            --model=model_file.onnx or /absolute/path/to/model_file.onnx. Generate model file using README.md in case it does not exists. Default to resnet50.onnx
-            --image=image.ppm or /absolute/path/to/image.ppm. Image to infer. Defaults to airlines.ppm
-            --reference=reference.txt or /absolute/path/to/reference.txt. Reference labels file. Defaults to reference_labels.txt
-            --ranges=ranges.txt or /absolute/path/to/ranges.txt. Specify custom per tensor dynamic range for the network. Defaults to     resnet50_per_tensor_dynamic_range.txt
-            --write_tensors. Option to generate file containing network tensors name. By default writes to network_tensors.txt file. To provide user defined file     name use         additional option --network_tensors_file. See --network_tensors_file option usage for more detail.
-            --network_tensors_file=network_tensors.txt or /absolute/path/to/network_tensors.txt. This option needs to be used with --write_tensors option. Specify     file name         (will write to current execution directory) or absolute path to file name to write network tensor names file. Dynamic range corresponding to     each network tensor is         required to run the sample. Defaults to network_tensors.txt
-            --data=/path/to/data/dir. Specify data directory to search for above files in case absolute paths to files are not provided. Defaults to data/samples/    int8_api/ or         data/int8_api/
-            --useDLACore=N. Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform.
-            --verbose. Outputs per tensor dynamic range and layer precision info for the network
+			--help, -h                        Display This help information
+            --model=<model file>              Path to the model file (default = resnet50.onnx)
+			--image=<image file>              Path to the image file to infer (default = airlines.ppm)
+			--reference=<reference file>      Path to the reference labels file (default = reference_labels.txt)
+			--ranges=<dynamic range file>     Path to the custom per tensor dynamic range file for the network (default = resnet50_per_tensor_dynamic_range.txt)
+			--write_tensors                   Option to generate file containing network tensors name. By default writes to network_tensors.txt file. To provide user defined file name use additional option --network_tensors_file.        See --network_tensors_file option usage for more detail.
+			--network_tensors_file=<file>     Path to the network tensors file. This option needs to be used with --write_tensors option. Specify file name (will write to current execution directory) or absolute path to file name to write network tensor names file. Dynamic range corresponding to each network tensor is required to run the sample (default = network_tensors.txt)
+			--data=<path to data dir>         Path to the data directory to search for above files in case absolute paths to files are not provided (default both data/samples/int8_api/ and data/int8_api/)
+			--useDLACore=N                    Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform.
+			--verbose                         Enable verbose prints
 ```
 
 # Models other than ResNet-50 with custom configuration
diff --git a/samples/opensource/sampleINT8API/sampleINT8API.cpp b/samples/opensource/sampleINT8API/sampleINT8API.cpp
index 637af5df..7a8d3eff 100644
--- a/samples/opensource/sampleINT8API/sampleINT8API.cpp
+++ b/samples/opensource/sampleINT8API/sampleINT8API.cpp
@@ -43,12 +43,8 @@ const std::string gSampleName = "TensorRT.sample_int8_api";
 
 struct SampleINT8APIPreprocessing
 {
-    // Preprocessing values are available here:
-    // https://github.com/onnx/models/tree/master/models/image_classification/resnet
-    std::vector<float> mean{0.485f, 0.456f, 0.406f};
-    std::vector<float> std{0.229f, 0.224f, 0.225f};
-    float scale{255.0f};
-    std::vector<int> inputDims{1, 3, 224, 224};
+    // Preprocessing values are available here: https://github.com/onnx/models/tree/master/models/image_classification/resnet
+    std::vector<int> inputDims{1,3,224,224};
 };
 
 //!
@@ -223,8 +219,16 @@ void SampleINT8API::setLayerPrecision(SampleUniquePtr<nvinfer1::INetworkDefiniti
             std::string layerName = layer->getName();
             gLogInfo << "Layer: " << layerName << ". Precision: INT8" << std::endl;
         }
-        // set computation precision of the layer
-        layer->setPrecision(nvinfer1::DataType::kINT8);
+
+        // Don't set the precision on non-computation layers as they don't support
+        // int8.
+        if (layer->getType() != LayerType::kCONSTANT
+                && layer->getType() != LayerType::kCONCATENATION
+                && layer->getType() != LayerType::kSHAPE)
+        {
+            // set computation precision of the layer
+            layer->setPrecision(nvinfer1::DataType::kINT8);
+        }
 
         for (int j = 0; j < layer->getNbOutputs(); ++j)
         {
@@ -234,8 +238,11 @@ void SampleINT8API::setLayerPrecision(SampleUniquePtr<nvinfer1::INetworkDefiniti
                 std::string tensorName = layer->getOutput(j)->getName();
                 gLogInfo << "Tensor: " << tensorName << ". OutputType: INT8" << std::endl;
             }
-            // set output type of the tensor
-            layer->setOutputType(j, nvinfer1::DataType::kINT8);
+            // set output type of execution tensors and not shape tensors.
+            if (layer->getOutput(j)->isExecutionTensor())
+            {
+                layer->setOutputType(j, nvinfer1::DataType::kINT8);
+            }
         }
     }
 }
@@ -305,15 +312,13 @@ bool SampleINT8API::setDynamicRange(SampleUniquePtr<nvinfer1::INetworkDefinition
                     "missing per tensor dynamic range."
                  << std::endl;
     }
-
     // set dynamic range for network input tensors
     for (int i = 0; i < network->getNbInputs(); ++i)
     {
         string tName = network->getInput(i)->getName();
         if (mPerTensorDynamicRangeMap.find(tName) != mPerTensorDynamicRangeMap.end())
         {
-            network->getInput(i)->setDynamicRange(
-                -mPerTensorDynamicRangeMap.at(tName), mPerTensorDynamicRangeMap.at(tName));
+            network->getInput(i)->setDynamicRange(-mPerTensorDynamicRangeMap.at(tName), mPerTensorDynamicRangeMap.at(tName));
         }
         else
         {
@@ -324,6 +329,55 @@ bool SampleINT8API::setDynamicRange(SampleUniquePtr<nvinfer1::INetworkDefinition
         }
     }
 
+     // set dynamic range for layer output tensors
+    for (int i = 0; i < network->getNbLayers(); ++i)
+    {
+        auto lyr = network->getLayer(i);
+        for (int j = 0, e = lyr->getNbOutputs(); j < e; ++j)
+        {
+            string tName = lyr->getOutput(j)->getName();
+            if (mPerTensorDynamicRangeMap.find(tName) != mPerTensorDynamicRangeMap.end())
+            {
+                // Calibrator generated dynamic range for network tensor can be overriden or set using below API
+                lyr->getOutput(j)->setDynamicRange(
+                    -mPerTensorDynamicRangeMap.at(tName), mPerTensorDynamicRangeMap.at(tName));
+            }
+            else if (lyr->getType() == LayerType::kCONSTANT)
+            {
+                IConstantLayer* cLyr = static_cast<IConstantLayer*>(lyr);
+                if (mParams.verbose)
+                {
+                    gLogWarning << "Computing missing dynamic range for tensor, " << tName << ", from weights."
+                                << std::endl;
+                }
+                auto wts = cLyr->getWeights();
+                double max = std::numeric_limits<double>::min();
+                for (int64_t wb = 0, we = wts.count; wb < we; ++wb)
+                {
+                    double val;
+                    switch (wts.type)
+                    {
+                    case DataType::kFLOAT: val = static_cast<const float*>(wts.values)[wb]; break;
+                    case DataType::kBOOL: val = static_cast<const bool*>(wts.values)[wb]; break;
+                    case DataType::kINT8: val = static_cast<const int8_t*>(wts.values)[wb]; break;
+                    case DataType::kHALF: val = static_cast<const half_float::half*>(wts.values)[wb]; break;
+                    case DataType::kINT32: val = static_cast<const int32_t*>(wts.values)[wb]; break;
+                    }
+                    max = std::max(max, std::abs(val));
+                }
+
+                lyr->getOutput(j)->setDynamicRange(-max, max);
+            }
+            else
+            {
+                if (mParams.verbose)
+                {
+                    gLogWarning << "Missing dynamic range for tensor: " << tName << std::endl;
+                }
+            }
+        }
+    }
+
     // set dynamic range for layer output tensors
     for (int i = 0; i < network->getNbLayers(); ++i)
     {
@@ -398,9 +452,7 @@ bool SampleINT8API::prepareInput(const samplesCommon::BufferManager& buffers)
                 // 1. Scale Image to range [0.f, 1.0f]
                 // 2. Normalize Image using per channel Mean and per channel Standard Deviation
                 // 3. Shuffle HWC to CHW form
-                hostInputBuffer[dstIdx]
-                    = (float(fileData[srcIdx]) / mParams.mPreproc.scale - mParams.mPreproc.mean.at(c))
-                    / mParams.mPreproc.std.at(c);
+                hostInputBuffer[dstIdx] = (2.0 / 255.0) * static_cast<const float>(fileData[srcIdx]) - 1.0;
             }
         }
     }
@@ -748,37 +800,35 @@ SampleINT8APIParams initializeSampleParams(SampleINT8APIArgs args)
 //!
 void printHelpInfo()
 {
-    std::cout << "Usage: ./sample_int8_api [-h or --help] [--model=model_file] "
-                 "[--ranges=per_tensor_dynamic_range_file] [--image=image_file] [--reference=reference_file] "
-                 "[--data=/path/to/data/dir] [--useDLACore=<int>] [-v or --verbose]\n";
-    std::cout << "-h or --help. Display This help information" << std::endl;
-    std::cout << "--model=model_file.onnx or /absolute/path/to/model_file.onnx. Generate model file using README.md in "
-                 "case it does not exists. Default to resnet50.onnx"
+    std::cout << "Usage: ./sample_int8_api [-h or --help] [--model=<model file>] "
+                 "[--ranges=<dynamic range file>] [--image=<image file>] [--reference=<reference file>] "
+                 "[--data=<path to data directory>] [--useDLACore=<int>] [-v or --verbose]\n";
+    std::cout << "--help, -h                        Display This help information" << std::endl;
+    std::cout << "--model=<model file>              Path to the model file (default = resnet50.onnx)"
               << std::endl;
-    std::cout << "--image=image.ppm or /absolute/path/to/image.ppm. Image to infer. Defaults to airlines.ppm"
+    std::cout << "--image=<image file>              Path to the image file to infer (default = airlines.ppm)"
               << std::endl;
-    std::cout << "--reference=reference.txt or /absolute/path/to/reference.txt. Reference labels file. Defaults to "
-                 "reference_labels.txt"
+    std::cout << "--reference=<reference file>      Path to the reference labels file (default = reference_labels.txt)"
               << std::endl;
-    std::cout << "--ranges=ranges.txt or /absolute/path/to/ranges.txt. Specify custom per tensor dynamic range for the "
-                 "network. Defaults to resnet50_per_tensor_dynamic_range.txt"
+    std::cout << "--ranges=<dynamic range file>     Path to the custom per tensor dynamic range file for the network "
+                 "(default = resnet50_per_tensor_dynamic_range.txt)"
               << std::endl;
-    std::cout << "--write_tensors. Option to generate file containing network tensors name. By default writes to "
+    std::cout << "--write_tensors                   Option to generate file containing network tensors name. By default writes to "
                  "network_tensors.txt file. To provide user defined file name use additional option "
                  "--network_tensors_file. See --network_tensors_file option usage for more detail."
               << std::endl;
-    std::cout << "--network_tensors_file=network_tensors.txt or /absolute/path/to/network_tensors.txt. This option "
+    std::cout << "--network_tensors_file=<file>     Path to the network tensors file. This option "
                  "needs to be used with --write_tensors option. Specify file name (will write to current execution "
                  "directory) or absolute path to file name to write network tensor names file. Dynamic range "
-                 "corresponding to each network tensor is required to run the sample. Defaults to network_tensors.txt"
+                 "corresponding to each network tensor is required to run the sample (default = network_tensors.txt)"
               << std::endl;
-    std::cout << "--data=/path/to/data/dir. Specify data directory to search for above files in case absolute paths to "
-                 "files are not provided. Defaults to data/samples/int8_api/ or data/int8_api/"
+    std::cout << "--data=<path to data dir>         Path to the data directory to search for above files in case absolute paths to "
+                 "files are not provided (default both data/samples/int8_api/ and data/int8_api/)"
               << std::endl;
-    std::cout << "--useDLACore=N. Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
+    std::cout << "--useDLACore=N                    Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
                  "where n is the number of DLA engines on the platform."
               << std::endl;
-    std::cout << "--verbose. Outputs per tensor dynamic range and layer precision info for the network" << std::endl;
+    std::cout << "--verbose                         Enable verbose prints" << std::endl;
 }
 
 int main(int argc, char** argv)
diff --git a/samples/opensource/sampleMNISTAPI/README.md b/samples/opensource/sampleMNISTAPI/README.md
index adfd7fb3..361512d9 100644
--- a/samples/opensource/sampleMNISTAPI/README.md
+++ b/samples/opensource/sampleMNISTAPI/README.md
@@ -63,7 +63,7 @@ When you build a network by individually creating every layer, ensure you provid
     In this statement, we are loading the filter weights weightsMap["conv1filter"] and bias weightsMap["conv1bias"] to the
     convolution layer.
 	```
-	IConvolutionLayer* conv1 = network->addConvolution(*scale_1->getOutput(0), 20, DimsHW{5, 5}, weightMap["conv1filter"], weightMap["conv1bias"]);
+	IConvolutionLayer* conv1 = network->addConvolutionNd(*scale_1->getOutput(0), 20, Dims{2, {5, 5}, {}}, weightMap["conv1filter"], weightMap["conv1bias"]);
 	```
 
 ## Running the sample
@@ -152,7 +152,7 @@ The following resources provide a deeper understanding about MNIST:
 - [MNIST dataset](https://github.com/NVIDIA/DIGITS/blob/master/docs/GettingStarted.md)
 
 **Documentation**
-- [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#samples)
+- [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#samples)
 - [Working With TensorRT Using The C++ API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#c_topics)
 - [NVIDIA’s TensorRT Documentation Library](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)
 
diff --git a/samples/opensource/sampleMNISTAPI/sampleMNISTAPI.cpp b/samples/opensource/sampleMNISTAPI/sampleMNISTAPI.cpp
index d03e7222..9d3452d1 100644
--- a/samples/opensource/sampleMNISTAPI/sampleMNISTAPI.cpp
+++ b/samples/opensource/sampleMNISTAPI/sampleMNISTAPI.cpp
@@ -185,24 +185,24 @@ bool SampleMNISTAPI::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& build
     assert(scale_1);
 
     // Add convolution layer with 20 outputs and a 5x5 filter.
-    IConvolutionLayer* conv1 = network->addConvolution(
-        *scale_1->getOutput(0), 20, DimsHW{5, 5}, mWeightMap["conv1filter"], mWeightMap["conv1bias"]);
+    IConvolutionLayer* conv1 = network->addConvolutionNd(
+        *scale_1->getOutput(0), 20, Dims{2, {5, 5}, {}}, mWeightMap["conv1filter"], mWeightMap["conv1bias"]);
     assert(conv1);
     conv1->setStride(DimsHW{1, 1});
 
     // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
-    IPoolingLayer* pool1 = network->addPooling(*conv1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
+    IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), PoolingType::kMAX, Dims{2, {2, 2}, {}});
     assert(pool1);
     pool1->setStride(DimsHW{2, 2});
 
     // Add second convolution layer with 50 outputs and a 5x5 filter.
-    IConvolutionLayer* conv2 = network->addConvolution(
-        *pool1->getOutput(0), 50, DimsHW{5, 5}, mWeightMap["conv2filter"], mWeightMap["conv2bias"]);
+    IConvolutionLayer* conv2 = network->addConvolutionNd(
+        *pool1->getOutput(0), 50, Dims{2, {5, 5}, {}}, mWeightMap["conv2filter"], mWeightMap["conv2bias"]);
     assert(conv2);
     conv2->setStride(DimsHW{1, 1});
 
     // Add second max pooling layer with stride of 2x2 and kernel size of 2x3>
-    IPoolingLayer* pool2 = network->addPooling(*conv2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
+    IPoolingLayer* pool2 = network->addPoolingNd(*conv2->getOutput(0), PoolingType::kMAX, Dims{2, {2, 2}, {}});
     assert(pool2);
     pool2->setStride(DimsHW{2, 2});
 
diff --git a/samples/opensource/sampleMovieLens/README.md b/samples/opensource/sampleMovieLens/README.md
index 68e6d73c..cef6c719 100644
--- a/samples/opensource/sampleMovieLens/README.md
+++ b/samples/opensource/sampleMovieLens/README.md
@@ -90,14 +90,16 @@ This sample comes with a pre-trained model. However, if you want to train your o
     -   `sampleMovieLens.pb`: The frozen TensorFlow graph which contains the information of the network structure and parameters.
 
 5.  Convert the trained model weights to UFF format which sampleMovieLens understands.
-    1.  Install UFF. The `convert_to_uff.py` utility is located in the `/usr/local/bin/convert-to-uff` directory. This utility is installed with the `UFF.whl` file that is shipped with TensorRT.
-    2.  Convert the `frozen .pb` file to `.uff` format.
+    1.  Convert the `frozen .pb` file to `.uff` format.
         ```
-        python3 convert_to_uff.py sampleMovieLens.pb -p preprocess.py
+        convert-to-uff sampleMovieLens.pb -p preprocess.py
         ```
 
         The `preprocess.py` script is a preprocessing step that needs to be applied to the TensorFlow graph before it can be used by TensorRT. The reason for this is that TensorFlow's concatenation operation accounts for the batch dimension while TensorRT's concatenation operation does not.
-    3.  Copy:
+
+        The `convert-to-uff` tool is installed together with UFF installation. If you install UFF with deb/rpm, please use the `convert_to_uff.py` script located in `/usr/lib/python3.X/dist-packages/uff*/bin`.
+
+    2.  Copy:
         -   The `sampleMovieLens.uff` file to the `<TensorRT Install>/data/movielens` directory.
         -   The `movielens_ratings.txt` file to the `<TensorRT Install>/data/movielens` directory.
 
diff --git a/samples/opensource/sampleNMT/trtUtil.cpp b/samples/opensource/sampleNMT/trtUtil.cpp
index b2c51fa7..d51c3a3f 100644
--- a/samples/opensource/sampleNMT/trtUtil.cpp
+++ b/samples/opensource/sampleNMT/trtUtil.cpp
@@ -30,6 +30,7 @@ int inferTypeToBytes(nvinfer1::DataType t)
     case nvinfer1::DataType::kHALF: return sizeof(int16_t); break;
     default: assert(0); break;
     }
+    return 0;
 };
 
 int getVolume(nvinfer1::Dims dims)
diff --git a/samples/opensource/samplePlugin/samplePlugin.cpp b/samples/opensource/samplePlugin/samplePlugin.cpp
index 253fc8f0..c8d7a450 100644
--- a/samples/opensource/samplePlugin/samplePlugin.cpp
+++ b/samples/opensource/samplePlugin/samplePlugin.cpp
@@ -158,7 +158,7 @@ bool SamplePlugin::build()
     {
         config->setFlag(BuilderFlag::kINT8);
     }
-    samplesCommon::setDummyInt8Scales(config.get(), network.get());
+    samplesCommon::setAllTensorScales(network.get(), 127.0f, 127.0f);
 
     samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
@@ -273,7 +273,13 @@ bool SamplePlugin::infer()
     assert(mParams.outputTensorNames.size() == 1);
     bool outputCorrect = verifyOutput(buffers, mParams.outputTensorNames[0], digit);
 
-    return outputCorrect;
+    // The output correctness is not used to determine the test result.
+    if (!outputCorrect && mParams.dlaCore != -1)
+    {
+        gLogInfo << "Warning: infer result is not correct. It maybe caused by dummy scales in INT8 mode." << std::endl;
+    }
+
+    return true;
 }
 
 //!
diff --git a/samples/opensource/sampleSSD/batchPrepare.py b/samples/opensource/sampleSSD/batchPrepare.py
index 832f4cbb..e1fd827d 100644
--- a/samples/opensource/sampleSSD/batchPrepare.py
+++ b/samples/opensource/sampleSSD/batchPrepare.py
@@ -35,7 +35,7 @@
 width = 300
 NUM_BATCHES = 0
 NUM_PER_BATCH = 1
-NUM_CALIBRATION_IMAGES = 500
+NUM_CALIBRATION_IMAGES = 50
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--inDir', required=True, help='Input directory')
diff --git a/samples/opensource/sampleSSD/sampleSSD.cpp b/samples/opensource/sampleSSD/sampleSSD.cpp
index 133e7aea..108d6c40 100644
--- a/samples/opensource/sampleSSD/sampleSSD.cpp
+++ b/samples/opensource/sampleSSD/sampleSSD.cpp
@@ -187,7 +187,6 @@ bool SampleSSD::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
     {
         config->setFlag(BuilderFlag::kFP16);
     }
-    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
     // Calibrator life time needs to last until after the engine is built.
     std::unique_ptr<IInt8Calibrator> calibrator;
@@ -203,6 +202,7 @@ bool SampleSSD::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
         config->setInt8Calibrator(calibrator.get());
     }
 
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
     mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
         builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
@@ -398,7 +398,7 @@ SampleSSDParams initializeSampleParams(const samplesCommon::Args& args)
     params.outputClsSize = 21;
     params.keepTopK = 200; // Number of total bboxes to be kept per image after NMS step. It is same as
                            // detection_output_param.keep_top_k in prototxt file
-    params.nbCalBatches = 500;
+    params.nbCalBatches = 50;
     params.visualThreshold = 0.6f;
     params.calibrationBatches = "batches/batch_calibration";
 
diff --git a/samples/opensource/sampleUffFasterRCNN/sampleUffFasterRCNN.cpp b/samples/opensource/sampleUffFasterRCNN/sampleUffFasterRCNN.cpp
index d4718ca7..0a769b21 100644
--- a/samples/opensource/sampleUffFasterRCNN/sampleUffFasterRCNN.cpp
+++ b/samples/opensource/sampleUffFasterRCNN/sampleUffFasterRCNN.cpp
@@ -332,7 +332,7 @@ bool SampleUffFasterRcnn::infer()
 
     // Memcpy from host input buffers to device input buffers
     buffers.copyInputToDevice();
-    bool status;
+    bool status{true};
 
     for (int i = 0; i < mParams.repeat; ++i)
     {
@@ -358,7 +358,7 @@ bool SampleUffFasterRcnn::infer()
         return false;
     }
 
-    return true;
+    return status;
 }
 
 bool SampleUffFasterRcnn::teardown()
@@ -446,7 +446,7 @@ SampleUffFasterRcnnParams initializeSampleParams(const FrcnnArgs& args)
         params.dataDirs.push_back("data/samples/faster-rcnn/");
     }
 
-    assert(args.batchSize == args.inputImages.size());
+    assert(args.batchSize == static_cast<int>(args.inputImages.size()));
     params.inputImages = args.inputImages;
     params.uffFileName = "faster_rcnn.uff";
     params.inputNodeName = "input_1";
diff --git a/samples/opensource/sampleUffMaskRCNN/sampleUffMaskRCNN.cpp b/samples/opensource/sampleUffMaskRCNN/sampleUffMaskRCNN.cpp
index 97a1fabc..eda6a0f9 100644
--- a/samples/opensource/sampleUffMaskRCNN/sampleUffMaskRCNN.cpp
+++ b/samples/opensource/sampleUffMaskRCNN/sampleUffMaskRCNN.cpp
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#ifndef _MSC_VER
+#include <unistd.h>
+#include <sys/time.h>
+#endif
+
 #include <assert.h>
 #include <chrono>
 #include <ctime>
@@ -23,7 +28,6 @@
 #include <sstream>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <sys/time.h>
 #include <time.h>
 #include <vector>
 
@@ -38,9 +42,6 @@
 // max
 #include <algorithm>
 
-// data type
-#include <unistd.h>
-
 // MaskRCNN Parameter
 #include "mrcnn_config.h"
 
diff --git a/samples/opensource/trtexec/CMakeLists.txt b/samples/opensource/trtexec/CMakeLists.txt
index 8646eef6..1ea7b229 100644
--- a/samples/opensource/trtexec/CMakeLists.txt
+++ b/samples/opensource/trtexec/CMakeLists.txt
@@ -23,5 +23,4 @@ SET(SAMPLE_SOURCES
 
 set(SAMPLE_PARSERS "caffe" "uff" "onnx")
 set(PLUGINS_NEEDED ON)
-
 include(../../CMakeSamplesTemplate.txt)
diff --git a/samples/opensource/trtexec/README.md b/samples/opensource/trtexec/README.md
index 0ddc25bc..f8978ef5 100644
--- a/samples/opensource/trtexec/README.md
+++ b/samples/opensource/trtexec/README.md
@@ -8,6 +8,8 @@
     * [Example 2: Profiling a custom layer](#example-2-profiling-a-custom-layer)
     * [Example 3: Running a network on DLA](#example-3-running-a-network-on-dla)
     * [Example 4: Running an ONNX model with full dimensions and dynamic shapes](#example-4-running-an-onnx-model-with-full-dimensions-and-dynamic-shapes)
+    * [Example 5: Collecting and printing a timing trace](#example-5-collecting-and-printing-a-timing-trace)
+    * [Example 6: Tune throughput with multi-streaming](#example-6-tune-throughput-with-multi-streaming)
 - [Tool command line arguments](#tool-command-line-arguments)
 - [Additional resources](#additional-resources)
 - [License](#license)
@@ -77,25 +79,53 @@ For more information about DLA, see [Working With DLA](https://docs.nvidia.com/d
 To run an ONNX model in full-dimensions mode with static input shapes:
 
 ```
-./trtexec --onnx=model.onnx --explicitBatch
+./trtexec --onnx=model.onnx
 ```
 
 The following examples assumes an ONNX model with one dynamic input with name `input` and dimensions `[-1, 3, 244, 244]`
 
-To run an ONNX model with dynamic shapes with a given input shape:
+To run an ONNX model in full-dimensions mode with an given input shape:
 
 ```
-./trtexec --onnx=model.onnx --explicitBatch --shapes=input:32x3x244x244
+./trtexec --onnx=model.onnx --shapes=input:32x3x244x244
 ```
 
 To benchmark your ONNX model with a range of possible input shapes:
 
 ```
-./trtexec --onnx=model.onnx --explicitBatch --minShapes=input:1x3x244x244 --optShapes=input:16x3x244x244 --maxShapes=input:32x3x244x244 --shapes=input:5x3x244x244
+./trtexec --onnx=model.onnx --minShapes=input:1x3x244x244 --optShapes=input:16x3x244x244 --maxShapes=<inputName1>input:32x3x244x244 --shapes=input:5x3x244x244
 ```
 
 For more information about using dynamic shapes, see [Working With Dynamic Shapes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#work_dynamic_shapes)
 
+### Example 5: Collecting and printing a timing trace
+
+When running, `trtexec` prints the measured performance, but can also export the measurement trace to a json file:
+```
+./trtexec --deploy=data/AlexNet/AlexNet_N2.prototxt --output=prob --exportTimes=trace.json
+```
+Once the trace is stored in a file, it can be printed using the `tracer.py` utility. This tool prints timestamps and duration of input, compute, and output, in different forms:
+```
+./tracer.py trace.json
+```
+Similarly, profiles can also be printed and stored in a json file. The utility `profiler.py` can be used to read and print the profile from a json file.
+
+### Example 6: Tune throughput with multi-streaming
+
+Tuning throughput may require running multiple concurrent streams of execution. This is the case for example when the latency achieved is well within the desired
+threshold, and we can increase the throughput, even at the expense of some latency. For example, saving engines for batch sizes 1 and 2 and assume that both
+execute within 2ms, the latency threshold:
+```
+trtexec --deploy=GoogleNet_N2.prototxt --output=prob --batch=1 --saveEngine=g1.trt --int8 --buildOnly
+trtexec --deploy=GoogleNet_N2.prototxt --output=prob --batch=2 --saveEngine=g2.trt --int8 --buildOnly
+```
+Now, the saved engines can be tried to find the combination batch/streams below 2 ms that maximizes the throughput:
+```
+trtexec --loadEngine=g1.trt --batch=1 --streams=2
+trtexec --loadEngine=g1.trt --batch=1 --streams=3
+trtexec --loadEngine=g1.trt --batch=1 --streams=4
+trtexec --loadEngine=g2.trt --batch=2 --streams=2
+```
 ## Tool command line arguments
 
 To see the full list of available options and their descriptions, issue the `./trtexec --help` command.
diff --git a/samples/opensource/trtexec/giexec b/samples/opensource/trtexec/giexec
index 47837dbf..a4d2c2d6 100755
--- a/samples/opensource/trtexec/giexec
+++ b/samples/opensource/trtexec/giexec
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
+
 # Stub to call trtexec
 
 CUR_DPATH="$(dirname "$(readlink -f "${BASH_SOURCE}")")"
diff --git a/samples/opensource/trtexec/prn_utils.py b/samples/opensource/trtexec/prn_utils.py
new file mode 100644
index 00000000..8998d554
--- /dev/null
+++ b/samples/opensource/trtexec/prn_utils.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+'''
+Simple printing utils
+
+Utils to print traces and profiles in CSV format
+'''
+
+
+from __future__ import print_function
+
+def combine_descriptions(prolog, features, descriptions):
+    ''' Combine features with their descriptions '''
+
+    full_description = prolog
+    sep = ' '
+    for feature, description in zip(features, descriptions):
+        full_description += sep + feature + ' (' + description + ')'
+        sep = ', '
+
+    return full_description
+
+
+
+def print_header(allFeatures, features, gp, count):
+    ''' Print table header '''
+
+    if gp:
+        sep = '#'
+        if count:
+            sep += 'count, '
+    else:
+        sep = ''
+
+    for feature in allFeatures:
+       if feature in features:
+           print(sep + feature, end = '')
+           sep = ', '
+
+    print('')
+
+
+
+def print_csv(data, count):
+    ''' Print trace in CSV format '''
+
+    c = 0
+    for row in data:
+        if count:
+            print(c, end = '')
+            c += 1
+            sep = ', '
+        else:
+            sep = ''
+        for r in row:
+            print('{}{:.6}'.format(sep, float(r)), end = '')
+            sep = ', '
+        print('')
+
+
+
+def filter_data(data, all_features, feature_set):
+    ''' Drop features not in the given set '''
+
+    filteredData = []
+
+    for d in data:
+        row = []
+        for f in all_features:
+            if f in feature_set:
+                row.append(d[f])
+        filteredData.append(row)
+
+    return filteredData
diff --git a/samples/opensource/trtexec/profiler.py b/samples/opensource/trtexec/profiler.py
new file mode 100755
index 00000000..72fa2b04
--- /dev/null
+++ b/samples/opensource/trtexec/profiler.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+'''
+Print a trtexec profile from a JSON file
+
+Given a JSON file containing a trtexec profile,
+this program prints the profile in CSV table format.
+Each row represents a layer in the profile.
+
+The output format can be optionally converted to a
+format suitable for GNUPlot.
+'''
+
+import sys
+import json
+import argparse
+import prn_utils
+
+
+all_features = ['name', 'timeMs', 'averageMs', 'percentage']
+
+default_features = ",".join(all_features)
+
+descriptions = ['layer name', 'total layer time', 'average layer time', 'percentage of total time']
+
+features_description = prn_utils.combine_descriptions('Features are (times in ms):',
+                                                      all_features, descriptions)
+
+
+
+def hasNames(feature_set):
+    ''' Check if the name is included in the set '''
+
+    return 'name' in feature_set
+
+
+
+def total_data(data, names):
+    ''' Add row at the bottom with the total '''
+
+    accumulator = []
+
+    if names:
+        start = 1
+        accumulator.append('total')
+    else:
+        start = 0
+    for f in range(start, len(data[0])):
+        accumulator.append(0)
+
+    for row in data:
+        for f in range(start, len(row)):
+             accumulator[f] += row[f]
+
+    data.append(accumulator)
+
+    return data
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--features', metavar='F[,F]*', default='name,timeMs,averageMs,percentage',
+                        help='Comma separated list of features to print. ' + features_description)
+    parser.add_argument('--total', action='store_true', help='Add total time row.')
+    parser.add_argument('--gp', action='store_true', help='Print GNUPlot format.')
+    parser.add_argument('--no-header', action='store_true', help='Omit the header row.')
+    parser.add_argument('name', metavar='filename', help='Profile file.')
+    args = parser.parse_args()
+
+    feature_set = args.features.split(',')
+    count = args.gp and not hasNames(feature_set)
+
+    if not args.no_header:
+        prn_utils.print_header(all_features, feature_set, args.gp, count)
+
+    with open(args.name) as f:
+        profile = json.load(f)
+
+        data = prn_utils.filter_data(profile[1:], all_features, feature_set)
+
+        if args.total:
+            data = total_data(data, hasNames(feature_set))
+
+        prn_utils.print_csv(data, count)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/samples/opensource/trtexec/tracer.py b/samples/opensource/trtexec/tracer.py
new file mode 100755
index 00000000..2d3c6046
--- /dev/null
+++ b/samples/opensource/trtexec/tracer.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+'''
+Print a trtexec timing trace from a JSON file
+
+Given a JSON file containing a trtexec timing trace,
+this program prints the trace in CSV table format.
+Each row represents an entry point in the trace.
+
+The columns, as indicated by the header, respresent
+one of the metric recorded. The output format can
+be optionally converted to a format suitable for
+GNUPlot.
+'''
+
+import sys
+import json
+import argparse
+import prn_utils
+
+
+timestamps = ['startInMs', 'endInMs', 'startComputeMs', 'endComputeMs', 'startOutMs', 'endOutMs']
+
+intervals = ['inMs', 'computeMs', 'outMs', 'latencyMs', 'endToEndMs']
+
+all_metrics = timestamps + intervals
+
+default_metrics = ",".join(all_metrics)
+
+descriptions = ['start input', 'end input', 'start compute', 'end compute', 'start output',
+                'end output', 'input', 'compute', 'output', 'latency', 'end to end latency']
+
+metrics_description = prn_utils.combine_descriptions('Possible metrics (all in ms) are:',
+                                                     all_metrics, descriptions)
+
+
+
+def skip_trace(trace, start):
+    ''' Skip trace entries until start time '''
+
+    trailing = []
+
+    for t in trace:
+        if t['start compute'] >= start:
+            trailing.append(t)
+
+    return trailing
+
+
+
+def hasTimestamp(metric_set):
+    ''' Check if features have at least one timestamp '''
+
+    for timestamp in timestamps:
+        if timestamp in metric_set:
+            return True
+    return False;
+
+
+
+def avg_data(data, avg, times):
+    ''' Average trace entries (every avg entries) '''
+
+    averaged = []
+    accumulator = []
+    r = 0
+
+    for row in data:
+        if r == 0:
+            for t in range(len(row)):
+                accumulator.append(row[t])
+        else:
+            for t in range(times, len(row)):
+                accumulator[t] += row[t]
+
+        r += 1
+        if r == avg:
+            for t in range(times, len(row)):
+                accumulator[t] /= avg
+            averaged.append(accumulator)
+            accumulator = []
+            r = 0
+
+    return averaged
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--metrics', metavar='M[,M]*', default=default_metrics,
+                        help='Comma separated list of metrics to print. ' + metrics_description)
+    parser.add_argument('--avg', metavar='N', type=int, default=1, help='Print average every N records.')
+    parser.add_argument('--start', metavar='T', type=float, default=0, help='Start trace at time T (drop records with compute start before T).')
+    parser.add_argument('--gp', action='store_true', help='Print GNUPlot format.')
+    parser.add_argument('--no-header', action='store_true', help='Omit the header row.')
+    parser.add_argument('name', metavar='filename', help='Trace file.')
+    args = parser.parse_args()
+
+    metric_set = args.metrics.split(',')
+    count = args.gp and ( not hasTimestamp(metric_set) or len(metric_set) == 1)
+
+    if not args.no_header:
+        prn_utils.print_header(all_metrics, metric_set, args.gp, count)
+
+    with open(args.name) as f:
+        trace = json.load(f)
+
+        if args.start > 0:
+            trace = skip_trace(trace, args.start)
+
+        data = prn_utils.filter_data(trace, all_metrics, metric_set)
+
+        if args.avg > 1:
+            data = avg_data(data, args.avg, hasTimestamp(metric_set))
+
+        prn_utils.print_csv(data, count)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/samples/opensource/trtexec/trtexec.cpp b/samples/opensource/trtexec/trtexec.cpp
index 91a5d01e..809dc88f 100644
--- a/samples/opensource/trtexec/trtexec.cpp
+++ b/samples/opensource/trtexec/trtexec.cpp
@@ -47,6 +47,8 @@ using namespace sample;
 int main(int argc, char** argv)
 {
     const std::string sampleName = "TensorRT.trtexec";
+    const std::string supportNote{"Note: CUDA graphs is not supported in this version."};
+
     auto sampleTest = gLogger.defineTest(sampleName, argc, argv);
 
     gLogger.reportTestStart(sampleTest);
@@ -79,10 +81,7 @@ int main(int argc, char** argv)
         if (failed)
         {
             AllOptions::help(std::cout);
-            std::cout << "Note: the following options are not fully supported in trtexec:"
-                         " dynamic shapes, multistream/threads, cuda graphs, json logs,"
-                         " and actual data IO"
-                      << std::endl;
+            std::cout << supportNote << std::endl;
             return gLogger.reportFail(sampleTest);
         }
     }
@@ -94,10 +93,7 @@ int main(int argc, char** argv)
     if (options.helps)
     {
         AllOptions::help(std::cout);
-        std::cout << "Note: the following options are not fully supported in trtexec:"
-                     " dynamic shapes, multistream/threads, cuda graphs, json logs,"
-                     " and actual data IO"
-                  << std::endl;
+        std::cout << supportNote << std::endl;
         return gLogger.reportPass(sampleTest);
     }
 
@@ -137,16 +133,37 @@ int main(int argc, char** argv)
         return gLogger.reportFail(sampleTest);
     }
 
-    if (options.reporting.profile)
+    if (options.reporting.profile || !options.reporting.exportTimes.empty())
     {
-        iEnv.profiler.reset(new SimpleProfiler("Layer time"));
+        iEnv.profiler.reset(new Profiler);
     }
 
     setUpInference(iEnv, options.inference);
-    std::vector<InferenceTime> times;
-    runInference(options.inference, iEnv, times);
+    std::vector<InferenceTrace> trace;
+    runInference(options.inference, iEnv, trace);
 
-    printTimes(times, options.reporting, options.inference.batch * options.inference.streams, gLogInfo);
+    printPerformanceReport(trace, options.reporting, static_cast<float>(options.inference.warmup), options.inference.batch, gLogInfo);
+
+    if (options.reporting.output)
+    {
+        dumpOutputs(*iEnv.context.front(), *iEnv.bindings.front(), gLogInfo);
+    }
+    if (!options.reporting.exportOutput.empty())
+    {
+        exportJSONOutput(*iEnv.context.front(), *iEnv.bindings.front(), options.reporting.exportOutput);
+    }
+    if (!options.reporting.exportTimes.empty())
+    {
+        exportJSONTrace(trace, options.reporting.exportTimes);
+    }
+    if (options.reporting.profile)
+    {
+        iEnv.profiler->print(gLogInfo);
+    }
+    if (!options.reporting.exportProfile.empty())
+    {
+        iEnv.profiler->exportJSONProfile(options.reporting.exportProfile);
+    }
 
     return gLogger.reportPass(sampleTest);
 }
diff --git a/scripts/stubify.sh b/scripts/stubify.sh
new file mode 100755
index 00000000..036d6587
--- /dev/null
+++ b/scripts/stubify.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This short shell script will extract all the strong "text" symbols from the
+# shared library and create a new "stub" shared library with the same symbols.
+# The body of these functions will be empty and therefore have no dependencies.
+# This scripts uses whatever CC is defined in the user's environment.
+#
+
+set -o pipefail
+
+# check arguments
+if [ $# -ne 2 ] ; then
+    echo "Usage: $(basename $0) IN_LIBFILE OUT_LIBFILE"
+    exit 1
+fi
+
+IN_LIBFILE="$1"
+OUT_LIBFILE="$2"
+
+# check compiler
+if [ -z "${CC}" ] ; then
+    echo "Error: Environment variable 'CC' has not been defined"
+    exit 1
+fi
+
+SONAME=$(readelf -d "${IN_LIBFILE}" | grep '(SONAME)' | cut -d [ -f 2 | cut -d ] -f 1)
+
+# make stub library
+nm -D "${IN_LIBFILE}" | \
+    awk '{if ($2 == "T") { print "void",$3,"() {}" }}' | \
+    "${CC}" -x c -O0 -fPIC -shared -Wl,-soname=${SONAME} -Wl,--strip-all -o "${OUT_LIBFILE}" -
+
+exit $?
+