diff --git a/CHANGELOG.md b/CHANGELOG.md
index 66139b50..0ef9e135 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,23 @@
# TensorRT OSS Release Changelog
-## 10.0.0 EA - 2024-04-02
+## 10.0.1 GA - 2024-04-30
+
+Key Features and Updates:
+
+ - Parser changes
+ - Added support for building with `protobuf-lite`.
+ - Fixed issue when parsing and refitting models with nested `BatchNormalization` nodes.
+ - Added support for empty inputs in custom plugin nodes.
+ - Demo changes
+ - The following demos have been removed: Jasper, Tacotron2, HuggingFace Diffusers notebook
+ - Updated tooling
+ - Polygraphy v0.49.10
+ - ONNX-GraphSurgeon v0.5.2
+ - Build Containers
+ - Updated default cuda versions to `12.4.0`.
+ - Added Rocky Linux 8 and Rocky Linux 9 build containers
+
+## 10.0.0 EA - 2024-03-27
Key Features and Updates:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d29b78e..a1f072a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -143,7 +143,20 @@ if(BUILD_PARSERS)
configure_protobuf(${PROTOBUF_VERSION})
endif()
-find_library_create_target(nvinfer nvinfer SHARED ${TRT_LIB_DIR})
+# Windows library names have major version appended.
+if (MSVC)
+ set(nvinfer_lib_name "nvinfer_${TRT_SOVERSION}")
+ set(nvinfer_plugin_lib_name "nvinfer_plugin_${TRT_SOVERSION}")
+ set(nvinfer_vc_plugin_lib_name "nvinfer_vc_plugin_${TRT_SOVERSION}")
+ set(nvonnxparser_lib_name "nvonnxparser_${TRT_SOVERSION}")
+else()
+ set(nvinfer_lib_name "nvinfer")
+ set(nvinfer_plugin_lib_name "nvinfer_plugin")
+ set(nvinfer_vc_plugin_lib_name "nvinfer_vc_plugin")
+ set(nvonnxparser_lib_name "nvonnxparser")
+endif()
+
+find_library_create_target(nvinfer ${nvinfer_lib_name} SHARED ${TRT_LIB_DIR})
find_library(CUDART_LIB cudart_static HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib/x64 lib64)
@@ -165,7 +178,16 @@ else()
75
)
- string(REGEX MATCH "aarch64" IS_ARM "${TRT_PLATFORM_ID}")
+ find_file(IS_L4T_NATIVE nv_tegra_release PATHS /env/)
+ set (IS_L4T_CROSS "False")
+ if (DEFINED ENV{IS_L4T_CROSS})
+ set(IS_L4T_CROSS $ENV{IS_L4T_CROSS})
+ endif()
+
+ if (IS_L4T_NATIVE OR ${IS_L4T_CROSS} STREQUAL "True")
+ # Only Orin (SM87) supported
+ list(APPEND GPU_ARCHS 87)
+ endif()
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.0)
# Ampere GPU (SM80) support is only available in CUDA versions > 11.0
@@ -206,13 +228,13 @@ endif()
if(BUILD_PLUGINS)
add_subdirectory(plugin)
else()
- find_library_create_target(nvinfer_plugin nvinfer_plugin SHARED ${TRT_OUT_DIR} ${TRT_LIB_DIR})
+ find_library_create_target(nvinfer_plugin ${nvinfer_plugin_lib_name} SHARED ${TRT_OUT_DIR} ${TRT_LIB_DIR})
endif()
if(BUILD_PARSERS)
add_subdirectory(parsers)
else()
- find_library_create_target(nvonnxparser nvonnxparser SHARED ${TRT_OUT_DIR} ${TRT_LIB_DIR})
+ find_library_create_target(nvonnxparser ${nvonnxparser_lib_name} SHARED ${TRT_OUT_DIR} ${TRT_LIB_DIR})
endif()
if(BUILD_SAMPLES)
diff --git a/README.md b/README.md
index 28a3edba..9e2bf7b9 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ You can skip the **Build** section to enjoy TensorRT with Python.
To build the TensorRT-OSS components, you will first need the following software packages.
**TensorRT GA build**
-* TensorRT v10.0.0.6
+* TensorRT v10.0.1.6
* Available from direct download links listed below
**System Packages**
@@ -73,16 +73,16 @@ To build the TensorRT-OSS components, you will first need the following software
If using the TensorRT OSS build container, TensorRT libraries are preinstalled under `/usr/lib/x86_64-linux-gnu` and you may skip this step.
Else download and extract the TensorRT GA build from [NVIDIA Developer Zone](https://developer.nvidia.com) with the direct links below:
- - [TensorRT 10.0.0.6 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.0/tars/TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-11.8.tar.gz)
- - [TensorRT 10.0.0.6 for CUDA 12.4, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.0/tars/TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-12.4.tar.gz)
+ - [TensorRT 10.0.1.6 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz)
+ - [TensorRT 10.0.1.6 for CUDA 12.4, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz)
**Example: Ubuntu 20.04 on x86-64 with cuda-12.4**
```bash
cd ~/Downloads
- tar -xvzf TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
- export TRT_LIBPATH=`pwd`/TensorRT-10.0.0.6
+ tar -xvzf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
+ export TRT_LIBPATH=`pwd`/TensorRT-10.0.1.6
```
## Setting Up The Build Environment
@@ -92,16 +92,27 @@ For Linux platforms, we recommend that you generate a docker container for build
1. #### Generate the TensorRT-OSS build container.
The TensorRT-OSS build container can be generated using the supplied Dockerfiles and build scripts. The build containers are configured for building TensorRT OSS out-of-the-box.
- **Example: Ubuntu 20.04 on x86-64 with cuda-12.3.2 (default)**
+ **Example: Ubuntu 20.04 on x86-64 with cuda-12.4 (default)**
```bash
- ./docker/build.sh --file docker/ubuntu-20.04.Dockerfile --tag tensorrt-ubuntu20.04-cuda12.3.2
+ ./docker/build.sh --file docker/ubuntu-20.04.Dockerfile --tag tensorrt-ubuntu20.04-cuda12.4
+ ```
+ **Example: Rockylinux8 on x86-64 with cuda-12.4**
+ ```bash
+ ./docker/build.sh --file docker/rockylinux8.Dockerfile --tag tensorrt-rockylinux8-cuda12.4
+ ```
+ **Example: Ubuntu 22.04 cross-compile for Jetson (aarch64) with cuda-12.4 (JetPack SDK)**
+ ```bash
+ ./docker/build.sh --file docker/ubuntu-cross-aarch64.Dockerfile --tag tensorrt-jetpack-cuda12.4
+ ```
+ **Example: Ubuntu 22.04 on aarch64 with cuda-12.4**
+ ```bash
+ ./docker/build.sh --file docker/ubuntu-22.04-aarch64.Dockerfile --tag tensorrt-aarch64-ubuntu22.04-cuda12.4
```
-
2. #### Launch the TensorRT-OSS build container.
**Example: Ubuntu 20.04 build container**
```bash
- ./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda12.3.2 --gpus all
+ ./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda12.4 --gpus all
```
> NOTE:
1. Use the `--tag` corresponding to build container generated in Step 1.
@@ -112,13 +123,36 @@ For Linux platforms, we recommend that you generate a docker container for build
## Building TensorRT-OSS
* Generate Makefiles and build.
- **Example: Linux (x86-64) build with default cuda-12.3.2**
+ **Example: Linux (x86-64) build with default cuda-12.4**
```bash
cd $TRT_OSSPATH
mkdir -p build && cd build
cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out
make -j$(nproc)
```
+ **Example: Linux (aarch64) build with default cuda-12.4**
+ ```bash
+ cd $TRT_OSSPATH
+ mkdir -p build && cd build
+ cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out -DCMAKE_TOOLCHAIN_FILE=$TRT_OSSPATH/cmake/toolchains/cmake_aarch64-native.toolchain
+ make -j$(nproc)
+ ```
+ **Example: Native build on Jetson (aarch64) with cuda-12.4**
+ ```bash
+ cd $TRT_OSSPATH
+ mkdir -p build && cd build
+ cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out -DTRT_PLATFORM_ID=aarch64 -DCUDA_VERSION=12.4
+ CC=/usr/bin/gcc make -j$(nproc)
+ ```
+ > NOTE: C compiler must be explicitly specified via CC= for native aarch64 builds of protobuf.
+
+ **Example: Ubuntu 22.04 Cross-Compile for Jetson (aarch64) with cuda-12.4 (JetPack)**
+ ```bash
+ cd $TRT_OSSPATH
+ mkdir -p build && cd build
+ cmake .. -DCMAKE_TOOLCHAIN_FILE=$TRT_OSSPATH/cmake/toolchains/cmake_aarch64.toolchain -DCUDA_VERSION=12.4 -DCUDNN_LIB=/pdk_files/cudnn/usr/lib/aarch64-linux-gnu/libcudnn.so -DCUBLAS_LIB=/usr/local/cuda-12.4/targets/aarch64-linux/lib/stubs/libcublas.so -DCUBLASLT_LIB=/usr/local/cuda-12.4/targets/aarch64-linux/lib/stubs/libcublasLt.so -DTRT_LIB_DIR=/pdk_files/tensorrt/lib
+ make -j$(nproc)
+ ```
> NOTE:
1. The default CUDA version used by CMake is 12.2.0. To override this, for example to 11.8, append `-DCUDA_VERSION=11.8` to the cmake command.
diff --git a/VERSION b/VERSION
index efdce495..db243822 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-10.0.0.6
+10.0.1.6
diff --git a/cmake/modules/find_library_create_target.cmake b/cmake/modules/find_library_create_target.cmake
index a1d29efb..49441847 100644
--- a/cmake/modules/find_library_create_target.cmake
+++ b/cmake/modules/find_library_create_target.cmake
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,9 +25,6 @@ macro(find_library_create_target target_name lib libtype hints)
find_library(${lib}_LIB_PATH ${lib})
message(STATUS "Library that was found ${${lib}_LIB_PATH}")
add_library(${target_name} ${libtype} IMPORTED)
- set_property(TARGET ${target_name} PROPERTY IMPORTED_LOCATION ${${lib}_LIB_PATH}) # This should be .so or .dll file, currently its .a or .lib.
- if (WIN32)
- set_property(TARGET ${target_name} PROPERTY IMPORTED_IMPLIB ${${lib}_LIB_PATH}) # This should be a .lib file
- endif()
+ set_property(TARGET ${target_name} PROPERTY IMPORTED_LOCATION ${${lib}_LIB_PATH})
message(STATUS "==========================================================================================")
endmacro()
diff --git a/cmake/modules/set_ifndef.cmake b/cmake/modules/set_ifndef.cmake
index fbdc9be1..85d769e9 100644
--- a/cmake/modules/set_ifndef.cmake
+++ b/cmake/modules/set_ifndef.cmake
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cmake/toolchains/cmake_aarch64-android.toolchain b/cmake/toolchains/cmake_aarch64-android.toolchain
index 87e490f6..ec768aa4 100644
--- a/cmake/toolchains/cmake_aarch64-android.toolchain
+++ b/cmake/toolchains/cmake_aarch64-android.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cmake/toolchains/cmake_aarch64-native.toolchain b/cmake/toolchains/cmake_aarch64-native.toolchain
index fd4e30cc..bd49c9bb 100644
--- a/cmake/toolchains/cmake_aarch64-native.toolchain
+++ b/cmake/toolchains/cmake_aarch64-native.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cmake/toolchains/cmake_aarch64.toolchain b/cmake/toolchains/cmake_aarch64.toolchain
index 3c87fd65..020a1066 100644
--- a/cmake/toolchains/cmake_aarch64.toolchain
+++ b/cmake/toolchains/cmake_aarch64.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +19,8 @@ set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR aarch64)
set(TRT_PLATFORM_ID "aarch64")
+set(CMAKE_FIND_LIBRARY_PREFIXES "lib")
+set(CMAKE_FIND_LIBRARY_SUFFIXES .so)
if("$ENV{ARMSERVER}" AND "${CUDA_VERSION}" VERSION_GREATER_EQUAL 11.0)
set(CUDA_PLATFORM_ID "sbsa-linux")
@@ -46,10 +48,18 @@ set(BUILD_LIBRARY_ONLY 1)
set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT})
set(CUDA_INCLUDE_DIRS ${CUDA_ROOT}/include)
+set(CMAKE_THREAD_LIBS_INIT "-lpthread")
+set(CMAKE_HAVE_THREADS_LIBRARY 1)
+set(CMAKE_USE_WIN32_THREADS_INIT 0)
+set(CMAKE_USE_PTHREADS_INIT 1)
+
find_library(RT_LIB rt PATHS /usr/aarch64-linux-gnu/lib /usr/lib/aarch64-linux-gnu)
if(NOT RT_LIB)
- message(WARNING "librt.so not found in default paths")
+ find_file(RT_LIB librt.so PATHS /usr/aarch64-linux-gnu/lib /usr/lib/aarch64-linux-gnu)
+ if(NOT RT_LIB)
+ message(WARNING "librt.so not found in default paths")
+ endif()
endif()
message("RT_LIB: ${RT_LIB}")
diff --git a/cmake/toolchains/cmake_aarch64_cross.toolchain b/cmake/toolchains/cmake_aarch64_cross.toolchain
index 177a82f9..844fdd89 100644
--- a/cmake/toolchains/cmake_aarch64_cross.toolchain
+++ b/cmake/toolchains/cmake_aarch64_cross.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cmake/toolchains/cmake_ppc64le.toolchain b/cmake/toolchains/cmake_ppc64le.toolchain
index 074c3fb0..2d6272f5 100644
--- a/cmake/toolchains/cmake_ppc64le.toolchain
+++ b/cmake/toolchains/cmake_ppc64le.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cmake/toolchains/cmake_qnx.toolchain b/cmake/toolchains/cmake_qnx.toolchain
index 95f337a8..60b36163 100644
--- a/cmake/toolchains/cmake_qnx.toolchain
+++ b/cmake/toolchains/cmake_qnx.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cmake/toolchains/cmake_x64_win.toolchain b/cmake/toolchains/cmake_x64_win.toolchain
index 5dad0ce7..87b04f5f 100644
--- a/cmake/toolchains/cmake_x64_win.toolchain
+++ b/cmake/toolchains/cmake_x64_win.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cmake/toolchains/cmake_x86_64.toolchain b/cmake/toolchains/cmake_x86_64.toolchain
index 8d452945..daf336ef 100644
--- a/cmake/toolchains/cmake_x86_64.toolchain
+++ b/cmake/toolchains/cmake_x86_64.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cmake/toolchains/cmake_x86_64_agnostic.toolchain b/cmake/toolchains/cmake_x86_64_agnostic.toolchain
index 8253d8f1..91c03095 100644
--- a/cmake/toolchains/cmake_x86_64_agnostic.toolchain
+++ b/cmake/toolchains/cmake_x86_64_agnostic.toolchain
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/CMakeLists.txt b/demo/BERT/CMakeLists.txt
index cc2c8fc9..94639130 100644
--- a/demo/BERT/CMakeLists.txt
+++ b/demo/BERT/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/README.md b/demo/BERT/README.md
index f867a321..27d141f5 100755
--- a/demo/BERT/README.md
+++ b/demo/BERT/README.md
@@ -73,9 +73,9 @@ The following software version configuration has been tested:
|Software|Version|
|--------|-------|
-|Python|>=3.6|
-|TensorRT|8.5.1|
-|CUDA|11.6|
+|Python|>=3.8|
+|TensorRT|10.0.1.6|
+|CUDA|12.4|
## Setup
diff --git a/demo/BERT/builder.py b/demo/BERT/builder.py
index 5eafe367..c5f21b0a 100755
--- a/demo/BERT/builder.py
+++ b/demo/BERT/builder.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -43,7 +43,7 @@
trt_version = [n for n in trt.__version__.split('.')]
# Import necessary plugins for demoBERT
-plugin_lib_name = "nvinfer_plugin.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
+plugin_lib_name = "nvinfer_plugin_10.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
env_name_to_add_path = "PATH" if sys.platform == "win32" else "LD_LIBRARY_PATH"
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
if not handle:
diff --git a/demo/BERT/builder_utils.py b/demo/BERT/builder_utils.py
index 248bee80..abf0f514 100644
--- a/demo/BERT/builder_utils.py
+++ b/demo/BERT/builder_utils.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/builder_varseqlen.py b/demo/BERT/builder_varseqlen.py
index ad25ef0c..66a9d571 100755
--- a/demo/BERT/builder_varseqlen.py
+++ b/demo/BERT/builder_varseqlen.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -42,7 +42,7 @@
trt_version = [n for n in trt.__version__.split('.')]
# Import necessary plugins for demoBERT
-plugin_lib_name = "nvinfer_plugin.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
+plugin_lib_name = "nvinfer_plugin_10.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
env_name_to_add_path = "PATH" if sys.platform == "win32" else "LD_LIBRARY_PATH"
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
if not handle:
diff --git a/demo/BERT/helpers/calibrator.py b/demo/BERT/helpers/calibrator.py
index beacc625..09e6014b 100644
--- a/demo/BERT/helpers/calibrator.py
+++ b/demo/BERT/helpers/calibrator.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/helpers/data_processing.py b/demo/BERT/helpers/data_processing.py
index 88459ebf..e7deae31 100644
--- a/demo/BERT/helpers/data_processing.py
+++ b/demo/BERT/helpers/data_processing.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/helpers/tokenization.py b/demo/BERT/helpers/tokenization.py
index 434f411d..9d3cb22d 100644
--- a/demo/BERT/helpers/tokenization.py
+++ b/demo/BERT/helpers/tokenization.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/infer_c/bert_infer.h b/demo/BERT/infer_c/bert_infer.h
index 2f72102a..d049877e 100644
--- a/demo/BERT/infer_c/bert_infer.h
+++ b/demo/BERT/infer_c/bert_infer.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -152,15 +152,12 @@ struct BertInference
mDeviceBuffers.emplace_back(devBuf);
mHostOutput.resize(numOutputItems);
- mBindings.resize(mEngine->getNbIOTensors() * mEngine->getNbOptimizationProfiles());
}
void prepare(int profIdx, int batchSize)
{
- mContext->setOptimizationProfile(profIdx);
- const int bindingIdxOffset = profIdx * mEngine->getNbIOTensors();
- std::copy(mDeviceBuffers.begin(), mDeviceBuffers.end(), mBindings.begin() + bindingIdxOffset);
+ mContext->setOptimizationProfileAsync(profIdx, mStream);
if (mEnableVariableLen)
{
@@ -191,13 +188,13 @@ struct BertInference
for (int32_t i = 0; i < mEngine->getNbIOTensors(); i++)
{
auto const& name = mEngine->getIOTensorName(i);
- context->setTensorAddress(name, mBindings[i + bindingIdxOffset]);
+ mContext->setTensorAddress(name, mDeviceBuffers[i]);
}
cudaGraph_t graph;
cudaGraphExec_t exec;
// warm up and let mContext do cublas initialization
- bool status = mContext->enqueueV3(mStream, nullptr);
+ bool status = mContext->enqueueV3(mStream);
if (!status)
{
gLogError << "Enqueue failed\n";
@@ -206,7 +203,7 @@ struct BertInference
gLogVerbose << "Capturing graph\n";
gpuErrChk(cudaStreamBeginCapture(mStream, cudaStreamCaptureModeRelaxed));
- status = mContext->enqueueV3(mStream, nullptr);
+ status = mContext->enqueueV3(mStream);
if (!status)
{
gLogError << "Enqueue failed\n";
@@ -240,7 +237,7 @@ struct BertInference
}
else
{
- bool status = mContext->enqueueV3(mStream, nullptr);
+ bool status = mContext->enqueueV3(mStream);
if (!status)
{
gLogError << "Enqueue failed\n";
@@ -265,7 +262,7 @@ struct BertInference
}
else
{
- bool status = mContext->enqueueV3(mStream, nullptr);
+ bool status = mContext->enqueueV3(mStream);
if (!status)
{
gLogError << "Enqueue failed\n";
@@ -347,7 +344,6 @@ struct BertInference
TrtUniquePtr mRuntime{nullptr};
TrtUniquePtr mEngine{nullptr};
TrtUniquePtr mContext{nullptr};
- std::vector mBindings;
bool mEnableVariableLen;
std::vector mCuSeqlens;
diff --git a/demo/BERT/infer_c/common.h b/demo/BERT/infer_c/common.h
index b5280e2a..da29944c 100644
--- a/demo/BERT/infer_c/common.h
+++ b/demo/BERT/infer_c/common.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -169,7 +169,7 @@ struct TrtDestroyer
{
void operator()(T* t)
{
- t->destroy();
+ delete t;
}
};
diff --git a/demo/BERT/infer_c/infer_c.cpp b/demo/BERT/infer_c/infer_c.cpp
index b868a661..946ce663 100644
--- a/demo/BERT/infer_c/infer_c.cpp
+++ b/demo/BERT/infer_c/infer_c.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/infer_c/logging.cpp b/demo/BERT/infer_c/logging.cpp
index b6b14298..f651155c 100644
--- a/demo/BERT/infer_c/logging.cpp
+++ b/demo/BERT/infer_c/logging.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/infer_c/logging.h b/demo/BERT/infer_c/logging.h
index 2c36d039..2c137465 100644
--- a/demo/BERT/infer_c/logging.h
+++ b/demo/BERT/infer_c/logging.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/infer_c/perf.cpp b/demo/BERT/infer_c/perf.cpp
index bbc6de76..0208f2eb 100644
--- a/demo/BERT/infer_c/perf.cpp
+++ b/demo/BERT/infer_c/perf.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/inference.py b/demo/BERT/inference.py
index dc172181..aa0d0dd7 100644
--- a/demo/BERT/inference.py
+++ b/demo/BERT/inference.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -121,7 +121,7 @@ def question_features(tokens, question):
return dp.convert_example_to_features(tokens, question, tokenizer, max_seq_length, doc_stride, args.max_query_length)
# Import necessary plugins for demoBERT
- plugin_lib_name = "nvinfer_plugin.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
+ plugin_lib_name = "nvinfer_plugin_10.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
env_name_to_add_path = "PATH" if sys.platform == "win32" else "LD_LIBRARY_PATH"
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
if not handle:
diff --git a/demo/BERT/inference_c.py b/demo/BERT/inference_c.py
index e2bda9af..b10127bd 100644
--- a/demo/BERT/inference_c.py
+++ b/demo/BERT/inference_c.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/inference_varseqlen.py b/demo/BERT/inference_varseqlen.py
index 7eb87012..700ddcce 100644
--- a/demo/BERT/inference_varseqlen.py
+++ b/demo/BERT/inference_varseqlen.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -120,7 +120,7 @@ def question_features(tokens, question):
return dp.convert_example_to_features(tokens, question, tokenizer, max_seq_length, doc_stride, args.max_query_length)
# Import necessary plugins for demoBERT
- plugin_lib_name = "nvinfer_plugin.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
+ plugin_lib_name = "nvinfer_plugin_10.dll" if sys.platform == "win32" else "libnvinfer_plugin.so"
env_name_to_add_path = "PATH" if sys.platform == "win32" else "LD_LIBRARY_PATH"
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
if not handle:
diff --git a/demo/BERT/perf.py b/demo/BERT/perf.py
index 7b4e9da9..f3d2ab74 100644
--- a/demo/BERT/perf.py
+++ b/demo/BERT/perf.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/perf_varseqlen.py b/demo/BERT/perf_varseqlen.py
index 853201a4..6708f989 100644
--- a/demo/BERT/perf_varseqlen.py
+++ b/demo/BERT/perf_varseqlen.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/squad/evaluate-v1.1.py b/demo/BERT/squad/evaluate-v1.1.py
index c73db423..bde41564 100644
--- a/demo/BERT/squad/evaluate-v1.1.py
+++ b/demo/BERT/squad/evaluate-v1.1.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/BERT/squad/evaluate-v2.0.py b/demo/BERT/squad/evaluate-v2.0.py
index e36d3e9f..67518e3c 100644
--- a/demo/BERT/squad/evaluate-v2.0.py
+++ b/demo/BERT/squad/evaluate-v2.0.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/DeBERTa/deberta_onnx_modify.py b/demo/DeBERTa/deberta_onnx_modify.py
index 234c4659..f8fe61f5 100644
--- a/demo/DeBERTa/deberta_onnx_modify.py
+++ b/demo/DeBERTa/deberta_onnx_modify.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/DeBERTa/deberta_ort_inference.py b/demo/DeBERTa/deberta_ort_inference.py
index 17378989..05741733 100644
--- a/demo/DeBERTa/deberta_ort_inference.py
+++ b/demo/DeBERTa/deberta_ort_inference.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/DeBERTa/deberta_pytorch2onnx.py b/demo/DeBERTa/deberta_pytorch2onnx.py
index 51546b29..7745f0dc 100644
--- a/demo/DeBERTa/deberta_pytorch2onnx.py
+++ b/demo/DeBERTa/deberta_pytorch2onnx.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/DeBERTa/deberta_tensorrt_inference.py b/demo/DeBERTa/deberta_tensorrt_inference.py
index 378a5953..355ad7cf 100644
--- a/demo/DeBERTa/deberta_tensorrt_inference.py
+++ b/demo/DeBERTa/deberta_tensorrt_inference.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/DeBERTa/requirements.txt b/demo/DeBERTa/requirements.txt
index 59b63433..c52dd08a 100644
--- a/demo/DeBERTa/requirements.txt
+++ b/demo/DeBERTa/requirements.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/Diffusion/README.md b/demo/Diffusion/README.md
index d550c83b..42949381 100644
--- a/demo/Diffusion/README.md
+++ b/demo/Diffusion/README.md
@@ -19,15 +19,15 @@ Install nvidia-docker using [these intructions](https://docs.nvidia.com/datacent
docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash
```
+NOTE: The demo supports CUDA>=11.8
+
### Install latest TensorRT release
```bash
python3 -m pip install --upgrade pip
-python3 -m pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt
+pip install --pre tensorrt-cu12
```
-> NOTE: TensorRT 10.x is only available as a pre-release
-
Check your installed version using:
`python3 -c 'import tensorrt;print(tensorrt.__version__)'`
@@ -39,27 +39,24 @@ Check your installed version using:
export TRT_OSSPATH=/workspace
cd $TRT_OSSPATH/demo/Diffusion
pip3 install -r requirements.txt
-
```
-> NOTE: demoDiffusion has been tested on systems with NVIDIA A100, RTX3090, and RTX4090 GPUs, and the following software configuration.
+> NOTE: demoDiffusion has been tested on systems with NVIDIA H100, A100, L40, T4, and RTX4090 GPUs, and the following software configuration.
```
diffusers 0.26.3
onnx 1.15.0
-onnx-graphsurgeon 0.3.27
-onnxruntime 1.17.0
-polygraphy 0.49.7
-tensorrt 10.0.0.6
+onnx-graphsurgeon 0.5.2
+onnxruntime 1.16.3
+polygraphy 0.49.9
+tensorrt 10.0.1.6
tokenizers 0.13.3
-torch 2.1.0
-transformers 4.31.0
+torch 2.2.0
+transformers 4.33.1
controlnet-aux 0.0.6
-nvidia-ammo 0.7.0
+nvidia-ammo 0.9.4
```
-
> NOTE: optionally install HuggingFace [accelerate](https://pypi.org/project/accelerate/) package for faster and less memory-intense model loading.
-
# Running demoDiffusion
### Review usage instructions for the supported pipelines
@@ -75,6 +72,7 @@ python3 demo_txt2img_xl.py --help
### HuggingFace user access token
To download model checkpoints for the Stable Diffusion pipelines, obtain a `read` access token to HuggingFace Hub. See [instructions](https://huggingface.co/docs/hub/security-tokens).
+> NOTE: This step isn't required for many models now.
```bash
export HF_TOKEN=
@@ -144,10 +142,9 @@ python3 demo_txt2img_xl.py "Picture of a rustic Italian village with Olive trees
### Faster Text-to-image using SDXL & INT8 quantization using AMMO
```bash
-python3 demo_txt2img_xl.py "a photo of an astronaut riding a horse on mars" --version xl-1.0 --onnx-dir onnx-sdxl --engine-dir engine-sdxl --int8 --quantization-level 3
+python3 demo_txt2img_xl.py "a photo of an astronaut riding a horse on mars" --version xl-1.0 --onnx-dir onnx-sdxl --engine-dir engine-sdxl --int8
```
-
-Note that the calibration process can be quite time-consuming, and will be repeated if `--quantization-level`, `--denoising-steps`, or `--onnx-dir` is changed.
+> Note that INT8 quantization is only supported for SDXL, and won't work with LoRA weights. Some prompts may produce better inputs with fewer denoising steps (e.g. `--denoising-steps 20`) but this will repeat the calibration, ONNX export, and engine building processes for the U-Net.
### Faster Text-to-Image using SDXL + LCM (Latent Consistency Model) LoRA weights
[LCM-LoRA](https://arxiv.org/abs/2311.05556) produces good quality images in 4 to 8 denoising steps instead of 30+ needed base model. Note that we use LCM scheduler and disable classifier-free-guidance by setting `--guidance-scale` to 0.
diff --git a/demo/Diffusion/calibration.py b/demo/Diffusion/calibration.py
deleted file mode 100644
index 98adb6d3..00000000
--- a/demo/Diffusion/calibration.py
+++ /dev/null
@@ -1,177 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import types
-from typing import Callable, Optional, Union
-
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from torch.distributed import ReduceOp
-from utilities import PercentileAmaxes
-
-from ammo.torch.quantization.model_calib import (
- enable_stats_collection,
- finish_stats_collection,
- max_calibrate,
-)
-from ammo.torch.quantization.utils import is_quantized_linear
-
-
-def precentile_calib_mode(base_unet, quant_config={}):
- def compute_amax(self, all_reduce=True):
- """Return the absolute max of all tensors collected."""
- if (
- self._calib_amax is not None
- and all_reduce
- and dist.is_available()
- and dist.is_initialized()
- and dist.get_world_size() > 1
- ):
- tmp_amax = self._calib_amax.clone()
- dist.all_reduce(tmp_amax, op=ReduceOp.MAX)
- self._calib_amax.copy_(tmp_amax)
- if self._track_amax:
- up_lim = int(self._amaxs.total_step * self._amaxs.percentile)
- if up_lim <= 0:
- up_lim = 1
- amaxs_values = [self._amaxs.data[i] for i in range(0, up_lim)]
- act_amax = (
- torch.tensor(np.vstack(amaxs_values).min(axis=0))
- .float()
- .squeeze(0)
- .to(self._calib_amax.device)
- .to(self._calib_amax.dtype)
- )
- return act_amax
- return self._calib_amax
-
- for _, module in base_unet.named_modules():
- if isinstance(module, (nn.Linear, nn.Conv2d)):
- module.input_quantizer._calibrator._track_amax = True
- module.input_quantizer._calibrator._amaxs = PercentileAmaxes(
- total_step=quant_config["base-step"], percentile=quant_config["percentile"]
- )
- module.input_quantizer._calibrator.compute_amax = types.MethodType(
- compute_amax, module.input_quantizer._calibrator
- )
-
-
-@torch.no_grad()
-def smoothquant(model, forward_loop=None):
- """
- Rewrite the original SmoothQuant method
- """
- assert forward_loop is not None, "forward_loop must be provided for smoothquant"
- max_calibrate(model, forward_loop)
-
- smoothed_modules = 0
- for name, module in model.named_modules():
- if is_quantized_linear(module):
- if not hasattr(module.input_quantizer, "_amax"):
- print(f"Warning: {name} is not calibrated, skip smoothing")
- continue
- if module.input_quantizer.num_bits != 8 or module.weight_quantizer.num_bits != 8:
- print(f"Warning: only int8 smoothing is supported, skip {name}")
- continue
- if module.input_quantizer.axis != -1:
- print(f"Warning: only per-channel smoothing is supported, skip {name}")
- continue
-
- alpha = 1.0
- if hasattr(module, "alpha"):
- alpha = module.alpha
- assert (
- module.input_quantizer._amax.numel() > 1
- ), f"Error: {name} has only one channel to smooth"
-
- # It is important to keep scaling math in fp32 to be numerically safe
- act_amax = module.input_quantizer.amax.float()
-
- act_device = act_amax.device
-
- # If model is split across devices, this tensor may be on wrong one
- act_amax = act_amax.to(module.weight.device)
-
- weight_scale = module.weight.abs().max(dim=0, keepdim=True)[0]
- scale_a = (weight_scale.pow(1 - alpha) / act_amax.pow(alpha)).squeeze()
-
- # Some channel could have 0 amax which causes scale_a to overflow. Explicitly mask them out here
- epsilon = 1.0 / (1 << 31)
- if act_amax.min() <= epsilon:
- zero_mask = act_amax <= epsilon
- scale_a[zero_mask] = 1
- inv_scale_a = 1.0 / scale_a
- inv_scale_a = inv_scale_a.squeeze()[None, :]
-
- # Use per-tensor quantization for activation, add a pre-quantization scale vector
- module.input_quantizer.pre_quant_scale = scale_a.to(module.weight.dtype).to(act_device)
- module.input_quantizer._axis = None
- delattr(module.input_quantizer, "_amax")
- module.input_quantizer.amax = torch.tensor(
- (act_amax * scale_a).max().item(),
- dtype=module.weight.dtype,
- device=module.weight.device,
- )
-
- # Multiply weight by inv_scale_a and recalibrate
- module.weight.detach().copy_(
- (module.weight.float() * inv_scale_a).to(module.weight.dtype)
- )
-
- enable_stats_collection(module.weight_quantizer)
- module.weight_quantizer(module.weight)
- finish_stats_collection(module.weight_quantizer)
-
- smoothed_modules += 1
- print(f"Smoothed {smoothed_modules} modules")
-
-
-def calibrate(
- model: nn.Module,
- algorithm: Union[str, dict, None] = "max",
- forward_loop: Optional[Callable] = None,
-) -> None:
- if algorithm is None:
- return
-
- if isinstance(algorithm, str):
- kwargs = {}
- elif isinstance(algorithm, dict):
- kwargs = algorithm.copy()
- algorithm = kwargs.pop("method")
- else:
- raise TypeError(f"Unsupported type for algorithm: {type(algorithm)}")
-
- if algorithm == "smoothquant":
- smoothquant(model, forward_loop)
- elif algorithm == "max":
- max_calibrate(model, forward_loop)
- else:
- raise ValueError(f"Unsupported calibration algorithm: {algorithm}")
-
-
-def reg_alpha_qkv(base_unet, alpha):
- """
- Only apply alpha to QKV layers
- """
- for name, module in base_unet.named_modules():
- if isinstance(module, torch.nn.Linear):
- if "to_q" in name or "to_k" in name or "to_v" in name:
- module.alpha = alpha
-
diff --git a/demo/Diffusion/demo_img2img.py b/demo/Diffusion/demo_img2img.py
index bf56f6a9..74ec90ad 100755
--- a/demo/Diffusion/demo_img2img.py
+++ b/demo/Diffusion/demo_img2img.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/Diffusion/demo_inpaint.py b/demo/Diffusion/demo_inpaint.py
index af635df0..29ca0ce2 100755
--- a/demo/Diffusion/demo_inpaint.py
+++ b/demo/Diffusion/demo_inpaint.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/Diffusion/demo_txt2img.py b/demo/Diffusion/demo_txt2img.py
index 3e33838f..84c9e164 100644
--- a/demo/Diffusion/demo_txt2img.py
+++ b/demo/Diffusion/demo_txt2img.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/Diffusion/demo_txt2img_xl.py b/demo/Diffusion/demo_txt2img_xl.py
index ea579279..96910756 100644
--- a/demo/Diffusion/demo_txt2img_xl.py
+++ b/demo/Diffusion/demo_txt2img_xl.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/Diffusion/models.py b/demo/Diffusion/models.py
index b1a196aa..b48028ff 100644
--- a/demo/Diffusion/models.py
+++ b/demo/Diffusion/models.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,7 +22,6 @@
ControlNetModel,
UNet2DConditionModel
)
-from diffusers.utils import convert_state_dict_to_diffusers
import json
import numpy as np
import onnx
@@ -159,13 +158,13 @@ def fuse_mha_qkv_int8_sq(self):
del tensors[k]
removed += 1
print(f"Removed {removed} QDQ nodes")
- return removed
+ return removed # expected 72 for L2.5
def get_path(version, pipeline, controlnets=None):
if controlnets is not None:
return ["lllyasviel/sd-controlnet-" + modality for modality in controlnets]
-
+
if version == "1.4":
if pipeline.is_inpaint():
return "runwayml/stable-diffusion-inpainting"
@@ -647,7 +646,7 @@ def __init__(self, unet, controlnets) -> None:
super().__init__()
self.unet = unet
self.controlnets = controlnets
-
+
def forward(self, sample, timestep, encoder_hidden_states, images, controlnet_scales):
for i, (image, conditioning_scale, controlnet) in enumerate(zip(images, controlnet_scales, self.controlnets)):
down_samples, mid_sample = controlnet(
@@ -663,7 +662,7 @@ def forward(self, sample, timestep, encoder_hidden_states, images, controlnet_sc
for down_sample in down_samples
]
mid_sample *= conditioning_scale
-
+
# merge samples
if i == 0:
down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
@@ -673,7 +672,7 @@ def forward(self, sample, timestep, encoder_hidden_states, images, controlnet_sc
for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
]
mid_block_res_sample += mid_sample
-
+
noise_pred = self.unet(
sample,
timestep,
@@ -744,7 +743,7 @@ def get_model(self, torch_inference=''):
def get_input_names(self):
if self.controlnets is None:
return ['sample', 'timestep', 'encoder_hidden_states']
- else:
+ else:
return ['sample', 'timestep', 'encoder_hidden_states', 'images', 'controlnet_scales']
def get_output_names(self):
@@ -820,14 +819,14 @@ def get_sample_input(self, batch_size, image_height, image_width, static_shape):
dtype = torch.float16 if self.fp16 else torch.float32
if self.controlnets is None:
return (
- torch.randn(batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device),
- torch.tensor([1.], dtype=torch.float32, device=self.device),
+ torch.randn(batch_size, self.unet_dim, latent_height, latent_width, dtype=dtype, device=self.device),
+ torch.tensor([1.], dtype=dtype, device=self.device),
torch.randn(batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device)
)
else:
return (
- torch.randn(batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device),
- torch.tensor(999, dtype=torch.float32, device=self.device),
+ torch.randn(batch_size, self.unet_dim, latent_height, latent_width, dtype=dtype, device=self.device),
+ torch.tensor(999, dtype=dtype, device=self.device),
torch.randn(batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device),
torch.randn(len(self.controlnets), batch_size, 3, image_height, image_width, dtype=dtype, device=self.device),
torch.randn(len(self.controlnets), dtype=dtype, device=self.device)
@@ -931,8 +930,8 @@ def get_sample_input(self, batch_size, image_height, image_width, static_shape):
latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
dtype = torch.float16 if self.fp16 else torch.float32
return (
- torch.randn(self.xB*batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device),
- torch.tensor([1.], dtype=torch.float32, device=self.device),
+ torch.randn(self.xB*batch_size, self.unet_dim, latent_height, latent_width, dtype=dtype, device=self.device),
+ torch.tensor([1.], dtype=dtype, device=self.device),
torch.randn(self.xB*batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device),
{
'added_cond_kwargs': {
diff --git a/demo/Diffusion/requirements.txt b/demo/Diffusion/requirements.txt
index 4de26381..5fa939ec 100644
--- a/demo/Diffusion/requirements.txt
+++ b/demo/Diffusion/requirements.txt
@@ -1,4 +1,3 @@
-accelerate
colored
controlnet_aux==0.0.6
cuda-python
@@ -7,11 +6,10 @@ ftfy
matplotlib
nvtx
onnx==1.15.0
-onnxruntime==1.17.0
+onnxruntime==1.16.3
opencv-python==4.8.0.74
scipy
-transformers==4.31.0
---extra-index-url https://pypi.nvidia.com
-nvidia-ammo==0.7.0
+transformers==4.33.1
+nvidia-ammo==0.9.4
onnx-graphsurgeon
-polygraphy
+polygraphy==0.49.9
diff --git a/demo/Diffusion/stable_diffusion_pipeline.py b/demo/Diffusion/stable_diffusion_pipeline.py
index 13bd4156..10b7f57e 100755
--- a/demo/Diffusion/stable_diffusion_pipeline.py
+++ b/demo/Diffusion/stable_diffusion_pipeline.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,8 +15,8 @@
# limitations under the License.
#
+import ammo.torch.opt as ato
import ammo.torch.quantization as atq
-import calibration
from cuda import cudart
from diffusers import (
DDIMScheduler,
@@ -44,7 +44,6 @@
import numpy as np
import nvtx
import json
-import onnx
import os
import pathlib
import tensorrt as trt
@@ -55,17 +54,18 @@
PIPELINE_TYPE,
TRT_LOGGER,
Engine,
- filter_func,
- get_smoothquant_config,
get_refit_weights,
load_calib_prompts,
merge_loras,
prepare_mask_and_masked_image,
- quantize_lvl,
- replace_lora_layers,
save_image,
unload_model
)
+from utils_ammo import (
+ filter_func,
+ quantize_lvl,
+ get_int8_config,
+)
class StableDiffusionPipeline:
"""
@@ -76,7 +76,7 @@ def __init__(
version='1.5',
pipeline_type=PIPELINE_TYPE.TXT2IMG,
max_batch_size=16,
- denoising_steps=50,
+ denoising_steps=30,
scheduler=None,
guidance_scale=7.5,
device='cuda',
@@ -216,6 +216,11 @@ def makeScheduler(cls, subfolder="scheduler", **kwargs):
if self.pipeline_type.is_sd_xl():
self.config['clip_hidden_states'] = True
self.torch_inference = torch_inference
+ if self.torch_inference:
+ torch._inductor.config.conv_1x1_as_mm = True
+ torch._inductor.config.coordinate_descent_tuning = True
+ torch._inductor.config.epilogue_fusion = False
+ torch._inductor.config.coordinate_descent_check_all_directions = True
self.use_cuda_graph = use_cuda_graph
# initialized in loadEngines()
@@ -315,10 +320,11 @@ def loadEngines(
timing_cache=None,
int8=False,
quantization_level=2.5,
- quantization_percentile=0.4,
- quantization_alpha=0.6,
- calibration_steps=384,
- denoising_steps=50,
+ quantization_percentile=1.0,
+ quantization_alpha=0.8,
+ calibration_size=32,
+ calib_batch_size=2,
+ denoising_steps=30,
):
"""
Build and load engines for TensorRT accelerated inference.
@@ -349,6 +355,24 @@ def loadEngines(
Enable all tactic sources during TensorRT engine builds.
timing_cache (str):
Path to the timing cache to speed up TensorRT build.
+ int8 (bool):
+ Whether to quantize to int8 format or not (SDXL only).
+ quantization_level (float):
+ Controls which layers to quantize. 1: CNN, 2: CNN+FFN, 2.5: CNN+FFN+QKV, 3: CNN+FC
+ quantization_percentile (float):
+ Control quantization scaling factors (amax) collecting range, where the minimum amax in
+ range(n_steps * percentile) will be collected. Recommendation: 1.0
+ quantization_alpha (float):
+ The alpha parameter for SmoothQuant quantization used for linear layers.
+ Recommendation: 0.8 for SDXL
+ calibration_size (int):
+ The number of steps to use for calibrating the model for quantization.
+ Recommendation: 32, 64, 128 for SDXL
+ calib_batch_size (int):
+ The batch size to use for calibration. Defaults to 2.
+ denoising_steps (int):
+ The number of denoising steps.
+ More denoising steps usually lead to a higher quality image at the expense of slower inference.
"""
# Create directories if missing
for directory in [engine_dir, onnx_dir]:
@@ -411,7 +435,7 @@ def loadEngines(
if int8:
assert self.pipeline_type.is_sd_xl(), "int8 quantization only supported for SDXL pipeline"
use_int8['unetxl'] = True
- model_suffix['unetxl'] += f"-int8.l{quantization_level}.bs2.s{denoising_steps}.c{calibration_steps}.p{quantization_percentile}.a{quantization_alpha}"
+ model_suffix['unetxl'] += f"-int8.l{quantization_level}.bs2.s{denoising_steps}.c{calibration_size}.p{quantization_percentile}.a{quantization_alpha}"
onnx_path = dict(zip(model_names, [self.getOnnxPath(model_name, onnx_dir, opt=False, suffix=model_suffix[model_name]) for model_name in model_names]))
onnx_opt_path = dict(zip(model_names, [self.getOnnxPath(model_name, onnx_dir, suffix=model_suffix[model_name]) for model_name in model_names]))
engine_path = dict(zip(model_names, [self.getEnginePath(model_name, engine_dir, do_engine_refit[model_name], suffix=model_suffix[model_name]) for model_name in model_names]))
@@ -433,22 +457,16 @@ def loadEngines(
print(f"[I] Calibrated weights not found, generating {state_dict_path}")
pipeline = obj.get_pipeline()
model = pipeline.unet
- replace_lora_layers(model)
calibration_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'calibration-prompts.txt')
- # Use batch_size = 2 for UNet calibration
- calibration_prompts = load_calib_prompts(2, calibration_file)
- # TODO check size > calibration_steps
- quant_config = get_smoothquant_config(model, quantization_level)
- if quantization_percentile is not None:
- quant_config["percentile"] = quantization_percentile
- quant_config["base-step"] = int(denoising_steps)
-
- atq.replace_quant_module(model)
- atq.set_quantizer_by_cfg(model, quant_config["quant_cfg"])
- if quantization_percentile is not None:
- calibration.precentile_calib_mode(base_unet=model, quant_config=quant_config)
- if quantization_alpha is not None:
- calibration.reg_alpha_qkv(base_unet=model, alpha=quantization_alpha)
+ calibration_prompts = load_calib_prompts(calib_batch_size, calibration_file)
+ # TODO check size > calibration_size
+ quant_config = get_int8_config(
+ model,
+ quantization_level,
+ quantization_alpha,
+ quantization_percentile,
+ denoising_steps
+ )
def do_calibrate(base, calibration_prompts, **kwargs):
for i_th, prompts in enumerate(calibration_prompts):
@@ -462,34 +480,35 @@ def do_calibrate(base, calibration_prompts, **kwargs):
]
* len(prompts),
).images
-
- def calibration_loop():
+
+ def calibration_loop(unet):
+ pipeline.model = unet
do_calibrate(
base=pipeline,
calibration_prompts=calibration_prompts,
- calib_size=calibration_steps,
+ calib_size=calibration_size // calib_batch_size,
n_steps=denoising_steps,
)
- print(f"[I] Performing int8 calibration for {calibration_steps} steps. This can take a long time.")
- calibration.calibrate(model, quant_config["algorithm"], forward_loop=calibration_loop)
- torch.save(model.state_dict(), state_dict_path)
+ print(f"[I] Performing int8 calibration for {calibration_size} steps.")
+ atq.quantize(model, quant_config, forward_loop=calibration_loop)
+ ato.save(model, state_dict_path)
- print(f"[I] Generaing quantized ONNX model: {onnx_opt_path[model_name]}")
+ print(f"[I] Generating quantized ONNX model: {onnx_opt_path[model_name]}")
if not os.path.exists(onnx_path[model_name]):
model = obj.get_model()
- replace_lora_layers(model)
- atq.replace_quant_module(model)
- quant_config = atq.INT8_DEFAULT_CFG
- atq.set_quantizer_by_cfg(model, quant_config["quant_cfg"])
- model.load_state_dict(torch.load(state_dict_path), strict=True)
- quantize_lvl(model, quantization_level)
+ ato.restore(model, state_dict_path)
+ quantize_lvl(model, quantization_level)
atq.disable_quantizer(model, filter_func)
- model.to(torch.float32) # QDQ needs to be in FP32
+ model.to(torch.float32).to("cpu") # QDQ needs to be in FP32
+ # WAR to enable ONNX export of quantized UNet
+ obj.device="cpu"
+ obj.fp16=False
else:
model = None
obj.export_onnx(onnx_path[model_name], onnx_opt_path[model_name], onnx_opset, opt_image_height, opt_image_width, custom_model=model)
-
+ obj.fp16=True # Part of WAR, UNET obj.fp16 defaults to True so it is safe to reset this way
+
# FIXME do_export_weights_map needs ONNX graph
if do_export_weights_map:
print(f"[I] Saving weights map: {weights_map_path[model_name]}")
diff --git a/demo/Diffusion/utilities.py b/demo/Diffusion/utilities.py
index 62d582f5..11f36807 100644
--- a/demo/Diffusion/utilities.py
+++ b/demo/Diffusion/utilities.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,7 +25,6 @@
import numpy as np
import onnx
from onnx import numpy_helper
-import onnx_graphsurgeon as gs
import os
from PIL import Image
from polygraphy.backend.common import bytes_from_path
@@ -40,9 +39,7 @@
)
from polygraphy.logger import G_LOGGER
import random
-import re
import requests
-from scipy import integrate
import tensorrt as trt
import torch
import types
@@ -406,63 +403,6 @@ def load_calib_prompts(batch_size, calib_data_path):
lst = [line.rstrip("\n") for line in file]
return [lst[i : i + batch_size] for i in range(0, len(lst), batch_size)]
-def filter_func(name):
- pattern = re.compile(
- r".*(time_emb_proj|time_embedding|conv_in|conv_out|conv_shortcut|add_embedding).*"
- )
- return pattern.match(name) is not None
-
-def quantize_lvl(unet, quant_level=2.5):
- """
- We should disable the unwanted quantizer when exporting the onnx
- Because in the current ammo setting, it will load the quantizer amax for all the layers even
- if we didn't add that unwanted layer into the config during the calibration
- """
- for name, module in unet.named_modules():
- if isinstance(module, torch.nn.Conv2d):
- module.input_quantizer.enable()
- module.weight_quantizer.enable()
- elif isinstance(module, torch.nn.Linear):
- if (
- (quant_level >= 2 and "ff.net" in name)
- or (quant_level >= 2.5 and ("to_q" in name or "to_k" in name or "to_v" in name))
- or quant_level == 3
- ):
- module.input_quantizer.enable()
- module.weight_quantizer.enable()
- else:
- module.input_quantizer.disable()
- module.weight_quantizer.disable()
-
-def get_smoothquant_config(model, quant_level=3):
- quant_config = {
- "quant_cfg": {},
- "algorithm": "smoothquant",
- }
- for name, module in model.named_modules():
- w_name = f"{name}*weight_quantizer"
- i_name = f"{name}*input_quantizer"
-
- if (
- w_name in quant_config["quant_cfg"].keys() # type: ignore
- or i_name in quant_config["quant_cfg"].keys() # type: ignore
- ):
- continue
- if filter_func(name):
- continue
- if isinstance(module, torch.nn.Linear):
- if (
- (quant_level >= 2 and "ff.net" in name)
- or (quant_level >= 2.5 and ("to_q" in name or "to_k" in name or "to_v" in name))
- or quant_level == 3
- ):
- quant_config["quant_cfg"][w_name] = {"num_bits": 8, "axis": 0} # type: ignore
- quant_config["quant_cfg"][i_name] = {"num_bits": 8, "axis": -1} # type: ignore
- elif isinstance(module, torch.nn.Conv2d):
- quant_config["quant_cfg"][w_name] = {"num_bits": 8, "axis": 0} # type: ignore
- quant_config["quant_cfg"][i_name] = {"num_bits": 8, "axis": None} # type: ignore
- return quant_config
-
class PercentileAmaxes:
def __init__(self, total_step, percentile) -> None:
self.data = {}
@@ -503,7 +443,7 @@ def add_arguments(parser):
# TensorRT engine build
parser.add_argument('--engine-dir', default='engine', help="Output directory for TensorRT engines")
parser.add_argument('--int8', action='store_true', help="Apply int8 quantization.")
- parser.add_argument('--quantization-level', type=float, default=3.0, choices=range(1,4), help="int8/fp8 quantization level, 1: CNN, 2: CNN+FFN, 2.5: CNN+FFN+QKV, 3: CNN+FC")
+ parser.add_argument('--quantization-level', type=float, default=2.5, choices=[1.0, 2.0, 2.5, 3.0], help="int8/fp8 quantization level, 1: CNN, 2: CNN+FFN, 2.5: CNN+FFN+QKV, 3: CNN+FC")
parser.add_argument('--build-static-batch', action='store_true', help="Build TensorRT engines with fixed batch size.")
parser.add_argument('--build-dynamic-shape', action='store_true', help="Build TensorRT engines with dynamic image shapes.")
parser.add_argument('--build-enable-refit', action='store_true', help="Enable Refit option in TensorRT engines during build.")
diff --git a/demo/Diffusion/utils_ammo.py b/demo/Diffusion/utils_ammo.py
new file mode 100644
index 00000000..8bfe44b8
--- /dev/null
+++ b/demo/Diffusion/utils_ammo.py
@@ -0,0 +1,160 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import re
+import torch
+
+from ammo.torch.quantization import utils as quant_utils
+from ammo.torch.quantization.calib.max import MaxCalibrator
+
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+
+
+class PercentileCalibrator(MaxCalibrator):
+ def __init__(self, num_bits=8, axis=None, unsigned=False, track_amax=False, **kwargs):
+ super().__init__(num_bits, axis, unsigned, track_amax)
+ self.percentile = kwargs["percentile"]
+ self.total_step = kwargs["total_step"]
+ self.global_min = kwargs["global_min"]
+ self.data = {}
+ self.i = 0
+
+ def collect(self, x):
+ """Tracks the absolute max of all tensors.
+
+ Args:
+ x: A tensor
+
+ Raises:
+ RuntimeError: If amax shape changes
+ """
+ # Swap axis to reduce.
+ axis = self._axis if isinstance(self._axis, (list, tuple)) else [self._axis]
+ # Handle negative axis.
+ axis = [x.dim() + i if isinstance(i, int) and i < 0 else i for i in axis]
+ reduce_axis = []
+ for i in range(x.dim()):
+ if i not in axis:
+ reduce_axis.append(i)
+ local_amax = quant_utils.reduce_amax(x, axis=reduce_axis).detach()
+ _cur_step = self.i % self.total_step
+ if _cur_step not in self.data.keys():
+ self.data[_cur_step] = local_amax
+ else:
+ if self.global_min:
+ self.data[_cur_step] = torch.min(self.data[_cur_step], local_amax)
+ else:
+ self.data[_cur_step] += local_amax
+ if self._track_amax:
+ raise NotImplementedError
+ self.i += 1
+
+ def compute_amax(self):
+ """Return the absolute max of all tensors collected."""
+ up_lim = int(self.total_step * self.percentile)
+ amaxs_values = [self.data[i] / self.total_step for i in range(0, up_lim)]
+ act_amax = torch.vstack(amaxs_values).min(axis=0)[0]
+ self._calib_amax = act_amax
+ return self._calib_amax
+
+ def __str__(self):
+ s = "PercentileCalibrator"
+ return s.format(**self.__dict__)
+
+ def __repr__(self):
+ s = "PercentileCalibrator("
+ s += super(MaxCalibrator, self).__repr__()
+ s += " calib_amax={_calib_amax}"
+ if self._track_amax:
+ s += " amaxs={_amaxs}"
+ s += ")"
+ return s.format(**self.__dict__)
+
+def filter_func(name):
+ pattern = re.compile(
+ r".*(time_emb_proj|time_embedding|conv_in|conv_out|conv_shortcut|add_embedding).*"
+ )
+ return pattern.match(name) is not None
+
+
+def quantize_lvl(unet, quant_level=2.5):
+ """
+ We should disable the unwanted quantizer when exporting the onnx
+ Because in the current ammo setting, it will load the quantizer amax for all the layers even
+ if we didn't add that unwanted layer into the config during the calibration
+ """
+ for name, module in unet.named_modules():
+ if isinstance(module, (torch.nn.Conv2d, LoRACompatibleConv)):
+ module.input_quantizer.enable()
+ module.weight_quantizer.enable()
+ elif isinstance(module, (torch.nn.Linear, LoRACompatibleLinear)):
+ if (
+ (quant_level >= 2 and "ff.net" in name)
+ or (quant_level >= 2.5 and ("to_q" in name or "to_k" in name or "to_v" in name))
+ or quant_level == 3
+ ):
+ module.input_quantizer.enable()
+ module.weight_quantizer.enable()
+ else:
+ module.input_quantizer.disable()
+ module.weight_quantizer.disable()
+
+def get_int8_config(
+ model, quant_level=2.5, alpha=0.8, percentile=1.0, num_inference_steps=20, global_min=False
+):
+ quant_config = {
+ "quant_cfg": {
+ "*lm_head*": {"enable": False},
+ "*output_layer*": {"enable": False},
+ "default": {"num_bits": 8, "axis": None},
+ },
+ "algorithm": {"method": "smoothquant", "alpha": alpha},
+ }
+ for name, module in model.named_modules():
+ w_name = f"{name}*weight_quantizer"
+ i_name = f"{name}*input_quantizer"
+
+ if w_name in quant_config["quant_cfg"].keys() or i_name in quant_config["quant_cfg"].keys():
+ continue
+ if filter_func(name):
+ continue
+ if isinstance(module, (torch.nn.Linear, LoRACompatibleLinear)):
+ if (
+ (quant_level >= 2 and "ff.net" in name)
+ or (quant_level >= 2.5 and ("to_q" in name or "to_k" in name or "to_v" in name))
+ or quant_level == 3
+ ):
+ quant_config["quant_cfg"][w_name] = {"num_bits": 8, "axis": 0}
+ quant_config["quant_cfg"][i_name] = {"num_bits": 8, "axis": -1}
+ elif isinstance(module, (torch.nn.Conv2d, LoRACompatibleConv)):
+ quant_config["quant_cfg"][w_name] = {"num_bits": 8, "axis": 0}
+ quant_config["quant_cfg"][i_name] = {
+ "num_bits": 8,
+ "axis": None,
+ "calibrator": (
+ PercentileCalibrator,
+ (),
+ {
+ "num_bits": 8,
+ "axis": None,
+ "percentile": percentile,
+ "total_step": num_inference_steps,
+ "global_min": global_min,
+ },
+ ),
+ }
+ return quant_config
diff --git a/demo/Jasper/README.md b/demo/Jasper/README.md
deleted file mode 100644
index f8988c08..00000000
--- a/demo/Jasper/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Jasper Inference Using TensorRT
-
-[Jupyter Notebook](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/notebooks/)
diff --git a/demo/Tacotron2/README.md b/demo/Tacotron2/README.md
deleted file mode 100644
index c687c5ee..00000000
--- a/demo/Tacotron2/README.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# Tacotron 2 and WaveGlow Inference with TensorRT
-
-The Tacotron2 and WaveGlow models form a text-to-speech (TTS) system that enables users to synthesize natural sounding speech from raw transcripts without any additional information such as patterns and/or rhythms of speech. This is an implementation of Tacotron2 for PyTorch, tested and maintained by NVIDIA, and provides scripts to perform high-performance inference using NVIDIA TensorRT. More information about the TTS system and its training can be found in the
-[NVIDIA DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2).
-
-NVIDIA TensorRT is a platform for high-performance deep learning inference. It includes a deep learning inference optimizer and runtime that delivers low latency and high-throughput for deep learning inference applications. After optimizing the compute-intensive acoustic model with NVIDIA TensorRT, inference throughput increased by up to 1.4x over native PyTorch in mixed precision.
-
-### Software Versions
-
-|Software|Version|
-|--------|-------|
-|Python|3.8.10|
-|CUDA|12.2|
-|Apex|0.1|
-|TensorRT|9.0|
-|PyTorch|2.0.1|
-
-
-## Quick Start Guide
-
-1. Build and launch the container as described in [TensorRT OSS README](https://github.com/NVIDIA/TensorRT/blob/master/README.md).
-
- **Note:** After this point, all commands should be run from within the container.
-
-2. Verify TensorRT installation by printing the version:
- ```bash
- python3 -c "import tensorrt as trt; print(trt.__version__)"
- ```
-
-3. Install prerequisite software for TTS sample:
- ```bash
- cd $TRT_OSSPATH/demo/Tacotron2
- bash ./scripts/install_prerequisites.sh
- ```
-4. Download pretrained checkpoints from [NGC](https://ngc.nvidia.com/catalog/models) into the `./checkpoints` directory:
-
-- [Tacotron2 checkpoint](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_fp16)
-- [WaveGlow checkpoint](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16)
-
- ```bash
- bash ./scripts/download_checkpoints.sh
- ```
-
-5. Export the models to ONNX intermediate representation (ONNX IR).
- Export Tacotron 2 to three ONNX parts: Encoder, Decoder, and Postnet:
-
- ```bash
- mkdir -p output
- python3 tensorrt/convert_tacotron22onnx.py --tacotron2 checkpoints/tacotron2_pyt_ckpt_amp_v19.09.0/nvidia_tacotron2pyt_fp16_20190427 -o output/ --fp16
- ```
-
- Convert WaveGlow to ONNX IR:
-
- ```bash
- python3 tensorrt/convert_waveglow2onnx.py --waveglow ./checkpoints/waveglow_ckpt_amp_256_v19.10.0/nvidia_waveglow256pyt_fp16 --config-file config.json --wn-channels 256 -o output/ --fp16
- ```
-
- The above commands store the generated ONNX files under the `./output/` directory:
- `encoder.onnx`, `decoder_iter.onnx`, `postnet.onnx`, `waveglow.onnx`, `loop_body_fp16.onnx`, and `decoder.onnx` (on TensorRT 8.0+ if `--no-loop` option is not specified).
-
-6. Export the ONNX IRs to TensorRT engines with fp16 mode enabled:
-
- ```bash
- python3 tensorrt/convert_onnx2trt.py --encoder output/encoder.onnx --decoder output/decoder.onnx --postnet output/postnet.onnx --waveglow output/waveglow.onnx -o output/ --fp16
- ```
-
- After running the command, there should be four new engine files in `./output/` directory:
- `encoder_fp16.engine`, `decoder_with_outer_loop_fp16.engine`, `postnet_fp16.engine`, and `waveglow_fp16.engine`. On TensorRT <8.0 or if `--no-loop` option is specified, `decoder_iter_fp16.engine` is generated instead.
-
-7. Run TTS inference pipeline with fp16:
-
-
- ```bash
- python3 tensorrt/inference_trt.py -i phrases/phrase.txt --encoder output/encoder_fp16.engine --decoder output/decoder_with_outer_loop_fp16.engine --postnet output/postnet_fp16.engine --waveglow output/waveglow_fp16.engine -o output/ --fp16
- ```
-
- On TensorRT <8.0 use `decoder_iter_fp16.engine` for the decoder instead.
-
-## Performance
-
-### Benchmarking
-
-The following section shows how to benchmark the TensorRT inference performance for our Tacotron2 + Waveglow TTS.
-
-#### TensorRT inference benchmark
-
-Before running the benchmark script, please download the checkpoints and build the TensorRT engines for the Tacotron2 and Waveglow models as prescribed in the [Quick Start Guide](#quick-start-guide) above.
-
-The inference benchmark is performed on a single GPU by the `inference_benchmark.sh` script, which runs 3 warm-up iterations then runs timed inference for 1000 iterations.
-
-```bash
-bash scripts/inference_benchmark.sh
-```
-
-*Note*: For benchmarking we use WaveGlow with 256 residual channels, and Tacotron2 decoder with outer loop for TensorRT inference.
-
-### Results
-
-> Note: Results last updated for TensorRT 8.0.1.6 release.
-
-#### Inference performance: NVIDIA T4 (16GB)
-
-|Framework|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up PyT+TRT/TRT|Avg mels generated (81 mels=1 sec of speech)| Avg audio length (s)| Avg RTF|
-|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-|PyT+TRT|1| 128| FP16| 0.1662 | 0.0036 | 0.1705 | 0.1717 | 0.1736 | 871,568 | 7.64 | 566 | 6.99 | 42.03 |
-|PyT |1| 128| FP16| 1.27 | 0.07 | 1.36 | 1.38 | 1.44 | 121,184 | 1.00 | 601 | 7.42 | 5.84 |
-
-#### Inference performance: NVIDIA V100 (16GB)
-
-|Framework|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up PyT+TRT/TRT|Avg mels generated (81 mels=1 sec of speech)| Avg audio length (s)| Avg RTF|
-|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-|PyT+TRT|1| 128| FP16| 0.1641 | 0.0046 | 0.1694 | 0.1707 | 0.1731 | 900,884 | 6.52 | 577 | 7.13 | 43.44 |
-|PyT |1| 128| FP16| 1.07 | 0.06 | 1.14 | 1.17 | 1.23 | 144,668 | 1.00 | 602 | 7.42 | 6.95 |
diff --git a/demo/Tacotron2/common/audio_processing.py b/demo/Tacotron2/common/audio_processing.py
deleted file mode 100644
index 7b261cec..00000000
--- a/demo/Tacotron2/common/audio_processing.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-import numpy as np
-from scipy.signal import get_window
-import librosa.util as librosa_util
-
-
-def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
- n_fft=800, dtype=np.float32, norm=None):
- """
- # from librosa 0.6
- Compute the sum-square envelope of a window function at a given hop length.
-
- This is used to estimate modulation effects induced by windowing
- observations in short-time fourier transforms.
-
- Parameters
- ----------
- window : string, tuple, number, callable, or list-like
- Window specification, as in `get_window`
-
- n_frames : int > 0
- The number of analysis frames
-
- hop_length : int > 0
- The number of samples to advance between frames
-
- win_length : [optional]
- The length of the window function. By default, this matches `n_fft`.
-
- n_fft : int > 0
- The length of each analysis frame.
-
- dtype : np.dtype
- The data type of the output
-
- Returns
- -------
- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
- The sum-squared envelope of the window function
- """
- if win_length is None:
- win_length = n_fft
-
- n = n_fft + hop_length * (n_frames - 1)
- x = np.zeros(n, dtype=dtype)
-
- # Compute the squared window at the desired length
- win_sq = get_window(window, win_length, fftbins=True)
- win_sq = librosa_util.normalize(win_sq, norm=norm)**2
- win_sq = librosa_util.pad_center(win_sq, size=n_fft)
-
- # Fill the envelope
- for i in range(n_frames):
- sample = i * hop_length
- x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
- return x
-
-
-def griffin_lim(magnitudes, stft_fn, n_iters=30):
- """
- PARAMS
- ------
- magnitudes: spectrogram magnitudes
- stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
- """
-
- angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
- angles = angles.astype(np.float32)
- angles = torch.autograd.Variable(torch.from_numpy(angles))
- signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
-
- for i in range(n_iters):
- _, angles = stft_fn.transform(signal)
- signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
- return signal
-
-
-def dynamic_range_compression(x, C=1, clip_val=1e-5):
- """
- PARAMS
- ------
- C: compression factor
- """
- return torch.log(torch.clamp(x, min=clip_val) * C)
-
-
-def dynamic_range_decompression(x, C=1):
- """
- PARAMS
- ------
- C: compression factor used to compress
- """
- return torch.exp(x) / C
diff --git a/demo/Tacotron2/common/layers.py b/demo/Tacotron2/common/layers.py
deleted file mode 100644
index cbeb4910..00000000
--- a/demo/Tacotron2/common/layers.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-from librosa.filters import mel as librosa_mel_fn
-from common.audio_processing import dynamic_range_compression, dynamic_range_decompression
-from common.stft import STFT
-
-
-class LinearNorm(torch.nn.Module):
- def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
- super(LinearNorm, self).__init__()
- self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
-
- torch.nn.init.xavier_uniform_(
- self.linear_layer.weight,
- gain=torch.nn.init.calculate_gain(w_init_gain))
-
- def forward(self, x):
- return self.linear_layer(x)
-
-
-class ConvNorm(torch.nn.Module):
- def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
- padding=None, dilation=1, bias=True, w_init_gain='linear'):
- super(ConvNorm, self).__init__()
- if padding is None:
- assert(kernel_size % 2 == 1)
- padding = int(dilation * (kernel_size - 1) / 2)
-
- self.conv = torch.nn.Conv1d(in_channels, out_channels,
- kernel_size=kernel_size, stride=stride,
- padding=padding, dilation=dilation,
- bias=bias)
-
- torch.nn.init.xavier_uniform_(
- self.conv.weight,
- gain=torch.nn.init.calculate_gain(w_init_gain))
-
- def forward(self, signal):
- return self.conv(signal)
-
-
-class TacotronSTFT(torch.nn.Module):
- def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
- n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
- mel_fmax=8000.0):
- super(TacotronSTFT, self).__init__()
- self.n_mel_channels = n_mel_channels
- self.sampling_rate = sampling_rate
- self.stft_fn = STFT(filter_length, hop_length, win_length)
- mel_basis = librosa_mel_fn(
- sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
- mel_basis = torch.from_numpy(mel_basis).float()
- self.register_buffer('mel_basis', mel_basis)
-
- def spectral_normalize(self, magnitudes):
- output = dynamic_range_compression(magnitudes)
- return output
-
- def spectral_de_normalize(self, magnitudes):
- output = dynamic_range_decompression(magnitudes)
- return output
-
- def mel_spectrogram(self, y):
- """Computes mel-spectrograms from a batch of waves
- PARAMS
- ------
- y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
-
- RETURNS
- -------
- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
- """
- assert(torch.min(y.data) >= -1)
- assert(torch.max(y.data) <= 1)
-
- magnitudes, phases = self.stft_fn.transform(y)
- magnitudes = magnitudes.data
- mel_output = torch.matmul(self.mel_basis, magnitudes)
- mel_output = self.spectral_normalize(mel_output)
- return mel_output
diff --git a/demo/Tacotron2/common/stft.py b/demo/Tacotron2/common/stft.py
deleted file mode 100644
index 0341d60e..00000000
--- a/demo/Tacotron2/common/stft.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-BSD 3-Clause License
-
-Copyright (c) 2017, Prem Seetharaman
-All rights reserved.
-
-* Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice, this
- list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its
- contributors may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-
-import torch
-import numpy as np
-import torch.nn.functional as F
-from torch.autograd import Variable
-from scipy.signal import get_window
-from librosa.util import pad_center, tiny
-from common.audio_processing import window_sumsquare
-
-
-class STFT(torch.nn.Module):
- """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
- def __init__(self, filter_length=800, hop_length=200, win_length=800,
- window='hann'):
- super(STFT, self).__init__()
- self.filter_length = filter_length
- self.hop_length = hop_length
- self.win_length = win_length
- self.window = window
- self.forward_transform = None
- scale = self.filter_length / self.hop_length
- fourier_basis = np.fft.fft(np.eye(self.filter_length))
-
- cutoff = int((self.filter_length / 2 + 1))
- fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
- np.imag(fourier_basis[:cutoff, :])])
-
- forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
- inverse_basis = torch.FloatTensor(
- np.linalg.pinv(scale * fourier_basis).T[:, None, :].astype(np.float32))
-
- if window is not None:
- assert(filter_length >= win_length)
- # get window and zero center pad it to filter_length
- fft_window = get_window(window, win_length, fftbins=True)
- fft_window = pad_center(fft_window, size=filter_length)
- fft_window = torch.from_numpy(fft_window).float()
-
- # window the bases
- forward_basis *= fft_window
- inverse_basis *= fft_window
-
- self.register_buffer('forward_basis', forward_basis.float())
- self.register_buffer('inverse_basis', inverse_basis.float())
-
- def transform(self, input_data):
- num_batches = input_data.size(0)
- num_samples = input_data.size(1)
-
- self.num_samples = num_samples
-
- # similar to librosa, reflect-pad the input
- input_data = input_data.view(num_batches, 1, num_samples)
- input_data = F.pad(
- input_data.unsqueeze(1),
- (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
- mode='reflect')
- input_data = input_data.squeeze(1)
-
- forward_transform = F.conv1d(
- input_data,
- Variable(self.forward_basis, requires_grad=False),
- stride=self.hop_length,
- padding=0)
-
- cutoff = int((self.filter_length / 2) + 1)
- real_part = forward_transform[:, :cutoff, :]
- imag_part = forward_transform[:, cutoff:, :]
-
- magnitude = torch.sqrt(real_part**2 + imag_part**2)
- phase = torch.autograd.Variable(
- torch.atan2(imag_part.data, real_part.data))
-
- return magnitude, phase
-
- def inverse(self, magnitude, phase):
- recombine_magnitude_phase = torch.cat(
- [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
-
- inverse_transform = F.conv_transpose2d(
- recombine_magnitude_phase.unsqueeze(-1),
- Variable(self.inverse_basis.unsqueeze(-1), requires_grad=False),
- stride=(self.hop_length,1),
- padding=(0,0))
- inverse_transform = inverse_transform.squeeze(-1)
-
- if self.window is not None:
- window_sum = window_sumsquare(
- self.window, magnitude.size(-1), hop_length=self.hop_length,
- win_length=self.win_length, n_fft=self.filter_length,
- dtype=np.float32)
- # remove modulation effects
- approx_nonzero_indices = torch.from_numpy(
- np.where(window_sum > tiny(window_sum))[0])
- window_sum = torch.autograd.Variable(
- torch.from_numpy(window_sum), requires_grad=False)
- window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
- inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
-
- # scale by hop ratio
- inverse_transform *= float(self.filter_length) / self.hop_length
-
- inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
- inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
-
- return inverse_transform
-
- def forward(self, input_data):
- self.magnitude, self.phase = self.transform(input_data)
- reconstruction = self.inverse(self.magnitude, self.phase)
- return reconstruction
diff --git a/demo/Tacotron2/common/utils.py b/demo/Tacotron2/common/utils.py
deleted file mode 100644
index 6cccbf22..00000000
--- a/demo/Tacotron2/common/utils.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-from scipy.io.wavfile import read
-import torch
-import os
-
-import argparse
-import json
-
-class ParseFromConfigFile(argparse.Action):
-
- def __init__(self, option_strings, type, dest, help=None, required=False):
- super(ParseFromConfigFile, self).__init__(option_strings=option_strings, type=type, dest=dest, help=help, required=required)
-
- def __call__(self, parser, namespace, values, option_string):
- with open(values, 'r') as f:
- data = json.load(f)
-
- for group in data.keys():
- for k,v in data[group].items():
- underscore_k = k.replace('-', '_')
- setattr(namespace, underscore_k, v)
-
-def get_mask_from_lengths(lengths):
- max_len = torch.max(lengths).item()
- ids = torch.arange(0, max_len, device=lengths.device, dtype=lengths.dtype)
- mask = (ids < lengths.unsqueeze(1)).byte()
- mask = torch.le(mask, 0)
- return mask
-
-
-def load_wav_to_torch(full_path):
- sampling_rate, data = read(full_path)
- return torch.FloatTensor(data.astype(np.float32)), sampling_rate
-
-
-def load_filepaths_and_text(dataset_path, filename, split="|"):
- with open(filename, encoding='utf-8') as f:
- def split_line(root, line):
- parts = line.strip().split(split)
- if len(parts) > 2:
- raise Exception(
- "incorrect line format for file: {}".format(filename))
- path = os.path.join(root, parts[0])
- text = parts[1]
- return path,text
- filepaths_and_text = [split_line(dataset_path, line) for line in f]
- return filepaths_and_text
-
-
-def to_gpu(x):
- x = x.contiguous()
-
- if torch.cuda.is_available():
- x = x.cuda(non_blocking=True)
- return x
diff --git a/demo/Tacotron2/config.json b/demo/Tacotron2/config.json
deleted file mode 100644
index 07ab289e..00000000
--- a/demo/Tacotron2/config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "audio": {
- "max-wav-value": 32768.0,
- "sampling-rate": 22050,
- "filter-length": 1024,
- "hop-length": 256,
- "win-length": 1024,
- "mel-fmin": 0.0,
- "mel-fmax": 7000.0
- }
-}
diff --git a/demo/Tacotron2/data_functions.py b/demo/Tacotron2/data_functions.py
deleted file mode 100644
index 623e5af6..00000000
--- a/demo/Tacotron2/data_functions.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-from tacotron2.data_function import TextMelCollate
-from tacotron2.data_function import TextMelLoader
-from waveglow.data_function import MelAudioLoader
-from tacotron2.data_function import batch_to_gpu as batch_to_gpu_tacotron2
-from waveglow.data_function import batch_to_gpu as batch_to_gpu_waveglow
-
-
-def get_collate_function(model_name, n_frames_per_step):
- if model_name == 'Tacotron2':
- collate_fn = TextMelCollate(n_frames_per_step)
- elif model_name == 'WaveGlow':
- collate_fn = torch.utils.data.dataloader.default_collate
- else:
- raise NotImplementedError(
- "unknown collate function requested: {}".format(model_name))
-
- return collate_fn
-
-
-def get_data_loader(model_name, dataset_path, audiopaths_and_text, args):
- if model_name == 'Tacotron2':
- data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args)
- elif model_name == 'WaveGlow':
- data_loader = MelAudioLoader(dataset_path, audiopaths_and_text, args)
- else:
- raise NotImplementedError(
- "unknown data loader requested: {}".format(model_name))
-
- return data_loader
-
-
-def get_batch_to_gpu(model_name):
- if model_name == 'Tacotron2':
- batch_to_gpu = batch_to_gpu_tacotron2
- elif model_name == 'WaveGlow':
- batch_to_gpu = batch_to_gpu_waveglow
- else:
- raise NotImplementedError(
- "unknown batch_to_gpu requested: {}".format(model_name))
- return batch_to_gpu
diff --git a/demo/Tacotron2/inference.py b/demo/Tacotron2/inference.py
deleted file mode 100644
index 77bbccc1..00000000
--- a/demo/Tacotron2/inference.py
+++ /dev/null
@@ -1,266 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from tacotron2.text import text_to_sequence
-import models
-import torch
-import argparse
-import numpy as np
-from scipy.io.wavfile import write
-import matplotlib
-import matplotlib.pyplot as plt
-
-import sys
-
-import time
-import dllogger as DLLogger
-from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
-
-from waveglow.denoiser import Denoiser
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('-i', '--input', type=str, required=True,
- help='Full path to the input text (phareses separated by new line)')
- parser.add_argument('-o', '--output', required=True,
- help='Output folder to save audio (file per phrase)')
- parser.add_argument('--suffix', type=str, default="",
- help="Output filename suffix")
- parser.add_argument('--tacotron2', type=str,
- help='Full path to the Tacotron2 model checkpoint file')
- parser.add_argument('--waveglow', type=str,
- help='Full path to the WaveGlow model checkpoint file')
- parser.add_argument('-s', '--sigma-infer', default=0.9, type=float,
- help='Standard deviation of the Gaussian distribution')
- parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
- help='Denoising strength for removing model bias')
- parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
- help='Sampling rate')
-
- run_mode = parser.add_mutually_exclusive_group()
- run_mode.add_argument('--fp16', action='store_true',
- help='Run inference with mixed precision')
- run_mode.add_argument('--cpu', action='store_true',
- help='Run inference on CPU')
-
- parser.add_argument('--log-file', type=str, default='nvlog.json',
- help='Filename for logging')
- parser.add_argument('--include-warmup', action='store_true',
- help='Include warmup')
- parser.add_argument('--stft-hop-length', type=int, default=256,
- help='STFT hop length for estimating audio length from mel size')
-
- return parser
-
-
-def checkpoint_from_distributed(state_dict):
- """
- Checks whether checkpoint was generated by DistributedDataParallel. DDP
- wraps model in additional "module.", it needs to be unwrapped for single
- GPU inference.
- :param state_dict: model's state dict
- """
- ret = False
- for key, _ in state_dict.items():
- if key.find('module.') != -1:
- ret = True
- break
- return ret
-
-
-def unwrap_distributed(state_dict):
- """
- Unwraps model from DistributedDataParallel.
- DDP wraps model in additional "module.", it needs to be removed for single
- GPU inference.
- :param state_dict: model's state dict
- """
- new_state_dict = {}
- for key, value in state_dict.items():
- new_key = key.replace('module.', '')
- new_state_dict[new_key] = value
- return new_state_dict
-
-
-def load_and_setup_model(model_name, parser, checkpoint, fp16_run, cpu_run, forward_is_infer=False):
- model_parser = models.parse_model_args(model_name, parser, add_help=False)
- model_args, _ = model_parser.parse_known_args()
-
- model_config = models.get_model_config(model_name, model_args)
- model = models.get_model(model_name, model_config, to_cuda=(not cpu_run),
- forward_is_infer=forward_is_infer)
-
- if checkpoint is not None:
- if cpu_run:
- state_dict = torch.load(checkpoint, map_location=torch.device('cpu'))['state_dict']
- else:
- state_dict = torch.load(checkpoint)['state_dict']
- if checkpoint_from_distributed(state_dict):
- state_dict = unwrap_distributed(state_dict)
-
- model.load_state_dict(state_dict)
-
- if model_name == "WaveGlow":
- model = model.remove_weightnorm(model)
-
- model.eval()
-
- if fp16_run:
- model.half()
-
- return model
-
-
-# taken from tacotron2/data_function.py:TextMelCollate.__call__
-def pad_sequences(batch):
- # Right zero-pad all one-hot text sequences to max input length
- input_lengths, ids_sorted_decreasing = torch.sort(
- torch.LongTensor([len(x) for x in batch]),
- dim=0, descending=True)
- max_input_len = input_lengths[0]
-
- text_padded = torch.LongTensor(len(batch), max_input_len)
- text_padded.zero_()
- for i in range(len(ids_sorted_decreasing)):
- text = batch[ids_sorted_decreasing[i]]
- text_padded[i, :text.size(0)] = text
-
- return text_padded, input_lengths
-
-
-def prepare_input_sequence(texts, cpu_run=False):
-
- d = []
- for i,text in enumerate(texts):
- d.append(torch.IntTensor(
- text_to_sequence(text, ['english_cleaners'])[:]))
-
- text_padded, input_lengths = pad_sequences(d)
- if not cpu_run:
- text_padded = text_padded.cuda().long()
- input_lengths = input_lengths.cuda().long()
- else:
- text_padded = text_padded.long()
- input_lengths = input_lengths.long()
-
- return text_padded, input_lengths
-
-
-class MeasureTime():
- def __init__(self, measurements, key, cpu_run=False):
- self.measurements = measurements
- self.key = key
- self.cpu_run = cpu_run
-
- def __enter__(self):
- if not self.cpu_run:
- torch.cuda.synchronize()
- self.t0 = time.perf_counter()
-
- def __exit__(self, exc_type, exc_value, exc_traceback):
- if not self.cpu_run:
- torch.cuda.synchronize()
- self.measurements[self.key] = time.perf_counter() - self.t0
-
-
-def main():
- """
- Launches text to speech (inference).
- Inference is executed on a single GPU or CPU.
- """
- parser = argparse.ArgumentParser(
- description='PyTorch Tacotron 2 Inference')
- parser = parse_args(parser)
- args, _ = parser.parse_known_args()
-
- DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
- args.output+'/'+args.log_file),
- StdOutBackend(Verbosity.VERBOSE)])
- for k,v in vars(args).items():
- DLLogger.log(step="PARAMETER", data={k:v})
- DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
-
- tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
- args.fp16, args.cpu, forward_is_infer=True)
- waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
- args.fp16, args.cpu, forward_is_infer=True)
- denoiser = Denoiser(waveglow)
- if not args.cpu:
- denoiser.cuda()
-
- jitted_tacotron2 = torch.jit.script(tacotron2)
-
- texts = []
- try:
- f = open(args.input, 'r')
- texts = f.readlines()
- except:
- print("Could not read file")
- sys.exit(1)
-
- if args.include_warmup:
- sequence = torch.randint(low=0, high=148, size=(1,50)).long()
- input_lengths = torch.IntTensor([sequence.size(1)]).long()
- if not args.cpu:
- sequence = sequence.cuda()
- input_lengths = input_lengths.cuda()
- for i in range(3):
- with torch.no_grad():
- mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths)
- _ = waveglow(mel)
-
- measurements = {}
-
- sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)
-
- with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu):
- mel, mel_lengths, alignments = jitted_tacotron2(sequences_padded, input_lengths)
-
- with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu):
- audios = waveglow(mel, sigma=args.sigma_infer)
- audios = audios.float()
- with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu):
- audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
-
- print("Stopping after",mel.size(2),"decoder steps")
- tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time']
- waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time']
-
- DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf})
- DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']})
- DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
- DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
- DLLogger.log(step=0, data={"denoiser_latency": measurements['denoiser_time']})
- DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time']+measurements['denoiser_time'])})
-
- for i, audio in enumerate(audios):
-
- plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower")
- figure_path = args.output+"alignment_"+str(i)+"_"+args.suffix+".png"
- plt.savefig(figure_path)
-
- audio = audio[:mel_lengths[i]*args.stft_hop_length]
- audio = audio/torch.max(torch.abs(audio))
- audio_path = args.output+"audio_"+str(i)+"_"+args.suffix+".wav"
- write(audio_path, args.sampling_rate, audio.cpu().numpy())
-
- DLLogger.flush()
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/inference_perf.py b/demo/Tacotron2/inference_perf.py
deleted file mode 100644
index cb13463e..00000000
--- a/demo/Tacotron2/inference_perf.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import models
-import torch
-import argparse
-import numpy as np
-import json
-import time
-
-from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, MeasureTime
-
-import dllogger as DLLogger
-from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
-
-from apex import amp
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('-m', '--model-name', type=str, default='', required=True,
- help='Model to train')
- parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
- help='Sampling rate')
- parser.add_argument('--amp-run', action='store_true',
- help='Inference with Automatic Mixed Precision')
- parser.add_argument('-bs', '--batch-size', type=int, default=1,
- help='Batch size')
- parser.add_argument('-o', '--output', type=str, required=True,
- help='Directory to save results')
- parser.add_argument('--log-file', type=str, default='nvlog.json',
- help='Filename for logging')
-
- return parser
-
-
-def main():
- """
- Launches inference benchmark.
- Inference is executed on a single GPU.
- """
- parser = argparse.ArgumentParser(
- description='PyTorch Tacotron 2 Inference')
- parser = parse_args(parser)
- args, _ = parser.parse_known_args()
-
- log_file = args.log_file
-
- DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
- args.output+'/'+args.log_file),
- StdOutBackend(Verbosity.VERBOSE)])
- for k,v in vars(args).items():
- DLLogger.log(step="PARAMETER", data={k:v})
- DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
-
- model = load_and_setup_model(args.model_name, parser, None, args.amp_run,
- forward_is_infer=True)
-
- if args.model_name == "Tacotron2":
- model = torch.jit.script(model)
-
- warmup_iters = 3
- num_iters = 1+warmup_iters
-
- for i in range(num_iters):
-
- measurements = {}
-
- if args.model_name == 'Tacotron2':
- text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 140),
- dtype=torch.long).cuda()
- input_lengths = torch.IntTensor([text_padded.size(1)]*args.batch_size).cuda().long()
- with torch.no_grad(), MeasureTime(measurements, "inference_time"):
- mels, _, _ = model(text_padded, input_lengths)
- num_items = mels.size(0)*mels.size(2)
-
- if args.model_name == 'WaveGlow':
- n_mel_channels = model.upsample.in_channels
- num_mels = 895
- mel_padded = torch.zeros(args.batch_size, n_mel_channels,
- num_mels).normal_(-5.62, 1.98).cuda()
- if args.amp_run:
- mel_padded = mel_padded.half()
-
- with torch.no_grad(), MeasureTime(measurements, "inference_time"):
- audios = model(mel_padded)
- audios = audios.float()
- num_items = audios.size(0)*audios.size(1)
-
- if i >= warmup_iters:
- DLLogger.log(step=(i-warmup_iters,), data={"latency": measurements['inference_time']})
- DLLogger.log(step=(i-warmup_iters,), data={"items_per_sec": num_items/measurements['inference_time']})
-
- DLLogger.log(step=tuple(),
- data={'infer_latency': measurements['inference_time']})
- DLLogger.log(step=tuple(),
- data={'infer_items_per_sec': num_items/measurements['inference_time']})
-
- DLLogger.flush()
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/main.py b/demo/Tacotron2/main.py
deleted file mode 100644
index 2fee8563..00000000
--- a/demo/Tacotron2/main.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-from train import main as main_train
-from inference_perf import main as main_infer
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
-
- parser.add_argument('--bench-class', type=str, choices=['train', 'perf-infer', 'perf-train'], required=True, help='Choose test class')
-
- return parser
-
-def main():
-
- parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Testing')
- parser = parse_args(parser)
- args, unknown_args = parser.parse_known_args()
-
- if "train" in args.bench_class:
- main_train()
- else:
- main_infer()
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/models.py b/demo/Tacotron2/models.py
deleted file mode 100644
index fad8af46..00000000
--- a/demo/Tacotron2/models.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import sys
-from os.path import abspath, dirname
-# enabling modules discovery from global entrypoint
-sys.path.append(abspath(dirname(__file__)+'/'))
-from tacotron2.model import Tacotron2
-from waveglow.model import WaveGlow
-import torch
-
-
-def parse_model_args(model_name, parser, add_help=False):
- if model_name == 'Tacotron2':
- from tacotron2.arg_parser import parse_tacotron2_args
- return parse_tacotron2_args(parser, add_help)
- if model_name == 'WaveGlow':
- from waveglow.arg_parser import parse_waveglow_args
- return parse_waveglow_args(parser, add_help)
- else:
- raise NotImplementedError(model_name)
-
-
-def batchnorm_to_float(module):
- """Converts batch norm to FP32"""
- if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
- module.float()
- for child in module.children():
- batchnorm_to_float(child)
- return module
-
-
-def init_bn(module):
- if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
- if module.affine:
- module.weight.data.uniform_()
- for child in module.children():
- init_bn(child)
-
-
-def get_model(model_name, model_config, to_cuda,
- uniform_initialize_bn_weight=False, forward_is_infer=False):
- """ Code chooses a model based on name"""
- model = None
- if model_name == 'Tacotron2':
- if forward_is_infer:
- class Tacotron2__forward_is_infer(Tacotron2):
- def forward(self, inputs, input_lengths):
- return self.infer(inputs, input_lengths)
- model = Tacotron2__forward_is_infer(**model_config)
- else:
- model = Tacotron2(**model_config)
- elif model_name == 'WaveGlow':
- if forward_is_infer:
- class WaveGlow__forward_is_infer(WaveGlow):
- def forward(self, spect, sigma=1.0):
- return self.infer(spect, sigma)
- model = WaveGlow__forward_is_infer(**model_config)
- else:
- model = WaveGlow(**model_config)
- else:
- raise NotImplementedError(model_name)
-
- if uniform_initialize_bn_weight:
- init_bn(model)
-
- if to_cuda:
- model = model.cuda()
- return model
-
-
-def get_model_config(model_name, args):
- """ Code chooses a model based on name"""
- if model_name == 'Tacotron2':
- model_config = dict(
- # optimization
- mask_padding=args.mask_padding,
- # audio
- n_mel_channels=args.n_mel_channels,
- # symbols
- n_symbols=args.n_symbols,
- symbols_embedding_dim=args.symbols_embedding_dim,
- # encoder
- encoder_kernel_size=args.encoder_kernel_size,
- encoder_n_convolutions=args.encoder_n_convolutions,
- encoder_embedding_dim=args.encoder_embedding_dim,
- # attention
- attention_rnn_dim=args.attention_rnn_dim,
- attention_dim=args.attention_dim,
- # attention location
- attention_location_n_filters=args.attention_location_n_filters,
- attention_location_kernel_size=args.attention_location_kernel_size,
- # decoder
- n_frames_per_step=args.n_frames_per_step,
- decoder_rnn_dim=args.decoder_rnn_dim,
- prenet_dim=args.prenet_dim,
- max_decoder_steps=args.max_decoder_steps,
- gate_threshold=args.gate_threshold,
- p_attention_dropout=args.p_attention_dropout,
- p_decoder_dropout=args.p_decoder_dropout,
- # postnet
- postnet_embedding_dim=args.postnet_embedding_dim,
- postnet_kernel_size=args.postnet_kernel_size,
- postnet_n_convolutions=args.postnet_n_convolutions,
- decoder_no_early_stopping=args.decoder_no_early_stopping
- )
- return model_config
- elif model_name == 'WaveGlow':
- model_config = dict(
- n_mel_channels=args.n_mel_channels,
- n_flows=args.flows,
- n_group=args.groups,
- n_early_every=args.early_every,
- n_early_size=args.early_size,
- WN_config=dict(
- n_layers=args.wn_layers,
- kernel_size=args.wn_kernel_size,
- n_channels=args.wn_channels
- )
- )
- return model_config
- else:
- raise NotImplementedError(model_name)
diff --git a/demo/Tacotron2/multiproc.py b/demo/Tacotron2/multiproc.py
deleted file mode 100644
index d3eb63ad..00000000
--- a/demo/Tacotron2/multiproc.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import sys
-import subprocess
-
-import torch
-
-
-def main():
- argslist = list(sys.argv)[1:]
- world_size = torch.cuda.device_count()
-
- if '--world-size' in argslist:
- argslist[argslist.index('--world-size') + 1] = str(world_size)
- else:
- argslist.append('--world-size')
- argslist.append(str(world_size))
-
- workers = []
-
- for i in range(world_size):
- if '--rank' in argslist:
- argslist[argslist.index('--rank') + 1] = str(i)
- else:
- argslist.append('--rank')
- argslist.append(str(i))
- stdout = None if i == 0 else subprocess.DEVNULL
- worker = subprocess.Popen(
- [str(sys.executable)] + argslist, stdout=stdout)
- workers.append(worker)
-
- returncode = 0
- try:
- pending = len(workers)
- while pending > 0:
- for worker in workers:
- try:
- worker_returncode = worker.wait(1)
- except subprocess.TimeoutExpired:
- continue
- pending -= 1
- if worker_returncode != 0:
- if returncode != 1:
- for worker in workers:
- worker.terminate()
- returncode = 1
-
- except KeyboardInterrupt:
- print('Pressed CTRL-C, TERMINATING')
- for worker in workers:
- worker.terminate()
- for worker in workers:
- worker.wait()
- raise
-
- sys.exit(returncode)
-
-
-if __name__ == "__main__":
- main()
diff --git a/demo/Tacotron2/phrases/phrase.txt b/demo/Tacotron2/phrases/phrase.txt
deleted file mode 100644
index 8999934d..00000000
--- a/demo/Tacotron2/phrases/phrase.txt
+++ /dev/null
@@ -1 +0,0 @@
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves.
diff --git a/demo/Tacotron2/phrases/phrase_1_128.txt b/demo/Tacotron2/phrases/phrase_1_128.txt
deleted file mode 100644
index 2bd87ff0..00000000
--- a/demo/Tacotron2/phrases/phrase_1_128.txt
+++ /dev/null
@@ -1 +0,0 @@
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the
diff --git a/demo/Tacotron2/phrases/phrase_1_256.txt b/demo/Tacotron2/phrases/phrase_1_256.txt
deleted file mode 100644
index 8286058e..00000000
--- a/demo/Tacotron2/phrases/phrase_1_256.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-
diff --git a/demo/Tacotron2/phrases/phrase_1_64.txt b/demo/Tacotron2/phrases/phrase_1_64.txt
deleted file mode 100644
index 817a8a60..00000000
--- a/demo/Tacotron2/phrases/phrase_1_64.txt
+++ /dev/null
@@ -1 +0,0 @@
-She sells seashells by the seashore, shells she sells are great
diff --git a/demo/Tacotron2/phrases/phrase_4_256.txt b/demo/Tacotron2/phrases/phrase_4_256.txt
deleted file mode 100644
index 84de94bc..00000000
--- a/demo/Tacotron2/phrases/phrase_4_256.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
diff --git a/demo/Tacotron2/phrases/phrase_4_64.txt b/demo/Tacotron2/phrases/phrase_4_64.txt
deleted file mode 100644
index cd1d75b5..00000000
--- a/demo/Tacotron2/phrases/phrase_4_64.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
diff --git a/demo/Tacotron2/phrases/phrase_8_256.txt b/demo/Tacotron2/phrases/phrase_8_256.txt
deleted file mode 100644
index eace2b8e..00000000
--- a/demo/Tacotron2/phrases/phrase_8_256.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
-The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves and the form of printed letters should be beautiful, and that their arrangement on pages.
diff --git a/demo/Tacotron2/phrases/phrase_8_64.txt b/demo/Tacotron2/phrases/phrase_8_64.txt
deleted file mode 100644
index e3a97a5c..00000000
--- a/demo/Tacotron2/phrases/phrase_8_64.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
-She sells seashells by the seashore, shells she sells are great
diff --git a/demo/Tacotron2/preprocess_audio2mel.py b/demo/Tacotron2/preprocess_audio2mel.py
deleted file mode 100644
index 32026325..00000000
--- a/demo/Tacotron2/preprocess_audio2mel.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import torch
-
-from tacotron2.data_function import TextMelLoader
-from common.utils import load_filepaths_and_text
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('-d', '--dataset-path', type=str,
- default='./', help='Path to dataset')
- parser.add_argument('--wav-files', required=True,
- type=str, help='Path to filelist with audio paths and text')
- parser.add_argument('--mel-files', required=True,
- type=str, help='Path to filelist with mel paths and text')
- parser.add_argument('--text-cleaners', nargs='*',
- default=['english_cleaners'], type=str,
- help='Type of text cleaners for input text')
- parser.add_argument('--max-wav-value', default=32768.0, type=float,
- help='Maximum audiowave value')
- parser.add_argument('--sampling-rate', default=22050, type=int,
- help='Sampling rate')
- parser.add_argument('--filter-length', default=1024, type=int,
- help='Filter length')
- parser.add_argument('--hop-length', default=256, type=int,
- help='Hop (stride) length')
- parser.add_argument('--win-length', default=1024, type=int,
- help='Window length')
- parser.add_argument('--mel-fmin', default=0.0, type=float,
- help='Minimum mel frequency')
- parser.add_argument('--mel-fmax', default=8000.0, type=float,
- help='Maximum mel frequency')
- parser.add_argument('--n-mel-channels', default=80, type=int,
- help='Number of bins in mel-spectrograms')
-
- return parser
-
-
-def audio2mel(dataset_path, audiopaths_and_text, melpaths_and_text, args):
-
- melpaths_and_text_list = load_filepaths_and_text(dataset_path, melpaths_and_text)
- audiopaths_and_text_list = load_filepaths_and_text(dataset_path, audiopaths_and_text)
-
- data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args)
-
- for i in range(len(melpaths_and_text_list)):
- if i%100 == 0:
- print("done", i, "/", len(melpaths_and_text_list))
-
- mel = data_loader.get_mel(audiopaths_and_text_list[i][0])
- torch.save(mel, melpaths_and_text_list[i][0])
-
-def main():
-
- parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
- parser = parse_args(parser)
- args = parser.parse_args()
- args.load_mel_from_disk = False
-
- audio2mel(args.dataset_path, args.wav_files, args.mel_files, args)
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/requirements.txt b/demo/Tacotron2/requirements.txt
deleted file mode 100644
index b6eb26de..00000000
--- a/demo/Tacotron2/requirements.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-numba>=0.48
-resampy>=0.3.1
-torch==2.0.1
-matplotlib
-numpy
-inflect
-librosa>=0.10.0
-scipy
-Unidecode
-git+https://github.com/NVIDIA/dllogger#egg=dllogger
---extra-index-url https://pypi.ngc.nvidia.com
-onnx-graphsurgeon
diff --git a/demo/Tacotron2/run_latency_tests.sh b/demo/Tacotron2/run_latency_tests.sh
deleted file mode 100644
index 85e5f0f8..00000000
--- a/demo/Tacotron2/run_latency_tests.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-unset CUDA_VISIBLE_DEVICES
-bash test_infer.sh -bs 1 -il 128 --fp16 --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
-bash test_infer.sh -bs 4 -il 128 --fp16 --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
-bash test_infer.sh -bs 1 -il 128 --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
-bash test_infer.sh -bs 4 -il 128 --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
-export CUDA_VISIBLE_DEVICES=
-export OMP_NUM_THREADS=6
-export KMP_BLOCKTIME=0
-export KMP_AFFINITY=granularity=fine,compact,1,0
-bash test_infer.sh -bs 1 -il 128 --cpu --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
-bash test_infer.sh -bs 4 -il 128 --cpu --num-iters 1003 --tacotron2 ./checkpoints/tacotron2_1032590_6000_amp --waveglow ./checkpoints/waveglow_1076430_14000_amp --wn-channels 256
diff --git a/demo/Tacotron2/scripts/download_checkpoints.sh b/demo/Tacotron2/scripts/download_checkpoints.sh
deleted file mode 100755
index 0d23f2d3..00000000
--- a/demo/Tacotron2/scripts/download_checkpoints.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Prepare the download directory
-mkdir -p checkpoints && cd checkpoints
-
-# Download the Tacotron2 and Waveglow checkpoints
-if [ ! -f "checkpoints/tacotron2_pyt_ckpt_amp_v19.09.0/nvidia_tacotron2pyt_fp16_20190427" ]; then
- echo "Downloading Tacotron2 checkpoint from NGC"
- ngc registry model download-version nvidia/tacotron2_pyt_ckpt_amp:19.09.0
-fi;
-if [ ! -f "checkpoints/waveglow_ckpt_amp_256_v19.10.0/nvidia_waveglow256pyt_fp16" ]; then
- echo "Downloading Waveglow checkpoint from NGC"
- ngc registry model download-version nvidia/waveglow_ckpt_amp_256:19.10.0
-fi;
-
-cd -
diff --git a/demo/Tacotron2/scripts/inference_benchmark.sh b/demo/Tacotron2/scripts/inference_benchmark.sh
deleted file mode 100755
index 86200557..00000000
--- a/demo/Tacotron2/scripts/inference_benchmark.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-echo "TensorRT BS=1, S=128"
-bash test_infer.sh --test tensorrt/test_infer_trt.py -bs 1 -il 128 --fp16 --num-iters 103 --encoder ./output/encoder_fp16.engine --decoder ./output/decoder_with_outer_loop_fp16.engine --postnet ./output/postnet_fp16.engine --waveglow ./output/waveglow_fp16.engine --wn-channels 256
-echo "PyTorch (GPU) BS=1, S=128"
-bash test_infer.sh -bs 1 -il 128 --fp16 --num-iters 103 --tacotron2 ./checkpoints/tacotron2_pyt_ckpt_amp_v19.09.0/nvidia_tacotron2pyt_fp16_20190427 --waveglow ./checkpoints/waveglow_ckpt_amp_256_v19.10.0/nvidia_waveglow256pyt_fp16 --wn-channels 256
diff --git a/demo/Tacotron2/scripts/install_prerequisites.sh b/demo/Tacotron2/scripts/install_prerequisites.sh
deleted file mode 100755
index 5a16d392..00000000
--- a/demo/Tacotron2/scripts/install_prerequisites.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-pip3 install -r requirements.txt
-echo "nvidia" | sudo -S apt-get install -y libsndfile1
-
-pushd /tmp
-git clone https://github.com/NVIDIA/apex
-cd apex
-pip3 install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./
-popd
diff --git a/demo/Tacotron2/scripts/prepare_dataset.sh b/demo/Tacotron2/scripts/prepare_dataset.sh
deleted file mode 100755
index d38be817..00000000
--- a/demo/Tacotron2/scripts/prepare_dataset.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set -e
-
-DATADIR="LJSpeech-1.1"
-BZ2ARCHIVE="${DATADIR}.tar.bz2"
-ENDPOINT="http://data.keithito.com/data/speech/$BZ2ARCHIVE"
-
-if [ ! -d "$DATADIR" ]; then
- echo "dataset is missing, unpacking ..."
- if [ ! -f "$BZ2ARCHIVE" ]; then
- echo "dataset archive is missing, downloading ..."
- wget "$ENDPOINT"
- fi
- tar jxvf "$BZ2ARCHIVE"
-fi
diff --git a/demo/Tacotron2/scripts/prepare_mels.sh b/demo/Tacotron2/scripts/prepare_mels.sh
deleted file mode 100644
index b3843a26..00000000
--- a/demo/Tacotron2/scripts/prepare_mels.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set -e
-
-DATADIR="LJSpeech-1.1"
-FILELISTSDIR="filelists"
-
-TESTLIST="$FILELISTSDIR/ljs_audio_text_test_filelist.txt"
-TRAINLIST="$FILELISTSDIR/ljs_audio_text_train_filelist.txt"
-VALLIST="$FILELISTSDIR/ljs_audio_text_val_filelist.txt"
-
-TESTLIST_MEL="$FILELISTSDIR/ljs_mel_text_test_filelist.txt"
-TRAINLIST_MEL="$FILELISTSDIR/ljs_mel_text_train_filelist.txt"
-VALLIST_MEL="$FILELISTSDIR/ljs_mel_text_val_filelist.txt"
-
-mkdir -p "$DATADIR/mels"
-if [ $(ls $DATADIR/mels | wc -l) -ne 13100 ]; then
- python3 preprocess_audio2mel.py --wav-files "$TRAINLIST" --mel-files "$TRAINLIST_MEL"
- python3 preprocess_audio2mel.py --wav-files "$TESTLIST" --mel-files "$TESTLIST_MEL"
- python3 preprocess_audio2mel.py --wav-files "$VALLIST" --mel-files "$VALLIST_MEL"
-fi
diff --git a/demo/Tacotron2/tacotron2/arg_parser.py b/demo/Tacotron2/tacotron2/arg_parser.py
deleted file mode 100644
index 2a450ef6..00000000
--- a/demo/Tacotron2/tacotron2/arg_parser.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-
-from tacotron2.text import symbols
-
-
-def parse_tacotron2_args(parent, add_help=False):
- """
- Parse commandline arguments.
- """
- parser = argparse.ArgumentParser(parents=[parent], add_help=add_help)
-
- # misc parameters
- parser.add_argument('--mask-padding', default=False, type=bool,
- help='Use mask padding')
- parser.add_argument('--n-mel-channels', default=80, type=int,
- help='Number of bins in mel-spectrograms')
-
- # symbols parameters
- global symbols
- len_symbols = len(symbols)
- symbols = parser.add_argument_group('symbols parameters')
- symbols.add_argument('--n-symbols', default=len_symbols, type=int,
- help='Number of symbols in dictionary')
- symbols.add_argument('--symbols-embedding-dim', default=512, type=int,
- help='Input embedding dimension')
-
- # encoder parameters
- encoder = parser.add_argument_group('encoder parameters')
- encoder.add_argument('--encoder-kernel-size', default=5, type=int,
- help='Encoder kernel size')
- encoder.add_argument('--encoder-n-convolutions', default=3, type=int,
- help='Number of encoder convolutions')
- encoder.add_argument('--encoder-embedding-dim', default=512, type=int,
- help='Encoder embedding dimension')
-
- # decoder parameters
- decoder = parser.add_argument_group('decoder parameters')
- decoder.add_argument('--n-frames-per-step', default=1,
- type=int,
- help='Number of frames processed per step') # currently only 1 is supported
- decoder.add_argument('--decoder-rnn-dim', default=1024, type=int,
- help='Number of units in decoder LSTM')
- decoder.add_argument('--prenet-dim', default=256, type=int,
- help='Number of ReLU units in prenet layers')
- decoder.add_argument('--max-decoder-steps', default=2000, type=int,
- help='Maximum number of output mel spectrograms')
- decoder.add_argument('--gate-threshold', default=0.5, type=float,
- help='Probability threshold for stop token')
- decoder.add_argument('--p-attention-dropout', default=0.1, type=float,
- help='Dropout probability for attention LSTM')
- decoder.add_argument('--p-decoder-dropout', default=0.1, type=float,
- help='Dropout probability for decoder LSTM')
- decoder.add_argument('--decoder-no-early-stopping', action='store_true',
- help='Stop decoding once all samples are finished')
-
- # attention parameters
- attention = parser.add_argument_group('attention parameters')
- attention.add_argument('--attention-rnn-dim', default=1024, type=int,
- help='Number of units in attention LSTM')
- attention.add_argument('--attention-dim', default=128, type=int,
- help='Dimension of attention hidden representation')
-
- # location layer parameters
- location = parser.add_argument_group('location parameters')
- location.add_argument(
- '--attention-location-n-filters', default=32, type=int,
- help='Number of filters for location-sensitive attention')
- location.add_argument(
- '--attention-location-kernel-size', default=31, type=int,
- help='Kernel size for location-sensitive attention')
-
- # Mel-post processing network parameters
- postnet = parser.add_argument_group('postnet parameters')
- postnet.add_argument('--postnet-embedding-dim', default=512, type=int,
- help='Postnet embedding dimension')
- postnet.add_argument('--postnet-kernel-size', default=5, type=int,
- help='Postnet kernel size')
- postnet.add_argument('--postnet-n-convolutions', default=5, type=int,
- help='Number of postnet convolutions')
-
- return parser
diff --git a/demo/Tacotron2/tacotron2/data_function.py b/demo/Tacotron2/tacotron2/data_function.py
deleted file mode 100644
index 5d2c0064..00000000
--- a/demo/Tacotron2/tacotron2/data_function.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import random
-import numpy as np
-import torch
-import torch.utils.data
-
-import common.layers as layers
-from common.utils import load_wav_to_torch, load_filepaths_and_text, to_gpu
-from tacotron2.text import text_to_sequence
-
-class TextMelLoader(torch.utils.data.Dataset):
- """
- 1) loads audio,text pairs
- 2) normalizes text and converts them to sequences of one-hot vectors
- 3) computes mel-spectrograms from audio files.
- """
- def __init__(self, dataset_path, audiopaths_and_text, args):
- self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
- self.text_cleaners = args.text_cleaners
- self.max_wav_value = args.max_wav_value
- self.sampling_rate = args.sampling_rate
- self.load_mel_from_disk = args.load_mel_from_disk
- self.stft = layers.TacotronSTFT(
- args.filter_length, args.hop_length, args.win_length,
- args.n_mel_channels, args.sampling_rate, args.mel_fmin,
- args.mel_fmax)
- random.seed(1234)
- random.shuffle(self.audiopaths_and_text)
-
- def get_mel_text_pair(self, audiopath_and_text):
- # separate filename and text
- audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
- len_text = len(text)
- text = self.get_text(text)
- mel = self.get_mel(audiopath)
- return (text, mel, len_text)
-
- def get_mel(self, filename):
- if not self.load_mel_from_disk:
- audio, sampling_rate = load_wav_to_torch(filename)
- if sampling_rate != self.stft.sampling_rate:
- raise ValueError("{} {} SR doesn't match target {} SR".format(
- sampling_rate, self.stft.sampling_rate))
- audio_norm = audio / self.max_wav_value
- audio_norm = audio_norm.unsqueeze(0)
- audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
- melspec = self.stft.mel_spectrogram(audio_norm)
- melspec = torch.squeeze(melspec, 0)
- else:
- melspec = torch.load(filename)
- assert melspec.size(0) == self.stft.n_mel_channels, (
- 'Mel dimension mismatch: given {}, expected {}'.format(
- melspec.size(0), self.stft.n_mel_channels))
-
- return melspec
-
- def get_text(self, text):
- text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
- return text_norm
-
- def __getitem__(self, index):
- return self.get_mel_text_pair(self.audiopaths_and_text[index])
-
- def __len__(self):
- return len(self.audiopaths_and_text)
-
-
-class TextMelCollate():
- """ Zero-pads model inputs and targets based on number of frames per setep
- """
- def __init__(self, n_frames_per_step):
- self.n_frames_per_step = n_frames_per_step
-
- def __call__(self, batch):
- """Collate's training batch from normalized text and mel-spectrogram
- PARAMS
- ------
- batch: [text_normalized, mel_normalized]
- """
- # Right zero-pad all one-hot text sequences to max input length
- input_lengths, ids_sorted_decreasing = torch.sort(
- torch.LongTensor([len(x[0]) for x in batch]),
- dim=0, descending=True)
- max_input_len = input_lengths[0]
-
- text_padded = torch.LongTensor(len(batch), max_input_len)
- text_padded.zero_()
- for i in range(len(ids_sorted_decreasing)):
- text = batch[ids_sorted_decreasing[i]][0]
- text_padded[i, :text.size(0)] = text
-
- # Right zero-pad mel-spec
- num_mels = batch[0][1].size(0)
- max_target_len = max([x[1].size(1) for x in batch])
- if max_target_len % self.n_frames_per_step != 0:
- max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
- assert max_target_len % self.n_frames_per_step == 0
-
- # include mel padded and gate padded
- mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
- mel_padded.zero_()
- gate_padded = torch.FloatTensor(len(batch), max_target_len)
- gate_padded.zero_()
- output_lengths = torch.LongTensor(len(batch))
- for i in range(len(ids_sorted_decreasing)):
- mel = batch[ids_sorted_decreasing[i]][1]
- mel_padded[i, :, :mel.size(1)] = mel
- gate_padded[i, mel.size(1)-1:] = 1
- output_lengths[i] = mel.size(1)
-
- # count number of items - characters in text
- len_x = [x[2] for x in batch]
- len_x = torch.Tensor(len_x)
- return text_padded, input_lengths, mel_padded, gate_padded, \
- output_lengths, len_x
-
-def batch_to_gpu(batch):
- text_padded, input_lengths, mel_padded, gate_padded, \
- output_lengths, len_x = batch
- text_padded = to_gpu(text_padded).long()
- input_lengths = to_gpu(input_lengths).long()
- max_len = torch.max(input_lengths.data).item()
- mel_padded = to_gpu(mel_padded).float()
- gate_padded = to_gpu(gate_padded).float()
- output_lengths = to_gpu(output_lengths).long()
- x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
- y = (mel_padded, gate_padded)
- len_x = torch.sum(output_lengths)
- return (x, y, len_x)
diff --git a/demo/Tacotron2/tacotron2/loss_function.py b/demo/Tacotron2/tacotron2/loss_function.py
deleted file mode 100644
index 07b3610e..00000000
--- a/demo/Tacotron2/tacotron2/loss_function.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from torch import nn
-
-
-class Tacotron2Loss(nn.Module):
- def __init__(self):
- super(Tacotron2Loss, self).__init__()
-
- def forward(self, model_output, targets):
- mel_target, gate_target = targets[0], targets[1]
- mel_target.requires_grad = False
- gate_target.requires_grad = False
- gate_target = gate_target.view(-1, 1)
-
- mel_out, mel_out_postnet, gate_out, _ = model_output
- gate_out = gate_out.view(-1, 1)
- mel_loss = nn.MSELoss()(mel_out, mel_target) + \
- nn.MSELoss()(mel_out_postnet, mel_target)
- gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
- return mel_loss + gate_loss
diff --git a/demo/Tacotron2/tacotron2/model.py b/demo/Tacotron2/tacotron2/model.py
deleted file mode 100644
index c8ba9f96..00000000
--- a/demo/Tacotron2/tacotron2/model.py
+++ /dev/null
@@ -1,681 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from math import sqrt
-import torch
-from torch import nn
-from torch.nn import functional as F
-import sys
-from os.path import abspath, dirname
-# enabling modules discovery from global entrypoint
-sys.path.append(abspath(dirname(__file__)+'/../'))
-from common.layers import ConvNorm, LinearNorm
-from common.utils import to_gpu, get_mask_from_lengths
-
-
-class LocationLayer(nn.Module):
- def __init__(self, attention_n_filters, attention_kernel_size,
- attention_dim):
- super(LocationLayer, self).__init__()
- padding = int((attention_kernel_size - 1) / 2)
- self.location_conv = ConvNorm(2, attention_n_filters,
- kernel_size=attention_kernel_size,
- padding=padding, bias=False, stride=1,
- dilation=1)
- self.location_dense = LinearNorm(attention_n_filters, attention_dim,
- bias=False, w_init_gain='tanh')
-
- def forward(self, attention_weights_cat):
- processed_attention = self.location_conv(attention_weights_cat)
- processed_attention = processed_attention.transpose(1, 2)
- processed_attention = self.location_dense(processed_attention)
- return processed_attention
-
-
-class Attention(nn.Module):
- def __init__(self, attention_rnn_dim, embedding_dim,
- attention_dim, attention_location_n_filters,
- attention_location_kernel_size):
- super(Attention, self).__init__()
- self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
- bias=False, w_init_gain='tanh')
- self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
- w_init_gain='tanh')
- self.v = LinearNorm(attention_dim, 1, bias=False)
- self.location_layer = LocationLayer(attention_location_n_filters,
- attention_location_kernel_size,
- attention_dim)
- self.score_mask_value = -float("inf")
-
- def get_alignment_energies(self, query, processed_memory,
- attention_weights_cat):
- """
- PARAMS
- ------
- query: decoder output (batch, n_mel_channels * n_frames_per_step)
- processed_memory: processed encoder outputs (B, T_in, attention_dim)
- attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
-
- RETURNS
- -------
- alignment (batch, max_time)
- """
-
- processed_query = self.query_layer(query.unsqueeze(1))
- processed_attention_weights = self.location_layer(attention_weights_cat)
- energies = self.v(torch.tanh(
- processed_query + processed_attention_weights + processed_memory))
-
- energies = energies.squeeze(2)
- return energies
-
- def forward(self, attention_hidden_state, memory, processed_memory,
- attention_weights_cat, mask):
- """
- PARAMS
- ------
- attention_hidden_state: attention rnn last output
- memory: encoder outputs
- processed_memory: processed encoder outputs
- attention_weights_cat: previous and cummulative attention weights
- mask: binary mask for padded data
- """
- alignment = self.get_alignment_energies(
- attention_hidden_state, processed_memory, attention_weights_cat)
-
- alignment = alignment.masked_fill(mask, self.score_mask_value)
-
- attention_weights = F.softmax(alignment, dim=1)
- attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
- attention_context = attention_context.squeeze(1)
-
- return attention_context, attention_weights
-
-
-class Prenet(nn.Module):
- def __init__(self, in_dim, sizes):
- super(Prenet, self).__init__()
- in_sizes = [in_dim] + sizes[:-1]
- self.layers = nn.ModuleList(
- [LinearNorm(in_size, out_size, bias=False)
- for (in_size, out_size) in zip(in_sizes, sizes)])
-
- def forward(self, x):
- for linear in self.layers:
- x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
- return x
-
-
-class Postnet(nn.Module):
- """Postnet
- - Five 1-d convolution with 512 channels and kernel size 5
- """
-
- def __init__(self, n_mel_channels, postnet_embedding_dim,
- postnet_kernel_size, postnet_n_convolutions):
- super(Postnet, self).__init__()
- self.convolutions = nn.ModuleList()
-
- self.convolutions.append(
- nn.Sequential(
- ConvNorm(n_mel_channels, postnet_embedding_dim,
- kernel_size=postnet_kernel_size, stride=1,
- padding=int((postnet_kernel_size - 1) / 2),
- dilation=1, w_init_gain='tanh'),
- nn.BatchNorm1d(postnet_embedding_dim))
- )
-
- for i in range(1, postnet_n_convolutions - 1):
- self.convolutions.append(
- nn.Sequential(
- ConvNorm(postnet_embedding_dim,
- postnet_embedding_dim,
- kernel_size=postnet_kernel_size, stride=1,
- padding=int((postnet_kernel_size - 1) / 2),
- dilation=1, w_init_gain='tanh'),
- nn.BatchNorm1d(postnet_embedding_dim))
- )
-
- self.convolutions.append(
- nn.Sequential(
- ConvNorm(postnet_embedding_dim, n_mel_channels,
- kernel_size=postnet_kernel_size, stride=1,
- padding=int((postnet_kernel_size - 1) / 2),
- dilation=1, w_init_gain='linear'),
- nn.BatchNorm1d(n_mel_channels))
- )
- self.n_convs = len(self.convolutions)
-
- def forward(self, x):
- i = 0
- for conv in self.convolutions:
- if i < self.n_convs - 1:
- x = F.dropout(torch.tanh(conv(x)), 0.5, training=self.training)
- else:
- x = F.dropout(conv(x), 0.5, training=self.training)
- i += 1
-
- return x
-
-
-class Encoder(nn.Module):
- """Encoder module:
- - Three 1-d convolution banks
- - Bidirectional LSTM
- """
- def __init__(self, encoder_n_convolutions,
- encoder_embedding_dim, encoder_kernel_size):
- super(Encoder, self).__init__()
-
- convolutions = []
- for _ in range(encoder_n_convolutions):
- conv_layer = nn.Sequential(
- ConvNorm(encoder_embedding_dim,
- encoder_embedding_dim,
- kernel_size=encoder_kernel_size, stride=1,
- padding=int((encoder_kernel_size - 1) / 2),
- dilation=1, w_init_gain='relu'),
- nn.BatchNorm1d(encoder_embedding_dim))
- convolutions.append(conv_layer)
- self.convolutions = nn.ModuleList(convolutions)
-
- self.lstm = nn.LSTM(encoder_embedding_dim,
- int(encoder_embedding_dim / 2), 1,
- batch_first=True, bidirectional=True)
-
- @torch.jit.ignore
- def forward(self, x, input_lengths):
- for conv in self.convolutions:
- x = F.dropout(F.relu(conv(x)), 0.5, self.training)
-
- x = x.transpose(1, 2)
-
- # pytorch tensor are not reversible, hence the conversion
- input_lengths = input_lengths.cpu().numpy()
- x = nn.utils.rnn.pack_padded_sequence(
- x, input_lengths, batch_first=True)
-
- self.lstm.flatten_parameters()
- outputs, _ = self.lstm(x)
-
- outputs, _ = nn.utils.rnn.pad_packed_sequence(
- outputs, batch_first=True)
-
- return outputs
-
- @torch.jit.export
- def infer(self, x, input_lengths):
- device = x.device
- for conv in self.convolutions:
- x = F.dropout(F.relu(conv(x.to(device))), 0.5, self.training)
-
- x = x.transpose(1, 2)
-
- input_lengths = input_lengths.cpu()
- x = nn.utils.rnn.pack_padded_sequence(
- x, input_lengths, batch_first=True)
-
- outputs, _ = self.lstm(x)
-
- outputs, _ = nn.utils.rnn.pad_packed_sequence(
- outputs, batch_first=True)
-
- return outputs
-
-
-class Decoder(nn.Module):
- def __init__(self, n_mel_channels, n_frames_per_step,
- encoder_embedding_dim, attention_dim,
- attention_location_n_filters,
- attention_location_kernel_size,
- attention_rnn_dim, decoder_rnn_dim,
- prenet_dim, max_decoder_steps, gate_threshold,
- p_attention_dropout, p_decoder_dropout,
- early_stopping):
- super(Decoder, self).__init__()
- self.n_mel_channels = n_mel_channels
- self.n_frames_per_step = n_frames_per_step
- self.encoder_embedding_dim = encoder_embedding_dim
- self.attention_rnn_dim = attention_rnn_dim
- self.decoder_rnn_dim = decoder_rnn_dim
- self.prenet_dim = prenet_dim
- self.max_decoder_steps = max_decoder_steps
- self.gate_threshold = gate_threshold
- self.p_attention_dropout = p_attention_dropout
- self.p_decoder_dropout = p_decoder_dropout
- self.early_stopping = early_stopping
-
- self.prenet = Prenet(
- n_mel_channels * n_frames_per_step,
- [prenet_dim, prenet_dim])
-
- self.attention_rnn = nn.LSTMCell(
- prenet_dim + encoder_embedding_dim,
- attention_rnn_dim)
-
- self.attention_layer = Attention(
- attention_rnn_dim, encoder_embedding_dim,
- attention_dim, attention_location_n_filters,
- attention_location_kernel_size)
-
- self.decoder_rnn = nn.LSTMCell(
- attention_rnn_dim + encoder_embedding_dim,
- decoder_rnn_dim, 1)
-
- self.linear_projection = LinearNorm(
- decoder_rnn_dim + encoder_embedding_dim,
- n_mel_channels * n_frames_per_step)
-
- self.gate_layer = LinearNorm(
- decoder_rnn_dim + encoder_embedding_dim, 1,
- bias=True, w_init_gain='sigmoid')
-
- def get_go_frame(self, memory):
- """ Gets all zeros frames to use as first decoder input
- PARAMS
- ------
- memory: decoder outputs
-
- RETURNS
- -------
- decoder_input: all zeros frames
- """
- B = memory.size(0)
- dtype = memory.dtype
- device = memory.device
- decoder_input = torch.zeros(
- B, self.n_mel_channels*self.n_frames_per_step,
- dtype=dtype, device=device)
- return decoder_input
-
- def initialize_decoder_states(self, memory):
- """ Initializes attention rnn states, decoder rnn states, attention
- weights, attention cumulative weights, attention context, stores memory
- and stores processed memory
- PARAMS
- ------
- memory: Encoder outputs
- mask: Mask for padded data if training, expects None for inference
- """
- B = memory.size(0)
- MAX_TIME = memory.size(1)
- dtype = memory.dtype
- device = memory.device
-
- attention_hidden = torch.zeros(
- B, self.attention_rnn_dim, dtype=dtype, device=device)
- attention_cell = torch.zeros(
- B, self.attention_rnn_dim, dtype=dtype, device=device)
-
- decoder_hidden = torch.zeros(
- B, self.decoder_rnn_dim, dtype=dtype, device=device)
- decoder_cell = torch.zeros(
- B, self.decoder_rnn_dim, dtype=dtype, device=device)
-
- attention_weights = torch.zeros(
- B, MAX_TIME, dtype=dtype, device=device)
- attention_weights_cum = torch.zeros(
- B, MAX_TIME, dtype=dtype, device=device)
- attention_context = torch.zeros(
- B, self.encoder_embedding_dim, dtype=dtype, device=device)
-
- processed_memory = self.attention_layer.memory_layer(memory)
-
- return (attention_hidden, attention_cell, decoder_hidden,
- decoder_cell, attention_weights, attention_weights_cum,
- attention_context, processed_memory)
-
- def parse_decoder_inputs(self, decoder_inputs):
- """ Prepares decoder inputs, i.e. mel outputs
- PARAMS
- ------
- decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
-
- RETURNS
- -------
- inputs: processed decoder inputs
-
- """
- # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
- decoder_inputs = decoder_inputs.transpose(1, 2)
- decoder_inputs = decoder_inputs.view(
- decoder_inputs.size(0),
- int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
- # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
- decoder_inputs = decoder_inputs.transpose(0, 1)
- return decoder_inputs
-
- def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
- """ Prepares decoder outputs for output
- PARAMS
- ------
- mel_outputs:
- gate_outputs: gate output energies
- alignments:
-
- RETURNS
- -------
- mel_outputs:
- gate_outpust: gate output energies
- alignments:
- """
- # (T_out, B) -> (B, T_out)
- alignments = alignments.transpose(0, 1).contiguous()
- # (T_out, B) -> (B, T_out)
- gate_outputs = gate_outputs.transpose(0, 1).contiguous()
- # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
- mel_outputs = mel_outputs.transpose(0, 1).contiguous()
- # decouple frames per step
- shape = (mel_outputs.shape[0], -1, self.n_mel_channels)
- mel_outputs = mel_outputs.view(*shape)
- # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
- mel_outputs = mel_outputs.transpose(1, 2)
-
- return mel_outputs, gate_outputs, alignments
-
- def decode(self, decoder_input, attention_hidden, attention_cell,
- decoder_hidden, decoder_cell, attention_weights,
- attention_weights_cum, attention_context, memory,
- processed_memory, mask):
- """ Decoder step using stored states, attention and memory
- PARAMS
- ------
- decoder_input: previous mel output
-
- RETURNS
- -------
- mel_output:
- gate_output: gate output energies
- attention_weights:
- """
- cell_input = torch.cat((decoder_input, attention_context), -1)
-
- attention_hidden, attention_cell = self.attention_rnn(
- cell_input, (attention_hidden, attention_cell))
- attention_hidden = F.dropout(
- attention_hidden, self.p_attention_dropout, self.training)
-
- attention_weights_cat = torch.cat(
- (attention_weights.unsqueeze(1),
- attention_weights_cum.unsqueeze(1)), dim=1)
- attention_context, attention_weights = self.attention_layer(
- attention_hidden, memory, processed_memory,
- attention_weights_cat, mask)
-
- attention_weights_cum += attention_weights
- decoder_input = torch.cat(
- (attention_hidden, attention_context), -1)
-
- decoder_hidden, decoder_cell = self.decoder_rnn(
- decoder_input, (decoder_hidden, decoder_cell))
- decoder_hidden = F.dropout(
- decoder_hidden, self.p_decoder_dropout, self.training)
-
- decoder_hidden_attention_context = torch.cat(
- (decoder_hidden, attention_context), dim=1)
- decoder_output = self.linear_projection(
- decoder_hidden_attention_context)
-
- gate_prediction = self.gate_layer(decoder_hidden_attention_context)
-
- return (decoder_output, gate_prediction, attention_hidden,
- attention_cell, decoder_hidden, decoder_cell, attention_weights,
- attention_weights_cum, attention_context)
-
- @torch.jit.ignore
- def forward(self, memory, decoder_inputs, memory_lengths):
- """ Decoder forward pass for training
- PARAMS
- ------
- memory: Encoder outputs
- decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
- memory_lengths: Encoder output lengths for attention masking.
-
- RETURNS
- -------
- mel_outputs: mel outputs from the decoder
- gate_outputs: gate outputs from the decoder
- alignments: sequence of attention weights from the decoder
- """
-
- decoder_input = self.get_go_frame(memory).unsqueeze(0)
- decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
- decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
- decoder_inputs = self.prenet(decoder_inputs)
-
- mask = get_mask_from_lengths(memory_lengths)
- (attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context,
- processed_memory) = self.initialize_decoder_states(memory)
-
- mel_outputs, gate_outputs, alignments = [], [], []
- while len(mel_outputs) < decoder_inputs.size(0) - 1:
- decoder_input = decoder_inputs[len(mel_outputs)]
- (mel_output,
- gate_output,
- attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context) = self.decode(decoder_input,
- attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context,
- memory,
- processed_memory,
- mask)
-
- mel_outputs += [mel_output.squeeze(1)]
- gate_outputs += [gate_output.squeeze()]
- alignments += [attention_weights]
-
- mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
- torch.stack(mel_outputs),
- torch.stack(gate_outputs),
- torch.stack(alignments))
-
- return mel_outputs, gate_outputs, alignments
-
- @torch.jit.export
- def infer(self, memory, memory_lengths):
- """ Decoder inference
- PARAMS
- ------
- memory: Encoder outputs
-
- RETURNS
- -------
- mel_outputs: mel outputs from the decoder
- gate_outputs: gate outputs from the decoder
- alignments: sequence of attention weights from the decoder
- """
- decoder_input = self.get_go_frame(memory)
-
- mask = get_mask_from_lengths(memory_lengths)
- (attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context,
- processed_memory) = self.initialize_decoder_states(memory)
-
- mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device=memory.device)
- not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device=memory.device)
-
- mel_outputs, gate_outputs, alignments = (
- torch.zeros(1), torch.zeros(1), torch.zeros(1))
- first_iter = True
- while True:
- decoder_input = self.prenet(decoder_input)
- (mel_output,
- gate_output,
- attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context) = self.decode(decoder_input,
- attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context,
- memory,
- processed_memory,
- mask)
-
- if first_iter:
- mel_outputs = mel_output.unsqueeze(0)
- gate_outputs = gate_output
- alignments = attention_weights
- first_iter = False
- else:
- mel_outputs = torch.cat(
- (mel_outputs, mel_output.unsqueeze(0)), dim=0)
- gate_outputs = torch.cat((gate_outputs, gate_output), dim=0)
- alignments = torch.cat((alignments, attention_weights), dim=0)
-
- dec = torch.le(torch.sigmoid(gate_output),
- self.gate_threshold).to(torch.int32).squeeze(1)
-
- not_finished = not_finished*dec
- mel_lengths += not_finished
-
- if self.early_stopping and torch.sum(not_finished) == 0:
- break
- if len(mel_outputs) == self.max_decoder_steps:
- print("Warning! Reached max decoder steps")
- break
-
- decoder_input = mel_output
-
- mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
- mel_outputs, gate_outputs, alignments)
-
- return mel_outputs, gate_outputs, alignments, mel_lengths
-
-
-class Tacotron2(nn.Module):
- def __init__(self, mask_padding, n_mel_channels,
- n_symbols, symbols_embedding_dim, encoder_kernel_size,
- encoder_n_convolutions, encoder_embedding_dim,
- attention_rnn_dim, attention_dim, attention_location_n_filters,
- attention_location_kernel_size, n_frames_per_step,
- decoder_rnn_dim, prenet_dim, max_decoder_steps, gate_threshold,
- p_attention_dropout, p_decoder_dropout,
- postnet_embedding_dim, postnet_kernel_size,
- postnet_n_convolutions, decoder_no_early_stopping):
- super(Tacotron2, self).__init__()
- self.mask_padding = mask_padding
- self.n_mel_channels = n_mel_channels
- self.n_frames_per_step = n_frames_per_step
- self.embedding = nn.Embedding(n_symbols, symbols_embedding_dim)
- std = sqrt(2.0 / (n_symbols + symbols_embedding_dim))
- val = sqrt(3.0) * std # uniform bounds for std
- self.embedding.weight.data.uniform_(-val, val)
- self.encoder = Encoder(encoder_n_convolutions,
- encoder_embedding_dim,
- encoder_kernel_size)
- self.decoder = Decoder(n_mel_channels, n_frames_per_step,
- encoder_embedding_dim, attention_dim,
- attention_location_n_filters,
- attention_location_kernel_size,
- attention_rnn_dim, decoder_rnn_dim,
- prenet_dim, max_decoder_steps,
- gate_threshold, p_attention_dropout,
- p_decoder_dropout,
- not decoder_no_early_stopping)
- self.postnet = Postnet(n_mel_channels, postnet_embedding_dim,
- postnet_kernel_size,
- postnet_n_convolutions)
-
- def parse_batch(self, batch):
- text_padded, input_lengths, mel_padded, gate_padded, \
- output_lengths = batch
- text_padded = to_gpu(text_padded).long()
- input_lengths = to_gpu(input_lengths).long()
- max_len = torch.max(input_lengths.data).item()
- mel_padded = to_gpu(mel_padded).float()
- gate_padded = to_gpu(gate_padded).float()
- output_lengths = to_gpu(output_lengths).long()
-
- return (
- (text_padded, input_lengths, mel_padded, max_len, output_lengths),
- (mel_padded, gate_padded))
-
- def parse_output(self, outputs, output_lengths):
- # type: (List[Tensor], Tensor) -> List[Tensor]
- if self.mask_padding and output_lengths is not None:
- mask = get_mask_from_lengths(output_lengths)
- mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
- mask = mask.permute(1, 0, 2)
-
- outputs[0].masked_fill_(mask, 0.0)
- outputs[1].masked_fill_(mask, 0.0)
- outputs[2].masked_fill_(mask[:, 0, :], 1e3) # gate energies
-
- return outputs
-
- def forward(self, inputs):
- inputs, input_lengths, targets, max_len, output_lengths = inputs
- input_lengths, output_lengths = input_lengths.data, output_lengths.data
-
- embedded_inputs = self.embedding(inputs).transpose(1, 2)
-
- encoder_outputs = self.encoder(embedded_inputs, input_lengths)
-
- mel_outputs, gate_outputs, alignments = self.decoder(
- encoder_outputs, targets, memory_lengths=input_lengths)
-
- mel_outputs_postnet = self.postnet(mel_outputs)
- mel_outputs_postnet = mel_outputs + mel_outputs_postnet
-
- return self.parse_output(
- [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
- output_lengths)
-
-
- def infer(self, inputs, input_lengths):
-
- embedded_inputs = self.embedding(inputs).transpose(1, 2)
- encoder_outputs = self.encoder.infer(embedded_inputs, input_lengths)
- mel_outputs, gate_outputs, alignments, mel_lengths = self.decoder.infer(
- encoder_outputs, input_lengths)
-
- mel_outputs_postnet = self.postnet(mel_outputs)
- mel_outputs_postnet = mel_outputs + mel_outputs_postnet
-
- BS = mel_outputs_postnet.size(0)
- alignments = alignments.unfold(1, BS, BS).transpose(0,2)
-
- return mel_outputs_postnet, mel_lengths, alignments
diff --git a/demo/Tacotron2/tacotron2/text/LICENCE b/demo/Tacotron2/tacotron2/text/LICENCE
deleted file mode 100644
index 8ac1abf2..00000000
--- a/demo/Tacotron2/tacotron2/text/LICENCE
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2017 Keith Ito
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
\ No newline at end of file
diff --git a/demo/Tacotron2/tacotron2/text/__init__.py b/demo/Tacotron2/tacotron2/text/__init__.py
deleted file mode 100644
index f81bab41..00000000
--- a/demo/Tacotron2/tacotron2/text/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-""" from https://github.com/keithito/tacotron """
-import re
-from tacotron2.text import cleaners
-from tacotron2.text.symbols import symbols
-
-
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-
-# Regular expression matching text enclosed in curly braces:
-_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-
-
-def text_to_sequence(text, cleaner_names):
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
- The text can optionally have ARPAbet sequences enclosed in curly braces embedded
- in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-
- Args:
- text: string to convert to a sequence
- cleaner_names: names of the cleaner functions to run the text through
-
- Returns:
- List of integers corresponding to the symbols in the text
- '''
- sequence = []
-
- # Check for curly braces and treat their contents as ARPAbet:
- while len(text):
- m = _curly_re.match(text)
- if not m:
- sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
- break
- sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
- sequence += _arpabet_to_sequence(m.group(2))
- text = m.group(3)
-
- return sequence
-
-
-def sequence_to_text(sequence):
- '''Converts a sequence of IDs back to a string'''
- result = ''
- for symbol_id in sequence:
- if symbol_id in _id_to_symbol:
- s = _id_to_symbol[symbol_id]
- # Enclose ARPAbet back in curly braces:
- if len(s) > 1 and s[0] == '@':
- s = '{%s}' % s[1:]
- result += s
- return result.replace('}{', ' ')
-
-
-def _clean_text(text, cleaner_names):
- for name in cleaner_names:
- cleaner = getattr(cleaners, name)
- if not cleaner:
- raise Exception('Unknown cleaner: %s' % name)
- text = cleaner(text)
- return text
-
-
-def _symbols_to_sequence(symbols):
- return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
-
-
-def _arpabet_to_sequence(text):
- return _symbols_to_sequence(['@' + s for s in text.split()])
-
-
-def _should_keep_symbol(s):
- return s in _symbol_to_id and s is not '_' and s is not '~'
diff --git a/demo/Tacotron2/tacotron2/text/cleaners.py b/demo/Tacotron2/tacotron2/text/cleaners.py
deleted file mode 100644
index 4cbcb015..00000000
--- a/demo/Tacotron2/tacotron2/text/cleaners.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-""" from https://github.com/keithito/tacotron """
-
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
- 1. "english_cleaners" for English text
- 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
- the Unidecode library (https://pypi.python.org/pypi/Unidecode)
- 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
- the symbols in symbols.py to match your data).
-'''
-
-import re
-from unidecode import unidecode
-from .numbers import normalize_numbers
-
-
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
- ('mrs', 'misess'),
- ('mr', 'mister'),
- ('dr', 'doctor'),
- ('st', 'saint'),
- ('co', 'company'),
- ('jr', 'junior'),
- ('maj', 'major'),
- ('gen', 'general'),
- ('drs', 'doctors'),
- ('rev', 'reverend'),
- ('lt', 'lieutenant'),
- ('hon', 'honorable'),
- ('sgt', 'sergeant'),
- ('capt', 'captain'),
- ('esq', 'esquire'),
- ('ltd', 'limited'),
- ('col', 'colonel'),
- ('ft', 'fort'),
-]]
-
-
-def expand_abbreviations(text):
- for regex, replacement in _abbreviations:
- text = re.sub(regex, replacement, text)
- return text
-
-
-def expand_numbers(text):
- return normalize_numbers(text)
-
-
-def lowercase(text):
- return text.lower()
-
-
-def collapse_whitespace(text):
- return re.sub(_whitespace_re, ' ', text)
-
-
-def convert_to_ascii(text):
- return unidecode(text)
-
-
-def basic_cleaners(text):
- '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
- text = lowercase(text)
- text = collapse_whitespace(text)
- return text
-
-
-def transliteration_cleaners(text):
- '''Pipeline for non-English text that transliterates to ASCII.'''
- text = convert_to_ascii(text)
- text = lowercase(text)
- text = collapse_whitespace(text)
- return text
-
-
-def english_cleaners(text):
- '''Pipeline for English text, including number and abbreviation expansion.'''
- text = convert_to_ascii(text)
- text = lowercase(text)
- text = expand_numbers(text)
- text = expand_abbreviations(text)
- text = collapse_whitespace(text)
- return text
diff --git a/demo/Tacotron2/tacotron2/text/cmudict.py b/demo/Tacotron2/tacotron2/text/cmudict.py
deleted file mode 100644
index b359b235..00000000
--- a/demo/Tacotron2/tacotron2/text/cmudict.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-""" from https://github.com/keithito/tacotron """
-
-import re
-
-
-valid_symbols = [
- 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
- 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
- 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
- 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
- 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
- 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
- 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
-]
-
-_valid_symbol_set = set(valid_symbols)
-
-
-class CMUDict:
- '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
- def __init__(self, file_or_path, keep_ambiguous=True):
- if isinstance(file_or_path, str):
- with open(file_or_path, encoding='latin-1') as f:
- entries = _parse_cmudict(f)
- else:
- entries = _parse_cmudict(file_or_path)
- if not keep_ambiguous:
- entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
- self._entries = entries
-
-
- def __len__(self):
- return len(self._entries)
-
-
- def lookup(self, word):
- '''Returns list of ARPAbet pronunciations of the given word.'''
- return self._entries.get(word.upper())
-
-
-
-_alt_re = re.compile(r'\([0-9]+\)')
-
-
-def _parse_cmudict(file):
- cmudict = {}
- for line in file:
- if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
- parts = line.split(' ')
- word = re.sub(_alt_re, '', parts[0])
- pronunciation = _get_pronunciation(parts[1])
- if pronunciation:
- if word in cmudict:
- cmudict[word].append(pronunciation)
- else:
- cmudict[word] = [pronunciation]
- return cmudict
-
-
-def _get_pronunciation(s):
- parts = s.strip().split(' ')
- for part in parts:
- if part not in _valid_symbol_set:
- return None
- return ' '.join(parts)
diff --git a/demo/Tacotron2/tacotron2/text/numbers.py b/demo/Tacotron2/tacotron2/text/numbers.py
deleted file mode 100644
index 43df588d..00000000
--- a/demo/Tacotron2/tacotron2/text/numbers.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-""" from https://github.com/keithito/tacotron """
-
-import inflect
-import re
-
-
-_inflect = inflect.engine()
-_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
-_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
-_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
-_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
-_number_re = re.compile(r'[0-9]+')
-
-
-def _remove_commas(m):
- return m.group(1).replace(',', '')
-
-
-def _expand_decimal_point(m):
- return m.group(1).replace('.', ' point ')
-
-
-def _expand_dollars(m):
- match = m.group(1)
- parts = match.split('.')
- if len(parts) > 2:
- return match + ' dollars' # Unexpected format
- dollars = int(parts[0]) if parts[0] else 0
- cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
- if dollars and cents:
- dollar_unit = 'dollar' if dollars == 1 else 'dollars'
- cent_unit = 'cent' if cents == 1 else 'cents'
- return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
- elif dollars:
- dollar_unit = 'dollar' if dollars == 1 else 'dollars'
- return '%s %s' % (dollars, dollar_unit)
- elif cents:
- cent_unit = 'cent' if cents == 1 else 'cents'
- return '%s %s' % (cents, cent_unit)
- else:
- return 'zero dollars'
-
-
-def _expand_ordinal(m):
- return _inflect.number_to_words(m.group(0))
-
-
-def _expand_number(m):
- num = int(m.group(0))
- if num > 1000 and num < 3000:
- if num == 2000:
- return 'two thousand'
- elif num > 2000 and num < 2010:
- return 'two thousand ' + _inflect.number_to_words(num % 100)
- elif num % 100 == 0:
- return _inflect.number_to_words(num // 100) + ' hundred'
- else:
- return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
- else:
- return _inflect.number_to_words(num, andword='')
-
-
-def normalize_numbers(text):
- text = re.sub(_comma_number_re, _remove_commas, text)
- text = re.sub(_pounds_re, r'\1 pounds', text)
- text = re.sub(_dollars_re, _expand_dollars, text)
- text = re.sub(_decimal_number_re, _expand_decimal_point, text)
- text = re.sub(_ordinal_re, _expand_ordinal, text)
- text = re.sub(_number_re, _expand_number, text)
- return text
diff --git a/demo/Tacotron2/tacotron2/text/symbols.py b/demo/Tacotron2/tacotron2/text/symbols.py
deleted file mode 100644
index 604626ec..00000000
--- a/demo/Tacotron2/tacotron2/text/symbols.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-""" from https://github.com/keithito/tacotron """
-
-'''
-Defines the set of symbols used in text input to the model.
-
-The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
-from tacotron2.text import cmudict
-
-_pad = '_'
-_punctuation = '!\'(),.:;? '
-_special = '-'
-_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
-
-# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-_arpabet = ['@' + s for s in cmudict.valid_symbols]
-
-# Export all symbols:
-symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
diff --git a/demo/Tacotron2/tensorrt/convert_onnx2trt.py b/demo/Tacotron2/tensorrt/convert_onnx2trt.py
deleted file mode 100644
index dd24c801..00000000
--- a/demo/Tacotron2/tensorrt/convert_onnx2trt.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import sys
-import tensorrt as trt
-from os.path import join
-
-from trt_utils import build_engine, parse_dynamic_size
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('-o', '--output', required=True,
- help='output folder to save audio (file per phrase)')
- parser.add_argument('--encoder', type=str, default="",
- help='full path to the Encoder ONNX')
- parser.add_argument('--decoder', type=str, default="",
- help='full path to the Decoder or DecoderIter ONNX.')
- parser.add_argument('--postnet', type=str, default="",
- help='full path to the Postnet ONNX')
- parser.add_argument('--waveglow', type=str, default="",
- help='full path to the WaveGlow ONNX')
- parser.add_argument('--encoder_out', type=str,
- help='Filename of the exported encoder engine')
- parser.add_argument('--decoder_out', type=str,
- help='Filename of the exported decoder engine')
- parser.add_argument('--postnet_out', type=str,
- help='Filename of the exported postnet engine')
- parser.add_argument('--waveglow_out', type=str,
- help='Filename of the exported waveglow engine')
- parser.add_argument('--fp16', action='store_true',
- help='inference with FP16')
- parser.add_argument('-bs', '--batch-size', type=str, default="1",
- help='One or three comma separated integers specifying the batch size. Specify "min,opt,max" for dynamic shape')
- parser.add_argument('--mel-size', type=str, default="32,768,1664",
- help='One or three comma separated integers specifying the mels size for waveglow.')
- parser.add_argument('--z-size', type=str, default="1024,24576,53248",
- help='One or three comma separated integers specifying the z size for waveglow.')
- parser.add_argument('--loop', dest='loop', action='store_true',
- help='Includes the outer decoder loop in the ONNX model. Enabled by default and only supported on TensorRT 8.0 or later.')
- parser.add_argument('--no-loop', dest='loop', action='store_false',
- help='Excludes outer decoder loop from decoder ONNX model. Default behavior and necessary for TensorRT 7.2 or earlier.')
- parser.add_argument("-tcf", "--timing-cache-file", default=None, type=str,
- help="Path to tensorrt build timeing cache file, only available for tensorrt 8.0 and later. The cache file is assumed to be used exclusively. It's the users' responsibility to create file lock to prevent accessing conflict.",
- required=False)
- parser.set_defaults(loop=int(trt.__version__[0]) >= 8)
- return parser
-
-
-def main():
-
- parser = argparse.ArgumentParser(
- description='Export from ONNX to TensorRT for Tacotron 2 and WaveGlow')
- parser = parse_args(parser)
- args = parser.parse_args()
-
- precision = "fp16" if args.fp16 else "fp32"
- encoder_path = join(args.output, args.encoder_out if args.encoder_out else f"encoder_{precision}.engine")
- decoder_path = join(args.output, args.decoder_out if args.decoder_out else f"decoder_with_outer_loop_{precision}.engine" if args.loop else f"decoder_iter_{precision}.engine")
- postnet_path = join(args.output, args.postnet_out if args.postnet_out else f"postnet_{precision}.engine")
- waveglow_path = join(args.output, args.waveglow_out if args.waveglow_out else f"waveglow_{precision}.engine")
-
- bs_min, bs_opt, bs_max = parse_dynamic_size(args.batch_size)
- mel_min, mel_opt, mel_max = parse_dynamic_size(args.mel_size)
- z_min, z_opt, z_max = parse_dynamic_size(args.z_size)
-
- # Encoder
- shapes=[{"name": "sequences", "min": (bs_min,4), "opt": (bs_opt,128), "max": (bs_max,256)},
- {"name": "sequence_lengths", "min": (bs_min,), "opt": (bs_opt,), "max": (bs_max,)}]
- if args.encoder != "":
- print("Building Encoder ...")
- encoder_engine = build_engine(args.encoder, shapes=shapes, fp16=args.fp16, timing_cache=args.timing_cache_file)
- if encoder_engine is not None:
- with open(encoder_path, 'wb') as f:
- f.write(encoder_engine)
- else:
- print("Failed to build engine from", args.encoder)
- sys.exit(1)
-
- if args.loop:
- # Decoder
- shapes=[{"name": "decoder_input_0", "min": (bs_min,80), "opt": (bs_opt,80), "max": (bs_max,80)},
- {"name": "attention_hidden_0", "min": (bs_min,1024), "opt": (bs_opt,1024), "max": (bs_max,1024)},
- {"name": "attention_cell_0", "min": (bs_min,1024), "opt": (bs_opt,1024), "max": (bs_max,1024)},
- {"name": "decoder_hidden_0", "min": (bs_min,1024), "opt": (bs_opt,1024), "max": (bs_max,1024)},
- {"name": "decoder_cell_0", "min": (bs_min,1024), "opt": (bs_opt,1024), "max": (bs_max,1024)},
- {"name": "attention_weights_0", "min": (bs_min,4), "opt": (bs_opt,128), "max": (bs_max,256)},
- {"name": "attention_weights_cum_0", "min": (bs_min,4), "opt": (bs_opt,128), "max": (bs_max,256)},
- {"name": "attention_context_0", "min": (bs_min,512), "opt": (bs_opt,512), "max": (bs_max,512)},
- {"name": "memory", "min": (bs_min,4,512), "opt": (bs_opt,128,512), "max": (bs_max,256,512)},
- {"name": "processed_memory", "min": (bs_min,4,128), "opt": (bs_opt,128,128), "max": (bs_max,256,128)},
- {"name": "mask", "min": (bs_min,4), "opt": (bs_opt,128), "max": (bs_max,256)}]
- if args.decoder != "":
- print("Building Decoder with loop...")
- decoder_engine = build_engine(args.decoder, shapes=shapes, fp16=args.fp16, timing_cache=args.timing_cache_file)
- if decoder_engine is not None:
- with open(decoder_path, 'wb') as f:
- f.write(decoder_engine)
- else:
- print("Failed to build engine from", args.decoder)
- sys.exit(1)
- else:
- # DecoderIter
- shapes=[{"name": "decoder_input", "min": (bs_min,80), "opt": (bs_opt,80), "max": (bs_max,80)},
- {"name": "attention_hidden", "min": (bs_min,1024), "opt": (bs_opt,1024), "max": (bs_max,1024)},
- {"name": "attention_cell", "min": (bs_min,1024), "opt": (bs_opt,1024), "max": (bs_max,1024)},
- {"name": "decoder_hidden", "min": (bs_min,1024), "opt": (bs_opt,1024), "max": (bs_max,1024)},
- {"name": "decoder_cell", "min": (bs_min,1024), "opt": (bs_opt,1024), "max": (bs_max,1024)},
- {"name": "attention_weights", "min": (bs_min,4), "opt": (bs_opt,128), "max": (bs_max,256)},
- {"name": "attention_weights_cum", "min": (bs_min,4), "opt": (bs_opt,128), "max": (bs_max,256)},
- {"name": "attention_context", "min": (bs_min,512), "opt": (bs_opt,512), "max": (bs_max,512)},
- {"name": "memory", "min": (bs_min,4,512), "opt": (bs_opt,128,512), "max": (bs_max,256,512)},
- {"name": "processed_memory", "min": (bs_min,4,128), "opt": (bs_opt,128,128), "max": (bs_max,256,128)},
- {"name": "mask", "min": (bs_min,4), "opt": (bs_opt,128), "max": (bs_max,256)}]
- if args.decoder != "":
- print("Building Decoder ...")
- decoder_iter_engine = build_engine(args.decoder, shapes=shapes, fp16=args.fp16, timing_cache=args.timing_cache_file)
- if decoder_iter_engine is not None:
- with open(decoder_path, 'wb') as f:
- f.write(decoder_iter_engine)
- else:
- print("Failed to build engine from", args.decoder)
- sys.exit(1)
-
- # Postnet
- shapes=[{"name": "mel_outputs", "min": (bs_min,80,32), "opt": (bs_opt,80,768), "max": (bs_max,80,1664)}]
- if args.postnet != "":
- print("Building Postnet ...")
- postnet_engine = build_engine(args.postnet, shapes=shapes, fp16=args.fp16, timing_cache=args.timing_cache_file)
- if postnet_engine is not None:
- with open(postnet_path, 'wb') as f:
- f.write(postnet_engine)
- else:
- print("Failed to build engine from", args.postnet)
- sys.exit(1)
-
- # WaveGlow
- shapes=[{"name": "mel", "min": (bs_min,80,mel_min,1), "opt": (bs_opt,80,mel_opt,1), "max": (bs_max,80,mel_max,1)},
- {"name": "z", "min": (bs_min,8,z_min,1), "opt": (bs_opt,8,z_opt,1), "max": (bs_max,8,z_max,1)}]
- if args.waveglow != "":
- print("Building WaveGlow ...")
- waveglow_engine = build_engine(args.waveglow, shapes=shapes, fp16=args.fp16, timing_cache=args.timing_cache_file)
- if waveglow_engine is not None:
- with open(waveglow_path, 'wb') as f:
- f.write(waveglow_engine)
- else:
- print("Failed to build engine from", args.waveglow)
- sys.exit(1)
-
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/tensorrt/convert_tacotron22onnx.py b/demo/Tacotron2/tensorrt/convert_tacotron22onnx.py
deleted file mode 100644
index 361a2221..00000000
--- a/demo/Tacotron2/tensorrt/convert_tacotron22onnx.py
+++ /dev/null
@@ -1,418 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import tensorrt
-import torch
-from torch import nn
-from torch.nn import functional as F
-import argparse
-
-import sys
-import os
-from pathlib import Path
-sys.path.append(str(Path(__file__).parents[1]))
-
-import models
-from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, prepare_input_sequence
-from common.utils import to_gpu, get_mask_from_lengths
-
-torch.backends.cudnn.enabled = True
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('--tacotron2', type=str, required=True,
- help='Full path to the Tacotron2 model checkpoint file')
- parser.add_argument('-o', '--output', type=str, required=True,
- help='Directory for the exported Tacotron2 ONNX models')
- parser.add_argument('-e', '--encoder', type=str, required=False, default="encoder.onnx",
- help='Filename for exported encoder ONNX model')
- parser.add_argument('-d', '--decoder', type=str, required=False, default="decoder_iter.onnx",
- help='Filename for exported decoder ONNX model')
- parser.add_argument('-p', '--postnet', type=str, required=False, default="postnet.onnx",
- help='Filename for exported postnet ONNX model')
- parser.add_argument('--fp16', action='store_true',
- help='Export with half precision to ONNX')
- parser.add_argument('--loop', dest='loop', action='store_true',
- help='Includes the outer decoder loop in the ONNX model. Enabled by default and only supported on TensorRT 8.0 or later.')
- parser.add_argument('--no-loop', dest='loop', action='store_false',
- help='Excludes outer decoder loop from decoder ONNX model. Default behavior and necessary for TensorRT 7.2 or earlier.')
- parser.set_defaults(loop=int(tensorrt.__version__[0]) >= 8)
-
- return parser
-
-
-def encoder_infer(self, x, input_lengths):
- device = x.device
- for conv in self.convolutions:
- x = F.dropout(F.relu(conv(x.to(device))), 0.5, False)
-
- x = x.transpose(1, 2)
-
- x = nn.utils.rnn.pack_padded_sequence(
- x, input_lengths, batch_first=True)
-
- outputs, _ = self.lstm(x)
-
- outputs, _ = nn.utils.rnn.pad_packed_sequence(
- outputs, batch_first=True)
-
- lens = input_lengths*2
-
- return outputs, lens
-
-
-class Encoder(torch.nn.Module):
- def __init__(self, tacotron2):
- super(Encoder, self).__init__()
- self.tacotron2 = tacotron2
- self.tacotron2.encoder.lstm.flatten_parameters()
- self.infer = encoder_infer
-
- def forward(self, sequence, sequence_lengths):
- embedded_inputs = self.tacotron2.embedding(sequence).transpose(1, 2)
- memory, lens = self.infer(self.tacotron2.encoder, embedded_inputs, sequence_lengths)
- processed_memory = self.tacotron2.decoder.attention_layer.memory_layer(memory)
- return memory, processed_memory, lens
-
-class Postnet(torch.nn.Module):
- def __init__(self, tacotron2):
- super(Postnet, self).__init__()
- self.tacotron2 = tacotron2
-
- def forward(self, mel_outputs):
- mel_outputs_postnet = self.tacotron2.postnet(mel_outputs)
- return mel_outputs + mel_outputs_postnet
-
-def lstmcell2lstm_params(lstm_mod, lstmcell_mod):
- lstm_mod.weight_ih_l0 = torch.nn.Parameter(lstmcell_mod.weight_ih)
- lstm_mod.weight_hh_l0 = torch.nn.Parameter(lstmcell_mod.weight_hh)
- lstm_mod.bias_ih_l0 = torch.nn.Parameter(lstmcell_mod.bias_ih)
- lstm_mod.bias_hh_l0 = torch.nn.Parameter(lstmcell_mod.bias_hh)
-
-
-def prenet_infer(self, x):
- x1 = x[:]
- for linear in self.layers:
- x1 = F.relu(linear(x1))
- x0 = x1[0].unsqueeze(0)
- mask = torch.le(torch.rand(256, device='cuda').to(x.dtype), 0.5).to(x.dtype)
- mask = mask.expand(x1.size(0), x1.size(1))
- x1 = x1*mask*2.0
-
- return x1
-
-class DecoderIter(torch.nn.Module):
- def __init__(self, tacotron2):
- super(DecoderIter, self).__init__()
-
- self.tacotron2 = tacotron2
- dec = tacotron2.decoder
-
- self.p_attention_dropout = dec.p_attention_dropout
- self.p_decoder_dropout = dec.p_decoder_dropout
- self.prenet = dec.prenet
-
- self.prenet.infer = prenet_infer
-
- self.attention_rnn = nn.LSTM(dec.prenet_dim + dec.encoder_embedding_dim,
- dec.attention_rnn_dim, 1)
- lstmcell2lstm_params(self.attention_rnn, dec.attention_rnn)
- self.attention_rnn.flatten_parameters()
-
- self.attention_layer = dec.attention_layer
-
- self.decoder_rnn = nn.LSTM(dec.attention_rnn_dim + dec.encoder_embedding_dim,
- dec.decoder_rnn_dim, 1)
- lstmcell2lstm_params(self.decoder_rnn, dec.decoder_rnn)
- self.decoder_rnn.flatten_parameters()
-
- self.linear_projection = dec.linear_projection
- self.gate_layer = dec.gate_layer
-
-
- def decode(self, decoder_input, in_attention_hidden, in_attention_cell,
- in_decoder_hidden, in_decoder_cell, in_attention_weights,
- in_attention_weights_cum, in_attention_context, memory,
- processed_memory, mask):
-
- cell_input = torch.cat((decoder_input, in_attention_context), -1)
-
- _, (out_attention_hidden, out_attention_cell) = self.attention_rnn(
- cell_input.unsqueeze(0), (in_attention_hidden.unsqueeze(0),
- in_attention_cell.unsqueeze(0)))
- out_attention_hidden = out_attention_hidden.squeeze(0)
- out_attention_cell = out_attention_cell.squeeze(0)
-
- out_attention_hidden = F.dropout(
- out_attention_hidden, self.p_attention_dropout, False)
-
- attention_weights_cat = torch.cat(
- (in_attention_weights.unsqueeze(1),
- in_attention_weights_cum.unsqueeze(1)), dim=1)
- out_attention_context, out_attention_weights = self.attention_layer(
- out_attention_hidden, memory, processed_memory,
- attention_weights_cat, mask)
-
- out_attention_weights_cum = in_attention_weights_cum + out_attention_weights
- decoder_input_tmp = torch.cat(
- (out_attention_hidden, out_attention_context), -1)
-
- _, (out_decoder_hidden, out_decoder_cell) = self.decoder_rnn(
- decoder_input_tmp.unsqueeze(0), (in_decoder_hidden.unsqueeze(0),
- in_decoder_cell.unsqueeze(0)))
- out_decoder_hidden = out_decoder_hidden.squeeze(0)
- out_decoder_cell = out_decoder_cell.squeeze(0)
-
- out_decoder_hidden = F.dropout(
- out_decoder_hidden, self.p_decoder_dropout, False)
-
- decoder_hidden_attention_context = torch.cat(
- (out_decoder_hidden, out_attention_context), 1)
-
- decoder_output = self.linear_projection(
- decoder_hidden_attention_context)
-
- gate_prediction = self.gate_layer(decoder_hidden_attention_context)
-
- return (decoder_output, gate_prediction, out_attention_hidden,
- out_attention_cell, out_decoder_hidden, out_decoder_cell,
- out_attention_weights, out_attention_weights_cum, out_attention_context)
-
- # @torch.jit.script
- def forward(self,
- decoder_input,
- attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context,
- memory,
- processed_memory,
- mask):
- decoder_input1 = self.prenet.infer(self.prenet, decoder_input)
- outputs = self.decode(decoder_input1,
- attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context,
- memory,
- processed_memory,
- mask)
- return outputs
-
-
-def test_inference(encoder, decoder_iter, postnet):
-
- encoder.eval()
- decoder_iter.eval()
- postnet.eval()
-
- sys.path.append('./tensorrt')
- from inference_trt import init_decoder_inputs
-
- texts = ["Hello World, good day."]
- sequences, sequence_lengths = prepare_input_sequence(texts)
-
- measurements = {}
-
- print("Running Tacotron2 Encoder")
- with torch.no_grad():
- memory, processed_memory, lens = encoder(sequences, sequence_lengths)
-
- print("Running Tacotron2 Decoder")
- device = memory.device
- dtype = memory.dtype
- mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device = device)
- not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device = device)
- mel_outputs, gate_outputs, alignments = (torch.zeros(1), torch.zeros(1), torch.zeros(1))
- gate_threshold = 0.6
- max_decoder_steps = 1000
- first_iter = True
-
- (decoder_input, attention_hidden, attention_cell, decoder_hidden,
- decoder_cell, attention_weights, attention_weights_cum,
- attention_context, memory, processed_memory,
- mask) = init_decoder_inputs(memory, processed_memory, sequence_lengths)
-
- while True:
- with torch.no_grad():
- (mel_output, gate_output,
- attention_hidden, attention_cell,
- decoder_hidden, decoder_cell,
- attention_weights, attention_weights_cum,
- attention_context) = decoder_iter(decoder_input, attention_hidden, attention_cell, decoder_hidden,
- decoder_cell, attention_weights, attention_weights_cum,
- attention_context, memory, processed_memory, mask)
-
- if first_iter:
- mel_outputs = torch.unsqueeze(mel_output, 2)
- gate_outputs = torch.unsqueeze(gate_output, 2)
- alignments = torch.unsqueeze(attention_weights, 2)
- first_iter = False
- else:
- mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(mel_output, 2)), 2)
- gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(gate_output, 2)), 2)
- alignments = torch.cat((alignments, torch.unsqueeze(attention_weights, 2)), 2)
-
- dec = torch.le(torch.sigmoid(gate_output), gate_threshold).to(torch.int32).squeeze(1)
- not_finished = not_finished*dec
- mel_lengths += not_finished
-
- if torch.sum(not_finished) == 0:
- print("Stopping after ",mel_outputs.size(2)," decoder steps")
- break
- if mel_outputs.size(2) == max_decoder_steps:
- print("Warning! Reached max decoder steps")
- break
-
- decoder_input = mel_output
-
-
- print("Running Tacotron2 PostNet")
- with torch.no_grad():
- mel_outputs_postnet = postnet(mel_outputs)
-
- return mel_outputs_postnet
-
-def main():
-
- parser = argparse.ArgumentParser(
- description='PyTorch Tacotron 2 export to TRT')
- parser = parse_args(parser)
- args, _ = parser.parse_known_args()
-
- args.encoder = os.path.join(args.output, args.encoder)
- args.decoder = os.path.join(args.output, args.decoder)
- args.postnet = os.path.join(args.output, args.postnet)
-
- tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
- fp16_run=args.fp16, cpu_run=False)
-
- opset_version = 10
-
- sequences = torch.randint(low=0, high=148, size=(1,50),
- dtype=torch.long).cuda()
- sequence_lengths = torch.IntTensor([sequences.size(1)])
- dummy_input = (sequences, sequence_lengths)
-
- encoder = Encoder(tacotron2)
- encoder.eval()
- with torch.no_grad():
- encoder(*dummy_input)
-
- torch.onnx.export(encoder, dummy_input, args.encoder,
- opset_version=opset_version,
- do_constant_folding=True,
- input_names=["sequences", "sequence_lengths"],
- output_names=["memory", "processed_memory", "lens"],
- dynamic_axes={"sequences": {0: "batch_size", 1: "text_seq"},
- "sequence_lengths": {0: "batch_size"},
- "memory": {0: "batch_size", 1: "mem_seq"},
- "processed_memory": {0: "batch_size", 1: "mem_seq"},
- "lens": {0: "batch_size"}
- })
-
- decoder_iter = DecoderIter(tacotron2)
- memory = torch.randn((1,sequence_lengths[0],512)).cuda() #encoder_outputs
- if args.fp16:
- memory = memory.half()
- memory_lengths = sequence_lengths.cuda()
- # initialize decoder states for dummy_input
- decoder_input = tacotron2.decoder.get_go_frame(memory)
- mask = get_mask_from_lengths(memory_lengths)
- (attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context,
- processed_memory) = tacotron2.decoder.initialize_decoder_states(memory)
- dummy_input = (decoder_input,
- attention_hidden,
- attention_cell,
- decoder_hidden,
- decoder_cell,
- attention_weights,
- attention_weights_cum,
- attention_context,
- memory,
- processed_memory,
- mask)
-
- decoder_iter = DecoderIter(tacotron2)
- decoder_iter.eval()
- with torch.no_grad():
- decoder_iter(*dummy_input)
-
- torch.onnx.export(decoder_iter, dummy_input, args.decoder,
- opset_version=opset_version,
- do_constant_folding=True,
- input_names=["decoder_input",
- "attention_hidden",
- "attention_cell",
- "decoder_hidden",
- "decoder_cell",
- "attention_weights",
- "attention_weights_cum",
- "attention_context",
- "memory",
- "processed_memory",
- "mask"],
- output_names=["decoder_output",
- "gate_prediction",
- "out_attention_hidden",
- "out_attention_cell",
- "out_decoder_hidden",
- "out_decoder_cell",
- "out_attention_weights",
- "out_attention_weights_cum",
- "out_attention_context"],
- dynamic_axes={"attention_weights" : {0: "batch_size", 1: "seq_len"},
- "attention_weights_cum" : {0: "batch_size", 1: "seq_len"},
- "memory" : {0: "batch_size", 1: "seq_len"},
- "processed_memory" : {0: "batch_size", 1: "seq_len"},
- "mask" : {0: "batch_size", 1: "seq_len"},
- "out_attention_weights" : {0: "batch_size", 1: "seq_len"},
- "out_attention_weights_cum" : {0: "batch_size", 1: "seq_len"}
- })
-
- if args.loop:
- from generate_decoder import insert_decoder_loop
- decoder_dir = os.path.dirname(os.path.abspath(args.decoder))
- insert_decoder_loop(args.decoder, decoder_dir, os.path.basename(args.decoder).replace("_iter", ""), args.fp16)
-
- postnet = Postnet(tacotron2)
- dummy_input = torch.randn((1,80,620)).cuda()
- if args.fp16:
- dummy_input = dummy_input.half()
- torch.onnx.export(postnet, dummy_input, args.postnet,
- opset_version=opset_version,
- do_constant_folding=True,
- input_names=["mel_outputs"],
- output_names=["mel_outputs_postnet"],
- dynamic_axes={"mel_outputs": {0: "batch_size", 2: "mel_seq"},
- "mel_outputs_postnet": {0: "batch_size", 2: "mel_seq"}})
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/tensorrt/convert_waveglow2onnx.py b/demo/Tacotron2/tensorrt/convert_waveglow2onnx.py
deleted file mode 100644
index 4b9aecbc..00000000
--- a/demo/Tacotron2/tensorrt/convert_waveglow2onnx.py
+++ /dev/null
@@ -1,167 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-import argparse
-import os
-import sys
-from pathlib import Path
-sys.path.append(str(Path(__file__).parents[1]))
-
-from common.utils import ParseFromConfigFile
-from inference import load_and_setup_model
-
-def convert_convinv_1d_to_2d(convinv):
- """
- Takes an invertible 1x1 1-d convolution and returns a 2-d convolution that does
- the inverse
- """
- conv2d = torch.nn.Conv2d(convinv.W_inverse.size(1),
- convinv.W_inverse.size(0),
- 1, bias=False)
- conv2d.weight.data[:,:,:,0] = convinv.W_inverse.data
- return conv2d
-
-
-def convert_conv_1d_to_2d(conv1d):
- conv2d = torch.nn.Conv2d(conv1d.weight.size(1),
- conv1d.weight.size(0),
- (conv1d.weight.size(2), 1),
- stride=(conv1d.stride[0], 1),
- dilation=(conv1d.dilation[0], 1),
- padding=(conv1d.padding[0], 0))
- conv2d.weight.data[:,:,:,0] = conv1d.weight.data
- conv2d.bias.data = conv1d.bias.data
- return conv2d
-
-
-def convert_WN_1d_to_2d_(WN):
- """
- Modifies the WaveNet like affine coupling layer in-place to use 2-d convolutions
- """
- WN.start = convert_conv_1d_to_2d(WN.start)
- WN.end = convert_conv_1d_to_2d(WN.end)
-
- for i in range(len(WN.in_layers)):
- WN.in_layers[i] = convert_conv_1d_to_2d(WN.in_layers[i])
-
- for i in range(len(WN.res_skip_layers)):
- WN.res_skip_layers[i] = convert_conv_1d_to_2d(WN.res_skip_layers[i])
-
- for i in range(len(WN.res_skip_layers)):
- WN.cond_layers[i] = convert_conv_1d_to_2d(WN.cond_layers[i])
-
-
-def convert_1d_to_2d_(glow):
- """
- Caffe2 and TensorRT don't seem to support 1-d convolutions or properly
- convert ONNX exports with 1d convolutions to 2d convolutions yet, so we
- do the conversion to 2-d convolutions before ONNX export
- """
- # Convert upsample to 2d
- upsample = torch.nn.ConvTranspose2d(glow.upsample.weight.size(0),
- glow.upsample.weight.size(1),
- (glow.upsample.weight.size(2), 1),
- stride=(glow.upsample.stride[0], 1))
- upsample.weight.data[:,:,:,0] = glow.upsample.weight.data
- upsample.bias.data = glow.upsample.bias.data
- glow.upsample = upsample.cuda()
-
- # Convert WN to 2d
- for WN in glow.WN:
- convert_WN_1d_to_2d_(WN)
-
- # Convert invertible conv to 2d
- for i in range(len(glow.convinv)):
- glow.convinv[i] = convert_convinv_1d_to_2d(glow.convinv[i])
-
- glow.cuda()
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('--waveglow', type=str, required=True,
- help='full path to the WaveGlow model checkpoint file')
- parser.add_argument('-o', '--output', type=str, required=True,
- help='Directory or file name for the exported WaveGlow ONNX model')
- parser.add_argument('--fp16', action='store_true',
- help='inference with AMP')
- parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
-
- parser.add_argument('--config-file', action=ParseFromConfigFile,
- type=str, help='Path to configuration file')
-
- return parser
-
-
-def export_onnx(parser, args):
-
- waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
- fp16_run=args.fp16, cpu_run=False,
- forward_is_infer=False)
-
- # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
- mel = torch.randn(1, 80, 620).cuda()
- stride = 256 # value from waveglow upsample
- n_group = 8
- z_size2 = (mel.size(2)*stride)//n_group
- z = torch.randn(1, n_group, z_size2, 1).cuda()
-
- if args.fp16:
- mel = mel.half()
- z = z.half()
- with torch.no_grad():
- # run inference to force calculation of inverses
- waveglow.infer(mel, sigma=args.sigma_infer)
-
- convert_1d_to_2d_(waveglow)
- mel = mel.unsqueeze(3)
-
- # export to ONNX
- if args.fp16:
- waveglow = waveglow.half()
-
- waveglow.forward = waveglow.infer_onnx
-
- opset_version = 11
-
- if os.path.isdir(args.output):
- output_path = os.path.join(args.output, "waveglow.onnx")
- else:
- output_path = args.output
-
- torch.onnx.export(waveglow, (mel, z), output_path,
- opset_version=opset_version,
- do_constant_folding=True,
- input_names=["mel", "z"],
- output_names=["audio"],
- dynamic_axes={"mel": {0: "batch_size", 2: "mel_seq"},
- "z": {0: "batch_size", 2: "z_seq"},
- "audio": {0: "batch_size", 1: "audio_seq"}})
-
-
-def main():
- parser = argparse.ArgumentParser(
- description='PyTorch Tacotron 2 Inference')
- parser = parse_args(parser)
- args, _ = parser.parse_known_args()
-
- export_onnx(parser, args)
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/tensorrt/generate_decoder.py b/demo/Tacotron2/tensorrt/generate_decoder.py
deleted file mode 100644
index 62f8b04e..00000000
--- a/demo/Tacotron2/tensorrt/generate_decoder.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import onnx_graphsurgeon as gs
-import onnx
-import sys
-import os
-import numpy as np
-import argparse
-
-def insert_decoder_loop(decoder_iter_onnx_path, output_dir, decoder_out_name, fp16):
- float_prec = np.float16 if fp16 else np.float32
-
- # Modify loop body so that it has 2+N inputs: (iteration_num, condition, loop carried dependencies...)
- # and 1+N+K outputs: (condition, loop carried dependencies..., scan_outputs...)
-
- # In this case, the loop carried dependencies include the following IN ORDER
- # - decoder_output/decoder_input
- # - attention_hidden
- # - attention_cell
- # - decoder_hidden
- # - decoder_cell
- # - attention_weights
- # - attention_weights_cum
- # - attention_context
- # - not_finished (bool tensor, initialized to all True)
- # - mel_lengths
-
- # The following are NOT loop carried dependencies (they remain constant through the loop), and must be moved to be inputs outside of the loop body
- # - memory
- # - processed_memory
- # - mask
-
- # The scan outputs are
- # - mel_outputs (which scans across decoder_output)
- # - gate_outputs (scans across gate_prediction)
- # - alignments (scans across attention_weights)
-
-
- loop_body = gs.import_onnx(onnx.load(decoder_iter_onnx_path))
- loop_tensors = loop_body.tensors()
-
- iteration_num = gs.Variable("iteration_num", dtype=np.int64, shape=())
- cond_in = gs.Variable("cond_in", dtype=bool, shape=())
- cond_out = gs.Variable("cond_out", dtype=bool, shape=())
- not_finished_in = gs.Variable("not_finished_in", shape=('batch_size', 1), dtype=bool)
- not_finished_out = gs.Variable("not_finished_out", shape=('batch_size', 1), dtype=bool)
- mel_lengths_in = gs.Variable("mel_lengths_in", shape=('batch_size', 1), dtype=np.int32)
- mel_lengths_out = gs.Variable("mel_lengths_out", shape=('batch_size', 1), dtype=np.int32)
-
-
- # Set loop body inputs in the correct order
- loop_body.inputs = [iteration_num, cond_in, loop_tensors["decoder_input"], loop_tensors["attention_hidden"], loop_tensors["attention_cell"], loop_tensors["decoder_hidden"], loop_tensors["decoder_cell"], loop_tensors["attention_weights"], loop_tensors["attention_weights_cum"], loop_tensors["attention_context"], not_finished_in, mel_lengths_in]
-
- # Set loop body outputs in the correct order
- loop_body.outputs = [cond_out, loop_tensors["decoder_output"], loop_tensors["out_attention_hidden"], loop_tensors["out_attention_cell"], loop_tensors["out_decoder_hidden"], loop_tensors["out_decoder_cell"], loop_tensors["out_attention_weights"], loop_tensors["out_attention_weights_cum"], loop_tensors["out_attention_context"], not_finished_out, mel_lengths_out, loop_tensors["decoder_output"], loop_tensors["gate_prediction"], loop_tensors["out_attention_weights"]]
-
- # The loop stop condition is given by the following lines in PyTorch
- # dec = torch.le(torch.sigmoid(decoder_outputs[8]), gate_threshold).to(torch.int32).squeeze(1)
- # not_finished = not_finished*dec
- # if torch.sum(not_finished) == 0:
- # break
-
- # To compute cond_out, we can essentially follow the same steps. Using Less instead of Greater+Not for now
-
- gate_threshold = gs.Constant("gate_threshold", np.array([0.5], dtype=float_prec))
- gate_sigmoid = gs.Variable("gate_sigmoid", dtype=float_prec, shape=())
- sigmoid = loop_body.nodes.append(gs.Node(op="Sigmoid", inputs=[loop_tensors["gate_prediction"]], outputs=[gate_sigmoid]))
-
- leq_output = gs.Variable("leq_output", dtype=bool)
- leq = loop_body.nodes.append(gs.Node(op="Less", inputs=[gate_sigmoid, gate_threshold], outputs=[leq_output]))
-
- loop_body.nodes.append(gs.Node(op="And", inputs=[not_finished_in, leq_output], outputs=[not_finished_out]))
-
- cast_output = gs.Variable("cast_output", dtype=np.int32)
- loop_body.nodes.append(gs.Node(op="Cast", inputs=[not_finished_out], outputs=[cast_output], attrs={"to": 6})) # int32
-
- reduce_output = gs.Variable("reduce_output", dtype=np.int32)
- loop_body.nodes.append( gs.Node(op="ReduceSum", inputs=[cast_output], outputs=[reduce_output], attrs={"axes": [0], "keepdims": 0}))
-
- unsqueezed_cond_out = gs.Variable("unsqueezed_cond_out", dtype=bool)
- loop_body.nodes.append(gs.Node(op="Equal", inputs=[reduce_output, gs.Constant("zero", np.array(0, dtype=np.int32))], outputs=[unsqueezed_cond_out]))
-
- squeezed_cond_out = gs.Variable("squeezed_cond_out", dtype=bool)
- loop_body.nodes.append(gs.Node(op="Squeeze", inputs=[unsqueezed_cond_out], outputs=[squeezed_cond_out], attrs={"axes": [0]}))
-
- loop_body.nodes.append(gs.Node(op="Not", inputs=[squeezed_cond_out], outputs=[cond_out]))
-
- # Compute mel_lengths
- # from PyTorch: mel_lengths += not_finished
-
- loop_body.nodes.append(gs.Node(op="Add", inputs=[mel_lengths_in, cast_output], outputs=[mel_lengths_out]))
-
- memory = gs.Variable("memory", dtype=float_prec, shape=('batch_size', 'seq_len', 512))
- processed_memory = gs.Variable("processed_memory", dtype=float_prec, shape=('batch_size', 'seq_len', 128))
- mask = gs.Variable("mask", dtype=bool, shape=('batch_size', 'seq_len'))
-
- loop_body.toposort()
- onnx.save(gs.export_onnx(loop_body), os.path.join(output_dir, "loop_body_{prec}.onnx".format(prec="fp16" if float_prec == np.float16 else "fp32")))
-
- # Create outer graph
-
- # Inputs to outer graph are the following (suffixed with _0 to signify initial states)
- # - decoder_input_0
- # - attention_hidden_0
- # - attention_cell_0
- # - decoder_hidden_0
- # - decoder_cell_0
- # - attention_weights_0
- # - attention_weights_cum_0
- # - attention_context_0
- # - memory
- # - processed_memory
- # - mask
-
- # Outputs are the following
- # - mel_outputs
- # - mel_lengths
-
- # Note: alignments and gate_outputs are scan outputs, but don't seem to be used later in the PyTorch implementation. For now, we will make them intermediate tensors that are not outputted
-
- graph = gs.Graph()
-
- decoder_input_0 = gs.Variable("decoder_input_0", dtype=float_prec, shape=('batch_size', 80))
- attention_hidden_0 = gs.Variable("attention_hidden_0", dtype=float_prec, shape=('batch_size', 1024))
- attention_cell_0 = gs.Variable("attention_cell_0", dtype=float_prec, shape=('batch_size', 1024))
- decoder_hidden_0 = gs.Variable("decoder_hidden_0", dtype=float_prec, shape=('batch_size', 1024))
- decoder_cell_0 = gs.Variable("decoder_cell_0", dtype=float_prec, shape=('batch_size', 1024))
- attention_weights_0 = gs.Variable("attention_weights_0", dtype=float_prec, shape=('batch_size', 'seq_len'))
- attention_weights_cum_0 = gs.Variable("attention_weights_cum_0", dtype=float_prec, shape=('batch_size', 'seq_len'))
- attention_context_0 = gs.Variable("attention_context_0", dtype=float_prec, shape=('batch_size', 512))
- not_finished_0 = gs.Variable("not_finished_0", dtype=bool)
- mel_lengths_0 = gs.Variable("mel_lengths_0", dtype=np.int32)
-
- # For not_finished, we need to generate a tensor of shape (batch_size) that is all 1s
- # We can use the ONNX ConstantOfShape op to do this
- not_finished_shape = gs.Variable("not_finished_shape", dtype=np.int64)
- reduced = gs.Variable("reduced", dtype=float_prec)
- graph.nodes.append(gs.Node(op="ReduceSum", inputs=[decoder_input_0], outputs=[reduced], attrs={"axes":[1], "keepdims": 1}))
- graph.nodes.append(gs.Node(op="Shape", inputs=[reduced], outputs=[not_finished_shape]))
- before_cast = gs.Variable("before_cast", dtype=np.int32)
- graph.nodes.append(gs.Node(op="ConstantOfShape", inputs=[not_finished_shape], outputs=[before_cast], attrs={"value":gs.Constant("one", np.array([1], dtype=np.int32))}))
- graph.nodes.append(gs.Node(op="Cast", inputs=[before_cast], outputs=[not_finished_0], attrs={"to": 9}))
-
- # Same thing for mel_lengths, but we need all 0s
- graph.nodes.append(gs.Node(op="ConstantOfShape", inputs=[not_finished_shape], outputs=[mel_lengths_0], attrs={"value":gs.Constant("zero", np.array([0], dtype=np.int32))}))
-
- # Loop carried dependecies at the end of the loop
- decoder_input_t = gs.Variable("decoder_input_t", dtype=float_prec, shape=('batch_size', 80))
- attention_hidden_t = gs.Variable("attention_hidden_t", dtype=float_prec, shape=('batch_size', 1024))
- attention_cell_t = gs.Variable("attention_cell_t", dtype=float_prec, shape=('batch_size', 1024))
- decoder_hidden_t = gs.Variable("decoder_hidden_t", dtype=float_prec, shape=('batch_size', 1024))
- decoder_cell_t = gs.Variable("decoder_cell_t", dtype=float_prec, shape=('batch_size', 1024))
- attention_weights_t = gs.Variable("attention_weights_t", dtype=float_prec, shape=('batch_size', 'seq_len'))
- attention_weights_cum_t = gs.Variable("attention_weights_cum_t", dtype=float_prec, shape=('batch_size', 'seq_len'))
- attention_context_t = gs.Variable("attention_context_t", dtype=float_prec, shape=('batch_size', 512))
- not_finished_t = gs.Variable("not_finished_t", dtype=bool)
- mel_lengths_t = gs.Variable("mel_lengths_t", dtype=np.int32, shape=('batch_size', 1))
-
- # Scan outputs
- mel_outputs_raw = gs.Variable("mel_outputs_raw", dtype=float_prec, shape=(-1, 'batch_size', 80))
- gate_outputs = gs.Variable("gate_outputs", dtype=float_prec, shape=(-1, 'batch_size', 1))
- alignments = gs.Variable("alignments", dtype=float_prec, shape=(-1, 1, 'seq_len'))
-
- mel_outputs = gs.Variable("mel_outputs", dtype=float_prec, shape=('batch_size', 80, -1))
-
- graph.inputs = [decoder_input_0, attention_hidden_0, attention_cell_0, decoder_hidden_0, decoder_cell_0, attention_weights_0, attention_weights_cum_0, attention_context_0, memory, processed_memory, mask]
- graph.outputs = [mel_outputs, mel_lengths_t]
-
- trip_count = gs.Constant("trip_count", np.array(0, dtype=np.int64)) # In ONNX, this is an optional parameter, but I don't think ONNX-GS supports optional inputs. To fix this, after we export the ONNX ModelProto from GS, we replace this input with ""
- initial_cond = gs.Constant("initial_cond", np.array(True, dtype=bool))
- loop_inputs = [trip_count, initial_cond, decoder_input_0, attention_hidden_0, attention_cell_0, decoder_hidden_0, decoder_cell_0, attention_weights_0, attention_weights_cum_0, attention_context_0, not_finished_0, mel_lengths_0]
- loop_outputs = [decoder_input_t, attention_hidden_t, attention_cell_t, decoder_hidden_t, decoder_cell_t, attention_weights_t, attention_weights_cum_t, attention_context_t, not_finished_t, mel_lengths_t, mel_outputs_raw, gate_outputs, alignments]
- decoder_loop = gs.Node(op="Loop", name="decoder_loop", inputs=loop_inputs, outputs=loop_outputs, attrs={"body": loop_body})
- graph.nodes.append(decoder_loop)
-
- graph.nodes.append(gs.Node(op="Transpose", inputs=[mel_outputs_raw], outputs=[mel_outputs], attrs={"perm": [1, 2, 0]})) # Output needs to have loop dimension as inner-most dim
-
- graph.toposort()
- exported_graph = gs.export_onnx(graph)
- [x for x in exported_graph.graph.node if x.name == "decoder_loop"][0].input[0] = "" # Remove trip count input
-
- onnx.save(exported_graph, os.path.join(output_dir, decoder_out_name))
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument('model_path', type=str,
- help='path to original decoder_iter ONNX model')
- parser.add_argument('-o', '--output_dir', type=str, default='.', help='Output directory')
- parser.add_argument('--decoder_out', type=str, help='Filename of the exported decoder with outer loop')
- parser.add_argument('--fp16', action='store_true')
-
- args = parser.parse_args()
-
- if args.decoder_out == None:
- args.decoder_out = "decoder_with_outer_loop_{}.onnx".format("fp16" if args.fp16 else "fp32")
-
- insert_decoder_loop(args.model_path, args.output_dir, args.decoder_out, args.fp16)
diff --git a/demo/Tacotron2/tensorrt/inference_trt.py b/demo/Tacotron2/tensorrt/inference_trt.py
deleted file mode 100644
index d1a6dabd..00000000
--- a/demo/Tacotron2/tensorrt/inference_trt.py
+++ /dev/null
@@ -1,491 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import tensorrt as trt
-import numpy as np
-from scipy.io.wavfile import write
-import time
-import torch
-import argparse
-import os.path as path
-
-import sys
-from pathlib import Path
-sys.path.append(str(Path(__file__).parents[1]))
-
-from common.utils import to_gpu, get_mask_from_lengths
-from tacotron2.text import text_to_sequence
-from inference import MeasureTime, prepare_input_sequence, load_and_setup_model
-import dllogger as DLLogger
-from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
-from trt_utils import load_engine, run_trt_engine
-
-from waveglow.denoiser import Denoiser
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('-i', '--input', type=str, required=True,
- help='full path to the input text (phareses separated by new line)')
- parser.add_argument('-o', '--output', required=True,
- help='output folder to save audio (file per phrase)')
- parser.add_argument('--encoder', type=str, required=True,
- help='full path to the Encoder engine')
- parser.add_argument('--decoder', type=str, required=True,
- help='full path to the DecoderIter engine')
- parser.add_argument('--postnet', type=str, required=True,
- help='full path to the Postnet engine')
- parser.add_argument('--waveglow', type=str, required=True,
- help='full path to the WaveGlow engine')
- parser.add_argument('--waveglow-ckpt', type=str, default="",
- help='full path to the WaveGlow model checkpoint file')
- parser.add_argument('--log-file', type=str, default='nvlog.json',
- help='Filename for logging')
- parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
- parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
- help='Sampling rate')
- parser.add_argument('--stft-hop-length', type=int, default=256,
- help='STFT hop length for estimating audio length from mel size')
- parser.add_argument('--fp16', action='store_true',
- help='inference with FP16')
- parser.add_argument('--loop', dest='loop', action='store_true',
- help='Includes the outer decoder loop in the ONNX model. Enabled by default and only supported on TensorRT 8.0 or later.')
- parser.add_argument('--no-loop', dest='loop', action='store_false',
- help='Excludes outer decoder loop from decoder ONNX model. Default behavior and necessary for TensorRT 7.2 or earlier.')
- parser.set_defaults(loop=int(trt.__version__[0]) >= 8)
- parser.add_argument('--waveglow-onnxruntime', action='store_true',
- help='Specify this option to use ONNX runtime instead of TRT for running Waveglow')
- parser.add_argument('--decoder-onnxruntime', action='store_true',
- help='Specify this option to use ONNX runtime instead of TRT for running the TT2 Decoder with loop. When using this option, pass the decoder ONNX model to the --decoder argument')
- return parser
-
-
-def init_decoder_inputs(memory, processed_memory, memory_lengths):
-
- device = memory.device
- dtype = memory.dtype
- bs = memory.size(0)
- seq_len = memory.size(1)
- attention_rnn_dim = 1024
- decoder_rnn_dim = 1024
- encoder_embedding_dim = 512
- n_mel_channels = 80
-
- attention_hidden = torch.zeros(bs, attention_rnn_dim, device=device, dtype=dtype)
- attention_cell = torch.zeros(bs, attention_rnn_dim, device=device, dtype=dtype)
- decoder_hidden = torch.zeros(bs, decoder_rnn_dim, device=device, dtype=dtype)
- decoder_cell = torch.zeros(bs, decoder_rnn_dim, device=device, dtype=dtype)
- attention_weights = torch.zeros(bs, seq_len, device=device, dtype=dtype)
- attention_weights_cum = torch.zeros(bs, seq_len, device=device, dtype=dtype)
- attention_context = torch.zeros(bs, encoder_embedding_dim, device=device, dtype=dtype)
- mask = get_mask_from_lengths(memory_lengths).to(device)
- decoder_input = torch.zeros(bs, n_mel_channels, device=device, dtype=dtype)
-
- return (decoder_input, attention_hidden, attention_cell, decoder_hidden,
- decoder_cell, attention_weights, attention_weights_cum,
- attention_context, memory, processed_memory, mask)
-
-def init_decoder_outputs(memory, memory_lengths):
-
- device = memory.device
- dtype = memory.dtype
- bs = memory.size(0)
- seq_len = memory.size(1)
- attention_rnn_dim = 1024
- decoder_rnn_dim = 1024
- encoder_embedding_dim = 512
- n_mel_channels = 80
-
- attention_hidden = torch.zeros(bs, attention_rnn_dim, device=device, dtype=dtype)
- attention_cell = torch.zeros(bs, attention_rnn_dim, device=device, dtype=dtype)
- decoder_hidden = torch.zeros(bs, decoder_rnn_dim, device=device, dtype=dtype)
- decoder_cell = torch.zeros(bs, decoder_rnn_dim, device=device, dtype=dtype)
- attention_weights = torch.zeros(bs, seq_len, device=device, dtype=dtype)
- attention_weights_cum = torch.zeros(bs, seq_len, device=device, dtype=dtype)
- attention_context = torch.zeros(bs, encoder_embedding_dim, device=device, dtype=dtype)
- decoder_output = torch.zeros(bs, n_mel_channels, device=device, dtype=dtype)
- gate_prediction = torch.zeros(bs, 1, device=device, dtype=dtype)
-
- return (attention_hidden, attention_cell, decoder_hidden,
- decoder_cell, attention_weights, attention_weights_cum,
- attention_context, decoder_output, gate_prediction)
-
-def init_decoder_tensors(decoder_inputs, decoder_outputs):
-
- decoder_tensors = {
- "inputs" : {
- 'decoder_input': decoder_inputs[0],
- 'attention_hidden': decoder_inputs[1],
- 'attention_cell': decoder_inputs[2],
- 'decoder_hidden': decoder_inputs[3],
- 'decoder_cell': decoder_inputs[4],
- 'attention_weights': decoder_inputs[5],
- 'attention_weights_cum': decoder_inputs[6],
- 'attention_context': decoder_inputs[7],
- 'memory': decoder_inputs[8],
- 'processed_memory': decoder_inputs[9],
- 'mask': decoder_inputs[10]
- },
- "outputs" : {
- 'out_attention_hidden': decoder_outputs[0],
- 'out_attention_cell': decoder_outputs[1],
- 'out_decoder_hidden': decoder_outputs[2],
- 'out_decoder_cell': decoder_outputs[3],
- 'out_attention_weights': decoder_outputs[4],
- 'out_attention_weights_cum': decoder_outputs[5],
- 'out_attention_context': decoder_outputs[6],
- 'decoder_output': decoder_outputs[7],
- 'gate_prediction': decoder_outputs[8]
- }
- }
- return decoder_tensors
-
-def swap_inputs_outputs(decoder_inputs, decoder_outputs):
-
- new_decoder_inputs = (decoder_outputs[7], # decoder_output
- decoder_outputs[0], # attention_hidden
- decoder_outputs[1], # attention_cell
- decoder_outputs[2], # decoder_hidden
- decoder_outputs[3], # decoder_cell
- decoder_outputs[4], # attention_weights
- decoder_outputs[5], # attention_weights_cum
- decoder_outputs[6], # attention_context
- decoder_inputs[8], # memory
- decoder_inputs[9], # processed_memory
- decoder_inputs[10]) # mask
-
- new_decoder_outputs = (decoder_inputs[1], # attention_hidden
- decoder_inputs[2], # attention_cell
- decoder_inputs[3], # decoder_hidden
- decoder_inputs[4], # decoder_cell
- decoder_inputs[5], # attention_weights
- decoder_inputs[6], # attention_weights_cum
- decoder_inputs[7], # attention_context
- decoder_inputs[0], # decoder_input
- decoder_outputs[8])# gate_output
-
- return new_decoder_inputs, new_decoder_outputs
-
-
-def infer_tacotron2_trt(encoder, decoder_iter, postnet,
- encoder_context, decoder_context, postnet_context,
- sequences, sequence_lengths, measurements, fp16, loop):
-
- batch_size = len(sequence_lengths)
- max_sequence_len = sequence_lengths[0]
- memory = torch.zeros((batch_size, max_sequence_len, 512)).cuda()
- if fp16:
- memory = memory.half()
- device = memory.device
- dtype = memory.dtype
-
- processed_memory = torch.zeros((batch_size, max_sequence_len, 128), device=device, dtype=dtype)
- lens = torch.zeros_like(sequence_lengths)
- print(f"batch_size: {batch_size}, max sequence length: {max_sequence_len}")
-
- encoder_tensors = {
- "inputs" :
- {'sequences': sequences, 'sequence_lengths': sequence_lengths},
- "outputs" :
- {'memory': memory, 'lens': lens, 'processed_memory': processed_memory}
- }
-
- print("Running Tacotron2 Encoder")
- with MeasureTime(measurements, "tacotron2_encoder_time"):
- run_trt_engine(encoder_context, encoder, encoder_tensors)
- max_decoder_steps = 1024
- device = memory.device
- mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device = device)
- not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device = device)
- mel_outputs = torch.ones((batch_size, 80, max_decoder_steps), device = device, dtype=dtype).cuda()
- gate_threshold = 0.5
- first_iter = True
-
- decoder_inputs = init_decoder_inputs(memory, processed_memory, sequence_lengths)
- decoder_outputs = init_decoder_outputs(memory, sequence_lengths)
-
- if loop:
- if decoder_context is None:
- print("Running Tacotron2 Decoder with loop with ONNX-RT")
- decoder_inputs_onnxrt = [x.cpu().numpy().copy() for x in decoder_inputs]
- import onnx
- import onnxruntime
- sess = onnxruntime.InferenceSession(decoder_iter)
-
- with MeasureTime(measurements, "tacotron2_decoder_time"):
- result = sess.run(["mel_outputs", "mel_lengths_t"], {
- 'decoder_input_0': decoder_inputs_onnxrt[0],
- 'attention_hidden_0': decoder_inputs_onnxrt[1],
- 'attention_cell_0': decoder_inputs_onnxrt[2],
- 'decoder_hidden_0': decoder_inputs_onnxrt[3],
- 'decoder_cell_0': decoder_inputs_onnxrt[4],
- 'attention_weights_0': decoder_inputs_onnxrt[5],
- 'attention_weights_cum_0': decoder_inputs_onnxrt[6],
- 'attention_context_0': decoder_inputs_onnxrt[7],
- 'memory': decoder_inputs_onnxrt[8],
- 'processed_memory': decoder_inputs_onnxrt[9],
- 'mask': decoder_inputs_onnxrt[10]
- })
-
- mel_outputs = torch.tensor(result[0], device=device)
- mel_lengths = torch.tensor(result[1], device=device)
- else:
- print("Running Tacotron2 Decoder with loop")
- decoder_tensors = {
- "inputs" :
- {
- 'decoder_input_0': decoder_inputs[0],
- 'attention_hidden_0': decoder_inputs[1],
- 'attention_cell_0': decoder_inputs[2],
- 'decoder_hidden_0': decoder_inputs[3],
- 'decoder_cell_0': decoder_inputs[4],
- 'attention_weights_0': decoder_inputs[5],
- 'attention_weights_cum_0': decoder_inputs[6],
- 'attention_context_0': decoder_inputs[7],
- 'memory': decoder_inputs[8],
- 'processed_memory': decoder_inputs[9],
- 'mask': decoder_inputs[10]
- },
- "outputs" :
- {'mel_outputs': mel_outputs, 'mel_lengths_t': mel_lengths}
- }
-
- with MeasureTime(measurements, "tacotron2_decoder_time"):
- run_trt_engine(decoder_context, decoder_iter, decoder_tensors)
- mel_outputs = mel_outputs[:,:,:torch.max(mel_lengths)]
-
- else:
- print("Running Tacotron2 Decoder")
- measurements_decoder = {}
- while True:
- decoder_tensors = init_decoder_tensors(decoder_inputs, decoder_outputs)
- with MeasureTime(measurements_decoder, "step"):
- run_trt_engine(decoder_context, decoder_iter, decoder_tensors)
-
- if first_iter:
- mel_outputs = torch.unsqueeze(decoder_outputs[7], 2)
- gate_outputs = torch.unsqueeze(decoder_outputs[8], 2)
- alignments = torch.unsqueeze(decoder_outputs[4], 2)
- measurements['tacotron2_decoder_time'] = measurements_decoder['step']
- first_iter = False
- else:
- mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(decoder_outputs[7], 2)), 2)
- gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(decoder_outputs[8], 2)), 2)
- alignments = torch.cat((alignments, torch.unsqueeze(decoder_outputs[4], 2)), 2)
- measurements['tacotron2_decoder_time'] += measurements_decoder['step']
-
- dec = torch.le(torch.sigmoid(decoder_outputs[8]), gate_threshold).to(torch.int32).squeeze(1)
- not_finished = not_finished*dec
- mel_lengths += not_finished
-
- if torch.sum(not_finished) == 0:
- print("Stopping after",mel_outputs.size(2),"decoder steps")
- break
- if mel_outputs.size(2) == max_decoder_steps:
- print("Warning! Reached max decoder steps")
- break
-
- decoder_inputs, decoder_outputs = swap_inputs_outputs(decoder_inputs, decoder_outputs)
-
- mel_outputs = mel_outputs.clone().detach()
- mel_outputs_postnet = torch.zeros_like(mel_outputs, device=device, dtype=dtype)
-
- postnet_tensors = {
- "inputs" :
- {'mel_outputs': mel_outputs},
- "outputs" :
- {'mel_outputs_postnet': mel_outputs_postnet}
- }
- print("Running Tacotron2 Postnet")
- with MeasureTime(measurements, "tacotron2_postnet_time"):
- run_trt_engine(postnet_context, postnet, postnet_tensors)
-
- print("Tacotron2 Postnet done")
-
- return mel_outputs_postnet, mel_lengths
-
-
-def infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, fp16):
-
- mel_size = mel.size(2)
- batch_size = mel.size(0)
- stride = 256
- n_group = 8
- z_size = mel_size*stride
- z_size = z_size//n_group
- z = torch.randn(batch_size, n_group, z_size).cuda()
- audios = torch.zeros(batch_size, mel_size*stride).cuda()
-
- mel = mel.unsqueeze(3)
- z = z.unsqueeze(3)
-
- if fp16:
- z = z.half()
- mel = mel.half()
- audios = audios.half()
-
- waveglow_tensors = {
- "inputs" : {'mel': mel, 'z': z},
- "outputs" : {'audio': audios}
- }
-
- print("Running WaveGlow with TensorRT")
- with MeasureTime(measurements, "waveglow_time"):
- run_trt_engine(waveglow_context, waveglow, waveglow_tensors)
-
- return audios
-
-def infer_waveglow_onnx(waveglow_path, mel, measurements, fp16):
- import onnx
- import onnxruntime
- sess = onnxruntime.InferenceSession(waveglow_path)
-
- device=mel.device
- mel_size = mel.size(2)
- batch_size = mel.size(0)
- stride = 256
- n_group = 8
- z_size = mel_size*stride
- z_size = z_size//n_group
- z = torch.randn(batch_size, n_group, z_size).cuda()
-
- mel = mel.unsqueeze(3)
- z = z.unsqueeze(3)
-
- if fp16:
- z = z.half()
- mel = mel.half()
-
- mel = mel.cpu().numpy().copy()
- z = z.cpu().numpy().copy()
-
- print("Running WaveGlow with ONNX Runtime")
- with MeasureTime(measurements, "waveglow_time"):
- result = sess.run(["audio"], {
- 'mel': mel,
- 'z': z
- })
- audios = torch.tensor(result[0], device=device)
- return audios
-
-def main():
-
- parser = argparse.ArgumentParser(
- description='TensorRT Tacotron 2 Inference')
- parser = parse_args(parser)
- args, _ = parser.parse_known_args()
-
- # initialize CUDA state
- torch.cuda.init()
-
- TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
- encoder = load_engine(args.encoder, TRT_LOGGER)
- postnet = load_engine(args.postnet, TRT_LOGGER)
-
- if args.waveglow_ckpt != "":
- # setup denoiser using WaveGlow PyTorch checkpoint
- waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
- True, forward_is_infer=True)
- denoiser = Denoiser(waveglow_ckpt).cuda()
- # after initialization, we don't need WaveGlow PyTorch checkpoint
- # anymore - deleting
- del waveglow_ckpt
- torch.cuda.empty_cache()
-
- # create TRT contexts for each engine
- encoder_context = encoder.create_execution_context()
- decoder_context = None
- if not args.decoder_onnxruntime:
- decoder_iter = load_engine(args.decoder, TRT_LOGGER)
- decoder_context = decoder_iter.create_execution_context()
- else:
- decoder_iter = args.decoder
- postnet_context = postnet.create_execution_context()
-
- waveglow_context = None
- if not args.waveglow_onnxruntime:
- waveglow = load_engine(args.waveglow, TRT_LOGGER)
- waveglow_context = waveglow.create_execution_context()
-
- DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
- path.join(args.output, args.log_file)),
- StdOutBackend(Verbosity.VERBOSE)])
-
- texts = []
- try:
- f = open(args.input, 'r')
- texts = f.readlines()
- except:
- print("Could not read file")
- sys.exit(1)
-
- measurements = {}
-
- sequences, sequence_lengths = prepare_input_sequence(texts)
- dt = encoder.get_tensor_dtype("sequences")
- sequences = sequences.to(torch.int64 if dt == trt.DataType.INT64 else torch.int32)
- sequence_lengths = sequence_lengths.to(torch.int32)
-
- with MeasureTime(measurements, "latency"):
- mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
- encoder_context, decoder_context, postnet_context,
- sequences, sequence_lengths, measurements, args.fp16, args.loop)
- audios = infer_waveglow_onnx(args.waveglow, mel, measurements, args.fp16) if args.waveglow_onnxruntime else \
- infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16)
-
- with encoder_context, postnet_context:
- pass
-
- if decoder_context is not None:
- with decoder_context: pass
-
- if waveglow_context is not None:
- with waveglow_context: pass
-
- audios = audios.float()
- if args.waveglow_ckpt != "":
- with MeasureTime(measurements, "denoiser"):
- audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
-
- for i, audio in enumerate(audios):
- audio = audio[:mel_lengths[i]*args.stft_hop_length]
- audio = audio/torch.max(torch.abs(audio))
- audio_path = path.join(args.output, f"audio_{i}_trt.wav")
- write(audio_path, args.sampling_rate, audio.cpu().numpy())
-
-
- DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']})
- DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']})
- DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']})
- DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
- DLLogger.log(step=0, data={"latency": measurements['latency']})
-
- if args.waveglow_ckpt != "":
- DLLogger.log(step=0, data={"denoiser": measurements['denoiser']})
- DLLogger.flush()
-
- prec = "fp16" if args.fp16 else "fp32"
- latency = measurements['latency']
- throughput = audios.size(1)/latency
- log_data = f"1,{sequence_lengths[0].item()},{prec},{latency},{throughput},{mel_lengths[0].item()}\n"
- log_file = path.join(args.output, f"log_bs1_{prec}.log")
- with open(log_file, 'a') as f:
- f.write(log_data)
-
-if __name__ == "__main__":
- main()
diff --git a/demo/Tacotron2/tensorrt/run_latency_tests_trt.sh b/demo/Tacotron2/tensorrt/run_latency_tests_trt.sh
deleted file mode 100644
index a289cf63..00000000
--- a/demo/Tacotron2/tensorrt/run_latency_tests_trt.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-bash test_infer.sh --test tensorrt/test_infer_trt.py -bs 1 -il 128 --fp16 --num-iters 1003 --encoder ./output/encoder_fp16.engine --decoder ./output/decoder_with_outer_loop_fp16.engine --postnet ./output/postnet_fp16.engine --waveglow ./output/waveglow_fp16.engine --wn-channels 256
diff --git a/demo/Tacotron2/tensorrt/test_infer_trt.py b/demo/Tacotron2/tensorrt/test_infer_trt.py
deleted file mode 100644
index 7023f02f..00000000
--- a/demo/Tacotron2/tensorrt/test_infer_trt.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import sys
-sys.path.append('./')
-from tacotron2.text import text_to_sequence
-import models
-import tensorrt as trt
-import torch
-import argparse
-import numpy as np
-from scipy.io.wavfile import write
-
-from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence, load_and_setup_model
-from inference_trt import infer_tacotron2_trt, infer_waveglow_trt
-
-from trt_utils import load_engine
-
-import time
-import dllogger as DLLogger
-from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
-
-# from apex import amp
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('--encoder', type=str, required=True,
- help='full path to the Encoder engine')
- parser.add_argument('--decoder', type=str, required=True,
- help='full path to the DecoderIter engine')
- parser.add_argument('--postnet', type=str, required=True,
- help='full path to the Postnet engine')
- parser.add_argument('--waveglow', type=str, required=True,
- help='full path to the WaveGlow engine')
- parser.add_argument('--waveglow-ckpt', type=str, default="",
- help='full path to the WaveGlow model checkpoint file')
- parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
- parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
- help='Sampling rate')
- parser.add_argument('--fp16', action='store_true',
- help='inference with FP16')
- parser.add_argument('--log-file', type=str, default='nvlog.json',
- help='Filename for logging')
- parser.add_argument('--stft-hop-length', type=int, default=256,
- help='STFT hop length for estimating audio length from mel size')
- parser.add_argument('--num-iters', type=int, default=10,
- help='Number of iterations')
- parser.add_argument('-il', '--input-length', type=int, default=64,
- help='Input length')
- parser.add_argument('-bs', '--batch-size', type=int, default=1,
- help='Batch size')
-
- return parser
-
-
-def print_stats(measurements_all):
-
- print(np.mean(measurements_all['latency'][1:]),
- np.mean(measurements_all['throughput'][1:]),
- np.mean(measurements_all['pre_processing'][1:]),
- np.mean(measurements_all['type_conversion'][1:])+
- np.mean(measurements_all['storage'][1:])+
- np.mean(measurements_all['data_transfer'][1:]),
- np.mean(measurements_all['num_mels_per_audio'][1:]))
-
- throughput = measurements_all['throughput']
- preprocessing = measurements_all['pre_processing']
- type_conversion = measurements_all['type_conversion']
- storage = measurements_all['storage']
- data_transfer = measurements_all['data_transfer']
- postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
- latency = measurements_all['latency']
- num_mels_per_audio = measurements_all['num_mels_per_audio']
-
- latency.sort()
-
- cf_50 = max(latency[:int(len(latency)*0.50)])
- cf_90 = max(latency[:int(len(latency)*0.90)])
- cf_95 = max(latency[:int(len(latency)*0.95)])
- cf_99 = max(latency[:int(len(latency)*0.99)])
- cf_100 = max(latency[:int(len(latency)*1.0)])
-
- print("Throughput average (samples/sec) = {:.4f}".format(np.mean(throughput)))
- print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
- print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
- print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio))) #
- print("Latency average (seconds) = {:.4f}".format(np.mean(latency)))
- print("Latency std (seconds) = {:.4f}".format(np.std(latency)))
- print("Latency cl 50 (seconds) = {:.4f}".format(cf_50))
- print("Latency cl 90 (seconds) = {:.4f}".format(cf_90))
- print("Latency cl 95 (seconds) = {:.4f}".format(cf_95))
- print("Latency cl 99 (seconds) = {:.4f}".format(cf_99))
- print("Latency cl 100 (seconds) = {:.4f}".format(cf_100))
-
-
-def main():
- """
- Launches text to speech (inference).
- Inference is executed on a single GPU.
- """
- parser = argparse.ArgumentParser(
- description='PyTorch Tacotron 2 Inference')
- parser = parse_args(parser)
- args, unknown_args = parser.parse_known_args()
-
- DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
- StdOutBackend(Verbosity.VERBOSE)])
- for k,v in vars(args).items():
- DLLogger.log(step="PARAMETER", data={k:v})
- DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
-
- measurements_all = {"pre_processing": [],
- "tacotron2_encoder_time": [],
- "tacotron2_decoder_time": [],
- "tacotron2_postnet_time": [],
- "tacotron2_latency": [],
- "waveglow_latency": [],
- "latency": [],
- "type_conversion": [],
- "data_transfer": [],
- "storage": [],
- "tacotron2_items_per_sec": [],
- "waveglow_items_per_sec": [],
- "num_mels_per_audio": [],
- "throughput": []}
-
- print("args:", args, unknown_args)
-
- torch.cuda.init()
-
- TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
- encoder = load_engine(args.encoder, TRT_LOGGER)
- decoder_iter = load_engine(args.decoder, TRT_LOGGER)
- postnet = load_engine(args.postnet, TRT_LOGGER)
- waveglow = load_engine(args.waveglow, TRT_LOGGER)
-
- if args.waveglow_ckpt != "":
- # setup denoiser using WaveGlow PyTorch checkpoint
- waveglow_ckpt = load_and_setup_model('WaveGlow', parser,
- args.waveglow_ckpt,
- fp16_run=args.fp16,
- cpu_run=False,
- forward_is_infer=True)
- denoiser = Denoiser(waveglow_ckpt).cuda()
- # after initialization, we don't need WaveGlow PyTorch checkpoint
- # anymore - deleting
- del waveglow_ckpt
- torch.cuda.empty_cache()
-
- # create TRT contexts for each engine
- encoder_context = encoder.create_execution_context()
- decoder_context = decoder_iter.create_execution_context()
- postnet_context = postnet.create_execution_context()
- waveglow_context = waveglow.create_execution_context()
-
-
- texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
- texts = [texts[0][:args.input_length]]
- texts = texts*args.batch_size
-
- warmup_iters = 3
-
- for iter in range(args.num_iters):
-
- measurements = {}
-
- with MeasureTime(measurements, "pre_processing"):
- sequences_padded, input_lengths = prepare_input_sequence(texts)
- sequences_padded = sequences_padded.to(torch.int32)
- input_lengths = input_lengths.to(torch.int32)
-
- with torch.no_grad():
- with MeasureTime(measurements, "latency"):
- with MeasureTime(measurements, "tacotron2_latency"):
- mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
- encoder_context, decoder_context, postnet_context,
- sequences_padded, input_lengths, measurements, args.fp16, True)
-
- with MeasureTime(measurements, "waveglow_latency"):
- audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16)
-
- num_mels = mel.size(0)*mel.size(2)
- num_samples = audios.size(0)*audios.size(1)
-
- with MeasureTime(measurements, "type_conversion"):
- audios = audios.float()
-
- with MeasureTime(measurements, "data_transfer"):
- audios = audios.cpu()
-
- with MeasureTime(measurements, "storage"):
- audios = audios.numpy()
- for i, audio in enumerate(audios):
- audio_path = "audio_"+str(i)+".wav"
- write(audio_path, args.sampling_rate,
- audio[:mel_lengths[i]*args.stft_hop_length])
-
- measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
- measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
- measurements['num_mels_per_audio'] = mel.size(2)
- measurements['throughput'] = num_samples/measurements['latency']
-
- if iter >= warmup_iters:
- for k,v in measurements.items():
- if k in measurements_all.keys():
- measurements_all[k].append(v)
- DLLogger.log(step=(iter-warmup_iters), data={k: v})
-
- DLLogger.flush()
-
- print_stats(measurements_all)
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/tensorrt/trt_utils.py b/demo/Tacotron2/tensorrt/trt_utils.py
deleted file mode 100644
index e150983f..00000000
--- a/demo/Tacotron2/tensorrt/trt_utils.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import sys
-import tensorrt as trt
-
-# For a single dimension this will return the min, opt, and max size when given
-# input of either one or three (comma delimited) values
-# dim="1" or dim=1 returns (1, 1, 1)
-# dim="1,4,5" returns (1, 4, 5)
-def parse_dynamic_size(dim):
- split = str(dim).split(',')
- assert len(split) in (1,3) , "Dynamic size input must be either 1 or 3 comma-separated integers"
- ints = [int(i) for i in split]
-
- if len(ints) == 1:
- ints *= 3
-
- assert ints[0] <= ints[1] <= ints[2]
- return tuple(ints)
-
-
-def is_dimension_dynamic(dim):
- return dim is None or dim <= 0
-
-
-def is_shape_dynamic(shape):
- return any([is_dimension_dynamic(dim) for dim in shape])
-
-
-def run_trt_engine(context, engine, tensors):
-
- bindings = [0] * engine.num_io_tensors
-
- for i in range(engine.num_io_tensors):
- tensor_name = engine.get_tensor_name(i)
- if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
- tensor = tensors['inputs'][tensor_name]
- bindings[i] = tensor.data_ptr()
- if is_shape_dynamic(engine.get_tensor_shape(tensor_name)):
- context.set_input_shape(tensor_name, tensor.shape)
- elif engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.OUTPUT:
- tensor = tensors['outputs'][tensor_name]
- bindings[i] = tensor.data_ptr()
-
- context.execute_v2(bindings=bindings)
-
-
-def load_engine(engine_filepath, trt_logger):
- with open(engine_filepath, "rb") as f, trt.Runtime(trt_logger) as runtime:
- engine = runtime.deserialize_cuda_engine(f.read())
- return engine
-
-
-def engine_info(engine_filepath):
-
- TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
- engine = load_engine(engine_filepath, TRT_LOGGER)
-
- binding_template = r"""
-{btype} {{
- name: "{bname}"
- data_type: {dtype}
- dims: {dims}
-}}"""
- type_mapping = {"DataType.HALF": "TYPE_FP16",
- "DataType.FLOAT": "TYPE_FP32",
- "DataType.INT32": "TYPE_INT32",
- "DataType.BOOL" : "TYPE_BOOL"}
-
- print("engine name", engine.name)
- start_dim = 1
- print("num_optimization_profiles", engine.num_optimization_profiles)
- print("device_memory_size:", engine.device_memory_size)
- print("max_workspace_size:", engine.get_memory_pool_limit(trt.MemoryPoolType.WORKSPACE))
- print("num_layers:", engine.num_layers)
-
- for i in range(engine.num_io_tensors):
- tensor_name = engine.get_tensor_name(i)
- btype = "input" if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT else "output"
- dtype = engine.get_tensor_dtype(tensor_name)
- bdims = engine.get_tensor_shape(tensor_name)
- config_values = {
- "btype": btype,
- "bname": tensor_name,
- "dtype": type_mapping[str(dtype)],
- "dims": list(bdims[start_dim:])
- }
- final_binding_str = binding_template.format_map(config_values)
- print(final_binding_str)
-
-
-def build_engine(model_file, shapes, max_ws=512*1024*1024, fp16=False, timing_cache=None):
-
- TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
- builder = trt.Builder(TRT_LOGGER)
-
- config = builder.create_builder_config()
- config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, max_ws)
- if fp16:
- config.flags |= 1 << int(trt.BuilderFlag.FP16)
- profile = builder.create_optimization_profile()
- for s in shapes:
- profile.set_shape(s['name'], min=s['min'], opt=s['opt'], max=s['max'])
- config.add_optimization_profile(profile)
-
- timing_cache_available = int(trt.__version__[0]) >= 8 and timing_cache != None
- # load global timing cache
- if timing_cache_available:
- if os.path.exists(timing_cache):
- with open(timing_cache, "rb") as f:
- cache = config.create_timing_cache(f.read())
- config.set_timing_cache(cache, ignore_mismatch = False)
- else:
- cache = config.create_timing_cache(b"")
- config.set_timing_cache(cache, ignore_mismatch = False)
-
- network_creation_flag = 0
- if "EXPLICIT_BATCH" in trt.NetworkDefinitionCreationFlag.__members__.keys():
- network_creation_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
- network = builder.create_network(network_creation_flag)
-
- with trt.OnnxParser(network, TRT_LOGGER) as parser:
- with open(model_file, 'rb') as model:
- parsed = parser.parse(model.read())
- for i in range(parser.num_errors):
- print("TensorRT ONNX parser error:", parser.get_error(i))
- engine = builder.build_serialized_network(network, config=config)
-
- # save global timing cache
- if timing_cache_available:
- cache = config.get_timing_cache()
- with cache.serialize() as buffer:
- with open(timing_cache, "wb") as f:
- f.write(buffer)
- f.flush()
- os.fsync(f)
-
- return engine
diff --git a/demo/Tacotron2/test_infer.py b/demo/Tacotron2/test_infer.py
deleted file mode 100644
index 23816da9..00000000
--- a/demo/Tacotron2/test_infer.py
+++ /dev/null
@@ -1,198 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-import argparse
-import numpy as np
-from scipy.io.wavfile import write
-
-from inference import MeasureTime, prepare_input_sequence, load_and_setup_model
-
-import dllogger as DLLogger
-from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
-
-from waveglow.denoiser import Denoiser
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
- parser.add_argument('--tacotron2', type=str,
- help='Full path to the Tacotron2 model checkpoint file')
- parser.add_argument('--waveglow', type=str,
- help='Full path to the WaveGlow model checkpoint file')
- parser.add_argument('-s', '--sigma-infer', default=0.6, type=float,
- help='Standard deviation of the Gaussian distribution')
- parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
- help='Denoising strength for removing model bias')
- parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
- help='Sampling rate')
-
- run_mode = parser.add_mutually_exclusive_group()
- run_mode.add_argument('--fp16', action='store_true',
- help='Run inference with FP16')
- run_mode.add_argument('--cpu', action='store_true',
- help='Run inference on CPU')
-
- parser.add_argument('--log-file', type=str, default='nvlog.json',
- help='Filename for logging')
- parser.add_argument('--stft-hop-length', type=int, default=256,
- help='STFT hop length for estimating audio length from mel size')
- parser.add_argument('--num-iters', type=int, default=10,
- help='Number of iterations')
- parser.add_argument('-il', '--input-length', type=int, default=64,
- help='Input length')
- parser.add_argument('-bs', '--batch-size', type=int, default=1,
- help='Batch size')
-
-
- return parser
-
-
-def print_stats(measurements_all):
-
- throughput = measurements_all['throughput']
- preprocessing = measurements_all['pre_processing']
- type_conversion = measurements_all['type_conversion']
- storage = measurements_all['storage']
- data_transfer = measurements_all['data_transfer']
- postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
- latency = measurements_all['latency']
- waveglow_latency = measurements_all['waveglow_latency']
- tacotron2_latency = measurements_all['tacotron2_latency']
- denoiser_latency = measurements_all['denoiser_latency']
- num_mels_per_audio = measurements_all['num_mels_per_audio']
-
- latency.sort()
-
- cf_50 = max(latency[:int(len(latency)*0.50)])
- cf_90 = max(latency[:int(len(latency)*0.90)])
- cf_95 = max(latency[:int(len(latency)*0.95)])
- cf_99 = max(latency[:int(len(latency)*0.99)])
- cf_100 = max(latency[:int(len(latency)*1.0)])
-
- print("Throughput average (samples/sec) = {:.0f}".format(np.mean(throughput)))
- print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
- print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
- print("Number of mels per audio average = {:.0f}".format(np.mean(num_mels_per_audio)))
- print("Tacotron2 latency average (seconds) = {:.2f}".format(np.mean(tacotron2_latency)))
- print("WaveGlow latency average (seconds) = {:.2f}".format(np.mean(waveglow_latency)))
- print("Denoiser latency average (seconds) = {:.4f}".format(np.mean(denoiser_latency)))
- print("Latency average (seconds) = {:.2f}".format(np.mean(latency)))
- print("Latency std (seconds) = {:.2f}".format(np.std(latency)))
- print("Latency cl 50 (seconds) = {:.2f}".format(cf_50))
- print("Latency cl 90 (seconds) = {:.2f}".format(cf_90))
- print("Latency cl 95 (seconds) = {:.2f}".format(cf_95))
- print("Latency cl 99 (seconds) = {:.2f}".format(cf_99))
- print("Latency cl 100 (seconds) = {:.2f}".format(cf_100))
-
-
-def main():
- """
- Launches text to speech (inference).
- Inference is executed on a single GPU or CPU.
- """
- parser = argparse.ArgumentParser(
- description='PyTorch Tacotron 2 Inference')
- parser = parse_args(parser)
- args, unknown_args = parser.parse_known_args()
-
- DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
- StdOutBackend(Verbosity.VERBOSE)])
- for k,v in vars(args).items():
- DLLogger.log(step="PARAMETER", data={k:v})
- DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
-
- measurements_all = {"pre_processing": [],
- "tacotron2_latency": [],
- "waveglow_latency": [],
- "denoiser_latency": [],
- "latency": [],
- "type_conversion": [],
- "data_transfer": [],
- "storage": [],
- "tacotron2_items_per_sec": [],
- "waveglow_items_per_sec": [],
- "num_mels_per_audio": [],
- "throughput": []}
-
- print("args:", args, unknown_args)
-
- tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
- args.fp16, args.cpu, forward_is_infer=True)
- waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
- args.fp16, args.cpu, forward_is_infer=True)
- denoiser = Denoiser(waveglow)
- if not args.cpu:
- denoiser.cuda()
-
- texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
- texts = [texts[0][:args.input_length]]
- texts = texts*args.batch_size
-
- warmup_iters = 3
-
- for iter in range(args.num_iters):
-
- measurements = {}
-
- with MeasureTime(measurements, "pre_processing", args.cpu):
- sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)
-
- with torch.no_grad():
- with MeasureTime(measurements, "latency", args.cpu):
- with MeasureTime(measurements, "tacotron2_latency", args.cpu):
- mel, mel_lengths, _ = tacotron2.infer(sequences_padded, input_lengths)
-
- with MeasureTime(measurements, "waveglow_latency", args.cpu):
- audios = waveglow.infer(mel, sigma=args.sigma_infer)
-
- num_mels = mel.size(0)*mel.size(2)
- num_samples = audios.size(0)*audios.size(1)
-
- with MeasureTime(measurements, "type_conversion", args.cpu):
- audios = audios.float()
-
- with torch.no_grad(), MeasureTime(measurements, "denoiser_latency", args.cpu):
- audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
-
- with MeasureTime(measurements, "data_transfer", args.cpu):
- audios = audios.cpu()
-
- with MeasureTime(measurements, "storage", args.cpu):
- audios = audios.numpy()
- for i, audio in enumerate(audios):
- audio_path = "audio_"+str(i)+".wav"
- write(audio_path, args.sampling_rate,
- audio[:mel_lengths[i]*args.stft_hop_length])
-
- measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
- measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
- measurements['num_mels_per_audio'] = mel.size(2)
- measurements['throughput'] = num_samples/measurements['latency']
-
- if iter >= warmup_iters:
- for k,v in measurements.items():
- measurements_all[k].append(v)
- DLLogger.log(step=(iter-warmup_iters), data={k: v})
-
- DLLogger.flush()
-
- print_stats(measurements_all)
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/test_infer.sh b/demo/Tacotron2/test_infer.sh
deleted file mode 100644
index 103fb941..00000000
--- a/demo/Tacotron2/test_infer.sh
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-BATCH_SIZE=1
-INPUT_LENGTH=128
-NUM_ITERS=1003 # extra 3 iterations for warmup
-TACOTRON2_CKPT="nvidia_tacotron2pyt_fp16_20190427"
-WAVEGLOW_CKPT="nvidia_waveglow256pyt_fp16"
-RUN_MODE="" # = fp32
-LOG_RUN_MODE="gpu_fp32"
-TEST_PROGRAM="test_infer.py"
-WN_CHANNELS=512
-LOG_SUFFIX_ADD="" #additional info, e.g., GPU type
-
-while [ -n "$1" ]
-do
- case "$1" in
- -bs|--batch-size)
- BATCH_SIZE="$2"
- shift
- ;;
- -il|--input-length)
- INPUT_LENGTH="$2"
- shift
- ;;
- --num-iters)
- NUM_ITERS="$2"
- shift
- ;;
- --test)
- TEST_PROGRAM="$2"
- shift
- ;;
- --tacotron2)
- TACOTRON2_CKPT="$2"
- shift
- ;;
- --encoder)
- ENCODER_CKPT="$2"
- shift
- ;;
- --decoder)
- DECODER_CKPT="$2"
- shift
- ;;
- --postnet)
- POSTNET_CKPT="$2"
- shift
- ;;
- --waveglow)
- WAVEGLOW_CKPT="$2"
- shift
- ;;
- --wn-channels)
- WN_CHANNELS="$2"
- shift
- ;;
- --cpu)
- RUN_MODE="--cpu"
- LOG_RUN_MODE="cpu_fp32"
- ;;
- --fp16)
- RUN_MODE="--fp16"
- LOG_RUN_MODE="gpu_fp16"
- ;;
- --log-suffix)
- LOG_SUFFIX_ADD="$2"
- shift
- ;;
- *)
- echo "Option $1 not recognized"
- esac
- shift
-done
-
-LOG_SUFFIX=bs${BATCH_SIZE}_il${INPUT_LENGTH}_${LOG_RUN_MODE}_wn${WN_CHANNELS}_${LOG_SUFFIX_ADD}
-NVLOG_FILE=nvlog_${LOG_SUFFIX}.json
-TMP_LOGFILE=tmp_log_${LOG_SUFFIX}.log
-LOGFILE=log_${LOG_SUFFIX}.log
-
-
-if [ "$TEST_PROGRAM" = "tensorrt/test_infer_trt.py" ]
-then
- TACOTRON2_PARAMS="--encoder $ENCODER_CKPT --decoder $DECODER_CKPT --postnet $POSTNET_CKPT"
-else
- TACOTRON2_PARAMS="--tacotron2 $TACOTRON2_CKPT"
-fi
-
-set -x
-python3 $TEST_PROGRAM \
- $TACOTRON2_PARAMS \
- --waveglow $WAVEGLOW_CKPT \
- --batch-size $BATCH_SIZE \
- --input-length $INPUT_LENGTH \
- --log-file $NVLOG_FILE \
- --num-iters $NUM_ITERS \
- --wn-channels $WN_CHANNELS \
- $RUN_MODE \
- |& tee $TMP_LOGFILE
-set +x
-
-
-PERF=$(cat $TMP_LOGFILE | grep -F 'Throughput average (samples/sec)' | awk -F'= ' '{print $2}')
-NUM_MELS=$(cat $TMP_LOGFILE | grep -F 'Number of mels per audio average' | awk -F'= ' '{print $2}')
-LATENCY=$(cat $TMP_LOGFILE | grep -F 'Latency average (seconds)' | awk -F'= ' '{print $2}')
-LATENCYSTD=$(cat $TMP_LOGFILE | grep -F 'Latency std (seconds)' | awk -F'= ' '{print $2}')
-LATENCY50=$(cat $TMP_LOGFILE | grep -F 'Latency cl 50 (seconds)' | awk -F'= ' '{print $2}')
-LATENCY90=$(cat $TMP_LOGFILE | grep -F 'Latency cl 90 (seconds)' | awk -F'= ' '{print $2}')
-LATENCY95=$(cat $TMP_LOGFILE | grep -F 'Latency cl 95 (seconds)' | awk -F'= ' '{print $2}')
-LATENCY99=$(cat $TMP_LOGFILE | grep -F 'Latency cl 99 (seconds)' | awk -F'= ' '{print $2}')
-
-echo "$BATCH_SIZE,$INPUT_LENGTH,$LOG_RUN_MODE,$NUM_ITERS,$LATENCY,$LATENCYSTD,$LATENCY50,$LATENCY90,$LATENCY95,$LATENCY99,$PERF,$NUM_MELS" | tee $LOGFILE
diff --git a/demo/Tacotron2/train.py b/demo/Tacotron2/train.py
deleted file mode 100644
index 55a9e56f..00000000
--- a/demo/Tacotron2/train.py
+++ /dev/null
@@ -1,535 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import time
-import argparse
-import numpy as np
-from contextlib import contextmanager
-
-import torch
-from torch.utils.data import DataLoader
-from torch.autograd import Variable
-from torch.nn.parameter import Parameter
-
-import torch.distributed as dist
-from torch.utils.data.distributed import DistributedSampler
-
-from apex.parallel import DistributedDataParallel as DDP
-
-import models
-import loss_functions
-import data_functions
-
-import dllogger as DLLogger
-from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
-
-from scipy.io.wavfile import write as write_wav
-
-from apex import amp
-amp.lists.functional_overrides.FP32_FUNCS.remove('softmax')
-amp.lists.functional_overrides.FP16_FUNCS.append('softmax')
-
-
-def parse_args(parser):
- """
- Parse commandline arguments.
- """
-
- parser.add_argument('-o', '--output', type=str, required=True,
- help='Directory to save checkpoints')
- parser.add_argument('-d', '--dataset-path', type=str,
- default='./', help='Path to dataset')
- parser.add_argument('-m', '--model-name', type=str, default='', required=True,
- help='Model to train')
- parser.add_argument('--log-file', type=str, default='nvlog.json',
- help='Filename for logging')
- parser.add_argument('--anneal-steps', nargs='*',
- help='Epochs after which decrease learning rate')
- parser.add_argument('--anneal-factor', type=float, choices=[0.1, 0.3], default=0.1,
- help='Factor for annealing learning rate')
-
- # training
- training = parser.add_argument_group('training setup')
- training.add_argument('--epochs', type=int, required=True,
- help='Number of total epochs to run')
- training.add_argument('--epochs-per-checkpoint', type=int, default=50,
- help='Number of epochs per checkpoint')
- training.add_argument('--checkpoint-path', type=str, default='',
- help='Checkpoint path to resume training')
- training.add_argument('--resume-from-last', action='store_true',
- help='Resumes training from the last checkpoint; uses the directory provided with \'--output\' option to search for the checkpoint \"checkpoint__last.pt\"')
- training.add_argument('--dynamic-loss-scaling', type=bool, default=True,
- help='Enable dynamic loss scaling')
- training.add_argument('--amp', action='store_true',
- help='Enable AMP')
- training.add_argument('--cudnn-enabled', action='store_true',
- help='Enable cudnn')
- training.add_argument('--cudnn-benchmark', action='store_true',
- help='Run cudnn benchmark')
- training.add_argument('--disable-uniform-initialize-bn-weight', action='store_true',
- help='disable uniform initialization of batchnorm layer weight')
-
- optimization = parser.add_argument_group('optimization setup')
- optimization.add_argument(
- '--use-saved-learning-rate', default=False, type=bool)
- optimization.add_argument('-lr', '--learning-rate', type=float, required=True,
- help='Learing rate')
- optimization.add_argument('--weight-decay', default=1e-6, type=float,
- help='Weight decay')
- optimization.add_argument('--grad-clip-thresh', default=1.0, type=float,
- help='Clip threshold for gradients')
- optimization.add_argument('-bs', '--batch-size', type=int, required=True,
- help='Batch size per GPU')
- optimization.add_argument('--grad-clip', default=5.0, type=float,
- help='Enables gradient clipping and sets maximum gradient norm value')
-
- # dataset parameters
- dataset = parser.add_argument_group('dataset parameters')
- dataset.add_argument('--load-mel-from-disk', action='store_true',
- help='Loads mel spectrograms from disk instead of computing them on the fly')
- dataset.add_argument('--training-files',
- default='filelists/ljs_audio_text_train_filelist.txt',
- type=str, help='Path to training filelist')
- dataset.add_argument('--validation-files',
- default='filelists/ljs_audio_text_val_filelist.txt',
- type=str, help='Path to validation filelist')
- dataset.add_argument('--text-cleaners', nargs='*',
- default=['english_cleaners'], type=str,
- help='Type of text cleaners for input text')
-
- # audio parameters
- audio = parser.add_argument_group('audio parameters')
- audio.add_argument('--max-wav-value', default=32768.0, type=float,
- help='Maximum audiowave value')
- audio.add_argument('--sampling-rate', default=22050, type=int,
- help='Sampling rate')
- audio.add_argument('--filter-length', default=1024, type=int,
- help='Filter length')
- audio.add_argument('--hop-length', default=256, type=int,
- help='Hop (stride) length')
- audio.add_argument('--win-length', default=1024, type=int,
- help='Window length')
- audio.add_argument('--mel-fmin', default=0.0, type=float,
- help='Minimum mel frequency')
- audio.add_argument('--mel-fmax', default=8000.0, type=float,
- help='Maximum mel frequency')
-
- distributed = parser.add_argument_group('distributed setup')
- # distributed.add_argument('--distributed-run', default=True, type=bool,
- # help='enable distributed run')
- distributed.add_argument('--rank', default=0, type=int,
- help='Rank of the process, do not set! Done by multiproc module')
- distributed.add_argument('--world-size', default=1, type=int,
- help='Number of processes, do not set! Done by multiproc module')
- distributed.add_argument('--dist-url', type=str, default='tcp://localhost:23456',
- help='Url used to set up distributed training')
- distributed.add_argument('--group-name', type=str, default='group_name',
- required=False, help='Distributed group name')
- distributed.add_argument('--dist-backend', default='nccl', type=str, choices={'nccl'},
- help='Distributed run backend')
-
- benchmark = parser.add_argument_group('benchmark')
- benchmark.add_argument('--bench-class', type=str, default='')
-
- return parser
-
-
-def reduce_tensor(tensor, num_gpus):
- rt = tensor.clone()
- dist.all_reduce(rt, op=dist.reduce_op.SUM)
- rt /= num_gpus
- return rt
-
-
-def init_distributed(args, world_size, rank, group_name):
- assert torch.cuda.is_available(), "Distributed mode requires CUDA."
- print("Initializing Distributed")
-
- # Set cuda device so everything is done on the right GPU.
- torch.cuda.set_device(rank % torch.cuda.device_count())
-
- # Initialize distributed communication
- dist.init_process_group(
- backend=args.dist_backend, init_method=args.dist_url,
- world_size=world_size, rank=rank, group_name=group_name)
-
- print("Done initializing distributed")
-
-
-def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_name,
- local_rank, world_size):
-
- random_rng_state = torch.random.get_rng_state().cuda()
- cuda_rng_state = torch.cuda.get_rng_state(local_rank).cuda()
-
- random_rng_states_all = [torch.empty_like(random_rng_state) for _ in range(world_size)]
- cuda_rng_states_all = [torch.empty_like(cuda_rng_state) for _ in range(world_size)]
-
- if world_size > 1:
- dist.all_gather(random_rng_states_all, random_rng_state)
- dist.all_gather(cuda_rng_states_all, cuda_rng_state)
- else:
- random_rng_states_all = [random_rng_state]
- cuda_rng_states_all = [cuda_rng_state]
-
- random_rng_states_all = torch.stack(random_rng_states_all).cpu()
- cuda_rng_states_all = torch.stack(cuda_rng_states_all).cpu()
-
- if local_rank == 0:
- checkpoint = {'epoch': epoch,
- 'cuda_rng_state_all': cuda_rng_states_all,
- 'random_rng_states_all': random_rng_states_all,
- 'config': config,
- 'state_dict': model.state_dict(),
- 'optimizer': optimizer.state_dict()}
- if amp_run:
- checkpoint['amp'] = amp.state_dict()
-
- checkpoint_filename = "checkpoint_{}_{}.pt".format(model_name, epoch)
- checkpoint_path = os.path.join(
- output_dir, checkpoint_filename)
- print("Saving model and optimizer state at epoch {} to {}".format(
- epoch, checkpoint_path))
- torch.save(checkpoint, checkpoint_path)
-
- symlink_src = checkpoint_filename
- symlink_dst = os.path.join(
- output_dir, "checkpoint_{}_last.pt".format(model_name))
- if os.path.exists(symlink_dst) and os.path.islink(symlink_dst):
- print("|||| Updating symlink", symlink_dst, "to point to", symlink_src)
- os.remove(symlink_dst)
-
- os.symlink(symlink_src, symlink_dst)
-
-
-def get_last_checkpoint_filename(output_dir, model_name):
- symlink = os.path.join(output_dir, "checkpoint_{}_last.pt".format(model_name))
- if os.path.exists(symlink):
- print("|||| Loading checkpoint from symlink", symlink)
- return os.path.join(output_dir, os.readlink(symlink))
- else:
- print("|||| No last checkpoint available - starting from epoch 0 ")
- return ""
-
-
-def load_checkpoint(model, optimizer, epoch, config, amp_run, filepath, local_rank):
-
- checkpoint = torch.load(filepath, map_location='cpu')
-
- epoch[0] = checkpoint['epoch']+1
- device_id = local_rank % torch.cuda.device_count()
- torch.cuda.set_rng_state(checkpoint['cuda_rng_state_all'][device_id])
- torch.random.set_rng_state(checkpoint['random_rng_states_all'][device_id])
- config = checkpoint['config']
- model.load_state_dict(checkpoint['state_dict'])
- optimizer.load_state_dict(checkpoint['optimizer'])
-
- if amp_run:
- amp.load_state_dict(checkpoint['amp'])
-
-
-# adapted from: https://discuss.pytorch.org/t/opinion-eval-should-be-a-context-manager/18998/3
-# Following snippet is licensed under MIT license
-
-@contextmanager
-def evaluating(model):
- '''Temporarily switch to evaluation mode.'''
- istrain = model.training
- try:
- model.eval()
- yield model
- finally:
- if istrain:
- model.train()
-
-
-def validate(model, criterion, valset, epoch, batch_iter, batch_size,
- world_size, collate_fn, distributed_run, rank, batch_to_gpu):
- """Handles all the validation scoring and printing"""
- with evaluating(model), torch.no_grad():
- val_sampler = DistributedSampler(valset) if distributed_run else None
- val_loader = DataLoader(valset, num_workers=1, shuffle=False,
- sampler=val_sampler,
- batch_size=batch_size, pin_memory=False,
- collate_fn=collate_fn)
-
- val_loss = 0.0
- num_iters = 0
- val_items_per_sec = 0.0
- for i, batch in enumerate(val_loader):
- torch.cuda.synchronize()
- iter_start_time = time.perf_counter()
-
- x, y, num_items = batch_to_gpu(batch)
- y_pred = model(x)
- loss = criterion(y_pred, y)
- if distributed_run:
- reduced_val_loss = reduce_tensor(loss.data, world_size).item()
- reduced_num_items = reduce_tensor(num_items.data, 1).item()
- else: #
- reduced_val_loss = loss.item()
- reduced_num_items = num_items.item()
- val_loss += reduced_val_loss
-
- torch.cuda.synchronize()
- iter_stop_time = time.perf_counter()
- iter_time = iter_stop_time - iter_start_time
-
- items_per_sec = reduced_num_items/iter_time
- DLLogger.log(step=(epoch, batch_iter, i), data={'val_items_per_sec': items_per_sec})
- val_items_per_sec += items_per_sec
- num_iters += 1
-
- val_loss = val_loss/(i + 1)
-
- DLLogger.log(step=(epoch,), data={'val_loss': val_loss})
- DLLogger.log(step=(epoch,), data={'val_items_per_sec':
- (val_items_per_sec/num_iters if num_iters > 0 else 0.0)})
-
- return val_loss
-
-def adjust_learning_rate(iteration, epoch, optimizer, learning_rate,
- anneal_steps, anneal_factor, rank):
-
- p = 0
- if anneal_steps is not None:
- for i, a_step in enumerate(anneal_steps):
- if epoch >= int(a_step):
- p = p+1
-
- if anneal_factor == 0.3:
- lr = learning_rate*((0.1 ** (p//2))*(1.0 if p % 2 == 0 else 0.3))
- else:
- lr = learning_rate*(anneal_factor ** p)
-
- if optimizer.param_groups[0]['lr'] != lr:
- DLLogger.log(step=(epoch, iteration), data={'learning_rate changed': str(optimizer.param_groups[0]['lr'])+" -> "+str(lr)})
-
- for param_group in optimizer.param_groups:
- param_group['lr'] = lr
-
-
-def main():
-
- parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
- parser = parse_args(parser)
- args, _ = parser.parse_known_args()
-
- if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ:
- local_rank = int(os.environ['LOCAL_RANK'])
- world_size = int(os.environ['WORLD_SIZE'])
- else:
- local_rank = args.rank
- world_size = args.world_size
-
- distributed_run = world_size > 1
-
- if local_rank == 0:
- DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
- args.output+'/'+args.log_file),
- StdOutBackend(Verbosity.VERBOSE)])
- else:
- DLLogger.init(backends=[])
-
- for k,v in vars(args).items():
- DLLogger.log(step="PARAMETER", data={k:v})
- DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
-
- model_name = args.model_name
- parser = models.parse_model_args(model_name, parser)
- args, _ = parser.parse_known_args()
-
- torch.backends.cudnn.enabled = args.cudnn_enabled
- torch.backends.cudnn.benchmark = args.cudnn_benchmark
-
- if distributed_run:
- init_distributed(args, world_size, local_rank, args.group_name)
-
- torch.cuda.synchronize()
- run_start_time = time.perf_counter()
-
- model_config = models.get_model_config(model_name, args)
- model = models.get_model(model_name, model_config,
- to_cuda=True,
- uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight)
-
- if not args.amp and distributed_run:
- model = DDP(model)
-
- optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
- weight_decay=args.weight_decay)
-
- if args.amp:
- model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
- if distributed_run:
- model = DDP(model)
-
- try:
- sigma = args.sigma
- except AttributeError:
- sigma = None
-
- start_epoch = [0]
-
- if args.resume_from_last:
- args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name)
-
- if args.checkpoint_path is not "":
- load_checkpoint(model, optimizer, start_epoch, model_config,
- args.amp, args.checkpoint_path, local_rank)
-
- start_epoch = start_epoch[0]
-
- criterion = loss_functions.get_loss_function(model_name, sigma)
-
- try:
- n_frames_per_step = args.n_frames_per_step
- except AttributeError:
- n_frames_per_step = None
-
- collate_fn = data_functions.get_collate_function(
- model_name, n_frames_per_step)
- trainset = data_functions.get_data_loader(
- model_name, args.dataset_path, args.training_files, args)
- if distributed_run:
- train_sampler = DistributedSampler(trainset)
- shuffle = False
- else:
- train_sampler = None
- shuffle = True
-
- train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
- sampler=train_sampler,
- batch_size=args.batch_size, pin_memory=False,
- drop_last=True, collate_fn=collate_fn)
-
- valset = data_functions.get_data_loader(
- model_name, args.dataset_path, args.validation_files, args)
-
- batch_to_gpu = data_functions.get_batch_to_gpu(model_name)
-
- iteration = 0
- train_epoch_items_per_sec = 0.0
- val_loss = 0.0
- num_iters = 0
-
- model.train()
-
- for epoch in range(start_epoch, args.epochs):
- torch.cuda.synchronize()
- epoch_start_time = time.perf_counter()
- # used to calculate avg items/sec over epoch
- reduced_num_items_epoch = 0
-
- train_epoch_items_per_sec = 0.0
-
- num_iters = 0
- reduced_loss = 0
-
- # if overflow at the last iteration then do not save checkpoint
- overflow = False
-
- if distributed_run:
- train_loader.sampler.set_epoch(epoch)
-
- for i, batch in enumerate(train_loader):
- torch.cuda.synchronize()
- iter_start_time = time.perf_counter()
- DLLogger.log(step=(epoch, i),
- data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))})
-
- adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate,
- args.anneal_steps, args.anneal_factor, local_rank)
-
- model.zero_grad()
- x, y, num_items = batch_to_gpu(batch)
-
- y_pred = model(x)
- loss = criterion(y_pred, y)
-
- if distributed_run:
- reduced_loss = reduce_tensor(loss.data, world_size).item()
- reduced_num_items = reduce_tensor(num_items.data, 1).item()
- else:
- reduced_loss = loss.item()
- reduced_num_items = num_items.item()
- if np.isnan(reduced_loss):
- raise Exception("loss is NaN")
-
- DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss})
-
- num_iters += 1
-
- # accumulate number of items processed in this epoch
- reduced_num_items_epoch += reduced_num_items
-
- if args.amp:
- with amp.scale_loss(loss, optimizer) as scaled_loss:
- scaled_loss.backward()
- grad_norm = torch.nn.utils.clip_grad_norm_(
- amp.master_params(optimizer), args.grad_clip_thresh)
- else:
- loss.backward()
- grad_norm = torch.nn.utils.clip_grad_norm_(
- model.parameters(), args.grad_clip_thresh)
-
- optimizer.step()
-
- torch.cuda.synchronize()
- iter_stop_time = time.perf_counter()
- iter_time = iter_stop_time - iter_start_time
- items_per_sec = reduced_num_items/iter_time
- train_epoch_items_per_sec += items_per_sec
-
- DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec})
- DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time})
- iteration += 1
-
- torch.cuda.synchronize()
- epoch_stop_time = time.perf_counter()
- epoch_time = epoch_stop_time - epoch_start_time
-
- DLLogger.log(step=(epoch,), data={'train_items_per_sec':
- (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
- DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss})
- DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time})
-
- val_loss = validate(model, criterion, valset, epoch, iteration,
- args.batch_size, world_size, collate_fn,
- distributed_run, local_rank, batch_to_gpu)
-
- if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "":
- save_checkpoint(model, optimizer, epoch, model_config,
- args.amp, args.output, args.model_name,
- local_rank, world_size)
- if local_rank == 0:
- DLLogger.flush()
-
- torch.cuda.synchronize()
- run_stop_time = time.perf_counter()
- run_time = run_stop_time - run_start_time
- DLLogger.log(step=tuple(), data={'run_time': run_time})
- DLLogger.log(step=tuple(), data={'val_loss': val_loss})
- DLLogger.log(step=tuple(), data={'train_items_per_sec':
- (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
-
- if local_rank == 0:
- DLLogger.flush()
-
-if __name__ == '__main__':
- main()
diff --git a/demo/Tacotron2/waveglow/arg_parser.py b/demo/Tacotron2/waveglow/arg_parser.py
deleted file mode 100644
index 7002bf6d..00000000
--- a/demo/Tacotron2/waveglow/arg_parser.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-
-def parse_waveglow_args(parent, add_help=False):
- """
- Parse commandline arguments.
- """
- parser = argparse.ArgumentParser(parents=[parent], add_help=add_help)
-
- # misc parameters
- parser.add_argument('--n-mel-channels', default=80, type=int,
- help='Number of bins in mel-spectrograms')
-
- # glow parameters
- parser.add_argument('--flows', default=12, type=int,
- help='Number of steps of flow')
- parser.add_argument('--groups', default=8, type=int,
- help='Number of samples in a group processed by the steps of flow')
- parser.add_argument('--early-every', default=4, type=int,
- help='Determines how often (i.e., after how many coupling layers) \
- a number of channels (defined by --early-size parameter) are output\
- to the loss function')
- parser.add_argument('--early-size', default=2, type=int,
- help='Number of channels output to the loss function')
- parser.add_argument('--sigma', default=1.0, type=float,
- help='Standard deviation used for sampling from Gaussian')
- parser.add_argument('--segment-length', default=4000, type=int,
- help='Segment length (audio samples) processed per iteration')
-
- # wavenet parameters
- wavenet = parser.add_argument_group('WaveNet parameters')
- wavenet.add_argument('--wn-kernel-size', default=3, type=int,
- help='Kernel size for dialted convolution in the affine coupling layer (WN)')
- wavenet.add_argument('--wn-channels', default=512, type=int,
- help='Number of channels in WN')
- wavenet.add_argument('--wn-layers', default=8, type=int,
- help='Number of layers in WN')
-
- return parser
diff --git a/demo/Tacotron2/waveglow/data_function.py b/demo/Tacotron2/waveglow/data_function.py
deleted file mode 100644
index 62076eba..00000000
--- a/demo/Tacotron2/waveglow/data_function.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-import random
-import common.layers as layers
-from common.utils import load_wav_to_torch, load_filepaths_and_text, to_gpu
-
-
-class MelAudioLoader(torch.utils.data.Dataset):
- """
- 1) loads audio,text pairs
- 2) computes mel-spectrograms from audio files.
- """
-
- def __init__(self, dataset_path, audiopaths_and_text, args):
- self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
- self.max_wav_value = args.max_wav_value
- self.sampling_rate = args.sampling_rate
- self.stft = layers.TacotronSTFT(
- args.filter_length, args.hop_length, args.win_length,
- args.n_mel_channels, args.sampling_rate, args.mel_fmin,
- args.mel_fmax)
- self.segment_length = args.segment_length
- random.seed(1234)
- random.shuffle(self.audiopaths_and_text)
-
- def get_mel_audio_pair(self, filename):
- audio, sampling_rate = load_wav_to_torch(filename)
-
- if sampling_rate != self.stft.sampling_rate:
- raise ValueError("{} {} SR doesn't match target {} SR".format(
- sampling_rate, self.stft.sampling_rate))
-
- # Take segment
- if audio.size(0) >= self.segment_length:
- max_audio_start = audio.size(0) - self.segment_length
- audio_start = random.randint(0, max_audio_start)
- audio = audio[audio_start:audio_start+self.segment_length]
- else:
- audio = torch.nn.functional.pad(
- audio, (0, self.segment_length - audio.size(0)), 'constant').data
-
- audio = audio / self.max_wav_value
- audio_norm = audio.unsqueeze(0)
- audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
- melspec = self.stft.mel_spectrogram(audio_norm)
- melspec = melspec.squeeze(0)
-
- return (melspec, audio, len(audio))
-
- def __getitem__(self, index):
- return self.get_mel_audio_pair(self.audiopaths_and_text[index][0])
-
- def __len__(self):
- return len(self.audiopaths_and_text)
-
-
-def batch_to_gpu(batch):
- x, y, len_y = batch
- x = to_gpu(x).float()
- y = to_gpu(y).float()
- len_y = to_gpu(torch.sum(len_y))
- return ((x, y), y, len_y)
diff --git a/demo/Tacotron2/waveglow/denoiser.py b/demo/Tacotron2/waveglow/denoiser.py
deleted file mode 100644
index 5dc2d789..00000000
--- a/demo/Tacotron2/waveglow/denoiser.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import sys
-sys.path.append('tacotron2')
-import torch
-from common.layers import STFT
-
-
-class Denoiser(torch.nn.Module):
- """ Removes model bias from audio produced with waveglow """
-
- def __init__(self, waveglow, filter_length=1024, n_overlap=4,
- win_length=1024, mode='zeros'):
- super(Denoiser, self).__init__()
- device = waveglow.upsample.weight.device
- dtype = waveglow.upsample.weight.dtype
- self.stft = STFT(filter_length=filter_length,
- hop_length=int(filter_length/n_overlap),
- win_length=win_length).to(device)
- if mode == 'zeros':
- mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
- elif mode == 'normal':
- mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
- else:
- raise Exception("Mode {} if not supported".format(mode))
-
- with torch.no_grad():
- bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
- bias_spec, _ = self.stft.transform(bias_audio)
-
- self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
-
- def forward(self, audio, strength=0.1):
- audio_spec, audio_angles = self.stft.transform(audio)
- audio_spec_denoised = audio_spec - self.bias_spec * strength
- audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
- audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
- return audio_denoised
diff --git a/demo/Tacotron2/waveglow/loss_function.py b/demo/Tacotron2/waveglow/loss_function.py
deleted file mode 100644
index 75620df9..00000000
--- a/demo/Tacotron2/waveglow/loss_function.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-
-class WaveGlowLoss(torch.nn.Module):
- def __init__(self, sigma=1.0):
- super(WaveGlowLoss, self).__init__()
- self.sigma = sigma
-
- def forward(self, model_output, clean_audio):
- # clean_audio is unused;
- z, log_s_list, log_det_W_list = model_output
- for i, log_s in enumerate(log_s_list):
- if i == 0:
- log_s_total = torch.sum(log_s)
- log_det_W_total = log_det_W_list[i]
- else:
- log_s_total = log_s_total + torch.sum(log_s)
- log_det_W_total += log_det_W_list[i]
-
- loss = torch.sum(
- z * z) / (2 * self.sigma * self.sigma) - log_s_total - log_det_W_total # noqa: E501
- return loss / (z.size(0) * z.size(1) * z.size(2))
diff --git a/demo/Tacotron2/waveglow/model.py b/demo/Tacotron2/waveglow/model.py
deleted file mode 100644
index 00a26421..00000000
--- a/demo/Tacotron2/waveglow/model.py
+++ /dev/null
@@ -1,343 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import torch
-from torch.autograd import Variable
-import torch.nn.functional as F
-import numpy as np
-
-
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
- n_channels_int = n_channels[0]
- in_act = input_a + input_b
- t_act = torch.tanh(in_act[:, :n_channels_int, :])
- s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
- acts = t_act * s_act
- return acts
-
-
-class Invertible1x1Conv(torch.nn.Module):
- """
- The layer outputs both the convolution, and the log determinant
- of its weight matrix. If reverse=True it does convolution with
- inverse
- """
-
- def __init__(self, c):
- super(Invertible1x1Conv, self).__init__()
- self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
- bias=False)
-
- # Sample a random orthonormal matrix to initialize weights
- W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
-
- # Ensure determinant is 1.0 not -1.0
- if torch.det(W) < 0:
- W[:, 0] = -1 * W[:, 0]
- W = W.view(c, c, 1)
- W = W.contiguous()
- self.conv.weight.data = W
-
- def forward(self, z):
- # shape
- batch_size, group_size, n_of_groups = z.size()
-
- W = self.conv.weight.squeeze()
-
- # Forward computation
- log_det_W = batch_size * n_of_groups * torch.logdet(W.unsqueeze(0).float()).squeeze()
- z = self.conv(z)
- return z, log_det_W
-
-
- def infer(self, z):
- # shape
- batch_size, group_size, n_of_groups = z.size()
-
- W = self.conv.weight.squeeze()
-
- if not hasattr(self, 'W_inverse'):
- # Reverse computation
- W_inverse = W.float().inverse()
- W_inverse = Variable(W_inverse[..., None])
- if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor':
- W_inverse = W_inverse.half()
- self.W_inverse = W_inverse
- z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
- return z
-
-
-class WN(torch.nn.Module):
- """
- This is the WaveNet like layer for the affine coupling. The primary
- difference from WaveNet is the convolutions need not be causal. There is
- also no dilation size reset. The dilation only doubles on each layer
- """
-
- def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
- kernel_size):
- super(WN, self).__init__()
- assert(kernel_size % 2 == 1)
- assert(n_channels % 2 == 0)
- self.n_layers = n_layers
- self.n_channels = n_channels
- self.in_layers = torch.nn.ModuleList()
- self.res_skip_layers = torch.nn.ModuleList()
- self.cond_layers = torch.nn.ModuleList()
-
- start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
- start = torch.nn.utils.weight_norm(start, name='weight')
- self.start = start
-
- # Initializing last layer to 0 makes the affine coupling layers
- # do nothing at first. This helps with training stability
- end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
- end.weight.data.zero_()
- end.bias.data.zero_()
- self.end = end
-
- for i in range(n_layers):
- dilation = 2 ** i
- padding = int((kernel_size * dilation - dilation) / 2)
- in_layer = torch.nn.Conv1d(n_channels, 2 * n_channels, kernel_size,
- dilation=dilation, padding=padding)
- in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
- self.in_layers.append(in_layer)
-
- cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
- cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
- self.cond_layers.append(cond_layer)
-
- # last one is not necessary
- if i < n_layers - 1:
- res_skip_channels = 2 * n_channels
- else:
- res_skip_channels = n_channels
- res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
- res_skip_layer = torch.nn.utils.weight_norm(
- res_skip_layer, name='weight')
- self.res_skip_layers.append(res_skip_layer)
-
- def forward(self, forward_input):
- audio, spect = forward_input
- audio = self.start(audio)
-
- for i in range(self.n_layers):
- acts = fused_add_tanh_sigmoid_multiply(
- self.in_layers[i](audio),
- self.cond_layers[i](spect),
- torch.IntTensor([self.n_channels]))
-
- res_skip_acts = self.res_skip_layers[i](acts)
- if i < self.n_layers - 1:
- audio = res_skip_acts[:, :self.n_channels, :] + audio
- skip_acts = res_skip_acts[:, self.n_channels:, :]
- else:
- skip_acts = res_skip_acts
-
- if i == 0:
- output = skip_acts
- else:
- output = skip_acts + output
- return self.end(output)
-
-
-class WaveGlow(torch.nn.Module):
- def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
- n_early_size, WN_config):
- super(WaveGlow, self).__init__()
-
- self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
- n_mel_channels,
- 1024, stride=256)
- assert(n_group % 2 == 0)
- self.n_flows = n_flows
- self.n_group = n_group
- self.n_early_every = n_early_every
- self.n_early_size = n_early_size
- self.WN = torch.nn.ModuleList()
- self.convinv = torch.nn.ModuleList()
-
- n_half = int(n_group / 2)
-
- # Set up layers with the right sizes based on how many dimensions
- # have been output already
- n_remaining_channels = n_group
- for k in range(n_flows):
- if k % self.n_early_every == 0 and k > 0:
- n_half = n_half - int(self.n_early_size / 2)
- n_remaining_channels = n_remaining_channels - self.n_early_size
- self.convinv.append(Invertible1x1Conv(n_remaining_channels))
- self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config))
- self.n_remaining_channels = n_remaining_channels
-
- def forward(self, forward_input):
- """
- forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames
- forward_input[1] = audio: batch x time
- """
- spect, audio = forward_input
-
- # Upsample spectrogram to size of audio
- spect = self.upsample(spect)
- assert(spect.size(2) >= audio.size(1))
- if spect.size(2) > audio.size(1):
- spect = spect[:, :, :audio.size(1)]
-
- spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
- spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
- spect = spect.permute(0, 2, 1)
-
- audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
- output_audio = []
- log_s_list = []
- log_det_W_list = []
-
- for k in range(self.n_flows):
- if k % self.n_early_every == 0 and k > 0:
- output_audio.append(audio[:, :self.n_early_size, :])
- audio = audio[:, self.n_early_size:, :]
-
- audio, log_det_W = self.convinv[k](audio)
- log_det_W_list.append(log_det_W)
-
- n_half = int(audio.size(1) / 2)
- audio_0 = audio[:, :n_half, :]
- audio_1 = audio[:, n_half:, :]
-
- output = self.WN[k]((audio_0, spect))
- log_s = output[:, n_half:, :]
- b = output[:, :n_half, :]
- audio_1 = torch.exp(log_s) * audio_1 + b
- log_s_list.append(log_s)
-
- audio = torch.cat([audio_0, audio_1], 1)
-
- output_audio.append(audio)
- return torch.cat(output_audio, 1), log_s_list, log_det_W_list
-
- def infer(self, spect, sigma=1.0):
-
- spect = self.upsample(spect)
- # trim conv artifacts. maybe pad spec to kernel multiple
- time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
- spect = spect[:, :, :-time_cutoff]
-
- spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
- spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
- spect = spect.permute(0, 2, 1)
-
- audio = torch.randn(spect.size(0),
- self.n_remaining_channels,
- spect.size(2), device=spect.device).to(spect.dtype)
-
- audio = torch.autograd.Variable(sigma * audio)
-
- for k in reversed(range(self.n_flows)):
- n_half = int(audio.size(1) / 2)
- audio_0 = audio[:, :n_half, :]
- audio_1 = audio[:, n_half:, :]
-
- output = self.WN[k]((audio_0, spect))
- s = output[:, n_half:, :]
- b = output[:, :n_half, :]
- audio_1 = (audio_1 - b) / torch.exp(s)
- audio = torch.cat([audio_0, audio_1], 1)
-
- audio = self.convinv[k].infer(audio)
-
- if k % self.n_early_every == 0 and k > 0:
- z = torch.randn(spect.size(0), self.n_early_size, spect.size(
- 2), device=spect.device).to(spect.dtype)
- audio = torch.cat((sigma * z, audio), 1)
-
- audio = audio.permute(
- 0, 2, 1).contiguous().view(
- audio.size(0), -1).data
- return audio
-
-
- def infer_onnx(self, spect, z, sigma=0.9):
-
- spect = self.upsample(spect)
- # trim conv artifacts. maybe pad spec to kernel multiple
- time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
- spect = spect[:, :, :-time_cutoff]
-
- length_spect_group = spect.size(2)//8
- mel_dim = 80
- batch_size = spect.size(0)
-
- spect = torch.squeeze(spect, 3)
- spect = spect.view((batch_size, mel_dim, length_spect_group, self.n_group))
- spect = spect.permute(0, 2, 1, 3)
- spect = spect.contiguous()
- spect = spect.view((batch_size, length_spect_group, self.n_group*mel_dim))
- spect = spect.permute(0, 2, 1)
- spect = torch.unsqueeze(spect, 3)
- spect = spect.contiguous()
-
- audio = z[:, :self.n_remaining_channels, :, :]
- z = z[:, self.n_remaining_channels:self.n_group, :, :]
-
- # Convert sigma to a torch tensor to ensure constant is exported properly
- if audio.type() == 'torch.cuda.HalfTensor' or audio.type() == 'torch.HalfTensor':
- sigma = torch.tensor(np.float16(sigma))
- else:
- sigma = torch.tensor(np.float32(sigma))
- audio = sigma * audio
-
- for k in reversed(range(self.n_flows)):
- n_half = int(audio.size(1) // 2)
- audio_0 = audio[:, :n_half, :, :]
- audio_1 = audio[:, n_half:(n_half+n_half), :, :]
-
- output = self.WN[k]((audio_0, spect))
- s = output[:, n_half:(n_half+n_half), :, :]
- b = output[:, :n_half, :, :]
- audio_1 = (audio_1 - b) / torch.exp(s)
- audio = torch.cat([audio_0, audio_1], 1)
- audio = self.convinv[k](audio)
-
- if k % self.n_early_every == 0 and k > 0:
- audio = torch.cat((z[:, :self.n_early_size, :, :], audio), 1)
- z = z[:, self.n_early_size:self.n_group, :, :]
-
- audio = torch.squeeze(audio, 3)
- audio = audio.permute(0,2,1).contiguous().view(batch_size, (length_spect_group * self.n_group))
-
- return audio
-
-
- @staticmethod
- def remove_weightnorm(model):
- waveglow = model
- for WN in waveglow.WN:
- WN.start = torch.nn.utils.remove_weight_norm(WN.start)
- WN.in_layers = remove(WN.in_layers)
- WN.cond_layers = remove(WN.cond_layers)
- WN.res_skip_layers = remove(WN.res_skip_layers)
- return waveglow
-
-
-def remove(conv_list):
- new_conv_list = torch.nn.ModuleList()
- for old_conv in conv_list:
- old_conv = torch.nn.utils.remove_weight_norm(old_conv)
- new_conv_list.append(old_conv)
- return new_conv_list
diff --git a/demo/experimental/HuggingFace-Diffusers/README.md b/demo/experimental/HuggingFace-Diffusers/README.md
deleted file mode 100644
index d0e4e563..00000000
--- a/demo/experimental/HuggingFace-Diffusers/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Introduction
-
-This demo notebook showcases the acceleration of Stable Diffusion pipeline using TensorRT through HuggingFace pipelines.
-
-# Setup
-
-### Clone the TensorRT OSS repository
-
-```bash
-git clone git@github.com:NVIDIA/TensorRT.git -b release/9.3 --single-branch
-cd TensorRT/demo/experimental/HuggingFace-Diffusers
-```
-
-### Launch TensorRT NGC container
-
-Install nvidia-docker using [these intructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker). Launch the docker container with the following command:
-
-```bash
-docker run --rm -it --gpus all -p 8888:8888 -v $PWD:/workspace nvcr.io/nvidia/tensorrt:23.04-py3 /bin/bash
-```
-
-### Run Jupyter Notebook
-
-Install `jupyter` with:
-
-```bash
-pip install jupyter
-```
-
-Launch the notebook within the container with:
-
-```bash
-jupyter notebook --ip 0.0.0.0 TensorRT-diffusers-txt2img.ipynb --allow-root --no-browser
-```
-
-Follow the console output for the link to run the notebook on your host machine.
diff --git a/demo/experimental/HuggingFace-Diffusers/TensorRT-diffusers-txt2img.ipynb b/demo/experimental/HuggingFace-Diffusers/TensorRT-diffusers-txt2img.ipynb
deleted file mode 100644
index 23eb1492..00000000
--- a/demo/experimental/HuggingFace-Diffusers/TensorRT-diffusers-txt2img.ipynb
+++ /dev/null
@@ -1,1290 +0,0 @@
-{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "14941611",
- "metadata": {},
- "source": [
- "# Stable Diffusion acceleration with TensorRT"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "47c80a60",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Copyright 2023 NVIDIA Corporation. All Rights Reserved.\n",
- "#\n",
- "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
- "# you may not use this file except in compliance with the License.\n",
- "# You may obtain a copy of the License at\n",
- "#\n",
- "# http://www.apache.org/licenses/LICENSE-2.0\n",
- "#\n",
- "# Unless required by applicable law or agreed to in writing, software\n",
- "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
- "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
- "# See the License for the specific language governing permissions and\n",
- "# limitations under the License.\n",
- "# =============================================================================="
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "7a9c6d74",
- "metadata": {},
- "source": [
- "# Install Prerequisites"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "b32d847b",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Disable warnings if pip is run as root.\n",
- "import os\n",
- "os.environ['PIP_ROOT_USER_ACTION']='ignore'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "cd9e73ba",
- "metadata": {},
- "outputs": [],
- "source": [
- "!python -m pip install --upgrade --quiet pip"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "d0214ad4",
- "metadata": {},
- "source": [
- "### Check NVIDIA GPU availability\n",
- "\n",
- "TensorRT acceleration for Diffusion models is available for NVIDIA Turing, Ampere, Ada Lovelace, and Hopper GPUs.\n",
- "\n",
- "For the following illustration we are using an A100 40GB GPU."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "362193c2",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Wed May 3 04:32:55 2023 \n",
- "+-----------------------------------------------------------------------------+\n",
- "| NVIDIA-SMI 515.44 Driver Version: 515.44 CUDA Version: 12.0 |\n",
- "|-------------------------------+----------------------+----------------------+\n",
- "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
- "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
- "| | | MIG M. |\n",
- "|===============================+======================+======================|\n",
- "| 0 NVIDIA Graphics... Off | 00000000:01:00.0 Off | 0 |\n",
- "| 65% 64C P0 81W / 200W | 86MiB / 40960MiB | 0% Default |\n",
- "| | | Disabled |\n",
- "+-------------------------------+----------------------+----------------------+\n",
- " \n",
- "+-----------------------------------------------------------------------------+\n",
- "| Processes: |\n",
- "| GPU GI CI PID Type Process name GPU Memory |\n",
- "| ID ID Usage |\n",
- "|=============================================================================|\n",
- "+-----------------------------------------------------------------------------+\n"
- ]
- }
- ],
- "source": [
- "!nvidia-smi"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "c79b497a",
- "metadata": {},
- "source": [
- "### Install PyTorch 1.x\n",
- "\n",
- "NOTE: this is a temporary workaround for ONNX export issues observed in PyTorch 2.0,"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "cabe1586",
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install --upgrade --quiet \"torch <2.0.0\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "f07ee31c",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "PyTorch version: 1.14.0a0+44dac51\n"
- ]
- }
- ],
- "source": [
- "import torch\n",
- "print(f\"PyTorch version: {torch.__version__}\")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "f26c0286",
- "metadata": {},
- "source": [
- "### Install NVIDIA TensorRT\n",
- "\n",
- "TensorRT 8.6+ includes Stable Diffusion model optimizations out of the box."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "1e5b96f2",
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install --upgrade --quiet \"tensorrt>=8.6\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "34a83eb3",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "TensorRT version: 8.6.1\n"
- ]
- }
- ],
- "source": [
- "import tensorrt\n",
- "print(f\"TensorRT version: {tensorrt.__version__}\")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "3a14e192",
- "metadata": {},
- "source": [
- "### Install TensorRT Utilities\n",
- "\n",
- "The TensorRT pipeline implementation in diffusers uses `polygraphy` API to reduce boilerplate code and simplify deployment of ONNX models in TensorRT.\n",
- "\n",
- "The pipeline also uses `onnx-graphsurgeon` and `onnxruntime` to sanitize (constant folding & shape inference) the exported ONNX models for deployment."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "465c891a",
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install --extra-index-url https://pypi.ngc.nvidia.com --upgrade --quiet \"onnx-graphsurgeon\" \"onnxruntime\" \"polygraphy\""
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "3d157e2d",
- "metadata": {},
- "source": [
- "### Install HuggingFace libraries\n",
- "\n",
- "HuggingFace `diffusers` library provides an implementation of the Stable Diffusion pipeline, including the constituent models. TensorRT txt2img pipeline was added in `diffusers` v0.16.0, which is a minimum requirement for the following illustration.\n",
- "\n",
- "The OpenAI CLIP text encoder and tokenizer models are obtained from HuggingFace `transformers` package."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "2c8f24c9",
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install --upgrade --quiet \"accelerate\" \"diffusers>=0.16\" \"transformers\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "eef75c7f",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "diffusers version: 0.16.1\n"
- ]
- }
- ],
- "source": [
- "import diffusers\n",
- "print(f\"diffusers version: {diffusers.__version__}\")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "7ee62e33",
- "metadata": {},
- "source": [
- "# Run Stable Diffusion\n",
- "\n",
- "The Stable Diffusion text2image pipeline takes a text prompt as an input and generates an image. A latent seed is used generate an initial random latent of size 64×64 and the text prompt is transformed to text embeddings of size 77×768 by a CLIP text encoder.\n",
- "\n",
- "Next the U-Net iteratively denoises the random latent representation over a user-specified number of steps while being conditioned on the text embeddings. The output of the U-Net in each iteration is a noise residual which is transformed into denoised latent image representation via a scheduler algorithm.\n",
- "\n",
- "For more information, see this [blog post](https://huggingface.co/blog/stable_diffusion)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "6892fdee",
- "metadata": {},
- "source": [
- "### Import SD pipeline from diffusers\n",
- "\n",
- "`StableDiffusionPipeline` contains all models required for inference - a tokenizer, `CLIPTextModel` (text encoder), `UNet2DConditionModel` (denoising UNet), and `AutoencoderKL` (VAE decoder)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "7d3abfe8",
- "metadata": {},
- "outputs": [],
- "source": [
- "from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "d68630b1",
- "metadata": {},
- "source": [
- "### Initialize DDIM scheduler\n",
- "\n",
- "A custom noise scheduler can be specified by the user. In our example we use [DDIM](https://huggingface.co/docs/diffusers/main/en/api/schedulers/ddim)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "8c0df48e",
- "metadata": {},
- "outputs": [],
- "source": [
- "from diffusers import DDIMScheduler\n",
- "scheduler = DDIMScheduler.from_pretrained(\"stabilityai/stable-diffusion-2-1\", subfolder=\"scheduler\")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "12fbcdc7",
- "metadata": {},
- "source": [
- "### Initialize native txt2img pipeline"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "0e81860f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "e84d2ea17a5247fea357a7499fbc9cc3",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Fetching 11 files: 0%| | 0/11 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "pipe = StableDiffusionPipeline.from_pretrained(\n",
- " \"stabilityai/stable-diffusion-2-1\",\n",
- " revision='fp16',\n",
- " torch_dtype=torch.float16,\n",
- " scheduler=scheduler,\n",
- " image_height=512,\n",
- " image_width=512)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "c22d4314",
- "metadata": {},
- "source": [
- "### Load the pipeline models to GPU"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "04210f39",
- "metadata": {},
- "outputs": [],
- "source": [
- "pipe = pipe.to(\"cuda\")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "a4be4ab7",
- "metadata": {},
- "source": [
- "### Run native txt2img pipeline\n",
- "\n",
- "The native pipeline in diffusers is implemented in PyTorch. Run it and display the generated image."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "d25b6e6a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "0cd1ce8dc34e4f71be0dc83f4cb99c7f",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/50 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "prompt = \"a beautiful photograph of Mt. Fuji during cherry blossom\"\n",
- "image = pipe(prompt).images[0]\n",
- "display(image)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "1b22dd3c",
- "metadata": {},
- "source": [
- "# Run Stable Diffusion with TensorRT"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "709ed5d5",
- "metadata": {},
- "source": [
- "### Initialize TensorRT txt2img pipeline\n",
- "\n",
- "TensorRT pipeline initialization is similar to the native pipeline, with a single extra option to specify the path to a [python file containing the TensorRT implementation](https://github.com/huggingface/diffusers/blob/main/examples/community/stable_diffusion_tensorrt_txt2img.py) in diffusers.\n",
- "`custom_pipeline=\"stable_diffusion_tensorrt_txt2img\"`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "fbd7f7a8",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.8/dist-packages/huggingface_hub/file_download.py:649: FutureWarning: 'cached_download' is the legacy way to download files from the HF hub, please consider upgrading to 'hf_hub_download'\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "pipe_trt = StableDiffusionPipeline.from_pretrained(\n",
- " \"stabilityai/stable-diffusion-2-1\",\n",
- " custom_pipeline=\"stable_diffusion_tensorrt_txt2img\",\n",
- " revision='fp16',\n",
- " torch_dtype=torch.float16,\n",
- " scheduler=scheduler,\n",
- " image_height=512,\n",
- " image_width=512)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "4e7e6c2e",
- "metadata": {},
- "source": [
- "### Specify cache folder name\n",
- "\n",
- "The ONNX models and TensorRT engines generated during the first inference run will be cached in this folder to speed up subsequent runs."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "9d018680",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "b0caad71f89a45ceb6e6f790bfc28f71",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Fetching 16 files: 0%| | 0/16 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "pipe_trt.set_cached_folder(\"stabilityai/stable-diffusion-2-1\", revision='fp16')"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "id": "aa7b0ede",
- "metadata": {},
- "source": [
- "### Build and load TensorRT engines\n",
- "\n",
- "The overloaded `to()` method builds the TensorRT engines and loads them up for inference.\n",
- "\n",
- "Note: ONNX export and TensorRT engine builds can take upto 20 minutes. Since the engines are cached, this latency is only observed on the first run below."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "4761f142",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Running inference on device: cuda:0\n",
- "Building Engines...\n",
- "Engine build can take a while to complete\n",
- "Exporting model: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/clip.onnx\n",
- "/usr/local/lib/python3.8/dist-packages/transformers/models/clip/modeling_clip.py:759: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
- " mask.fill_(torch.tensor(torch.finfo(dtype).min))\n",
- "/usr/local/lib/python3.8/dist-packages/transformers/models/clip/modeling_clip.py:284: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n",
- "/usr/local/lib/python3.8/dist-packages/transformers/models/clip/modeling_clip.py:292: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):\n",
- "/usr/local/lib/python3.8/dist-packages/transformers/models/clip/modeling_clip.py:324: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n",
- "/usr/local/lib/python3.8/dist-packages/torch/onnx/symbolic_opset9.py:5502: UserWarning: Exporting aten::index operator of advanced indexing in opset 17 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
- " warnings.warn(\n",
- "Generating optimizing model: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/clip.opt.onnx\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "========== Diagnostic Run torch.onnx.export version 1.14.0a0+44dac51 ===========\n",
- "verbose: False, log level: Level.ERROR\n",
- "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
- "\n",
- "[W] 'colored' module is not installed, will not use colors when logging. To enable colors, please install the 'colored' module: python3 -m pip install colored\n",
- "[I] Folding Constants | Pass 1\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2023-05-03 04:33:51.406400843 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/Unsqueeze\n",
- "2023-05-03 04:33:51.406435531 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/Unsqueeze_3\n",
- "2023-05-03 04:33:51.406444151 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/Unsqueeze_2\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Total Nodes | Original: 2984, After Folding: 1952 | 1032 Nodes Folded\n",
- "[I] Folding Constants | Pass 2\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2023-05-03 04:33:52.696879372 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.22/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.696910088 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.22/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.696918469 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.22/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.696926095 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.22/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.696934158 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.22/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.696942510 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.22/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.696950232 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.22/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.696958072 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.22/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.696965180 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.21/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.696973214 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.21/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.696980375 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.21/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.696987618 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.21/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.696995459 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.21/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697004257 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.21/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697012159 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.21/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697019979 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.21/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697027040 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.20/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697035038 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.20/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697042100 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.20/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697049352 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.20/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697057307 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.20/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697065488 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.20/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697073272 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.20/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697081068 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.20/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697088186 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.19/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697098638 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.19/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697105869 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.19/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697113091 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.19/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697120929 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.19/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697129978 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.19/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697138083 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.19/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697145991 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.19/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697153267 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.18/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697161162 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.18/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697168356 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.18/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697175695 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.18/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697183855 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.18/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697192177 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.18/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697200276 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.18/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697208293 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.18/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697215648 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.17/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697223776 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.17/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697231416 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.17/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697239063 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.17/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697247190 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.17/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697255198 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.17/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697263097 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.17/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697271127 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.17/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697279108 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.16/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697287460 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.16/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697294912 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.16/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697302384 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.16/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697310386 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.16/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697318363 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.16/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697326285 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.16/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697334333 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.16/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697341645 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.15/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697349806 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.15/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697357245 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.15/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697366026 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.15/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697374221 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.15/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697382180 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.15/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697390024 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.15/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697397968 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.15/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697405246 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.14/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697413222 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.14/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697420545 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.14/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697428028 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.14/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697436037 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.14/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697444024 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.14/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697452014 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.14/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697460577 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.14/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697467959 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.13/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697475929 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.13/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697483044 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.13/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697490306 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.13/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697498342 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.13/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697506366 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.13/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697514334 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.13/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697522291 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.13/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697529593 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.12/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697537565 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.12/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697544812 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.12/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697552042 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.12/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697560183 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.12/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697567995 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.12/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697575812 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.12/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697583605 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.12/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697591077 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.11/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697599219 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.11/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697606357 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.11/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697613637 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.11/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697621600 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.11/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697629544 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.11/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697638262 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.11/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697646306 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.11/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697653469 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.10/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697661524 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.10/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697668673 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.10/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697675998 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.10/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697684020 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.10/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697691815 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.10/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697712938 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.10/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697722041 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.10/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697729341 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.9/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697737646 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.9/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697745086 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.9/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697752476 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.9/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697760702 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.9/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697768950 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.9/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697776922 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.9/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697784857 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.9/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697792275 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.8/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697800291 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.8/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697807601 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.8/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697814991 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.8/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697822961 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.8/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697831691 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.8/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697839756 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.8/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697847772 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.8/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697855185 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.7/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697863296 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.7/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697870474 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.7/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697877903 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.7/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697885902 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.7/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697893747 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.7/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697901474 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.7/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697919936 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.7/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697927886 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.6/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697935932 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.6/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.697943245 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.6/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.697950557 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.6/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.697958510 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.6/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.697966376 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.6/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.697974155 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.6/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.697982020 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.6/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.697989103 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.5/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.697996922 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.5/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.698004204 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.5/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.698011435 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.5/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.698020339 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.5/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.698028273 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.5/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.698036196 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.5/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.698044147 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.5/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.698051325 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.4/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.698059196 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.4/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.698066535 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.4/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.698073895 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.4/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.698081837 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.4/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.698089750 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.4/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.698097637 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.4/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.698105468 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.4/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.698112647 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.3/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.698120675 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.3/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.698127779 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.3/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.698135095 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.3/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.698143121 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.3/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.698151128 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.3/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.698159112 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.3/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.698167075 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.3/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.698174339 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.2/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.698182481 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.2/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.698189698 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.2/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.698197659 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.2/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.698205605 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.2/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.698213397 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.2/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.698221193 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.2/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.698228950 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.2/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.698236066 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.1/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.698243869 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.1/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.698251059 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.1/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.698258288 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.1/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.698266113 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.1/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.698273949 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.1/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.698281751 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.1/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.698289457 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.1/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.698296612 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.0/self_attn/Unsqueeze_12\n",
- "2023-05-03 04:33:52.698304621 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.0/self_attn/Unsqueeze_9\n",
- "2023-05-03 04:33:52.698311814 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.0/self_attn/Unsqueeze_17\n",
- "2023-05-03 04:33:52.698318904 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.0/self_attn/Unsqueeze_16\n",
- "2023-05-03 04:33:52.698326954 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.0/self_attn/Unsqueeze_14\n",
- "2023-05-03 04:33:52.698334831 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.0/self_attn/Unsqueeze_11\n",
- "2023-05-03 04:33:52.698342564 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.0/self_attn/Unsqueeze_8\n",
- "2023-05-03 04:33:52.698350440 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/encoder/layers.0/self_attn/Unsqueeze_3\n",
- "2023-05-03 04:33:52.698357980 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /text_model/embeddings/Unsqueeze\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Total Nodes | Original: 1952, After Folding: 1625 | 327 Nodes Folded\n",
- "[I] Folding Constants | Pass 3\n",
- "[I] Total Nodes | Original: 1625, After Folding: 1625 | 0 Nodes Folded\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Building Engines...\n",
- "Engine build can take a while to complete\n",
- "Exporting model: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/unet.onnx\n",
- "/usr/local/lib/python3.8/dist-packages/diffusers/models/unet_2d_condition.py:650: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):\n",
- "/usr/local/lib/python3.8/dist-packages/diffusers/models/resnet.py:200: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " assert hidden_states.shape[1] == self.channels\n",
- "/usr/local/lib/python3.8/dist-packages/diffusers/models/resnet.py:205: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " assert hidden_states.shape[1] == self.channels\n",
- "/usr/local/lib/python3.8/dist-packages/diffusers/models/resnet.py:127: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " assert hidden_states.shape[1] == self.channels\n",
- "/usr/local/lib/python3.8/dist-packages/diffusers/models/resnet.py:140: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if hidden_states.shape[0] >= 64:\n",
- "/usr/local/lib/python3.8/dist-packages/diffusers/models/unet_2d_condition.py:793: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if not return_dict:\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "========== Diagnostic Run torch.onnx.export version 1.14.0a0+44dac51 ===========\n",
- "verbose: False, log level: Level.ERROR\n",
- "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Generating optimizing model: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/unet.opt.onnx\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Folding Constants | Pass 1\n",
- "[I] Total Nodes | Original: 7757, After Folding: 5379 | 2378 Nodes Folded\n",
- "[I] Folding Constants | Pass 2\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2023-05-03 04:35:05.063804462 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.063835442 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.063851254 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/Unsqueeze_6\n",
- "2023-05-03 04:35:05.063860151 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/Unsqueeze_2\n",
- "2023-05-03 04:35:05.063874574 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.063885250 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.063899247 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/Unsqueeze_6\n",
- "2023-05-03 04:35:05.063907147 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/Unsqueeze_2\n",
- "2023-05-03 04:35:05.063921177 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.063931750 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.063945862 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/Unsqueeze_6\n",
- "2023-05-03 04:35:05.063953915 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/Unsqueeze_2\n",
- "2023-05-03 04:35:05.063968177 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.063979539 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.063993087 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064000857 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064014245 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064024708 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064038411 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064046432 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064060263 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064070926 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064084326 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064092365 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064105810 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064116164 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064132918 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064140796 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064154446 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064165905 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064179209 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064187342 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064201199 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064211740 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064225424 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064233327 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064247287 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064257595 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064270903 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064279133 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064293283 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064303776 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064317285 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064325039 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064339012 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064349129 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064361976 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064369583 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064383610 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064394922 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064407734 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064415531 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064429270 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064439251 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064452187 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064459693 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064473401 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064483399 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064495818 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064505291 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064519225 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/transformer_blocks.0/attn2/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064529037 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/transformer_blocks.0/attn1/Unsqueeze_23\n",
- "2023-05-03 04:35:05.064541710 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/Unsqueeze_6\n",
- "2023-05-03 04:35:05.064549728 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/Unsqueeze_2\n",
- "2023-05-03 04:35:05.064556692 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064563536 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064570257 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064576986 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064583887 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064590714 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064597349 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064605340 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064612284 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064619173 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064626141 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064633046 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064639818 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064646608 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064653372 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064660184 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/transformer_blocks.0/attn2/Unsqueeze_20\n",
- "2023-05-03 04:35:05.064719945 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064730107 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064737080 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064744647 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.2/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064751229 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064758661 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064765408 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064772857 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.1/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064779651 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064787086 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064793967 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064801286 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.3/attentions.0/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064808977 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064816235 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064822982 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064830202 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.2/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064836835 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064844097 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064850828 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064858220 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064864754 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064872009 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064878486 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064885621 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064892385 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064899754 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064906377 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064913714 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.2/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064920540 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064927841 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064934645 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064941859 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064948485 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064956392 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064963043 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064970411 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /up_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.064977125 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.064984371 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.064990976 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.064998100 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /mid_block/attentions.0/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.065005010 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.065012308 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.065018874 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.065026019 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.1/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.065032681 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.065039872 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.065046731 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.065086682 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.2/attentions.0/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.065095119 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.065102513 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.065109193 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.065116469 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.1/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.065123211 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.065130426 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.065138081 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.065145644 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.1/attentions.0/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.065152308 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.065159575 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.065166147 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.065173277 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.1/transformer_blocks.0/attn2/Unsqueeze_7\n",
- "2023-05-03 04:35:05.065179792 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/transformer_blocks.0/attn2/Unsqueeze_16\n",
- "2023-05-03 04:35:05.065187044 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/transformer_blocks.0/attn2/Unsqueeze_13\n",
- "2023-05-03 04:35:05.065193654 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/transformer_blocks.0/attn2/Unsqueeze_10\n",
- "2023-05-03 04:35:05.065200911 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /down_blocks.0/attentions.0/transformer_blocks.0/attn2/Unsqueeze_7\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Total Nodes | Original: 5379, After Folding: 4208 | 1171 Nodes Folded\n",
- "[I] Folding Constants | Pass 3\n",
- "[I] Total Nodes | Original: 4208, After Folding: 4208 | 0 Nodes Folded\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Building Engines...\n",
- "Engine build can take a while to complete\n",
- "Exporting model: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/vae.onnx\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "========== Diagnostic Run torch.onnx.export version 1.14.0a0+44dac51 ===========\n",
- "verbose: False, log level: Level.ERROR\n",
- "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Generating optimizing model: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/vae.opt.onnx\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Folding Constants | Pass 1\n",
- "[I] Total Nodes | Original: 671, After Folding: 500 | 171 Nodes Folded\n",
- "[I] Folding Constants | Pass 2\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2023-05-03 04:35:36.443555280 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /decoder/mid_block/attentions.0/Unsqueeze_29\n",
- "2023-05-03 04:35:36.443582656 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /decoder/mid_block/attentions.0/Unsqueeze_26\n",
- "2023-05-03 04:35:36.443597966 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /decoder/mid_block/attentions.0/Unsqueeze_31\n",
- "2023-05-03 04:35:36.443606789 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node /decoder/mid_block/attentions.0/Unsqueeze_1\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Total Nodes | Original: 500, After Folding: 471 | 29 Nodes Folded\n",
- "[I] Folding Constants | Pass 3\n",
- "[I] Total Nodes | Original: 471, After Folding: 471 | 0 Nodes Folded\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Building TensorRT engine for /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/clip.opt.onnx: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/clip.plan\n",
- "[libprotobuf WARNING google/protobuf/io/coded_stream.cc:604] Reading dangerously large protocol message. If the message turns out to be larger than 2147483647 bytes, parsing will be halted for security reasons. To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.\n",
- "[libprotobuf WARNING google/protobuf/io/coded_stream.cc:81] The total number of bytes read was 681566094\n",
- "[libprotobuf WARNING google/protobuf/io/coded_stream.cc:604] Reading dangerously large protocol message. If the message turns out to be larger than 2147483647 bytes, parsing will be halted for security reasons. To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.\n",
- "[libprotobuf WARNING google/protobuf/io/coded_stream.cc:81] The total number of bytes read was 681566094\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.\n",
- "[I] Configuring with profiles: [Profile().add('input_ids', min=(1, 77), opt=(1, 77), max=(4, 77))]\n",
- "[I] Loading tactic timing cache from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/timing_cache\n",
- "[W] Timing cache file /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/timing_cache not found, falling back to empty timing cache.\n",
- "[I] Building engine with configuration:\n",
- " Flags | [FP16]\n",
- " Engine Capability | EngineCapability.DEFAULT\n",
- " Memory Pools | [WORKSPACE: 40535.88 MiB, TACTIC_DRAM: 40535.88 MiB]\n",
- " Tactic Sources | []\n",
- " Profiling Verbosity | ProfilingVerbosity.DETAILED\n",
- " Preview Features | [DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\n",
- "[W] kFASTER_DYNAMIC_SHAPES_0805 preview feature is disabled.\n",
- "[W] TensorRT encountered issues when converting weights between types and that could affect accuracy.\n",
- "[W] If this is not the desired behavior, please modify the weights or retrain with regularization to adjust the magnitude of the weights.\n",
- "[W] Check verbose logs for the list of affected weights.\n",
- "[W] - 225 weights are affected by this issue: Detected subnormal FP16 values.\n",
- "[I] Finished engine building in 146.532 seconds\n",
- "[I] Saving tactic timing cache to /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/timing_cache\n",
- "[I] Saving engine to /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/clip.plan\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Building TensorRT engine for /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/unet.opt.onnx: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/unet.plan\n",
- "[libprotobuf WARNING google/protobuf/io/coded_stream.cc:604] Reading dangerously large protocol message. If the message turns out to be larger than 2147483647 bytes, parsing will be halted for security reasons. To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.\n",
- "[libprotobuf WARNING google/protobuf/io/coded_stream.cc:81] The total number of bytes read was 1733934759\n",
- "[libprotobuf WARNING google/protobuf/io/coded_stream.cc:604] Reading dangerously large protocol message. If the message turns out to be larger than 2147483647 bytes, parsing will be halted for security reasons. To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.\n",
- "[libprotobuf WARNING google/protobuf/io/coded_stream.cc:81] The total number of bytes read was 1733934759\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[W] onnx2trt_utils.cpp:400: One or more weights outside the range of INT32 was clamped\n",
- "[I] Configuring with profiles: [Profile().add('sample', min=(2, 4, 96, 96), opt=(2, 4, 96, 96), max=(8, 4, 96, 96)).add('encoder_hidden_states', min=(2, 77, 1024), opt=(2, 77, 1024), max=(8, 77, 1024)).add('timestep', min=[1], opt=[1], max=[1])]\n",
- "[I] Loading tactic timing cache from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/timing_cache\n",
- "[I] Building engine with configuration:\n",
- " Flags | [FP16]\n",
- " Engine Capability | EngineCapability.DEFAULT\n",
- " Memory Pools | [WORKSPACE: 40535.88 MiB, TACTIC_DRAM: 40535.88 MiB]\n",
- " Tactic Sources | []\n",
- " Profiling Verbosity | ProfilingVerbosity.DETAILED\n",
- " Preview Features | [DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\n",
- "[W] - 272 weights are affected by this issue: Detected subnormal FP16 values.\n",
- "[I] Finished engine building in 1032.233 seconds\n",
- "[I] Saving tactic timing cache to /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/timing_cache\n",
- "[I] Saving engine to /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/unet.plan\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Building TensorRT engine for /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/onnx/vae.opt.onnx: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/vae.plan\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Configuring with profiles: [Profile().add('latent', min=(1, 4, 96, 96), opt=(1, 4, 96, 96), max=(4, 4, 96, 96))]\n",
- "[I] Loading tactic timing cache from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/timing_cache\n",
- "[I] Building engine with configuration:\n",
- " Flags | [FP16]\n",
- " Engine Capability | EngineCapability.DEFAULT\n",
- " Memory Pools | [WORKSPACE: 40535.88 MiB, TACTIC_DRAM: 40535.88 MiB]\n",
- " Tactic Sources | []\n",
- " Profiling Verbosity | ProfilingVerbosity.DETAILED\n",
- " Preview Features | [DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\n",
- "[W] - 4 weights are affected by this issue: Detected subnormal FP16 values.\n",
- "[I] Finished engine building in 204.808 seconds\n",
- "[I] Saving tactic timing cache to /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/timing_cache\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading TensorRT engine: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/clip.plan\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Saving engine to /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/vae.plan\n",
- "[I] Loading bytes from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/clip.plan\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading TensorRT engine: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/unet.plan\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Loading bytes from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/unet.plan\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading TensorRT engine: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/vae.plan\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Loading bytes from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/vae.plan\n"
- ]
- }
- ],
- "source": [
- "pipe_trt = pipe_trt.to(\"cuda\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "c7defb86",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Running inference on device: cuda:0\n",
- "Loading TensorRT engine: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/clip.plan\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Loading bytes from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/clip.plan\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading TensorRT engine: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/unet.plan\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Loading bytes from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/unet.plan\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading TensorRT engine: /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/vae.plan\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[I] Loading bytes from /root/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/f7f33030acc57428be85fbec092c37a78231d75a/engine/vae.plan\n"
- ]
- }
- ],
- "source": [
- "pipe_trt = pipe_trt.to(\"cuda\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2bdd0eaa",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/root/.cache/huggingface/modules/diffusers_modules/git/stable_diffusion_tensorrt_txt2img.py:907: FutureWarning: Accessing config attribute `in_channels` directly via 'UNet2DConditionModel' object attribute is deprecated. Please access 'in_channels' over 'UNet2DConditionModel's config object instead, e.g. 'unet.config.in_channels'.\n",
- " num_channels_latents = self.unet.in_channels\n"
- ]
- },
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "prompt = \"a beautiful photograph of Mt. Fuji during cherry blossom\"\n",
- "\n",
- "# warm up runs to stabilize performance benchmarking\n",
- "num_warm_up_steps=5\n",
- "for _ in range(num_warm_up_steps):\n",
- " _ = pipe_trt(prompt)\n",
- "\n",
- "image = pipe_trt(prompt).images[0]\n",
- "display(image)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docker/build.sh b/docker/build.sh
index b24029ae..33f52f55 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/docker/launch.sh b/docker/launch.sh
index 2fe9d299..c1b5d05d 100755
--- a/docker/launch.sh
+++ b/docker/launch.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/docker/rockylinux8.Dockerfile b/docker/rockylinux8.Dockerfile
new file mode 100644
index 00000000..dca7208c
--- /dev/null
+++ b/docker/rockylinux8.Dockerfile
@@ -0,0 +1,105 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ARG CUDA_VERSION=12.4.0
+
+FROM nvidia/cuda:${CUDA_VERSION}-devel-rockylinux8
+LABEL maintainer="NVIDIA CORPORATION"
+
+ENV CUDA_VERSION_MAJOR_MINOR=12.2
+ENV NV_CUDNN_VERSION 8.9.6.50-1
+ENV NV_CUDNN_PACKAGE libcudnn8-${NV_CUDNN_VERSION}.cuda12.2
+ENV NV_CUDNN_PACKAGE_DEV libcudnn8-devel-${NV_CUDNN_VERSION}.cuda12.2
+
+ENV TRT_VERSION 10.0.1.6
+SHELL ["/bin/bash", "-c"]
+
+RUN dnf install -y \
+ ${NV_CUDNN_PACKAGE} \
+ ${NV_CUDNN_PACKAGE_DEV} \
+ && dnf clean all \
+ && rm -rf /var/cache/dnf/*
+
+# Setup user account
+ARG uid=1000
+ARG gid=1000
+RUN groupadd -r -f -g ${gid} trtuser && useradd -o -r -l -u ${uid} -g ${gid} -ms /bin/bash trtuser
+RUN usermod -aG wheel trtuser
+RUN echo 'trtuser:nvidia' | chpasswd
+RUN mkdir -p /workspace && chown trtuser /workspace
+
+# Install requried packages
+RUN dnf -y groupinstall "Development Tools"
+RUN dnf -y install \
+ openssl-devel \
+ bzip2-devel \
+ libffi-devel \
+ wget \
+ perl-core \
+ git \
+ pkg-config \
+ unzip \
+ sudo
+
+# Install python3
+RUN dnf install -y python38 python38-devel &&\
+ cd /usr/bin && ln -s /usr/bin/pip3.8 pip;
+
+
+# Install TensorRT
+RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \
+ wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
+ && tar -xf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
+ && cp -a TensorRT-10.0.1.6/lib/*.so* /usr/lib64 \
+ && pip install TensorRT-10.0.1.6/python/tensorrt-10.0.1-cp38-none-linux_x86_64.whl ;\
+elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \
+ wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
+ && tar -xf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
+ && cp -a TensorRT-10.0.1.6/lib/*.so* /usr/lib64 \
+ && pip install TensorRT-10.0.1.6/python/tensorrt-10.0.1-cp38-none-linux_x86_64.whl ;\
+else \
+ echo "Invalid CUDA_VERSION"; \
+ exit 1; \
+fi
+
+# Install PyPI packages
+RUN pip install --upgrade pip
+RUN pip install setuptools>=41.0.0
+RUN pip install numpy
+RUN pip install jupyter jupyterlab
+
+# Install Cmake
+RUN cd /tmp && \
+ wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \
+ chmod +x cmake-3.14.4-Linux-x86_64.sh && \
+ ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
+ rm ./cmake-3.14.4-Linux-x86_64.sh
+
+# Download NGC client
+RUN cd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && echo "no-apikey\nascii\n" | ngc-cli/ngc config set
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+# Set environment and working directory
+ENV TRT_LIBPATH /usr/lib64
+ENV TRT_OSSPATH /workspace/TensorRT
+ENV PATH="${PATH}:/usr/local/bin/ngc-cli"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${TRT_OSSPATH}/build/out:${TRT_LIBPATH}"
+WORKDIR /workspace
+
+USER trtuser
+RUN ["/bin/bash"]
diff --git a/docker/rockylinux9.Dockerfile b/docker/rockylinux9.Dockerfile
new file mode 100644
index 00000000..ff00512a
--- /dev/null
+++ b/docker/rockylinux9.Dockerfile
@@ -0,0 +1,104 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ARG CUDA_VERSION=12.4.0
+
+FROM nvidia/cuda:${CUDA_VERSION}-devel-rockylinux9
+LABEL maintainer="NVIDIA CORPORATION"
+
+ENV CUDA_VERSION_MAJOR_MINOR=12.2
+ENV NV_CUDNN_VERSION 8.9.6.50-1
+ENV NV_CUDNN_PACKAGE libcudnn8-${NV_CUDNN_VERSION}.cuda12.2
+ENV NV_CUDNN_PACKAGE_DEV libcudnn8-devel-${NV_CUDNN_VERSION}.cuda12.2
+
+ENV TRT_VERSION 10.0.1.6
+SHELL ["/bin/bash", "-c"]
+
+RUN dnf install -y \
+ ${NV_CUDNN_PACKAGE} \
+ ${NV_CUDNN_PACKAGE_DEV} \
+ && dnf clean all \
+ && rm -rf /var/cache/dnf/*
+
+# Setup user account
+ARG uid=1000
+ARG gid=1000
+RUN groupadd -r -f -g ${gid} trtuser && useradd -o -r -l -u ${uid} -g ${gid} -ms /bin/bash trtuser
+RUN usermod -aG wheel trtuser
+RUN echo 'trtuser:nvidia' | chpasswd
+RUN mkdir -p /workspace && chown trtuser /workspace
+
+# Install python3
+RUN dnf install -y python39 python3-devel && \
+ cd /usr/bin && rm pip && ln -s /usr/bin/pip3.9 pip;
+
+# Install PyPI packages
+RUN pip install --upgrade pip
+RUN pip install setuptools>=41.0.0
+RUN pip install numpy
+RUN pip install jupyter jupyterlab
+
+# Install requried packages
+RUN dnf -y groupinstall "Development Tools"
+RUN dnf -y install \
+ openssl-devel \
+ bzip2-devel \
+ libffi-devel \
+ wget \
+ perl-core \
+ git \
+ pkg-config \
+ unzip \
+ sudo
+
+# Install TensorRT
+RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \
+ wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
+ && tar -xf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
+ && cp -a TensorRT-10.0.1.6/lib/*.so* /usr/lib64 \
+ && pip install TensorRT-10.0.1.6/python/tensorrt-10.0.1-cp39-none-linux_x86_64.whl ;\
+elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \
+ wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
+ && tar -xf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
+ && cp -a TensorRT-10.0.1.6/lib/*.so* /usr/lib64 \
+ && pip install TensorRT-10.0.1.6/python/tensorrt-10.0.1-cp39-none-linux_x86_64.whl ;\
+else \
+ echo "Invalid CUDA_VERSION"; \
+ exit 1; \
+fi
+
+# Install Cmake
+RUN cd /tmp && \
+ wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \
+ chmod +x cmake-3.14.4-Linux-x86_64.sh && \
+ ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
+ rm ./cmake-3.14.4-Linux-x86_64.sh
+
+# Download NGC client
+RUN cd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && echo "no-apikey\nascii\n" | ngc-cli/ngc config set
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+# Set environment and working directory
+ENV TRT_LIBPATH /usr/lib64
+ENV TRT_OSSPATH /workspace/TensorRT
+ENV PATH="${PATH}:/usr/local/bin/ngc-cli"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${TRT_OSSPATH}/build/out:${TRT_LIBPATH}"
+WORKDIR /workspace
+
+USER trtuser
+RUN ["/bin/bash"]
diff --git a/docker/ubuntu-20.04.Dockerfile b/docker/ubuntu-20.04.Dockerfile
index 0049d4c2..7498c124 100644
--- a/docker/ubuntu-20.04.Dockerfile
+++ b/docker/ubuntu-20.04.Dockerfile
@@ -15,7 +15,7 @@
# limitations under the License.
#
-ARG CUDA_VERSION=12.3.2
+ARG CUDA_VERSION=12.4.0
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
LABEL maintainer="NVIDIA CORPORATION"
@@ -28,7 +28,7 @@ ENV CUDA_VERSION_MAJOR_MINOR=12.2
ENV NV_CUDNN_PACKAGE "libcudnn8=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}"
ENV NV_CUDNN_PACKAGE_DEV "libcudnn8-dev=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}"
-ENV TRT_VERSION 10.0.0.6
+ENV TRT_VERSION 10.0.1.6
SHELL ["/bin/bash", "-c"]
RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -84,15 +84,15 @@ RUN apt-get install -y --no-install-recommends \
# Install TensorRT
RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \
- wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.0/tars/TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
- && tar -xf TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
- && cp -a TensorRT-10.0.0.6/lib/*.so* /usr/lib/x86_64-linux-gnu \
- && pip install TensorRT-10.0.0.6/python/tensorrt-10.0.0b6-cp38-none-linux_x86_64.whl ;\
+ wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
+ && tar -xf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
+ && cp -a TensorRT-10.0.1.6/lib/*.so* /usr/lib/x86_64-linux-gnu \
+ && pip install TensorRT-10.0.1.6/python/tensorrt-10.0.1-cp38-none-linux_x86_64.whl ;\
elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \
- wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.0/tars/TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
- && tar -xf TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
- && cp -a TensorRT-10.0.0.6/lib/*.so* /usr/lib/x86_64-linux-gnu \
- && pip install TensorRT-10.0.0.6/python/tensorrt-10.0.0b6-cp38-none-linux_x86_64.whl ;\
+ wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
+ && tar -xf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
+ && cp -a TensorRT-10.0.1.6/lib/*.so* /usr/lib/x86_64-linux-gnu \
+ && pip install TensorRT-10.0.1.6/python/tensorrt-10.0.1-cp38-none-linux_x86_64.whl ;\
else \
echo "Invalid CUDA_VERSION"; \
exit 1; \
diff --git a/docker/ubuntu-22.04-aarch64.Dockerfile b/docker/ubuntu-22.04-aarch64.Dockerfile
new file mode 100644
index 00000000..ebac9297
--- /dev/null
+++ b/docker/ubuntu-22.04-aarch64.Dockerfile
@@ -0,0 +1,112 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ARG CUDA_VERSION=12.4.0
+
+# Multi-arch container support available in non-cudnn containers.
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+ENV TRT_VERSION 10.0.1.6
+SHELL ["/bin/bash", "-c"]
+
+# Setup user account
+ARG uid=1000
+ARG gid=1000
+RUN groupadd -r -f -g ${gid} trtuser && useradd -o -r -l -u ${uid} -g ${gid} -ms /bin/bash trtuser
+RUN usermod -aG sudo trtuser
+RUN echo 'trtuser:nvidia' | chpasswd
+RUN mkdir -p /workspace && chown trtuser /workspace
+
+# Required to build Ubuntu 20.04 without user prompts with DLFW container
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update CUDA signing key
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa/3bf863cc.pub
+
+# Install requried libraries
+RUN apt-get update && apt-get install -y software-properties-common
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ libcurl4-openssl-dev \
+ wget \
+ git \
+ pkg-config \
+ sudo \
+ ssh \
+ libssl-dev \
+ pbzip2 \
+ pv \
+ bzip2 \
+ unzip \
+ devscripts \
+ lintian \
+ fakeroot \
+ dh-make \
+ build-essential
+
+# Install python3
+RUN apt-get install -y --no-install-recommends \
+ python3 \
+ python3-pip \
+ python3-dev \
+ python3-wheel &&\
+ cd /usr/local/bin &&\
+ ln -s /usr/bin/python3 python &&\
+ ln -s /usr/bin/pip3 pip;
+
+# Install TensorRT. This will also pull in CUDNN
+RUN ver="${CUDA_VERSION%.*}" &&\
+ if [ "${ver%.*}" = "12" ] ; then \
+ ver="12.4"; \
+ fi &&\
+ v="${TRT_VERSION}-1+cuda${ver}" &&\
+ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa/3bf863cc.pub &&\
+ apt-get update &&\
+ sudo apt-get -y install libnvinfer10=${v} libnvonnxparsers10=${v} libnvinfer-plugin10=${v} \
+ libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvinfer-plugin-dev=${v} \
+ python3-libnvinfer=${v} libnvinfer-dispatch10=${v} libnvinfer-dispatch-dev=${v} libnvinfer-lean10=${v} \
+ libnvinfer-lean-dev=${v} libnvinfer-vc-plugin10=${v} libnvinfer-vc-plugin-dev=${v} \
+ libnvinfer-headers-dev=${v} libnvinfer-headers-plugin-dev=${v};
+
+# Install Cmake
+RUN cd /tmp && \
+ wget https://github.com/Kitware/CMake/releases/download/v3.21.4/cmake-3.21.4-linux-aarch64.sh && \
+ chmod +x cmake-3.21.4-linux-aarch64.sh && \
+ ./cmake-3.21.4-linux-aarch64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
+ rm ./cmake-3.21.4-linux-aarch64.sh
+
+# Install PyPI packages
+RUN pip3 install --upgrade pip
+RUN pip3 install setuptools>=41.0.0
+COPY requirements.txt /tmp/requirements.txt
+RUN pip3 install -r /tmp/requirements.txt
+RUN pip3 install jupyter jupyterlab
+# Workaround to remove numpy installed with tensorflow
+RUN pip3 install --upgrade numpy
+
+# Download NGC client
+RUN cd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_arm64.zip && unzip ngccli_arm64.zip && chmod u+x ngc-cli/ngc && rm ngccli_arm64.zip ngc-cli.md5 && echo "no-apikey\nascii\n" | ngc-cli/ngc config set
+
+# Set environment and working directory
+ENV TRT_LIBPATH /usr/lib/aarch64-linux-gnu/
+ENV TRT_OSSPATH /workspace/TensorRT
+ENV PATH="${PATH}:/usr/local/bin/ngc-cli"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${TRT_OSSPATH}/build/out:${TRT_LIBPATH}"
+WORKDIR /workspace
+
+USER trtuser
+RUN ["/bin/bash"]
diff --git a/docker/ubuntu-22.04.Dockerfile b/docker/ubuntu-22.04.Dockerfile
index ebe90f71..a7e0d6a1 100644
--- a/docker/ubuntu-22.04.Dockerfile
+++ b/docker/ubuntu-22.04.Dockerfile
@@ -15,7 +15,7 @@
# limitations under the License.
#
-ARG CUDA_VERSION=12.3.2
+ARG CUDA_VERSION=12.4.0
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
LABEL maintainer="NVIDIA CORPORATION"
@@ -28,7 +28,7 @@ ENV CUDA_VERSION_MAJOR_MINOR=12.2
ENV NV_CUDNN_PACKAGE "libcudnn8=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}"
ENV NV_CUDNN_PACKAGE_DEV "libcudnn8-dev=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}"
-ENV TRT_VERSION 10.0.0.6
+ENV TRT_VERSION 10.0.1.6
SHELL ["/bin/bash", "-c"]
RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -49,7 +49,7 @@ RUN mkdir -p /workspace && chown trtuser /workspace
ENV DEBIAN_FRONTEND=noninteractive
# Update CUDA signing key
-RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub
# Install requried libraries
RUN apt-get update && apt-get install -y software-properties-common
@@ -84,15 +84,15 @@ RUN apt-get install -y --no-install-recommends \
# Install TensorRT
RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \
- wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.0/tars/TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
- && tar -xf TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
- && cp -a TensorRT-10.0.0.6/lib/*.so* /usr/lib/x86_64-linux-gnu \
- && pip install TensorRT-10.0.0.6/python/tensorrt-10.0.0b6-cp310-none-linux_x86_64.whl ;\
+ wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
+ && tar -xf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz \
+ && cp -a TensorRT-10.0.1.6/lib/*.so* /usr/lib/x86_64-linux-gnu \
+ && pip install TensorRT-10.0.1.6/python/tensorrt-10.0.1-cp310-none-linux_x86_64.whl ;\
elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \
- wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.0/tars/TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
- && tar -xf TensorRT-10.0.0.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
- && cp -a TensorRT-10.0.0.6/lib/*.so* /usr/lib/x86_64-linux-gnu \
- && pip install TensorRT-10.0.0.6/python/tensorrt-10.0.0b6-cp310-none-linux_x86_64.whl ;\
+ wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
+ && tar -xf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz \
+ && cp -a TensorRT-10.0.1.6/lib/*.so* /usr/lib/x86_64-linux-gnu \
+ && pip install TensorRT-10.0.1.6/python/tensorrt-10.0.1-cp310-none-linux_x86_64.whl ;\
else \
echo "Invalid CUDA_VERSION"; \
exit 1; \
diff --git a/docker/ubuntu-cross-aarch64.Dockerfile b/docker/ubuntu-cross-aarch64.Dockerfile
new file mode 100644
index 00000000..eb2e100b
--- /dev/null
+++ b/docker/ubuntu-cross-aarch64.Dockerfile
@@ -0,0 +1,134 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ARG CUDA_VERSION=12.4.0
+ARG OS_VERSION=22.04
+
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${OS_VERSION}
+LABEL maintainer="NVIDIA CORPORATION"
+
+ENV TRT_VERSION 10.0.1.6
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG uid=1000
+ARG gid=1000
+RUN groupadd -r -f -g ${gid} trtuser && useradd -o -r -l -u ${uid} -g ${gid} -ms /bin/bash trtuser
+RUN usermod -aG sudo trtuser
+RUN echo 'trtuser:nvidia' | chpasswd
+RUN mkdir -p /workspace && chown trtuser /workspace
+
+# Install requried libraries
+RUN apt-get update && apt-get install -y software-properties-common
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ libcurl4-openssl-dev \
+ wget \
+ git \
+ pkg-config \
+ python3 \
+ python3-pip \
+ python3-dev \
+ python3-wheel \
+ sudo \
+ ssh \
+ pbzip2 \
+ pv \
+ bzip2 \
+ unzip \
+ build-essential
+
+RUN cd /usr/local/bin &&\
+ ln -s /usr/bin/python3 python &&\
+ ln -s /usr/bin/pip3 pip
+RUN pip3 install --upgrade pip
+RUN pip3 install setuptools>=41.0.0
+
+# Install Cmake
+RUN cd /tmp && \
+ wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \
+ chmod +x cmake-3.14.4-Linux-x86_64.sh && \
+ ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
+ rm ./cmake-3.14.4-Linux-x86_64.sh
+
+# Skip installing PyPI packages and NGC client on cross-build container
+
+COPY docker/jetpack_files /pdk_files
+COPY scripts/stubify.sh /pdk_files
+
+# Update CUDA signing keys
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub
+
+# Install CUDA cross compile toolchain
+RUN dpkg -i /pdk_files/cuda-repo-cross-aarch64*.deb /pdk_files/cuda-repo-ubuntu*_amd64.deb \
+ && sudo cp /var/cuda-repo-cross-aarch64*/cuda-*keyring.gpg /usr/share/keyrings/ \
+ && sudo cp /var/cuda-repo-ubuntu2204*/cuda-*keyring.gpg /usr/share/keyrings/ \
+ && apt-get update \
+ && apt-get install -y cuda-cross-aarch64 \
+ && rm -rf /var/lib/apt/lists/*
+
+# Unpack cudnn
+RUN dpkg -x /pdk_files/cudnn-local*.deb /pdk_files/cudnn_extract \
+ && dpkg -x /pdk_files/cudnn_extract/var/cudnn-local*/libcudnn8_*.deb /pdk_files/cudnn \
+ && dpkg -x /pdk_files/cudnn_extract/var/cudnn-local*/libcudnn8-dev*.deb /pdk_files/cudnn \
+ && cd /pdk_files/cudnn/usr/lib/aarch64-linux-gnu \
+ && cd /pdk_files/cudnn \
+ && ln -s usr/include/aarch64-linux-gnu include \
+ && ln -s usr/lib/aarch64-linux-gnu lib \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_adv_infer_v[7-9].h /usr/include/cudnn_adv_infer.h \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_adv_train_v[7-9].h /usr/include/cudnn_adv_train.h \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_backend_v[7-9].h /usr/include/cudnn_backend.h \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_cnn_infer_v[7-9].h /usr/include/cudnn_cnn_infer.h \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_cnn_train_v[7-9].h /usr/include/cudnn_cnn_train.h \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_ops_infer_v[7-9].h /usr/include/cudnn_ops_infer.h \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_ops_train_v[7-9].h /usr/include/cudnn_ops_train.h \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_v[7-9].h /usr/include/cudnn.h \
+ && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_version_v[7-9].h /usr/include/cudnn_version.h
+
+# Unpack libnvinfer
+RUN dpkg -x /pdk_files/libnvinfer10_*-1+cuda12.[0-9]_arm64.deb /pdk_files/tensorrt \
+ && dpkg -x /pdk_files/libnvinfer-dev_*-1+cuda12.[0-9]_arm64.deb /pdk_files/tensorrt \
+ && dpkg -x /pdk_files/libnvinfer-plugin10_*-1+cuda12.[0-9]_arm64.deb /pdk_files/tensorrt \
+ && dpkg -x /pdk_files/libnvinfer-plugin-dev_*-1+cuda12.[0-9]_arm64.deb /pdk_files/tensorrt \
+ && dpkg -x /pdk_files/libnvonnxparsers10_*-1+cuda12.[0-9]_arm64.deb /pdk_files/tensorrt \
+ && dpkg -x /pdk_files/libnvonnxparsers-dev_*-1+cuda12.[0-9]_arm64.deb /pdk_files/tensorrt
+
+# Clean up debs
+RUN rm -rf /pdk_files/*.deb
+
+# set up librt.so symlink
+RUN ln -sf /usr/aarch64-linux-gnu/lib/librt.so.1 /usr/aarch64-linux-gnu/lib/librt.so
+RUN ln -sf /usr/lib/aarch64-linux-gnu/librt.so.1 /usr/lib/aarch64-linux-gnu/librt.so
+
+# create stub libraries
+RUN cd /pdk_files/tensorrt \
+ && ln -s usr/include/aarch64-linux-gnu include \
+ && ln -s usr/lib/aarch64-linux-gnu lib \
+ && cd lib \
+ && mkdir stubs \
+ && for x in nvinfer nvparsers nvinfer_plugin nvonnxparser; \
+ do \
+ CC=aarch64-linux-gnu-gcc /pdk_files/stubify.sh lib${x}.so stubs/lib${x}.so \
+ ; done
+
+# Set environment and working directory
+ENV TRT_LIBPATH /pdk_files/tensorrt/lib
+ENV TRT_OSSPATH /workspace/TensorRT
+ENV IS_L4T_CROSS True
+WORKDIR /workspace
+
+USER trtuser
+RUN ["/bin/bash"]
diff --git a/include/NvInfer.h b/include/NvInfer.h
index 7fff86b1..c921ede0 100644
--- a/include/NvInfer.h
+++ b/include/NvInfer.h
@@ -1282,7 +1282,7 @@ class IConvolutionLayer : public ILayer
//!
//! If executing this layer on DLA, only support 2D padding, both height and width must be in the range [1,32].
//!
- //! \see getDilation()
+ //! \see getDilationNd()
//!
void setDilationNd(Dims const& dilation) noexcept
{
@@ -1292,7 +1292,7 @@ class IConvolutionLayer : public ILayer
//!
//! \brief Get the multi-dimension dilation of the convolution.
//!
- //! \see setDilation()
+ //! \see setDilationNd()
//!
Dims getDilationNd() const noexcept
{
@@ -3716,10 +3716,9 @@ class IRaggedSoftMaxLayer : public ILayer
//! Two types are compatible if they are identical, or are both in {kFLOAT, kHALF}.
//! Implicit conversion between incompatible types, i.e. without using setOutputType,
//! is recognized as incorrect as of TensorRT 8.4, but is retained for API compatibility
-//! within TensorRT 8.x releases. In a future major release the behavior will change
-//! to record an error if the network output tensor type is incompatible with the layer
-//! output type. E.g., implicit conversion from kFLOAT to kINT32 will not be allowed,
-//! and instead such a conversion will require calling setOutputType(DataType::kINT32).
+//! within TensorRT 8.x releases. TensorRT 10.0 onwards it is an error if the network output tensor type is incompatible
+//! with the layer output type. E.g., implicit conversion from kFLOAT to kINT32 is not allowed, Use
+//! setOutputType(DataType::kINT32) to explict convert kFLOAT to kINT32.
//!
//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
//!
@@ -4343,6 +4342,14 @@ class ILoop;
//!
//! \brief This is a base class for Loop boundary layers.
//!
+//! The loop boundary layers are used to define loops within a network, enabling the implementation
+//! of recurrences. The boundary layers for a loop are created by class ILoop.
+//!
+//! There are four kinds of boundary layers.
+//! * ITripLimitLayer: controls the number of loop iterations.
+//! * IIterationLayer: iterates over an input tensor.
+//! * IRecurrenceLayer: returns an initial value or value from the previous loop iteration.
+//! * ILoopOutputLayer: generates an output tensor from the loop iterations.
class ILoopBoundaryLayer : public ILayer
{
public:
@@ -4526,6 +4533,8 @@ class IIfConditional : public INoCopy
//!
//! \brief A recurrence layer in a network definition.
//!
+//! The recurrence layer allows a loop iteration to compute a result from a value computed in the previous iteration.
+//!
class IRecurrenceLayer : public ILoopBoundaryLayer
{
public:
@@ -4641,6 +4650,12 @@ class ILoopOutputLayer : public ILoopBoundaryLayer
//!
//! \brief A layer that represents a trip-count limiter.
//!
+//! The trip limit layer sets the execution condition for loops, using kCOUNT to define the number of iterations or
+//! kWHILE for a conditional loop. A loop can have one of each kind of limit, in which case the loop exits when
+//! the trip count is reached or the condition becomes false.
+//!
+//! See INetworkDefinition::addTripLimit().
+//!
class ITripLimitLayer : public ILoopBoundaryLayer
{
public:
@@ -4662,6 +4677,11 @@ class ITripLimitLayer : public ILoopBoundaryLayer
//!
//! \brief A layer to do iterations.
//!
+//! The iterator layer iterates over a tensor along the given axis and in the given direction.
+//! It enables each loop iteration to inspect a different slice of the tensor.
+//!
+//! \see ILoop::addIterator()
+//!
class IIteratorLayer : public ILoopBoundaryLayer
{
public:
@@ -4715,6 +4735,10 @@ class IIteratorLayer : public ILoopBoundaryLayer
//!
//! \brief Helper for creating a recurrent subgraph.
//!
+//! An ILoop defines a loop within a network. It supports the implementation of recurrences,
+//! which are crucial for iterative computations, such as RNNs for natural language processing and
+//! time-series analysis.
+//!
class ILoop : public INoCopy
{
public:
@@ -4809,7 +4833,12 @@ class ILoop : public INoCopy
//!
//! \class ISelectLayer
//!
-//! \brief A select layer in a network definition.
+//! \brief Select elements from two data tensors based on a condition tensor.
+//!
+//! The select layer makes elementwise selections from two data tensors based on a condition tensor,
+//! behaving similarly to the numpy.where function with three parameters.
+//! The three input tensors must share the same rank. Multidirectional broadcasting is supported.
+//! The output tensor has the dimensions of the inputs AFTER applying the broadcast rule.
//!
//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
//!
@@ -8361,13 +8390,16 @@ enum class MemoryPoolType : int32_t
kTACTIC_DRAM = 4,
//!
- //! kTACTIC_SHARED_MEMORY defines the maximum shared memory size utilized for executing
- //! the backend CUDA kernel implementation. Adjust this value to restrict tactics that exceed
- //! the specified threshold en masse. The default value is device max capability. This value must
+ //! kTACTIC_SHARED_MEMORY defines the maximum sum of shared memory reserved by the driver and
+ //! used for executing CUDA kernels. Adjust this value to restrict tactics that exceed the
+ //! specified threshold en masse. The default value is device max capability. This value must
//! be less than 1GiB.
//!
+ //! The driver reserved shared memory can be queried from cuDeviceGetAttribute(&reservedShmem,
+ //! CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK).
+ //!
//! Updating this flag will override the shared memory limit set by \ref HardwareCompatibilityLevel,
- //! which defaults to 48KiB.
+ //! which defaults to 48KiB - reservedShmem.
//!
kTACTIC_SHARED_MEMORY = 5,
};
@@ -8430,10 +8462,15 @@ enum class HardwareCompatibilityLevel : int32_t
//! built.
kNONE = 0,
- //! Require that the engine is compatible with Ampere and newer GPUs. This will limit the max shared memory usage to
- //! 48KiB, may reduce the number of available tactics for each layer, and may prevent some fusions from occurring.
- //! Thus this can decrease the performance, especially for tf32 models.
+ //! Require that the engine is compatible with Ampere and newer GPUs. This will limit the combined usage of driver
+ //! reserved and backend kernel max shared memory to 48KiB, may reduce the number of available tactics for each
+ //! layer, and may prevent some fusions from occurring. Thus this can decrease the performance, especially for tf32
+ //! models.
//! This option will disable cuDNN, cuBLAS, and cuBLAS LT as tactic sources.
+ //!
+ //! The driver reserved shared memory can be queried from cuDeviceGetAttribute(&reservedShmem,
+ //! CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK).
+ //!
kAMPERE_PLUS = 1,
};
diff --git a/include/NvInferConsistency.h b/include/NvInferConsistency.h
index 5096c3f4..32bca28b 100644
--- a/include/NvInferConsistency.h
+++ b/include/NvInferConsistency.h
@@ -19,7 +19,9 @@
#define NV_INFER_CONSISTENCY_H
#include "NvInferConsistencyImpl.h"
+#define NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE 1
#include "NvInferRuntimeBase.h"
+#undef NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE
#include "NvInferRuntimePlugin.h"
//!
diff --git a/include/NvInferLegacyDims.h b/include/NvInferLegacyDims.h
index 204d17a8..2725d184 100644
--- a/include/NvInferLegacyDims.h
+++ b/include/NvInferLegacyDims.h
@@ -18,7 +18,9 @@
#ifndef NV_INFER_LEGACY_DIMS_H
#define NV_INFER_LEGACY_DIMS_H
-#include "NvInferRuntimeCommon.h"
+#define NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE 1
+#include "NvInferRuntimeBase.h"
+#undef NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE
//!
//! \file NvInferLegacyDims.h
diff --git a/include/NvInferRuntimeBase.h b/include/NvInferRuntimeBase.h
index 60006e6c..3624706c 100644
--- a/include/NvInferRuntimeBase.h
+++ b/include/NvInferRuntimeBase.h
@@ -64,9 +64,15 @@
//!
//! This file contains common definitions, data structures and interfaces shared between the standard and safe runtime.
//!
-//! \warning Do not directly include this file. Instead include either NvInferRuntime.h (for the standard runtime) or
-//! NvInferSafeRuntime.h (for the safety runtime).
-//!
+//! \warning Do not directly include this file. Instead include one of:
+//! * NvInferRuntime.h (for the standard runtime)
+//! * NvInferSafeRuntime.h (for the safety runtime)
+//! * NvInferConsistency.h (for consistency checker)
+//! * NvInferPluginUtils.h (for plugin utilities)
+//!
+#if !defined(NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE) && !defined(TRT_VCAST_SAFE)
+static_assert(false, "Do not directly include this file. Include NvInferRuntime.h or NvInferSafeRuntime.h or NvInferConsistency.h or NvInferPluginUtils.h");
+#endif
//! Forward declare some CUDA types to avoid an include dependency.
@@ -864,6 +870,8 @@ class IErrorRecorder : public IVersionedInterface
//!
//! \brief The length limit for an error description in bytes, excluding the '\0' string terminator.
+ //! Only applicable to safe runtime.
+ //! General error recorder implementation can use any size appropriate for the use case.
//!
static constexpr size_t kMAX_DESC_LENGTH{127U};
@@ -982,10 +990,10 @@ class IErrorRecorder : public IVersionedInterface
//!
//! \brief Report an error to the error recorder with the corresponding enum and description.
//!
- //! \param val The error code enum that is being reported.
- //! \param desc The string description of the error, which will be a NULL-terminated string of kMAX_DESC_LENGTH
- //! bytes or less (excluding the NULL terminator). Descriptions that exceed this limit will be silently
- //! truncated.
+ //! \param val The error code enum that is being reported.
+ //! \param desc The string description of the error, which will be a NULL-terminated string.
+ //! For safety use cases its length is limited to kMAX_DESC_LENGTH bytes
+ //! (excluding the NULL terminator) and descriptions that exceed this limit will be silently truncated.
//!
//! Report an error to the user that has a given value and human readable description. The function returns false
//! if processing can continue, which implies that the reported error is not fatal. This does not guarantee that
diff --git a/include/NvInferRuntimeCommon.h b/include/NvInferRuntimeCommon.h
index 65a3c220..13e42f4f 100644
--- a/include/NvInferRuntimeCommon.h
+++ b/include/NvInferRuntimeCommon.h
@@ -28,7 +28,9 @@
//!
//! \warning Do not directly include this file. Instead include NvInferRuntime.h
//!
+#define NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE 1
#include "NvInferRuntimeBase.h"
+#undef NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE
#include "NvInferRuntimePlugin.h"
namespace nvinfer1
diff --git a/include/NvInferRuntimePlugin.h b/include/NvInferRuntimePlugin.h
index ecae2ce9..5f97f4a5 100644
--- a/include/NvInferRuntimePlugin.h
+++ b/include/NvInferRuntimePlugin.h
@@ -18,7 +18,9 @@
#ifndef NV_INFER_RUNTIME_PLUGIN_H
#define NV_INFER_RUNTIME_PLUGIN_H
+#define NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE 1
#include "NvInferRuntimeBase.h"
+#undef NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE
//!
//! \file NvInferRuntimePlugin.h
diff --git a/include/NvInferSafeRuntime.h b/include/NvInferSafeRuntime.h
index 1c322c4e..6dc503e0 100644
--- a/include/NvInferSafeRuntime.h
+++ b/include/NvInferSafeRuntime.h
@@ -18,7 +18,9 @@
#ifndef NV_INFER_SAFE_RUNTIME_H
#define NV_INFER_SAFE_RUNTIME_H
+#define NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE 1
#include "NvInferRuntimeBase.h"
+#undef NV_INFER_INTERNAL_INCLUDE_RUNTIME_BASE
#include "NvInferRuntimePlugin.h"
#include
#include
diff --git a/include/NvInferVersion.h b/include/NvInferVersion.h
index 8c99bea7..13861a12 100644
--- a/include/NvInferVersion.h
+++ b/include/NvInferVersion.h
@@ -25,7 +25,7 @@
#define NV_TENSORRT_MAJOR 10 //!< TensorRT major version.
#define NV_TENSORRT_MINOR 0 //!< TensorRT minor version.
-#define NV_TENSORRT_PATCH 0 //!< TensorRT patch version.
+#define NV_TENSORRT_PATCH 1 //!< TensorRT patch version.
#define NV_TENSORRT_BUILD 6 //!< TensorRT build number.
#define NV_TENSORRT_LWS_MAJOR 0 //!< TensorRT LWS major version.
@@ -36,6 +36,6 @@
#define NV_TENSORRT_RELEASE_TYPE_RELEASE_CANDIDATE 1 //!< A release candidate
#define NV_TENSORRT_RELEASE_TYPE_GENERAL_AVAILABILITY 2 //!< A final release
-#define NV_TENSORRT_RELEASE_TYPE NV_TENSORRT_RELEASE_TYPE_EARLY_ACCESS //!< TensorRT release type
+#define NV_TENSORRT_RELEASE_TYPE NV_TENSORRT_RELEASE_TYPE_GENERAL_AVAILABILITY //!< TensorRT release type
#endif // NV_INFER_VERSION_H
diff --git a/parsers/CMakeLists.txt b/parsers/CMakeLists.txt
index 750942e6..6b4858ba 100644
--- a/parsers/CMakeLists.txt
+++ b/parsers/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/parsers/common/half.h b/parsers/common/half.h
index 7497459a..a66c197c 100644
--- a/parsers/common/half.h
+++ b/parsers/common/half.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/parsers/common/ieee_half.h b/parsers/common/ieee_half.h
index 071aee09..ac78fd6b 100644
--- a/parsers/common/ieee_half.h
+++ b/parsers/common/ieee_half.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/parsers/common/parserUtils.h b/parsers/common/parserUtils.h
index 115a2efa..eeb14724 100644
--- a/parsers/common/parserUtils.h
+++ b/parsers/common/parserUtils.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/parsers/onnx b/parsers/onnx
index 973d68d0..eb43908b 160000
--- a/parsers/onnx
+++ b/parsers/onnx
@@ -1 +1 @@
-Subproject commit 973d68d06f671998ddcc0c504b9a2fdfcfc85a62
+Subproject commit eb43908b02a296ea0594432f06e9d3fac288d672
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 2e708d3a..2007b7ed 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,10 +16,10 @@
#
add_custom_target(plugin)
-set(TARGET_NAME nvinfer_plugin)
+set(TARGET_NAME ${nvinfer_plugin_lib_name})
set(SHARED_TARGET ${TARGET_NAME})
set(STATIC_TARGET ${TARGET_NAME}_static)
-set(VFC_TARGET_NAME nvinfer_vc_plugin)
+set(VFC_TARGET_NAME ${nvinfer_vc_plugin_lib_name})
set(VFC_SHARED_TARGET ${VFC_TARGET_NAME})
set(TARGET_DIR ${CMAKE_CURRENT_SOURCE_DIR})
@@ -143,10 +143,6 @@ else()
set_target_properties(${SHARED_TARGET} PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL -Wl,-Bsymbolic -Wl,--version-script=${PLUGIN_EXPORT_MAP} -Wl,--no-undefined")
endif()
-if (ADDITIONAL_PLATFORM_LIB_FLAGS)
- set_target_properties(${SHARED_TARGET} PROPERTIES LINK_FLAGS ${ADDITIONAL_PLATFORM_LIB_FLAGS})
-endif()
-
set_target_properties(${SHARED_TARGET} PROPERTIES DEBUG_POSTFIX ${TRT_DEBUG_POSTFIX})
set_target_properties(${SHARED_TARGET} PROPERTIES VERSION ${TRT_VERSION} SOVERSION ${TRT_SOVERSION} )
@@ -155,7 +151,7 @@ set_property(TARGET ${SHARED_TARGET} PROPERTY CUDA_STANDARD 14)
target_link_libraries(${SHARED_TARGET}
${CUDART_LIB}
- ${nvinfer_LIB_PATH}
+ ${${nvinfer_lib_name}_LIB_PATH}
${CMAKE_DL_LIBS}
)
@@ -189,10 +185,6 @@ set_target_properties(${STATIC_TARGET} PROPERTIES
set_target_properties(${STATIC_TARGET} PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL")
-if (ADDITIONAL_PLATFORM_LIB_FLAGS)
- set_target_properties(${STATIC_TARGET} PROPERTIES LINK_FLAGS ${ADDITIONAL_PLATFORM_LIB_FLAGS})
-endif()
-
set_target_properties(${STATIC_TARGET} PROPERTIES DEBUG_POSTFIX ${TRT_DEBUG_POSTFIX})
set_target_properties(${STATIC_TARGET} PROPERTIES VERSION ${TRT_VERSION} SOVERSION ${TRT_SOVERSION} )
@@ -230,10 +222,6 @@ else()
set_target_properties(${VFC_SHARED_TARGET} PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL -Wl,-Bsymbolic -Wl,--version-script=${VFC_PLUGIN_EXPORT_MAP} -Wl,--no-undefined")
endif()
-if (ADDITIONAL_PLATFORM_LIB_FLAGS)
- set_target_properties(${VFC_SHARED_TARGET} PROPERTIES LINK_FLAGS ${ADDITIONAL_PLATFORM_LIB_FLAGS})
-endif()
-
set_target_properties(${VFC_SHARED_TARGET} PROPERTIES DEBUG_POSTFIX ${TRT_DEBUG_POSTFIX})
set_target_properties(${VFC_SHARED_TARGET} PROPERTIES VERSION ${TRT_VERSION} SOVERSION ${TRT_SOVERSION} )
@@ -242,7 +230,7 @@ set_property(TARGET ${VFC_SHARED_TARGET} PROPERTY CUDA_STANDARD 14)
target_link_libraries(${VFC_SHARED_TARGET}
${CUDART_LIB}
- ${nvinfer_LIB_PATH}
+ ${${nvinfer_lib_name}_LIB_PATH}
${CMAKE_DL_LIBS}
)
diff --git a/plugin/batchTilePlugin/CMakeLists.txt b/plugin/batchTilePlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/batchTilePlugin/CMakeLists.txt
+++ b/plugin/batchTilePlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/batchTilePlugin/batchTilePlugin.cpp b/plugin/batchTilePlugin/batchTilePlugin.cpp
index 7b99d578..1e98ac6e 100644
--- a/plugin/batchTilePlugin/batchTilePlugin.cpp
+++ b/plugin/batchTilePlugin/batchTilePlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/batchTilePlugin/batchTilePlugin.h b/plugin/batchTilePlugin/batchTilePlugin.h
index 0ff85bb0..fe1ce902 100644
--- a/plugin/batchTilePlugin/batchTilePlugin.h
+++ b/plugin/batchTilePlugin/batchTilePlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/batchedNMSPlugin/CMakeLists.txt b/plugin/batchedNMSPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/batchedNMSPlugin/CMakeLists.txt
+++ b/plugin/batchedNMSPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/batchedNMSPlugin/batchedNMSInference.cu b/plugin/batchedNMSPlugin/batchedNMSInference.cu
index 9d01f5b8..2a0ceff3 100644
--- a/plugin/batchedNMSPlugin/batchedNMSInference.cu
+++ b/plugin/batchedNMSPlugin/batchedNMSInference.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/batchedNMSPlugin/batchedNMSPlugin.cpp b/plugin/batchedNMSPlugin/batchedNMSPlugin.cpp
index 40ff8671..428db1ad 100644
--- a/plugin/batchedNMSPlugin/batchedNMSPlugin.cpp
+++ b/plugin/batchedNMSPlugin/batchedNMSPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/batchedNMSPlugin/batchedNMSPlugin.h b/plugin/batchedNMSPlugin/batchedNMSPlugin.h
index 418333e8..4c6c749f 100644
--- a/plugin/batchedNMSPlugin/batchedNMSPlugin.h
+++ b/plugin/batchedNMSPlugin/batchedNMSPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/batchedNMSPlugin/gatherNMSOutputs.h b/plugin/batchedNMSPlugin/gatherNMSOutputs.h
index f245eb93..0e9b78e4 100644
--- a/plugin/batchedNMSPlugin/gatherNMSOutputs.h
+++ b/plugin/batchedNMSPlugin/gatherNMSOutputs.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/CMakeLists.txt b/plugin/bertQKVToContextPlugin/CMakeLists.txt
index 6bdff6d7..da805cd2 100644
--- a/plugin/bertQKVToContextPlugin/CMakeLists.txt
+++ b/plugin/bertQKVToContextPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/CMakeLists.txt b/plugin/bertQKVToContextPlugin/fused_multihead_attention/CMakeLists.txt
index 1d53970e..91e05d03 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/CMakeLists.txt
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention.h b/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention.h
index d59e8a73..e1b51b9d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention.h
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,6 +32,236 @@
namespace nvinfer1
{
+
+namespace pluginInternal
+{
+template
+class TFusedMultiHeadAttentionXMMAKernel
+{
+public:
+ using KernelMeta = TKernelMeta;
+ using KernelParam = TKernelParam;
+ inline uint64_t hashID(uint32_t s, uint32_t d) const
+ {
+ return (uint64_t) s << 32 | d;
+ }
+ virtual uint64_t hashID(const KernelMeta& kernelMeta) const
+ {
+ return hashID(kernelMeta.mS, kernelMeta.mD);
+ }
+
+ TFusedMultiHeadAttentionXMMAKernel(
+ const TKernelMeta* pMetaStart, uint32_t nMetaCount, plugin::bert::Data_type type, uint32_t sm)
+ : mDataType(type)
+ , mKernelMeta(pMetaStart)
+ , mKernelMetaCount(nMetaCount)
+ , mSM(sm)
+ {
+ PLUGIN_ASSERT(mKernelMetaCount && "No kernels were loaded correctly.");
+ }
+
+ void loadXMMAKernels(uint32_t smVersion)
+ {
+ for (uint32_t i = 0; i < mKernelMetaCount; ++i)
+ {
+ const auto& kernelMeta = mKernelMeta[i];
+ const auto kernelKey = hashID(kernelMeta);
+ if (kernelMeta.mSM == smVersion && kernelMeta.mDataType == mDataType
+ && mFunctions.find(kernelKey) == mFunctions.end())
+ {
+ const uint32_t DEFAULT_SMEM_SIZE{48 * 1024};
+ if (kernelMeta.mSharedMemBytes >= DEFAULT_SMEM_SIZE)
+ {
+ int32_t deviceID{0};
+ cudaGetDevice(&deviceID);
+ int32_t sharedMemPerMultiprocessor{0};
+ if (cudaDeviceGetAttribute(
+ &sharedMemPerMultiprocessor, cudaDevAttrMaxSharedMemoryPerBlockOptin, deviceID)
+ != cudaSuccess
+ || sharedMemPerMultiprocessor < static_cast(kernelMeta.mSharedMemBytes))
+ {
+ // skip load function because not enough shared memory to launch the kernel
+ continue;
+ }
+ }
+
+ CUmodule hmod{0};
+ auto findModuleIter = mModules.find(kernelMeta.mCubin);
+ if (findModuleIter != mModules.end())
+ {
+ hmod = findModuleIter->second;
+ }
+ else
+ {
+ cuErrCheck(mDriver.cuModuleLoadData(&hmod, kernelMeta.mCubin), mDriver);
+ mModules.insert(std::make_pair(kernelMeta.mCubin, hmod));
+ }
+
+ FusedMultiHeadAttentionKernelInfo funcInfo;
+ funcInfo.mMetaInfoIndex = i;
+ cuErrCheck(mDriver.cuModuleGetFunction(&funcInfo.mDeviceFunction, hmod, kernelMeta.mFuncName), mDriver);
+ if (kernelMeta.mSharedMemBytes >= DEFAULT_SMEM_SIZE)
+ {
+ if (mDriver.cuFuncSetAttribute(funcInfo.mDeviceFunction,
+ CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, kernelMeta.mSharedMemBytes)
+ != CUDA_SUCCESS)
+ {
+ // some chip may not have enough shared memory to launch the kernel
+ continue;
+ }
+ }
+ mFunctions.insert({kernelKey, funcInfo});
+ uint64_t const s = kernelMeta.mS;
+ uint64_t const headSize = kernelMeta.mD;
+ uint64_t key = (headSize << 32 | s);
+ if (mValidSequences.find(key) == mValidSequences.end())
+ {
+ mValidSequences.insert(key);
+ }
+ }
+ }
+ }
+
+ void loadXMMAKernels()
+ {
+ if (!mFunctions.empty())
+ {
+ return;
+ }
+
+ loadXMMAKernels(mSM);
+
+ // sm_86 chips prefer sm_86 sass, but can also use sm_80 sass if sm_86 not exist.
+ // sm_87 cannot run sm_80 sass
+ if (mSM == kSM_86)
+ {
+ loadXMMAKernels(kSM_80);
+ }
+
+ // sm_89 will reuse sm_80 and sm_86 kernels
+ if (mSM == kSM_89)
+ {
+ loadXMMAKernels(kSM_86);
+ loadXMMAKernels(kSM_80);
+ }
+ }
+
+ bool isValid(int32_t headSize, int32_t s) const
+ {
+ uint64_t key = (static_cast(headSize) << 32 | static_cast(s));
+ return (mValidSequences.find(key) != mValidSequences.end());
+ }
+
+ virtual void run(TKernelParam& params, cudaStream_t ss) const
+ {
+ const auto findIter = mFunctions.find(hashID(params.s, params.d));
+ std::stringstream errMsg;
+ errMsg << "Could not find kernel for:\n"
+ << "\t s: " << params.s << "\n"
+ << "\t d: " << params.d << "\n"
+ << "Was the plugin compiled on a compatible CUDA and SM version?\n"
+ << "\t Compiled on CUDA " << CUDA_VERSION << "\n"
+ << "\t Current SM version: " << mSM << "\n"
+ << "\t SM versions enabled during compilation: "
+#if defined(ENABLE_SM72)
+ << "72 "
+#endif
+#if defined(ENABLE_SM75)
+ << "75 "
+#endif
+#if defined(ENABLE_SM80)
+ << "80 "
+#endif
+#if defined(ENABLE_SM86)
+ << "86 "
+#endif
+#if defined(ENABLE_SM87)
+ << "87 "
+#endif
+#if defined(ENABLE_SM89)
+ << "89 "
+#endif
+#if defined(ENABLE_SM90)
+ << "90 "
+#endif
+ << "\n";
+ PLUGIN_VALIDATE(findIter != mFunctions.end(), errMsg.str().c_str());
+
+ const auto& kernelMeta = mKernelMeta[findIter->second.mMetaInfoIndex];
+ const CUfunction func = findIter->second.mDeviceFunction;
+
+ void* kernelParams[] = {¶ms, nullptr};
+ cuErrCheck(mDriver.cuLaunchKernel(func, params.h, params.b, 1, kernelMeta.mThreadsPerCTA, 1, 1,
+ kernelMeta.mSharedMemBytes, ss, kernelParams, nullptr),
+ mDriver);
+ }
+
+ virtual ~TFusedMultiHeadAttentionXMMAKernel() = default;
+
+protected:
+ nvinfer1::CUDADriverWrapper mDriver;
+
+ plugin::bert::Data_type mDataType;
+ const TKernelMeta* mKernelMeta;
+ uint32_t mKernelMetaCount;
+ uint32_t mSM;
+ std::unordered_map mModules;
+ struct FusedMultiHeadAttentionKernelInfo
+ {
+ uint32_t mMetaInfoIndex;
+ CUfunction mDeviceFunction;
+ };
+ std::unordered_map mFunctions;
+ // Set of valid sequence and head size combination. We use (headSize << 32 | sequence) as key here.
+ std::unordered_set mValidSequences;
+};
+template
+class TFusedMHAKernelFactory
+{
+public:
+ const TFusedMHAKernelList* getXMMAKernels(const typename TFusedMHAKernelList::KernelMeta* pKernelList,
+ uint32_t nbKernels, plugin::bert::Data_type type, uint32_t sm)
+ {
+ static std::mutex s_mutex;
+ std::lock_guard lg(s_mutex);
+
+ const auto id = hashID(type, sm);
+ const auto findIter = mKernels.find(id);
+ if (findIter == mKernels.end())
+ {
+ TFusedMHAKernelList* newKernel = new TFusedMHAKernelList{pKernelList, nbKernels, type, sm};
+ newKernel->loadXMMAKernels();
+ mKernels.insert(std::make_pair(id, std::unique_ptr(newKernel)));
+ return newKernel;
+ }
+ return findIter->second.get();
+ }
+
+ static TFusedMHAKernelFactory& Get()
+ {
+ static TFusedMHAKernelFactory s_factory;
+ return s_factory;
+ }
+
+private:
+ TFusedMHAKernelFactory() = default;
+
+ inline uint64_t hashID(plugin::bert::Data_type type, uint32_t sm) const
+ {
+ // use deviceID in hasID for multi GPU support before driver support context-less loading of cubin
+ int32_t deviceID{0};
+ CSC(cudaGetDevice(&deviceID), STATUS_FAILURE);
+
+ PLUGIN_ASSERT((deviceID & 0xFFFF) == deviceID);
+ PLUGIN_ASSERT((type & 0xFFFF) == type);
+ PLUGIN_ASSERT((sm & 0xFFFFFFFF) == sm);
+ return (uint64_t) type << 48 | (uint64_t) deviceID << 32 | sm;
+ }
+
+ std::unordered_map> mKernels;
+};
+} // namespace pluginInternal
+
namespace plugin
{
namespace bert
@@ -324,235 +554,10 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV1
#endif // defined(ENABLE_SM90)
};
-template
-class TFusedMultiHeadAttentionXMMAKernel
-{
-public:
- using KernelMeta = TKernelMeta;
- using KernelParam = TKernelParam;
- inline uint64_t hashID(uint32_t s, uint32_t d) const
- {
- return (uint64_t) s << 32 | d;
- }
- virtual uint64_t hashID(const KernelMeta& kernelMeta) const
- {
- return hashID(kernelMeta.mS, kernelMeta.mD);
- }
-
- TFusedMultiHeadAttentionXMMAKernel(const TKernelMeta* pMetaStart, uint32_t nMetaCount, Data_type type, uint32_t sm)
- : mDataType(type)
- , mKernelMeta(pMetaStart)
- , mKernelMetaCount(nMetaCount)
- , mSM(sm)
- {
- PLUGIN_ASSERT(mKernelMetaCount && "No kernels were loaded correctly.");
- }
-
- void loadXMMAKernels(uint32_t smVersion)
- {
- for (uint32_t i = 0; i < mKernelMetaCount; ++i)
- {
- const auto& kernelMeta = mKernelMeta[i];
- const auto kernelKey = hashID(kernelMeta);
- if (kernelMeta.mSM == smVersion && kernelMeta.mDataType == mDataType
- && mFunctions.find(kernelKey) == mFunctions.end())
- {
- const uint32_t DEFAULT_SMEM_SIZE{48 * 1024};
- if (kernelMeta.mSharedMemBytes >= DEFAULT_SMEM_SIZE)
- {
- int32_t deviceID{0};
- cudaGetDevice(&deviceID);
- int32_t sharedMemPerMultiprocessor{0};
- if (cudaDeviceGetAttribute(
- &sharedMemPerMultiprocessor, cudaDevAttrMaxSharedMemoryPerBlockOptin, deviceID)
- != cudaSuccess
- || sharedMemPerMultiprocessor < static_cast(kernelMeta.mSharedMemBytes))
- {
- // skip load function because not enough shared memory to launch the kernel
- continue;
- }
- }
-
- CUmodule hmod{0};
- auto findModuleIter = mModules.find(kernelMeta.mCubin);
- if (findModuleIter != mModules.end())
- {
- hmod = findModuleIter->second;
- }
- else
- {
- cuErrCheck(mDriver.cuModuleLoadData(&hmod, kernelMeta.mCubin), mDriver);
- mModules.insert(std::make_pair(kernelMeta.mCubin, hmod));
- }
-
- FusedMultiHeadAttentionKernelInfo funcInfo;
- funcInfo.mMetaInfoIndex = i;
- cuErrCheck(mDriver.cuModuleGetFunction(&funcInfo.mDeviceFunction, hmod, kernelMeta.mFuncName), mDriver);
- if (kernelMeta.mSharedMemBytes >= DEFAULT_SMEM_SIZE)
- {
- if (mDriver.cuFuncSetAttribute(funcInfo.mDeviceFunction,
- CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, kernelMeta.mSharedMemBytes)
- != CUDA_SUCCESS)
- {
- // some chip may not have enough shared memory to launch the kernel
- continue;
- }
- }
- mFunctions.insert({kernelKey, funcInfo});
- uint64_t const s = kernelMeta.mS;
- uint64_t const headSize = kernelMeta.mD;
- uint64_t key = (headSize << 32 | s);
- if (mValidSequences.find(key) == mValidSequences.end())
- {
- mValidSequences.insert(key);
- }
- }
- }
- }
-
- void loadXMMAKernels()
- {
- if (!mFunctions.empty())
- {
- return;
- }
-
- loadXMMAKernels(mSM);
-
- // sm_86 chips prefer sm_86 sass, but can also use sm_80 sass if sm_86 not exist.
- // sm_87 cannot run sm_80 sass
- if (mSM == kSM_86)
- {
- loadXMMAKernels(kSM_80);
- }
-
- // sm_89 will reuse sm_80 and sm_86 kernels
- if (mSM == kSM_89)
- {
- loadXMMAKernels(kSM_86);
- loadXMMAKernels(kSM_80);
- }
- }
-
- bool isValid(int32_t headSize, int32_t s) const
- {
- uint64_t key = (static_cast(headSize) << 32 | static_cast(s));
- return (mValidSequences.find(key) != mValidSequences.end());
- }
-
- virtual void run(TKernelParam& params, cudaStream_t ss) const
- {
- const auto findIter = mFunctions.find(hashID(params.s, params.d));
- std::stringstream errMsg;
- errMsg << "Could not find kernel for:\n"
- << "\t s: " << params.s << "\n"
- << "\t d: " << params.d << "\n"
- << "Was the plugin compiled on a compatible CUDA and SM version?\n"
- << "\t Compiled on CUDA " << CUDA_VERSION << "\n"
- << "\t Current SM version: " << mSM << "\n"
- << "\t SM versions enabled during compilation: "
-#if defined(ENABLE_SM72)
- << "72 "
-#endif
-#if defined(ENABLE_SM75)
- << "75 "
-#endif
-#if defined(ENABLE_SM80)
- << "80 "
-#endif
-#if defined(ENABLE_SM86)
- << "86 "
-#endif
-#if defined(ENABLE_SM87)
- << "87 "
-#endif
-#if defined(ENABLE_SM89)
- << "89 "
-#endif
-#if defined(ENABLE_SM90)
- << "90 "
-#endif
- << "\n";
- PLUGIN_VALIDATE(findIter != mFunctions.end(), errMsg.str().c_str());
-
- const auto& kernelMeta = mKernelMeta[findIter->second.mMetaInfoIndex];
- const CUfunction func = findIter->second.mDeviceFunction;
-
- void* kernelParams[] = {¶ms, nullptr};
- cuErrCheck(mDriver.cuLaunchKernel(func, params.h, params.b, 1, kernelMeta.mThreadsPerCTA, 1, 1,
- kernelMeta.mSharedMemBytes, ss, kernelParams, nullptr),
- mDriver);
- }
-
- virtual ~TFusedMultiHeadAttentionXMMAKernel() = default;
-
-protected:
- nvinfer1::CUDADriverWrapper mDriver;
-
- Data_type mDataType;
- const TKernelMeta* mKernelMeta;
- uint32_t mKernelMetaCount;
- uint32_t mSM;
- std::unordered_map mModules;
- struct FusedMultiHeadAttentionKernelInfo
- {
- uint32_t mMetaInfoIndex;
- CUfunction mDeviceFunction;
- };
- std::unordered_map mFunctions;
- // Set of valid sequence and head size combination. We use (headSize << 32 | sequence) as key here.
- std::unordered_set mValidSequences;
-};
-
-template
-class TFusedMHAKernelFactory
-{
-public:
- const TFusedMHAKernelList* getXMMAKernels(
- const typename TFusedMHAKernelList::KernelMeta* pKernelList, uint32_t nbKernels, Data_type type, uint32_t sm)
- {
- static std::mutex s_mutex;
- std::lock_guard lg(s_mutex);
-
- const auto id = hashID(type, sm);
- const auto findIter = mKernels.find(id);
- if (findIter == mKernels.end())
- {
- TFusedMHAKernelList* newKernel = new TFusedMHAKernelList{pKernelList, nbKernels, type, sm};
- newKernel->loadXMMAKernels();
- mKernels.insert(std::make_pair(id, std::unique_ptr(newKernel)));
- return newKernel;
- }
- return findIter->second.get();
- }
-
- static TFusedMHAKernelFactory& Get()
- {
- static TFusedMHAKernelFactory s_factory;
- return s_factory;
- }
-
-private:
- TFusedMHAKernelFactory() = default;
-
- inline uint64_t hashID(Data_type type, uint32_t sm) const
- {
- // use deviceID in hasID for multi GPU support before driver support context-less loading of cubin
- int32_t deviceID{0};
- CSC(cudaGetDevice(&deviceID), STATUS_FAILURE);
-
- PLUGIN_ASSERT((deviceID & 0xFFFF) == deviceID);
- PLUGIN_ASSERT((type & 0xFFFF) == type);
- PLUGIN_ASSERT((sm & 0xFFFFFFFF) == sm);
- return (uint64_t) type << 48 | (uint64_t) deviceID << 32 | sm;
- }
-
- std::unordered_map> mKernels;
-};
-
using FusedMultiHeadAttentionXMMAKernel
- = TFusedMultiHeadAttentionXMMAKernel;
-using FusedMHAKernelFactory = TFusedMHAKernelFactory;
+ = pluginInternal::TFusedMultiHeadAttentionXMMAKernel;
+using FusedMHAKernelFactory = pluginInternal::TFusedMHAKernelFactory;
inline const FusedMultiHeadAttentionXMMAKernel* getXMMAKernels(Data_type type, uint32_t sm)
{
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention_common.h b/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention_common.h
index 11d4b954..e1fe7d40 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention_common.h
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention_common.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm75.cpp
index af45426d..9ae4c46d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm80.cpp
index 3e5031b1..aef4ae47 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm87.cpp
index 0d0a6ed7..6846143d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm90.cpp
index a5134aaf..41bd15fa 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_128_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm75.cpp
index e2604633..59cadd97 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm80.cpp
index 035270eb..ab54f6b9 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm86.cpp
index 81f7a887..9189749c 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm87.cpp
index 929c0a4b..92e6811e 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm90.cpp
index a9592f3f..a2a10d10 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_384_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_512_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_512_64_kernel.sm90.cpp
index a5a19772..690e6f42 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_512_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_512_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm75.cpp
index 9dc6ffa6..6d8c23da 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm80.cpp
index 588d5dc8..34eba769 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm87.cpp
index 4d6308d3..9268ddc3 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm90.cpp
index fd292683..43b2bd85 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_64_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm75.cpp
index 238e9fbd..f345e66c 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm80.cpp
index a2eb24f7..c61eb87a 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm87.cpp
index 5b39da95..29d128ef 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm90.cpp
index 1af3e96a..18e389ca 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_fp16_96_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm75.cpp
index a18e4874..26ca9b77 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm80.cpp
index 0c079b17..ffb0d50d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm87.cpp
index b88a696d..26b7460f 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm90.cpp
index 457af2b6..eb18694d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_128_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm75.cpp
index 22611907..941996d1 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm80.cpp
index bf716793..5fe88e45 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm87.cpp
index c4376f86..0d23c4a1 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm90.cpp
index 44f159a7..576b0e17 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_384_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_512_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_512_64_kernel.sm90.cpp
index fd51119e..6cef65c5 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_512_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_512_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_64_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_64_64_kernel.sm80.cpp
index 062ce999..6211cf87 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_64_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_64_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_96_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_96_64_kernel.sm80.cpp
index 017f6862..b94a6a7b 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_96_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/src/fused_multihead_attention_int8_96_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/CMakeLists.txt b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/CMakeLists.txt
index 1d53970e..91e05d03 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/CMakeLists.txt
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/include/fused_multihead_attention_v2.h b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/include/fused_multihead_attention_v2.h
index bb729359..ecc3684d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/include/fused_multihead_attention_v2.h
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/include/fused_multihead_attention_v2.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -832,14 +832,14 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2
};
class FusedMultiHeadAttentionXMMAKernelV2
- : public TFusedMultiHeadAttentionXMMAKernel
{
public:
FusedMultiHeadAttentionXMMAKernelV2(
const FusedMultiHeadAttentionKernelMetaInfoV2* pMetaStart, uint32_t nMetaCount, Data_type type, uint32_t sm)
- : TFusedMultiHeadAttentionXMMAKernel(pMetaStart, nMetaCount, type, sm)
+ : pluginInternal::TFusedMultiHeadAttentionXMMAKernel(pMetaStart, nMetaCount, type, sm)
{
}
@@ -988,7 +988,7 @@ class FusedMultiHeadAttentionXMMAKernelV2
}
};
-using FusedMHAKernelFactoryV2 = TFusedMHAKernelFactory;
+using FusedMHAKernelFactoryV2 = pluginInternal::TFusedMHAKernelFactory;
inline const FusedMultiHeadAttentionXMMAKernelV2* getXMMAKernelsV2(Data_type type, uint32_t sm)
{
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_32_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_32_kernel.sm75.cpp
index 373f496a..d82cc0cb 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_32_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_32_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_32_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_32_kernel.sm80.cpp
index 1e3ff7c6..3f992060 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_32_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_32_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm75.cpp
index ece2d0eb..c146aa40 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm80.cpp
index dbc34090..6ae22e4a 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm86.cpp
index ff794f09..f8a98908 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm87.cpp
index d957a175..6f3b27e3 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm90.cpp
index 910c2772..d56ece44 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_128_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_32_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_32_kernel.sm75.cpp
index f466437c..05ffdb23 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_32_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_32_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_32_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_32_kernel.sm80.cpp
index 643f3abe..0d6a6c53 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_32_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_32_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm75.cpp
index b193aac5..d549443c 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm80.cpp
index eedf762f..e8b7ec1d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm86.cpp
index 17cdf962..1d7791b3 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm87.cpp
index 3943f07e..a03a9a39 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm90.cpp
index 8aebf6e4..6e04b4e7 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_256_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm75.cpp
index 47d6f8b4..b9f264c2 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm80.cpp
index 2c0141c6..1b5e752a 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm86.cpp
index 007b0ca5..320a9e88 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm87.cpp
index e47a0eb5..e264c016 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm90.cpp
index 71047e0d..f9ce8e34 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_384_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_32_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_32_kernel.sm75.cpp
index e424fd93..8f766536 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_32_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_32_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_32_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_32_kernel.sm80.cpp
index f3b2aec9..b22936ed 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_32_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_32_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm75.cpp
index 6706f1e1..624e6e0b 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm80.cpp
index 57d31338..c8a9c2f9 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm90.cpp
index d9bbd955..a03160a2 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_512_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm75.cpp
index a93f1f80..4642ab1a 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm80.cpp
index fc6e825e..fae19400 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm86.cpp
index dc64aaf1..c5dc1be8 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm87.cpp
index 17394f7b..b93318b7 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm90.cpp
index 30a6a139..192047d0 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm75.cpp
index 75826861..a4dd7851 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm80.cpp
index a5a9db91..9f8557f6 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm86.cpp
index 5c0e4792..e45804f5 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm87.cpp
index 75cca5b0..f9fd241e 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm90.cpp
index 05ed3a7d..93e21e59 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_96_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_32_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_32_kernel.sm80.cpp
index 7377bb87..cf253602 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_32_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_32_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_64_kernel.sm87.cpp
index c486ba74..a446bdc9 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_64_kernel.sm90.cpp
index ff8b71b7..52dd640b 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_128_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_192_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_192_64_kernel.sm87.cpp
index b55a9b29..5f51d0f9 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_192_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_192_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_192_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_192_64_kernel.sm90.cpp
index a486db0f..2b0aec86 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_192_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_192_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_256_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_256_64_kernel.sm87.cpp
index dcac39f3..3cd2a96d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_256_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_256_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_256_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_256_64_kernel.sm90.cpp
index 9826a2c2..5a3744f1 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_256_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_256_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_384_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_384_64_kernel.sm87.cpp
index b6659f16..10b61245 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_384_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_384_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_384_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_384_64_kernel.sm90.cpp
index bbb5eeeb..b52902cb 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_384_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_384_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm80.cpp
index f9fd6183..84db4d9b 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm87.cpp
index 6441c74a..abdc3f80 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm90.cpp
index df8cda25..dc88c038 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_64_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm80.cpp
index e62d93aa..014442ea 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm87.cpp
index 590c0df4..6d830826 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm90.cpp
index be698b64..b345aad7 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_il_int8_96_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_32_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_32_kernel.sm75.cpp
index ce3baa27..310bd7b3 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_32_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_32_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_32_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_32_kernel.sm80.cpp
index 0abcf4e3..754f1117 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_32_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_32_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm72.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm72.cpp
index fbf16481..e8d90371 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm72.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm72.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm75.cpp
index 56cb1930..208f99b1 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm80.cpp
index f7b86091..28063c3a 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm86.cpp
index fe49aaa3..9073a280 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm87.cpp
index b84b0dc8..a7c7067b 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm90.cpp
index 6f889451..2db0cf89 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_128_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm72.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm72.cpp
index 3c3735d1..81f815ab 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm72.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm72.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm75.cpp
index dfe6d8ce..c2725c28 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm80.cpp
index 8a1d2d2c..9e310f3e 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm86.cpp
index 31dd3150..d7c891c6 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm87.cpp
index aa2a81c9..a1a73aed 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm90.cpp
index a5e4c65e..e2ba6e02 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_192_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_32_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_32_kernel.sm75.cpp
index 2a729502..6b74c2fe 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_32_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_32_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_32_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_32_kernel.sm80.cpp
index aeac0ebd..ecb4e343 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_32_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_32_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm72.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm72.cpp
index a62c2cf9..248e3096 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm72.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm72.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm75.cpp
index 3fa33ae5..c9a585ee 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm80.cpp
index f597a37e..fe195f5e 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm86.cpp
index 24d31716..6afbe0c8 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm87.cpp
index b70f696d..f8e37cfb 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm90.cpp
index 07f7b870..d170e2f1 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_256_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm72.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm72.cpp
index 2d62254b..cb17f7ab 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm72.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm72.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm75.cpp
index b373a064..9fbd6434 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm80.cpp
index 86517581..d8c78ccd 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm86.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm86.cpp
index c9196880..aeac0b9e 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm86.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm86.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm87.cpp
index 70e699f8..044654af 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm90.cpp
index 848c68be..6028ed75 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_384_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_32_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_32_kernel.sm75.cpp
index baaf7441..36ece8b7 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_32_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_32_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_32_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_32_kernel.sm80.cpp
index 68204bf6..590cbecb 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_32_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_32_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm75.cpp
index 8ee4ced0..15312cff 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm75.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm75.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm80.cpp
index e9bd8613..0cd60732 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm90.cpp
index 48644b36..58e28091 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_512_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm80.cpp
index 77ccb240..23019c5a 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm87.cpp
index 2eb5c132..35635613 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm90.cpp
index 2280de3b..4161dcd5 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_64_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm80.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm80.cpp
index b7a7f1db..f9056c6d 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm80.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm80.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm87.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm87.cpp
index c2e6aca4..e5689381 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm87.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm87.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm90.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm90.cpp
index a4516a2d..427ab9f8 100644
--- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm90.cpp
+++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_int8_96_64_kernel.sm90.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPlugin.cpp b/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPlugin.cpp
index f62f2c9a..40a42af0 100644
--- a/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPlugin.cpp
+++ b/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/zeroPadding2d.cu b/plugin/bertQKVToContextPlugin/zeroPadding2d.cu
index aa8a70c9..f8135ada 100644
--- a/plugin/bertQKVToContextPlugin/zeroPadding2d.cu
+++ b/plugin/bertQKVToContextPlugin/zeroPadding2d.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/bertQKVToContextPlugin/zeroPadding2d.h b/plugin/bertQKVToContextPlugin/zeroPadding2d.h
index bc1409a2..faa85ebe 100644
--- a/plugin/bertQKVToContextPlugin/zeroPadding2d.h
+++ b/plugin/bertQKVToContextPlugin/zeroPadding2d.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/clipPlugin/CMakeLists.txt b/plugin/clipPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/clipPlugin/CMakeLists.txt
+++ b/plugin/clipPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/clipPlugin/clip.cu b/plugin/clipPlugin/clip.cu
index f407ebbc..44bc1f73 100644
--- a/plugin/clipPlugin/clip.cu
+++ b/plugin/clipPlugin/clip.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/clipPlugin/clip.h b/plugin/clipPlugin/clip.h
index 70a53143..e21e8b43 100644
--- a/plugin/clipPlugin/clip.h
+++ b/plugin/clipPlugin/clip.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/CMakeLists.txt b/plugin/common/CMakeLists.txt
index 12ab940b..af59d7f7 100644
--- a/plugin/common/CMakeLists.txt
+++ b/plugin/common/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/bboxUtils.h b/plugin/common/bboxUtils.h
index 028eeb81..6419611d 100644
--- a/plugin/common/bboxUtils.h
+++ b/plugin/common/bboxUtils.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/bertCommon.h b/plugin/common/bertCommon.h
index e34e954f..4cb33551 100644
--- a/plugin/common/bertCommon.h
+++ b/plugin/common/bertCommon.h
@@ -86,6 +86,17 @@ constexpr size_t packedMaskSize384 = xmmasM384 * threadsPerCta384;
namespace nvinfer1
{
+namespace pluginInternal
+{
+template
+struct CudaDeleter
+{
+ void operator()(T* buf)
+ {
+ PLUGIN_CUASSERT(cudaFree(buf));
+ }
+};
+} // namespace pluginInternal
namespace plugin
{
namespace bert
@@ -308,16 +319,7 @@ struct CublasConfigHelper
};
template
-struct CudaDeleter
-{
- void operator()(T* buf)
- {
- PLUGIN_CUASSERT(cudaFree(buf));
- }
-};
-
-template
-using cuda_unique_ptr = std::unique_ptr>;
+using cuda_unique_ptr = std::unique_ptr>;
template
using cuda_shared_ptr = std::shared_ptr;
@@ -325,7 +327,7 @@ using cuda_shared_ptr = std::shared_ptr;
template
void make_cuda_shared(cuda_shared_ptr& ptr, void* cudaMem)
{
- ptr.reset(static_cast(cudaMem), bert::CudaDeleter());
+ ptr.reset(static_cast(cudaMem), pluginInternal::CudaDeleter());
}
struct WeightsWithOwnership : public nvinfer1::Weights
diff --git a/plugin/common/cub_helper.h b/plugin/common/cub_helper.h
index ee8402c4..7cc35848 100644
--- a/plugin/common/cub_helper.h
+++ b/plugin/common/cub_helper.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/cudaDriverWrapper.cpp b/plugin/common/cudaDriverWrapper.cpp
index 5e317564..fa83866c 100644
--- a/plugin/common/cudaDriverWrapper.cpp
+++ b/plugin/common/cudaDriverWrapper.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/cudaDriverWrapper.h b/plugin/common/cudaDriverWrapper.h
index b105e3c2..209ed3f8 100644
--- a/plugin/common/cudaDriverWrapper.h
+++ b/plugin/common/cudaDriverWrapper.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/dimsHelpers.h b/plugin/common/dimsHelpers.h
index 8198590b..239a63ac 100644
--- a/plugin/common/dimsHelpers.h
+++ b/plugin/common/dimsHelpers.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/half.h b/plugin/common/half.h
index 28825bb1..af49356a 100644
--- a/plugin/common/half.h
+++ b/plugin/common/half.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/CMakeLists.txt b/plugin/common/kernels/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/common/kernels/CMakeLists.txt
+++ b/plugin/common/kernels/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/bboxDeltas2Proposals.cu b/plugin/common/kernels/bboxDeltas2Proposals.cu
index 945d3bc5..0be5e90d 100644
--- a/plugin/common/kernels/bboxDeltas2Proposals.cu
+++ b/plugin/common/kernels/bboxDeltas2Proposals.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/cropAndResizeKernel.cu b/plugin/common/kernels/cropAndResizeKernel.cu
index aa1bec14..fdae167b 100644
--- a/plugin/common/kernels/cropAndResizeKernel.cu
+++ b/plugin/common/kernels/cropAndResizeKernel.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/decodeBbox3DKernels.cu b/plugin/common/kernels/decodeBbox3DKernels.cu
index ac53c098..f1592e49 100644
--- a/plugin/common/kernels/decodeBbox3DKernels.cu
+++ b/plugin/common/kernels/decodeBbox3DKernels.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/detectionForward.cu b/plugin/common/kernels/detectionForward.cu
index 09cba7dd..6f28c15a 100644
--- a/plugin/common/kernels/detectionForward.cu
+++ b/plugin/common/kernels/detectionForward.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/extractFgScores.cu b/plugin/common/kernels/extractFgScores.cu
index f087e012..1785bf0a 100644
--- a/plugin/common/kernels/extractFgScores.cu
+++ b/plugin/common/kernels/extractFgScores.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/generateAnchors.cu b/plugin/common/kernels/generateAnchors.cu
index 398cf1b7..b80383f7 100644
--- a/plugin/common/kernels/generateAnchors.cu
+++ b/plugin/common/kernels/generateAnchors.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/gridAnchorLayer.cu b/plugin/common/kernels/gridAnchorLayer.cu
index 666997c5..2475a943 100644
--- a/plugin/common/kernels/gridAnchorLayer.cu
+++ b/plugin/common/kernels/gridAnchorLayer.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/kernel.cpp b/plugin/common/kernels/kernel.cpp
index 7f8a00dc..d5c0966a 100644
--- a/plugin/common/kernels/kernel.cpp
+++ b/plugin/common/kernels/kernel.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/lReLU.cu b/plugin/common/kernels/lReLU.cu
index 8a720ff1..87c42724 100644
--- a/plugin/common/kernels/lReLU.cu
+++ b/plugin/common/kernels/lReLU.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/maskRCNNKernels.cu b/plugin/common/kernels/maskRCNNKernels.cu
index b79d55e0..0a9d8083 100644
--- a/plugin/common/kernels/maskRCNNKernels.cu
+++ b/plugin/common/kernels/maskRCNNKernels.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/maskRCNNKernels.h b/plugin/common/kernels/maskRCNNKernels.h
index 71ed0784..433d7ca2 100644
--- a/plugin/common/kernels/maskRCNNKernels.h
+++ b/plugin/common/kernels/maskRCNNKernels.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/nmsLayer.cu b/plugin/common/kernels/nmsLayer.cu
index 0fdcdf39..8ce2a8f2 100644
--- a/plugin/common/kernels/nmsLayer.cu
+++ b/plugin/common/kernels/nmsLayer.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/permuteData.cu b/plugin/common/kernels/permuteData.cu
index dd43f04c..185e4c53 100644
--- a/plugin/common/kernels/permuteData.cu
+++ b/plugin/common/kernels/permuteData.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/pillarScatterKernels.cu b/plugin/common/kernels/pillarScatterKernels.cu
index 528a2665..6ee3c3e8 100644
--- a/plugin/common/kernels/pillarScatterKernels.cu
+++ b/plugin/common/kernels/pillarScatterKernels.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/priorBoxLayer.cu b/plugin/common/kernels/priorBoxLayer.cu
index 3c6e160b..af17af22 100644
--- a/plugin/common/kernels/priorBoxLayer.cu
+++ b/plugin/common/kernels/priorBoxLayer.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/proposalKernel.cu b/plugin/common/kernels/proposalKernel.cu
index 8fcaab14..82f2db9b 100644
--- a/plugin/common/kernels/proposalKernel.cu
+++ b/plugin/common/kernels/proposalKernel.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/proposalsForward.cu b/plugin/common/kernels/proposalsForward.cu
index cab00063..2be3a087 100644
--- a/plugin/common/kernels/proposalsForward.cu
+++ b/plugin/common/kernels/proposalsForward.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/reducedMathPlugin.h b/plugin/common/kernels/reducedMathPlugin.h
index 777a5e51..d7c17f92 100644
--- a/plugin/common/kernels/reducedMathPlugin.h
+++ b/plugin/common/kernels/reducedMathPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/regionForward.cu b/plugin/common/kernels/regionForward.cu
index a948dc4f..b33b9b3f 100644
--- a/plugin/common/kernels/regionForward.cu
+++ b/plugin/common/kernels/regionForward.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/reorgForward.cu b/plugin/common/kernels/reorgForward.cu
index becc87a7..ef5fdb7a 100644
--- a/plugin/common/kernels/reorgForward.cu
+++ b/plugin/common/kernels/reorgForward.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/roiPooling.cu b/plugin/common/kernels/roiPooling.cu
index abac39a2..353173cc 100644
--- a/plugin/common/kernels/roiPooling.cu
+++ b/plugin/common/kernels/roiPooling.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/rproiInferenceFused.cu b/plugin/common/kernels/rproiInferenceFused.cu
index 46d0243b..db1161bb 100644
--- a/plugin/common/kernels/rproiInferenceFused.cu
+++ b/plugin/common/kernels/rproiInferenceFused.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/sortScoresPerClass.cu b/plugin/common/kernels/sortScoresPerClass.cu
index 1ac96086..cd62df64 100644
--- a/plugin/common/kernels/sortScoresPerClass.cu
+++ b/plugin/common/kernels/sortScoresPerClass.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/sortScoresPerImage.cu b/plugin/common/kernels/sortScoresPerImage.cu
index 2137bc09..99749c53 100644
--- a/plugin/common/kernels/sortScoresPerImage.cu
+++ b/plugin/common/kernels/sortScoresPerImage.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/kernels/voxelGeneratorKernels.cu b/plugin/common/kernels/voxelGeneratorKernels.cu
index 785a7e63..57b71798 100644
--- a/plugin/common/kernels/voxelGeneratorKernels.cu
+++ b/plugin/common/kernels/voxelGeneratorKernels.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/mrcnn_config.h b/plugin/common/mrcnn_config.h
index 5b3673ca..88added0 100644
--- a/plugin/common/mrcnn_config.h
+++ b/plugin/common/mrcnn_config.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/nmsUtils.h b/plugin/common/nmsUtils.h
index 28a4aa7e..8dbd03ff 100644
--- a/plugin/common/nmsUtils.h
+++ b/plugin/common/nmsUtils.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/reducedMathPlugin.cpp b/plugin/common/reducedMathPlugin.cpp
index 4e33680a..bedd8d2b 100644
--- a/plugin/common/reducedMathPlugin.cpp
+++ b/plugin/common/reducedMathPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/serialize.hpp b/plugin/common/serialize.hpp
index 8a29dd46..8fcef07f 100644
--- a/plugin/common/serialize.hpp
+++ b/plugin/common/serialize.hpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/templates.h b/plugin/common/templates.h
index 298bb8c2..2870bfd6 100644
--- a/plugin/common/templates.h
+++ b/plugin/common/templates.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/vfcCommon.cpp b/plugin/common/vfcCommon.cpp
index 7122d0d4..8664ab56 100644
--- a/plugin/common/vfcCommon.cpp
+++ b/plugin/common/vfcCommon.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/common/vfcCommon.h b/plugin/common/vfcCommon.h
index ee84dc97..7b7db007 100644
--- a/plugin/common/vfcCommon.h
+++ b/plugin/common/vfcCommon.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/coordConvACPlugin/CMakeLists.txt b/plugin/coordConvACPlugin/CMakeLists.txt
index df2f2da8..0e7b1e6e 100644
--- a/plugin/coordConvACPlugin/CMakeLists.txt
+++ b/plugin/coordConvACPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/coordConvACPlugin/coordConvACPlugin.cpp b/plugin/coordConvACPlugin/coordConvACPlugin.cpp
index 63462fcd..671e06ee 100644
--- a/plugin/coordConvACPlugin/coordConvACPlugin.cpp
+++ b/plugin/coordConvACPlugin/coordConvACPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/coordConvACPlugin/coordConvACPlugin.h b/plugin/coordConvACPlugin/coordConvACPlugin.h
index 1776d6f7..0df045ce 100644
--- a/plugin/coordConvACPlugin/coordConvACPlugin.h
+++ b/plugin/coordConvACPlugin/coordConvACPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/coordConvACPlugin/coordConvACPluginKernels.cu b/plugin/coordConvACPlugin/coordConvACPluginKernels.cu
index a0130a16..8f32aa87 100644
--- a/plugin/coordConvACPlugin/coordConvACPluginKernels.cu
+++ b/plugin/coordConvACPlugin/coordConvACPluginKernels.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/cropAndResizePlugin/CMakeLists.txt b/plugin/cropAndResizePlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/cropAndResizePlugin/CMakeLists.txt
+++ b/plugin/cropAndResizePlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/cropAndResizePlugin/cropAndResizePlugin.cpp b/plugin/cropAndResizePlugin/cropAndResizePlugin.cpp
index a0b19fc4..f8d5a731 100644
--- a/plugin/cropAndResizePlugin/cropAndResizePlugin.cpp
+++ b/plugin/cropAndResizePlugin/cropAndResizePlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/cropAndResizePlugin/cropAndResizePlugin.h b/plugin/cropAndResizePlugin/cropAndResizePlugin.h
index 54c8f16b..c0f9d33d 100644
--- a/plugin/cropAndResizePlugin/cropAndResizePlugin.h
+++ b/plugin/cropAndResizePlugin/cropAndResizePlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/decodeBbox3DPlugin/CMakeLists.txt b/plugin/decodeBbox3DPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/decodeBbox3DPlugin/CMakeLists.txt
+++ b/plugin/decodeBbox3DPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/decodeBbox3DPlugin/decodeBbox3D.cpp b/plugin/decodeBbox3DPlugin/decodeBbox3D.cpp
index f9e9faa5..96884a5b 100644
--- a/plugin/decodeBbox3DPlugin/decodeBbox3D.cpp
+++ b/plugin/decodeBbox3DPlugin/decodeBbox3D.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/decodeBbox3DPlugin/decodeBbox3D.h b/plugin/decodeBbox3DPlugin/decodeBbox3D.h
index ea85785a..65fbb5ae 100644
--- a/plugin/decodeBbox3DPlugin/decodeBbox3D.h
+++ b/plugin/decodeBbox3DPlugin/decodeBbox3D.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/detectionLayerPlugin/CMakeLists.txt b/plugin/detectionLayerPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/detectionLayerPlugin/CMakeLists.txt
+++ b/plugin/detectionLayerPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/detectionLayerPlugin/detectionLayerPlugin.cpp b/plugin/detectionLayerPlugin/detectionLayerPlugin.cpp
index 840156cd..cd243c11 100644
--- a/plugin/detectionLayerPlugin/detectionLayerPlugin.cpp
+++ b/plugin/detectionLayerPlugin/detectionLayerPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/detectionLayerPlugin/detectionLayerPlugin.h b/plugin/detectionLayerPlugin/detectionLayerPlugin.h
index adbf535d..88ac12f5 100644
--- a/plugin/detectionLayerPlugin/detectionLayerPlugin.h
+++ b/plugin/detectionLayerPlugin/detectionLayerPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/disentangledAttentionPlugin/CMakeLists.txt b/plugin/disentangledAttentionPlugin/CMakeLists.txt
index df2f2da8..0e7b1e6e 100644
--- a/plugin/disentangledAttentionPlugin/CMakeLists.txt
+++ b/plugin/disentangledAttentionPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/disentangledAttentionPlugin/disentangledAttentionPlugin.cpp b/plugin/disentangledAttentionPlugin/disentangledAttentionPlugin.cpp
index c79096a5..d9bf788f 100644
--- a/plugin/disentangledAttentionPlugin/disentangledAttentionPlugin.cpp
+++ b/plugin/disentangledAttentionPlugin/disentangledAttentionPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/disentangledAttentionPlugin/disentangledAttentionPlugin.h b/plugin/disentangledAttentionPlugin/disentangledAttentionPlugin.h
index 7d77a514..f9d01a4c 100644
--- a/plugin/disentangledAttentionPlugin/disentangledAttentionPlugin.h
+++ b/plugin/disentangledAttentionPlugin/disentangledAttentionPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/disentangledAttentionPlugin/disentangledKernel.cu b/plugin/disentangledAttentionPlugin/disentangledKernel.cu
index f90a98e6..2636fd8f 100644
--- a/plugin/disentangledAttentionPlugin/disentangledKernel.cu
+++ b/plugin/disentangledAttentionPlugin/disentangledKernel.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/CMakeLists.txt b/plugin/efficientNMSPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/efficientNMSPlugin/CMakeLists.txt
+++ b/plugin/efficientNMSPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/efficientNMSInference.cu b/plugin/efficientNMSPlugin/efficientNMSInference.cu
index ba99cb56..f3eee1a3 100644
--- a/plugin/efficientNMSPlugin/efficientNMSInference.cu
+++ b/plugin/efficientNMSPlugin/efficientNMSInference.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/efficientNMSInference.cuh b/plugin/efficientNMSPlugin/efficientNMSInference.cuh
index bf12c359..c16bdb40 100644
--- a/plugin/efficientNMSPlugin/efficientNMSInference.cuh
+++ b/plugin/efficientNMSPlugin/efficientNMSInference.cuh
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/efficientNMSInference.h b/plugin/efficientNMSPlugin/efficientNMSInference.h
index d9ec3192..fa4749bd 100644
--- a/plugin/efficientNMSPlugin/efficientNMSInference.h
+++ b/plugin/efficientNMSPlugin/efficientNMSInference.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/efficientNMSParameters.h b/plugin/efficientNMSPlugin/efficientNMSParameters.h
index 89829089..c4b6dc51 100644
--- a/plugin/efficientNMSPlugin/efficientNMSParameters.h
+++ b/plugin/efficientNMSPlugin/efficientNMSParameters.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/efficientNMSPlugin.cpp b/plugin/efficientNMSPlugin/efficientNMSPlugin.cpp
index 1a8692ae..71836943 100644
--- a/plugin/efficientNMSPlugin/efficientNMSPlugin.cpp
+++ b/plugin/efficientNMSPlugin/efficientNMSPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/efficientNMSPlugin.h b/plugin/efficientNMSPlugin/efficientNMSPlugin.h
index afceec01..c7248d91 100644
--- a/plugin/efficientNMSPlugin/efficientNMSPlugin.h
+++ b/plugin/efficientNMSPlugin/efficientNMSPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/tftrt/CMakeLists.txt b/plugin/efficientNMSPlugin/tftrt/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/efficientNMSPlugin/tftrt/CMakeLists.txt
+++ b/plugin/efficientNMSPlugin/tftrt/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/tftrt/efficientNMSExplicitTFTRTPlugin.cpp b/plugin/efficientNMSPlugin/tftrt/efficientNMSExplicitTFTRTPlugin.cpp
index f5c86365..3aef2fe6 100644
--- a/plugin/efficientNMSPlugin/tftrt/efficientNMSExplicitTFTRTPlugin.cpp
+++ b/plugin/efficientNMSPlugin/tftrt/efficientNMSExplicitTFTRTPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/tftrt/efficientNMSExplicitTFTRTPlugin.h b/plugin/efficientNMSPlugin/tftrt/efficientNMSExplicitTFTRTPlugin.h
index e1e98052..2ad7a2f0 100644
--- a/plugin/efficientNMSPlugin/tftrt/efficientNMSExplicitTFTRTPlugin.h
+++ b/plugin/efficientNMSPlugin/tftrt/efficientNMSExplicitTFTRTPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/tftrt/efficientNMSImplicitTFTRTPlugin.cpp b/plugin/efficientNMSPlugin/tftrt/efficientNMSImplicitTFTRTPlugin.cpp
index 25c8e0ef..af75d75d 100644
--- a/plugin/efficientNMSPlugin/tftrt/efficientNMSImplicitTFTRTPlugin.cpp
+++ b/plugin/efficientNMSPlugin/tftrt/efficientNMSImplicitTFTRTPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/efficientNMSPlugin/tftrt/efficientNMSImplicitTFTRTPlugin.h b/plugin/efficientNMSPlugin/tftrt/efficientNMSImplicitTFTRTPlugin.h
index 51b09148..58e07289 100644
--- a/plugin/efficientNMSPlugin/tftrt/efficientNMSImplicitTFTRTPlugin.h
+++ b/plugin/efficientNMSPlugin/tftrt/efficientNMSImplicitTFTRTPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/embLayerNormPlugin/CMakeLists.txt b/plugin/embLayerNormPlugin/CMakeLists.txt
index f49d60bd..0fbe405b 100644
--- a/plugin/embLayerNormPlugin/CMakeLists.txt
+++ b/plugin/embLayerNormPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/embLayerNormPlugin/embLayerNormKernel.cu b/plugin/embLayerNormPlugin/embLayerNormKernel.cu
index a32d14e5..6e6707d7 100644
--- a/plugin/embLayerNormPlugin/embLayerNormKernel.cu
+++ b/plugin/embLayerNormPlugin/embLayerNormKernel.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/embLayerNormPlugin/embLayerNormPlugin.cpp b/plugin/embLayerNormPlugin/embLayerNormPlugin.cpp
index 8e392b82..ab523971 100644
--- a/plugin/embLayerNormPlugin/embLayerNormPlugin.cpp
+++ b/plugin/embLayerNormPlugin/embLayerNormPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/embLayerNormPlugin/embLayerNormPlugin.h b/plugin/embLayerNormPlugin/embLayerNormPlugin.h
index eb21d268..5eb40958 100644
--- a/plugin/embLayerNormPlugin/embLayerNormPlugin.h
+++ b/plugin/embLayerNormPlugin/embLayerNormPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/embLayerNormPlugin/embLayerNormVarSeqlenKernelHFace.cu b/plugin/embLayerNormPlugin/embLayerNormVarSeqlenKernelHFace.cu
index db8f6b06..a23f3326 100644
--- a/plugin/embLayerNormPlugin/embLayerNormVarSeqlenKernelHFace.cu
+++ b/plugin/embLayerNormPlugin/embLayerNormVarSeqlenKernelHFace.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/embLayerNormPlugin/embLayerNormVarSeqlenKernelMTron.cu b/plugin/embLayerNormPlugin/embLayerNormVarSeqlenKernelMTron.cu
index 95e45820..2fddfe02 100644
--- a/plugin/embLayerNormPlugin/embLayerNormVarSeqlenKernelMTron.cu
+++ b/plugin/embLayerNormPlugin/embLayerNormVarSeqlenKernelMTron.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/embLayerNormPlugin/embLayerNormVarSeqlenPlugin.cpp b/plugin/embLayerNormPlugin/embLayerNormVarSeqlenPlugin.cpp
index 4b6bd72d..4313faa7 100644
--- a/plugin/embLayerNormPlugin/embLayerNormVarSeqlenPlugin.cpp
+++ b/plugin/embLayerNormPlugin/embLayerNormVarSeqlenPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/embLayerNormPlugin/embLayerNormVarSeqlenPlugin.h b/plugin/embLayerNormPlugin/embLayerNormVarSeqlenPlugin.h
index 80a0cc57..d3141a6b 100644
--- a/plugin/embLayerNormPlugin/embLayerNormVarSeqlenPlugin.h
+++ b/plugin/embLayerNormPlugin/embLayerNormVarSeqlenPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/exports-vfc_plugin.def b/plugin/exports-vfc_plugin.def
index d47954b3..28a79242 100644
--- a/plugin/exports-vfc_plugin.def
+++ b/plugin/exports-vfc_plugin.def
@@ -1,4 +1,4 @@
-; SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+; SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
; SPDX-License-Identifier: Apache-2.0
;
; Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +13,7 @@
; See the License for the specific language governing permissions and
; limitations under the License.
-LIBRARY nvinfer_vc_plugin
+LIBRARY nvinfer_vc_plugin_10
EXPORTS
setLoggerFinder
getPluginCreators
diff --git a/plugin/exports-vfc_plugin.map b/plugin/exports-vfc_plugin.map
index b90d58ce..7171544b 100644
--- a/plugin/exports-vfc_plugin.map
+++ b/plugin/exports-vfc_plugin.map
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/exports.def b/plugin/exports.def
index 6dac36fe..20503473 100644
--- a/plugin/exports.def
+++ b/plugin/exports.def
@@ -1,4 +1,4 @@
-; SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+; SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
; SPDX-License-Identifier: Apache-2.0
;
; Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +13,7 @@
; See the License for the specific language governing permissions and
; limitations under the License.
-LIBRARY nvinfer_plugin
+LIBRARY nvinfer_plugin_10
EXPORTS
getInferLibVersion
getPluginRegistry
diff --git a/plugin/exports.map b/plugin/exports.map
index 64de08ba..b68b1d16 100644
--- a/plugin/exports.map
+++ b/plugin/exports.map
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/fcPlugin/CMakeLists.txt b/plugin/fcPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/fcPlugin/CMakeLists.txt
+++ b/plugin/fcPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/fcPlugin/fcPlugin.cpp b/plugin/fcPlugin/fcPlugin.cpp
index c98ae433..fd0c1339 100644
--- a/plugin/fcPlugin/fcPlugin.cpp
+++ b/plugin/fcPlugin/fcPlugin.cpp
@@ -140,7 +140,7 @@ void nvinfer1::plugin::bert::LtGemmSearch(cublasLtHandle_t ltHandle, cublasOpera
void const* A, int32_t const& lda, void const* B, int32_t const& ldb, void const* beta, // host pointer
void* C, int32_t const& ldc, void* workSpace, size_t workSpaceSize, cublasComputeType_t computeType,
cudaDataType_t scaleType, cudaDataType_t Atype, cudaDataType_t Btype, cudaDataType_t Ctype,
- std::vector& perfResults)
+ std::vector& perfResults, cudaStream_t stream)
{
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
@@ -153,7 +153,6 @@ void nvinfer1::plugin::bert::LtGemmSearch(cublasLtHandle_t ltHandle, cublasOpera
cudaEvent_t startEvent = nullptr;
cudaEvent_t stopEvent = nullptr;
- cudaStream_t stream = nullptr;
CublasLtWrapper& cublasLtWrapper = getCublasLtWrapper();
@@ -520,13 +519,20 @@ void FCPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int
if (mAlgo.data[0] == 0 && memcmp(mAlgo.data, mAlgo.data + 1, sizeof(mAlgo.data) - sizeof(mAlgo.data[0])) == 0)
{
gLogVerbose << "FCPluginDynamic gemmSearch\n";
+ if (mSharedStream == nullptr)
+ {
+ SharedStream ss{};
+ mSharedStream = static_cast(
+ getPluginRegistry()->acquirePluginResource(kFCPLUGIN_SHARED_STREAM_KEY, &ss))
+ ->mStream;
+ }
if (mType == DataType::kFLOAT)
{
- mAlgo = gemmSearch(mOutDim, mNmax, mK, kMAX_WORKSPACE_BYTES, actualWorkspace);
+ mAlgo = gemmSearch(mOutDim, mNmax, mK, kMAX_WORKSPACE_BYTES, actualWorkspace, mSharedStream);
}
else if (mType == DataType::kHALF)
{
- mAlgo = gemmSearch(mOutDim, mNmax, mK, kMAX_WORKSPACE_BYTES, actualWorkspace);
+ mAlgo = gemmSearch(mOutDim, mNmax, mK, kMAX_WORKSPACE_BYTES, actualWorkspace, mSharedStream);
}
}
@@ -656,6 +662,11 @@ int32_t FCPluginDynamic::initialize() noexcept
void FCPluginDynamic::terminate() noexcept
{
gLogVerbose << "FCPluginDynamic terminate\n";
+ if (mSharedStream)
+ {
+ TRT_UNUSED(getPluginRegistry()->releasePluginResource(kFCPLUGIN_SHARED_STREAM_KEY));
+ mSharedStream = nullptr;
+ }
}
size_t FCPluginDynamic::getSerializationSize() const noexcept
diff --git a/plugin/fcPlugin/fcPlugin.h b/plugin/fcPlugin/fcPlugin.h
index 1ba56f7b..855ce96d 100644
--- a/plugin/fcPlugin/fcPlugin.h
+++ b/plugin/fcPlugin/fcPlugin.h
@@ -31,6 +31,67 @@
namespace nvinfer1
{
+
+namespace pluginInternal
+{
+class SharedStream : public IPluginResource
+{
+public:
+ SharedStream(bool init = false)
+ {
+ if (init)
+ {
+ PLUGIN_CUASSERT(cudaStreamCreate(&mStream));
+ }
+ }
+
+ void free()
+ {
+ if (mStream != nullptr)
+ {
+ PLUGIN_CUASSERT(cudaStreamDestroy(mStream));
+ mStream = nullptr;
+ }
+ }
+
+ int32_t release() noexcept override
+ {
+ try
+ {
+ free();
+ }
+ catch (std::exception const& e)
+ {
+ return -1;
+ }
+ return 0;
+ }
+
+ IPluginResource* clone() noexcept override
+ {
+ std::unique_ptr cloned{};
+ try
+ {
+ cloned = std::make_unique(/* init */ true);
+ }
+ catch (std::exception const& e)
+ {
+ return nullptr;
+ }
+ return cloned.release();
+ }
+
+ ~SharedStream() override
+ {
+ if (mStream)
+ {
+ free();
+ }
+ }
+
+ cudaStream_t mStream{nullptr};
+};
+} // namespace pluginInternal
namespace plugin
{
namespace bert
@@ -41,6 +102,8 @@ struct GemmTypes
{
};
+char const* const kFCPLUGIN_SHARED_STREAM_KEY{"fcPlugin_timing_key"};
+
template <>
struct GemmTypes
{
@@ -174,11 +237,12 @@ void LtGemmSearch(nvinfer1::pluginInternal::cublasLtHandle_t ltHandle,
cudaDataType_t Atype,
cudaDataType_t Btype,
cudaDataType_t Ctype,
- std::vector &perfResults);
+ std::vector &perfResults,
+ cudaStream_t stream);
// clang-format on
template
void LtGemmSearch(nvinfer1::pluginInternal::cublasLtHandle_t ltHandle, Gemm const& g, void* workSpace,
- size_t workSpaceSize, std::vector& perfResults)
+ size_t workSpaceSize, std::vector& perfResults, cudaStream_t stream)
{
// clang-format off
LtGemmSearch(
@@ -203,7 +267,8 @@ void LtGemmSearch(nvinfer1::pluginInternal::cublasLtHandle_t ltHandle, Gemm c
Gemm::Types::cudaTypeI,
Gemm::Types::cudaTypeI,
Gemm::Types::cudaTypeO,
- perfResults
+ perfResults,
+ stream
);
// clang-format on
}
@@ -380,29 +445,30 @@ struct AlgoProps
};
template
-nvinfer1::pluginInternal::cublasLtMatmulAlgo_t gemmSearch(
- int32_t const m, int32_t const n, int32_t const k, size_t const workspaceSize, size_t& actualWorkspace)
+nvinfer1::pluginInternal::cublasLtMatmulAlgo_t gemmSearch(int32_t const m, int32_t const n, int32_t const k,
+ size_t const workspaceSize, size_t& actualWorkspace, cudaStream_t& stream)
{
Gemm g(m, n, k, false, false);
std::vector perfResults(kNB_ALGO_COMBINATIONS);
- PLUGIN_CUASSERT(cudaMalloc(reinterpret_cast(&g.A), g.bytesA));
- PLUGIN_CUASSERT(cudaMalloc(reinterpret_cast(&g.B), g.bytesB));
- PLUGIN_CUASSERT(cudaMalloc(reinterpret_cast(&g.C), g.bytesC));
+ PLUGIN_CUASSERT(cudaMallocAsync(reinterpret_cast(&g.A), g.bytesA, stream));
+ PLUGIN_CUASSERT(cudaMallocAsync(reinterpret_cast(&g.B), g.bytesB, stream));
+ PLUGIN_CUASSERT(cudaMallocAsync(reinterpret_cast(&g.C), g.bytesC, stream));
void* workspace;
- PLUGIN_CUASSERT(cudaMalloc(&workspace, workspaceSize));
+ PLUGIN_CUASSERT(cudaMallocAsync(&workspace, workspaceSize, stream));
nvinfer1::pluginInternal::cublasLtHandle_t lt;
nvinfer1::pluginInternal::CublasLtWrapper& cublasLtWrapper = nvinfer1::pluginInternal::getCublasLtWrapper();
PLUGIN_CUBLASASSERT(cublasLtWrapper.cublasLtCreate(<));
- LtGemmSearch(lt, g, workspace, workspaceSize, perfResults);
- PLUGIN_CUASSERT(cudaDeviceSynchronize());
+
+ LtGemmSearch(lt, g, workspace, workspaceSize, perfResults, stream);
+ PLUGIN_CUASSERT(cudaStreamSynchronize(stream));
PLUGIN_CUBLASASSERT(cublasLtWrapper.cublasLtDestroy(lt));
- PLUGIN_CUASSERT(cudaFree(workspace));
+ PLUGIN_CUASSERT(cudaFreeAsync(workspace, stream));
- PLUGIN_CUASSERT(cudaFree(g.A));
- PLUGIN_CUASSERT(cudaFree(g.B));
- PLUGIN_CUASSERT(cudaFree(g.C));
+ PLUGIN_CUASSERT(cudaFreeAsync(g.A, stream));
+ PLUGIN_CUASSERT(cudaFreeAsync(g.B, stream));
+ PLUGIN_CUASSERT(cudaFreeAsync(g.C, stream));
actualWorkspace = perfResults[0].workspaceSize;
return perfResults[0].algo;
@@ -410,27 +476,28 @@ nvinfer1::pluginInternal::cublasLtMatmulAlgo_t gemmSearch(
template
nvinfer1::pluginInternal::cublasLtMatmulAlgo_t gemmSearch(
- Gemm& g, size_t const workspaceSize, size_t& actualWorkspace)
+ Gemm& g, size_t const workspaceSize, size_t& actualWorkspace, cudaStream_t& stream)
{
std::vector perfResults(kNB_ALGO_COMBINATIONS);
- PLUGIN_CUASSERT(cudaMalloc(&g.A, g.bytesA));
- PLUGIN_CUASSERT(cudaMalloc(&g.B, g.bytesB));
- PLUGIN_CUASSERT(cudaMalloc(&g.C, g.bytesC));
+ PLUGIN_CUASSERT(cudaMallocAsync(&g.A, g.bytesA, stream));
+ PLUGIN_CUASSERT(cudaMallocAsync(&g.B, g.bytesB, stream));
+ PLUGIN_CUASSERT(cudaMallocAsync(&g.C, g.bytesC, stream));
void* workspace;
- PLUGIN_CUASSERT(cudaMalloc(&workspace, workspaceSize));
+ PLUGIN_CUASSERT(cudaMallocAsync(&workspace, workspaceSize, stream));
nvinfer1::pluginInternal::cublasLtHandle_t lt;
nvinfer1::pluginInternal::CublasLtWrapper& cublasLtWrapper = nvinfer1::pluginInternal::getCublasLtWrapper();
PLUGIN_CUBLASASSERT(cublasLtWrapper.cublasLtCreate(<));
- LtGemmSearch(lt, g, workspace, workspaceSize, perfResults);
- PLUGIN_CUASSERT(cudaDeviceSynchronize());
+
+ LtGemmSearch(lt, g, workspace, workspaceSize, perfResults, stream);
+ PLUGIN_CUASSERT(cudaStreamSynchronize(stream));
PLUGIN_CUBLASASSERT(cublasLtWrapper.cublasLtDestroy(lt));
- PLUGIN_CUASSERT(cudaFree(workspace));
+ PLUGIN_CUASSERT(cudaFreeAsync(workspace, stream));
- PLUGIN_CUASSERT(cudaFree(g.A));
- PLUGIN_CUASSERT(cudaFree(g.B));
- PLUGIN_CUASSERT(cudaFree(g.C));
+ PLUGIN_CUASSERT(cudaFreeAsync(g.A, stream));
+ PLUGIN_CUASSERT(cudaFreeAsync(g.B, stream));
+ PLUGIN_CUASSERT(cudaFreeAsync(g.C, stream));
actualWorkspace = perfResults[0].workspaceSize;
return perfResults[0].algo;
@@ -500,6 +567,7 @@ class FCPluginDynamic : public nvinfer1::IPluginV2DynamicExt
bert::cuda_unique_ptr mWdev;
LtContext mLtContext;
+ cudaStream_t mSharedStream{nullptr};
};
class FCPluginDynamicCreator : public nvinfer1::IPluginCreator
@@ -527,6 +595,7 @@ class FCPluginDynamicCreator : public nvinfer1::IPluginCreator
static std::vector mPluginAttributes;
std::string mNamespace;
};
+
} // namespace bert
} // namespace plugin
} // namespace nvinfer1
diff --git a/plugin/flattenConcat/CMakeLists.txt b/plugin/flattenConcat/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/flattenConcat/CMakeLists.txt
+++ b/plugin/flattenConcat/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/geluPlugin/CMakeLists.txt b/plugin/geluPlugin/CMakeLists.txt
index f49d60bd..0fbe405b 100644
--- a/plugin/geluPlugin/CMakeLists.txt
+++ b/plugin/geluPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/geluPlugin/geluKernel.cu b/plugin/geluPlugin/geluKernel.cu
index 823ae803..fd7f8d54 100644
--- a/plugin/geluPlugin/geluKernel.cu
+++ b/plugin/geluPlugin/geluKernel.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/geluPlugin/geluPlugin.cpp b/plugin/geluPlugin/geluPlugin.cpp
index ca0d775f..dc6d48f8 100644
--- a/plugin/geluPlugin/geluPlugin.cpp
+++ b/plugin/geluPlugin/geluPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/geluPlugin/geluPlugin.h b/plugin/geluPlugin/geluPlugin.h
index 14bc0f6a..724d4ee8 100644
--- a/plugin/geluPlugin/geluPlugin.h
+++ b/plugin/geluPlugin/geluPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/generateDetectionPlugin/CMakeLists.txt b/plugin/generateDetectionPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/generateDetectionPlugin/CMakeLists.txt
+++ b/plugin/generateDetectionPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/generateDetectionPlugin/generateDetectionPlugin.cpp b/plugin/generateDetectionPlugin/generateDetectionPlugin.cpp
index 574f2ba2..7c7f5f82 100644
--- a/plugin/generateDetectionPlugin/generateDetectionPlugin.cpp
+++ b/plugin/generateDetectionPlugin/generateDetectionPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/generateDetectionPlugin/generateDetectionPlugin.h b/plugin/generateDetectionPlugin/generateDetectionPlugin.h
index 75dd50f3..f888f8a7 100644
--- a/plugin/generateDetectionPlugin/generateDetectionPlugin.h
+++ b/plugin/generateDetectionPlugin/generateDetectionPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/gridAnchorPlugin/CMakeLists.txt b/plugin/gridAnchorPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/gridAnchorPlugin/CMakeLists.txt
+++ b/plugin/gridAnchorPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/groupNormalizationPlugin/CMakeLists.txt b/plugin/groupNormalizationPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/groupNormalizationPlugin/CMakeLists.txt
+++ b/plugin/groupNormalizationPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/groupNormalizationPlugin/groupNormalizationKernel.cu b/plugin/groupNormalizationPlugin/groupNormalizationKernel.cu
index fc051e7f..4ab6dd12 100644
--- a/plugin/groupNormalizationPlugin/groupNormalizationKernel.cu
+++ b/plugin/groupNormalizationPlugin/groupNormalizationKernel.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/instanceNormalizationPlugin/CMakeLists.txt b/plugin/instanceNormalizationPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/instanceNormalizationPlugin/CMakeLists.txt
+++ b/plugin/instanceNormalizationPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/instanceNormalizationPlugin/instanceNormCommon.h b/plugin/instanceNormalizationPlugin/instanceNormCommon.h
index fb6f5bd0..938ed2cf 100644
--- a/plugin/instanceNormalizationPlugin/instanceNormCommon.h
+++ b/plugin/instanceNormalizationPlugin/instanceNormCommon.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/instanceNormalizationPlugin/instanceNormFwd.h b/plugin/instanceNormalizationPlugin/instanceNormFwd.h
index 5f5901bb..1836eb41 100644
--- a/plugin/instanceNormalizationPlugin/instanceNormFwd.h
+++ b/plugin/instanceNormalizationPlugin/instanceNormFwd.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/instanceNormalizationPlugin/instanceNormFwdImpl.cu b/plugin/instanceNormalizationPlugin/instanceNormFwdImpl.cu
index b79436e7..3bf35f6b 100644
--- a/plugin/instanceNormalizationPlugin/instanceNormFwdImpl.cu
+++ b/plugin/instanceNormalizationPlugin/instanceNormFwdImpl.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/leakyReluPlugin/CMakeLists.txt b/plugin/leakyReluPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/leakyReluPlugin/CMakeLists.txt
+++ b/plugin/leakyReluPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/leakyReluPlugin/lReluPlugin.cpp b/plugin/leakyReluPlugin/lReluPlugin.cpp
index 28148c8b..3acf8f39 100644
--- a/plugin/leakyReluPlugin/lReluPlugin.cpp
+++ b/plugin/leakyReluPlugin/lReluPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/leakyReluPlugin/lReluPlugin.h b/plugin/leakyReluPlugin/lReluPlugin.h
index 60d81029..087b0a0b 100644
--- a/plugin/leakyReluPlugin/lReluPlugin.h
+++ b/plugin/leakyReluPlugin/lReluPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/modulatedDeformConvPlugin/CMakeLists.txt b/plugin/modulatedDeformConvPlugin/CMakeLists.txt
index df2f2da8..0e7b1e6e 100644
--- a/plugin/modulatedDeformConvPlugin/CMakeLists.txt
+++ b/plugin/modulatedDeformConvPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/modulatedDeformConvPlugin/commonCudaHelper.h b/plugin/modulatedDeformConvPlugin/commonCudaHelper.h
index 5466817b..336867cd 100644
--- a/plugin/modulatedDeformConvPlugin/commonCudaHelper.h
+++ b/plugin/modulatedDeformConvPlugin/commonCudaHelper.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multilevelCropAndResizePlugin/CMakeLists.txt b/plugin/multilevelCropAndResizePlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/multilevelCropAndResizePlugin/CMakeLists.txt
+++ b/plugin/multilevelCropAndResizePlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.cpp b/plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.cpp
index 8b8c57f0..6ae3186d 100644
--- a/plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.cpp
+++ b/plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.h b/plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.h
index c2f615b4..d30df9cf 100644
--- a/plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.h
+++ b/plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multilevelProposeROI/CMakeLists.txt b/plugin/multilevelProposeROI/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/multilevelProposeROI/CMakeLists.txt
+++ b/plugin/multilevelProposeROI/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multilevelProposeROI/multilevelProposeROIPlugin.cpp b/plugin/multilevelProposeROI/multilevelProposeROIPlugin.cpp
index d9ad8add..48d3a359 100644
--- a/plugin/multilevelProposeROI/multilevelProposeROIPlugin.cpp
+++ b/plugin/multilevelProposeROI/multilevelProposeROIPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multilevelProposeROI/multilevelProposeROIPlugin.h b/plugin/multilevelProposeROI/multilevelProposeROIPlugin.h
index 653958e6..e384556f 100644
--- a/plugin/multilevelProposeROI/multilevelProposeROIPlugin.h
+++ b/plugin/multilevelProposeROI/multilevelProposeROIPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multilevelProposeROI/tlt_mrcnn_config.h b/plugin/multilevelProposeROI/tlt_mrcnn_config.h
index 13c8abfe..d85cc9fd 100644
--- a/plugin/multilevelProposeROI/tlt_mrcnn_config.h
+++ b/plugin/multilevelProposeROI/tlt_mrcnn_config.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multiscaleDeformableAttnPlugin/CMakeLists.txt b/plugin/multiscaleDeformableAttnPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/multiscaleDeformableAttnPlugin/CMakeLists.txt
+++ b/plugin/multiscaleDeformableAttnPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.cu b/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.cu
index d6843c64..648c83fb 100644
--- a/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.cu
+++ b/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.h b/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.h
index ba29c0bb..50336389 100644
--- a/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.h
+++ b/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttnPlugin.cpp b/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttnPlugin.cpp
index 18b763ba..1a87adb0 100644
--- a/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttnPlugin.cpp
+++ b/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttnPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableIm2ColCuda.cuh b/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableIm2ColCuda.cuh
index 370c4cd1..454b9f03 100644
--- a/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableIm2ColCuda.cuh
+++ b/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableIm2ColCuda.cuh
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/nmsPlugin/CMakeLists.txt b/plugin/nmsPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/nmsPlugin/CMakeLists.txt
+++ b/plugin/nmsPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/nmsPlugin/nmsPlugin.cpp b/plugin/nmsPlugin/nmsPlugin.cpp
index 458c184e..e567f8b9 100644
--- a/plugin/nmsPlugin/nmsPlugin.cpp
+++ b/plugin/nmsPlugin/nmsPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/nmsPlugin/nmsPlugin.h b/plugin/nmsPlugin/nmsPlugin.h
index eccefc37..dc70f2b0 100644
--- a/plugin/nmsPlugin/nmsPlugin.h
+++ b/plugin/nmsPlugin/nmsPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/normalizePlugin/CMakeLists.txt b/plugin/normalizePlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/normalizePlugin/CMakeLists.txt
+++ b/plugin/normalizePlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/nvFasterRCNN/CMakeLists.txt b/plugin/nvFasterRCNN/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/nvFasterRCNN/CMakeLists.txt
+++ b/plugin/nvFasterRCNN/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/pillarScatterPlugin/CMakeLists.txt b/plugin/pillarScatterPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/pillarScatterPlugin/CMakeLists.txt
+++ b/plugin/pillarScatterPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/pillarScatterPlugin/pillarScatter.cpp b/plugin/pillarScatterPlugin/pillarScatter.cpp
index b520347f..fe47b4c0 100644
--- a/plugin/pillarScatterPlugin/pillarScatter.cpp
+++ b/plugin/pillarScatterPlugin/pillarScatter.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/pillarScatterPlugin/pillarScatter.h b/plugin/pillarScatterPlugin/pillarScatter.h
index cdaf0454..6a968b08 100644
--- a/plugin/pillarScatterPlugin/pillarScatter.h
+++ b/plugin/pillarScatterPlugin/pillarScatter.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/priorBoxPlugin/CMakeLists.txt b/plugin/priorBoxPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/priorBoxPlugin/CMakeLists.txt
+++ b/plugin/priorBoxPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/proposalLayerPlugin/CMakeLists.txt b/plugin/proposalLayerPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/proposalLayerPlugin/CMakeLists.txt
+++ b/plugin/proposalLayerPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/proposalLayerPlugin/proposalLayerPlugin.cpp b/plugin/proposalLayerPlugin/proposalLayerPlugin.cpp
index 1335ea66..b9847495 100644
--- a/plugin/proposalLayerPlugin/proposalLayerPlugin.cpp
+++ b/plugin/proposalLayerPlugin/proposalLayerPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/proposalLayerPlugin/proposalLayerPlugin.h b/plugin/proposalLayerPlugin/proposalLayerPlugin.h
index 68a0d136..d612db29 100644
--- a/plugin/proposalLayerPlugin/proposalLayerPlugin.h
+++ b/plugin/proposalLayerPlugin/proposalLayerPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/proposalPlugin/CMakeLists.txt b/plugin/proposalPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/proposalPlugin/CMakeLists.txt
+++ b/plugin/proposalPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/proposalPlugin/proposalPlugin.cpp b/plugin/proposalPlugin/proposalPlugin.cpp
index e1bd677b..6a6e48c0 100644
--- a/plugin/proposalPlugin/proposalPlugin.cpp
+++ b/plugin/proposalPlugin/proposalPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -514,7 +514,7 @@ void ProposalPlugin::setPluginNamespace(char const* libNamespace) noexcept
{
try
{
- PLUGIN_VALIDATE(libNamespace != nullptr);
+ PLUGIN_VALIDATE(libNamespace == nullptr);
mNamespace = libNamespace;
}
catch (std::exception const& e)
@@ -527,7 +527,7 @@ void ProposalDynamicPlugin::setPluginNamespace(char const* libNamespace) noexcep
{
try
{
- PLUGIN_VALIDATE(libNamespace != nullptr);
+ PLUGIN_VALIDATE(libNamespace == nullptr);
mNamespace = libNamespace;
}
catch (std::exception const& e)
diff --git a/plugin/proposalPlugin/proposalPlugin.h b/plugin/proposalPlugin/proposalPlugin.h
index 05e9508f..025f1dee 100644
--- a/plugin/proposalPlugin/proposalPlugin.h
+++ b/plugin/proposalPlugin/proposalPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/pyramidROIAlignPlugin/CMakeLists.txt b/plugin/pyramidROIAlignPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/pyramidROIAlignPlugin/CMakeLists.txt
+++ b/plugin/pyramidROIAlignPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.cpp b/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.cpp
index 141339f1..e1cf5749 100644
--- a/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.cpp
+++ b/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.h b/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.h
index dde6a309..9d2cf26e 100644
--- a/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.h
+++ b/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/regionPlugin/CMakeLists.txt b/plugin/regionPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/regionPlugin/CMakeLists.txt
+++ b/plugin/regionPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/regionPlugin/regionPlugin.cpp b/plugin/regionPlugin/regionPlugin.cpp
index c6f709eb..6a140556 100644
--- a/plugin/regionPlugin/regionPlugin.cpp
+++ b/plugin/regionPlugin/regionPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/regionPlugin/regionPlugin.h b/plugin/regionPlugin/regionPlugin.h
index 7af234f1..66913fc0 100644
--- a/plugin/regionPlugin/regionPlugin.h
+++ b/plugin/regionPlugin/regionPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/reorgPlugin/CMakeLists.txt b/plugin/reorgPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/reorgPlugin/CMakeLists.txt
+++ b/plugin/reorgPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/reorgPlugin/reorgPlugin.cpp b/plugin/reorgPlugin/reorgPlugin.cpp
index 0154580a..227c59d9 100644
--- a/plugin/reorgPlugin/reorgPlugin.cpp
+++ b/plugin/reorgPlugin/reorgPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/reorgPlugin/reorgPlugin.h b/plugin/reorgPlugin/reorgPlugin.h
index 5971e028..f0e4b2e6 100644
--- a/plugin/reorgPlugin/reorgPlugin.h
+++ b/plugin/reorgPlugin/reorgPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/resizeNearestPlugin/CMakeLists.txt b/plugin/resizeNearestPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/resizeNearestPlugin/CMakeLists.txt
+++ b/plugin/resizeNearestPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/resizeNearestPlugin/resizeNearestPlugin.cpp b/plugin/resizeNearestPlugin/resizeNearestPlugin.cpp
index d60c91fa..75f0b73a 100644
--- a/plugin/resizeNearestPlugin/resizeNearestPlugin.cpp
+++ b/plugin/resizeNearestPlugin/resizeNearestPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/resizeNearestPlugin/resizeNearestPlugin.h b/plugin/resizeNearestPlugin/resizeNearestPlugin.h
index 5db5fc49..3f9f7e3e 100644
--- a/plugin/resizeNearestPlugin/resizeNearestPlugin.h
+++ b/plugin/resizeNearestPlugin/resizeNearestPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/roiAlignPlugin/CMakeLists.txt b/plugin/roiAlignPlugin/CMakeLists.txt
index bd8066f0..a2ac13d7 100644
--- a/plugin/roiAlignPlugin/CMakeLists.txt
+++ b/plugin/roiAlignPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/roiAlignPlugin/roiAlignKernel.h b/plugin/roiAlignPlugin/roiAlignKernel.h
index 890a9822..3be3faaa 100644
--- a/plugin/roiAlignPlugin/roiAlignKernel.h
+++ b/plugin/roiAlignPlugin/roiAlignKernel.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/roiAlignPlugin/roiAlignPlugin.cpp b/plugin/roiAlignPlugin/roiAlignPlugin.cpp
index d5e51638..5681eff5 100644
--- a/plugin/roiAlignPlugin/roiAlignPlugin.cpp
+++ b/plugin/roiAlignPlugin/roiAlignPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/roiAlignPlugin/roiAlignPlugin.h b/plugin/roiAlignPlugin/roiAlignPlugin.h
index f1246c83..e22d2571 100644
--- a/plugin/roiAlignPlugin/roiAlignPlugin.h
+++ b/plugin/roiAlignPlugin/roiAlignPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterElementsPlugin/CMakeLists.txt b/plugin/scatterElementsPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/scatterElementsPlugin/CMakeLists.txt
+++ b/plugin/scatterElementsPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterElementsPlugin/TensorInfo.cuh b/plugin/scatterElementsPlugin/TensorInfo.cuh
index 0656756c..fd6dd69d 100644
--- a/plugin/scatterElementsPlugin/TensorInfo.cuh
+++ b/plugin/scatterElementsPlugin/TensorInfo.cuh
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterElementsPlugin/atomics.cuh b/plugin/scatterElementsPlugin/atomics.cuh
index 90094c22..19c43e48 100644
--- a/plugin/scatterElementsPlugin/atomics.cuh
+++ b/plugin/scatterElementsPlugin/atomics.cuh
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterElementsPlugin/reducer.cuh b/plugin/scatterElementsPlugin/reducer.cuh
index 7143aa9f..baa13d92 100644
--- a/plugin/scatterElementsPlugin/reducer.cuh
+++ b/plugin/scatterElementsPlugin/reducer.cuh
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterElementsPlugin/scatterElementsPlugin.cpp b/plugin/scatterElementsPlugin/scatterElementsPlugin.cpp
index 7910ad55..babbaecc 100644
--- a/plugin/scatterElementsPlugin/scatterElementsPlugin.cpp
+++ b/plugin/scatterElementsPlugin/scatterElementsPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterElementsPlugin/scatterElementsPlugin.h b/plugin/scatterElementsPlugin/scatterElementsPlugin.h
index a49c4448..01c2a73d 100644
--- a/plugin/scatterElementsPlugin/scatterElementsPlugin.h
+++ b/plugin/scatterElementsPlugin/scatterElementsPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterElementsPlugin/scatterElementsPluginKernel.cu b/plugin/scatterElementsPlugin/scatterElementsPluginKernel.cu
index 7f487725..b09db5ae 100644
--- a/plugin/scatterElementsPlugin/scatterElementsPluginKernel.cu
+++ b/plugin/scatterElementsPlugin/scatterElementsPluginKernel.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterElementsPlugin/scatterElementsPluginKernel.h b/plugin/scatterElementsPlugin/scatterElementsPluginKernel.h
index d7fa1f5a..307ef355 100644
--- a/plugin/scatterElementsPlugin/scatterElementsPluginKernel.h
+++ b/plugin/scatterElementsPlugin/scatterElementsPluginKernel.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterPlugin/CMakeLists.txt b/plugin/scatterPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/scatterPlugin/CMakeLists.txt
+++ b/plugin/scatterPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/scatterPlugin/scatterLayer.cu b/plugin/scatterPlugin/scatterLayer.cu
index b7409156..55fdef1f 100644
--- a/plugin/scatterPlugin/scatterLayer.cu
+++ b/plugin/scatterPlugin/scatterLayer.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/skipLayerNormPlugin/CMakeLists.txt b/plugin/skipLayerNormPlugin/CMakeLists.txt
index f49d60bd..0fbe405b 100644
--- a/plugin/skipLayerNormPlugin/CMakeLists.txt
+++ b/plugin/skipLayerNormPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedKernelHFace.cu b/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedKernelHFace.cu
index 428c7483..b915dfb2 100644
--- a/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedKernelHFace.cu
+++ b/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedKernelHFace.cu
@@ -1,6 +1,6 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedKernelMTron.cu b/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedKernelMTron.cu
index f4c2a39c..7858f3e3 100644
--- a/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedKernelMTron.cu
+++ b/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedKernelMTron.cu
@@ -1,6 +1,6 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedPlugin.cpp b/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedPlugin.cpp
index 72061613..1b74f944 100644
--- a/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedPlugin.cpp
+++ b/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedPlugin.h b/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedPlugin.h
index f12cdda8..e858919b 100644
--- a/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedPlugin.h
+++ b/plugin/skipLayerNormPlugin/skipLayerNormInt8InterleavedPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/skipLayerNormPlugin/skipLayerNormKernel.cu b/plugin/skipLayerNormPlugin/skipLayerNormKernel.cu
index 5d52c249..da0cee19 100644
--- a/plugin/skipLayerNormPlugin/skipLayerNormKernel.cu
+++ b/plugin/skipLayerNormPlugin/skipLayerNormKernel.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/skipLayerNormPlugin/skipLayerNormPlugin.cpp b/plugin/skipLayerNormPlugin/skipLayerNormPlugin.cpp
index 04ed5885..c792486b 100644
--- a/plugin/skipLayerNormPlugin/skipLayerNormPlugin.cpp
+++ b/plugin/skipLayerNormPlugin/skipLayerNormPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/skipLayerNormPlugin/skipLayerNormPlugin.h b/plugin/skipLayerNormPlugin/skipLayerNormPlugin.h
index b9fb8c50..9b1a783a 100644
--- a/plugin/skipLayerNormPlugin/skipLayerNormPlugin.h
+++ b/plugin/skipLayerNormPlugin/skipLayerNormPlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/specialSlicePlugin/CMakeLists.txt b/plugin/specialSlicePlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/specialSlicePlugin/CMakeLists.txt
+++ b/plugin/specialSlicePlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/specialSlicePlugin/specialSlicePlugin.cpp b/plugin/specialSlicePlugin/specialSlicePlugin.cpp
index f4bdb04c..3730cfcc 100644
--- a/plugin/specialSlicePlugin/specialSlicePlugin.cpp
+++ b/plugin/specialSlicePlugin/specialSlicePlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/specialSlicePlugin/specialSlicePlugin.h b/plugin/specialSlicePlugin/specialSlicePlugin.h
index 0837682f..710bb8b4 100644
--- a/plugin/specialSlicePlugin/specialSlicePlugin.h
+++ b/plugin/specialSlicePlugin/specialSlicePlugin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/splitPlugin/CMakeLists.txt b/plugin/splitPlugin/CMakeLists.txt
index 1f1d4169..f1f6081b 100644
--- a/plugin/splitPlugin/CMakeLists.txt
+++ b/plugin/splitPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/splitPlugin/split.cu b/plugin/splitPlugin/split.cu
index 771e9cba..0afec432 100644
--- a/plugin/splitPlugin/split.cu
+++ b/plugin/splitPlugin/split.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/splitPlugin/split.h b/plugin/splitPlugin/split.h
index cc1916bf..2d7a9bd5 100644
--- a/plugin/splitPlugin/split.h
+++ b/plugin/splitPlugin/split.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/voxelGeneratorPlugin/CMakeLists.txt b/plugin/voxelGeneratorPlugin/CMakeLists.txt
index a240519a..657bfadc 100644
--- a/plugin/voxelGeneratorPlugin/CMakeLists.txt
+++ b/plugin/voxelGeneratorPlugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/voxelGeneratorPlugin/voxelGenerator.cpp b/plugin/voxelGeneratorPlugin/voxelGenerator.cpp
index c27d5193..2fff10cc 100644
--- a/plugin/voxelGeneratorPlugin/voxelGenerator.cpp
+++ b/plugin/voxelGeneratorPlugin/voxelGenerator.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/plugin/voxelGeneratorPlugin/voxelGenerator.h b/plugin/voxelGeneratorPlugin/voxelGenerator.h
index fea96877..9bb4f471 100644
--- a/plugin/voxelGeneratorPlugin/voxelGenerator.h
+++ b/plugin/voxelGeneratorPlugin/voxelGenerator.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1494c1fd..66034f8b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -91,7 +91,7 @@ if (MSVC)
find_path(PY_LIB_DIR ${PYTHON_LIB_NAME}.lib HINTS ${WIN_EXTERNALS}/${PYTHON} ${EXT_PATH}/${PYTHON} PATH_SUFFIXES lib)
message(STATUS "PY_LIB_DIR: ${PY_LIB_DIR}")
else()
- find_path(PY_INCLUDE Python.h HINTS ${EXT_PATH}/${PYTHON} PATH_SUFFIXES include)
+ find_path(PY_INCLUDE Python.h HINTS ${EXT_PATH}/${PYTHON} /usr/include/${PYTHON} PATH_SUFFIXES include)
endif()
message(STATUS "PY_INCLUDE: ${PY_INCLUDE}")
@@ -133,16 +133,6 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GLIBCXX_USE_CXX11_ABI_FLAG} -fvisibility=hidden -std=c++${CPP_STANDARD} -Wno-deprecated-declarations")
endif()
-# remove md
-# Add the flags to enable MD-TRT.
-if ("${ENABLE_MDTRT}" STREQUAL "1")
- include_directories(${TENSORRT_ROOT}/optimizer)
- include_directories(${TENSORRT_ROOT}/runtime)
- include_directories(${TENSORRT_ROOT}/common)
- include_directories(${TENSORRT_ROOT}/safety)
- add_compile_definitions(ENABLE_MDTRT=1)
-endif()
-
# Update linker
if(${NV_TARGET_OS} MATCHES "wddm2")
if(DEFINED W10_LINKER)
@@ -159,12 +149,26 @@ else()
set(vfc_suffix "")
endif()
+if (MSVC)
+ set(nvinfer_lib_name "nvinfer_${TENSORRT_MAJOR_VERSION}")
+ set(nvinfer_plugin_lib_name "nvinfer_plugin_${TENSORRT_MAJOR_VERSION}")
+ set(nvonnxparser_lib_name "nvonnxparser_${TENSORRT_MAJOR_VERSION}")
+ set(nvinfer_lean_lib_name "nvinfer_lean_${TENSORRT_MAJOR_VERSION}${vfc_suffix}")
+ set(nvinfer_dispatch_lib_name "nvinfer_dispatch_${TENSORRT_MAJOR_VERSION}${vfc_suffix}")
+else()
+ set(nvinfer_lib_name "nvinfer")
+ set(nvinfer_plugin_lib_name "nvinfer_plugin")
+ set(nvonnxparser_lib_name "nvonnxparser")
+ set(nvinfer_lean_lib_name "nvinfer_lean${vfc_suffix}")
+ set(nvinfer_dispatch_lib_name "nvinfer_dispatch${vfc_suffix}")
+endif()
+
if (${TENSORRT_MODULE} STREQUAL "tensorrt")
- set(TRT_LIBS nvinfer nvonnxparser nvinfer_plugin)
+ set(TRT_LIBS ${nvinfer_lib_name} ${nvonnxparser_lib_name} ${nvinfer_plugin_lib_name})
elseif (${TENSORRT_MODULE} STREQUAL "tensorrt_lean")
- set(TRT_LIBS "nvinfer_lean${vfc_suffix}")
+ set(TRT_LIBS ${nvinfer_lean_lib_name})
elseif (${TENSORRT_MODULE} STREQUAL "tensorrt_dispatch")
- set(TRT_LIBS "nvinfer_dispatch${vfc_suffix}")
+ set(TRT_LIBS ${nvinfer_dispatch_lib_name})
else()
message(FATAL_ERROR "Unknown TensorRT module " ${TENSORRT_MODULE})
endif()
diff --git a/python/docstrings/infer/pyAlgorithmSelectorDoc.h b/python/docstrings/infer/pyAlgorithmSelectorDoc.h
index f5814474..78ba454c 100644
--- a/python/docstrings/infer/pyAlgorithmSelectorDoc.h
+++ b/python/docstrings/infer/pyAlgorithmSelectorDoc.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -60,13 +60,7 @@ constexpr const char* descr = R"trtdoc(
:ivar num_inputs: :class:`int` number of inputs of the algorithm.
:ivar num_outputs: :class:`int` number of outputs of the algorithm.
)trtdoc"
-// remove md
-#if ENABLE_MDTRT
- R"trtdoc(
- :ivar instance_id: Read-only. The multi-device instance ID.
-)trtdoc"
-#endif // ENABLE_MDTRT
- ;
+ ;
constexpr const char* get_shape = R"trtdoc(
Get the minimum / optimum / maximum dimensions for a dynamic input tensor.
diff --git a/python/docstrings/infer/pyCoreDoc.h b/python/docstrings/infer/pyCoreDoc.h
index 3586fd9f..d59d6ac0 100644
--- a/python/docstrings/infer/pyCoreDoc.h
+++ b/python/docstrings/infer/pyCoreDoc.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -407,7 +407,7 @@ constexpr char const* descr = R"trtdoc(
:ivar nvtx_verbosity: The NVTX verbosity of the execution context. Building with DETAILED verbosity will generally increase latency in enqueueV3(). Call this method to select NVTX verbosity in this execution context at runtime. The default is the verbosity with which the engine was built, and the verbosity may not be raised above that level. This function does not affect how IEngineInspector interacts with the engine.
:ivar temporary_allocator: :class:`IGpuAllocator` The GPU allocator used for internal temporary storage.
:ivar weight_streaming_budget: Set and get the current weight streaming budget for inference. The budget may be set to -1 disabling weight streaming at runtime, 0 (default) enabling TRT to choose to weight stream or not, or a positive value in the inclusive range [minimum_weight_streaming_budget, streamable_weights_size - 1].
- :ivar minimum_weight_streaming_budget: Returns the minimum weight streaming budget in bytes required to run the network successfully. The engine must have been built with kWEIGHT_STREAMING.
+ :ivar minimum_weight_streaming_budget: Returns the minimum weight streaming budget in bytes required to run the network successfully. The engine must have been built with kWEIGHT_STREAMING.
:ivar streamable_weights_size: Returns the size of the streamable weights in the engine. This may not include all the weights.
)trtdoc";
@@ -731,15 +731,6 @@ constexpr char const* create_execution_context_without_device_memory = R"trtdoc(
:returns: An :class:`IExecutionContext` without device memory allocated.
)trtdoc";
-constexpr char const* get_profile_shape = R"trtdoc(
- Get the minimum/optimum/maximum dimensions for a particular binding under an optimization profile.
-
- :arg profile_index: The index of the profile.
- :arg binding: The binding index or name.
-
- :returns: A ``List[Dims]`` of length 3, containing the minimum, optimum, and maximum shapes, in that order.
-)trtdoc";
-
constexpr char const* get_tensor_profile_values = R"trtdoc(
Get minimum/optimum/maximum values for an input shape binding under an optimization profile. If the specified binding is not an input shape binding, an exception is raised.
@@ -882,7 +873,7 @@ To implement a custom output allocator, ensure that you explicitly instantiate t
def reallocate_output_async(self, tensor_name, memory, size, alignment, stream):
... # Your implementation here
-
+
def notify_shape(self, tensor_name, shape):
... # Your implementation here
@@ -936,7 +927,7 @@ To implement a custom stream reader, ensure that you explicitly instantiate the
def __init__(self):
trt.IStreamReader.__init__(self)
- def read(self, memory, size):
+ def read(self, size: int) -> bytes:
... # Your implementation here
)trtdoc";
@@ -1032,7 +1023,7 @@ constexpr char const* TACTIC_DRAM = R"trtdoc(
cudaGetDeviceProperties.embedded is true, and 100% otherwise.
)trtdoc";
constexpr char const* TACTIC_SHARED_MEMORY = R"trtdoc(
- TACTIC_SHARED_MEMORY defines the maximum shared memory size utilized for executing
+ TACTIC_SHARED_MEMORY defines the maximum shared memory size utilized for driver reserved and executing
the backend CUDA kernel implementation. Adjust this value to restrict tactics that exceed
the specified threshold en masse. The default value is device max capability. This value must
be less than 1GiB.
@@ -1074,7 +1065,7 @@ constexpr char const* NONE = R"trtdoc(
Do not require hardware compatibility with GPU architectures other than that of the GPU on which the engine was built.
)trtdoc";
constexpr char const* AMPERE_PLUS = R"trtdoc(
- Require that the engine is compatible with Ampere and newer GPUs. This will limit the max shared memory usage to
+ Require that the engine is compatible with Ampere and newer GPUs. This will limit the combined usage of driver reserved and backend kernel max shared memory to
48KiB, may reduce the number of available tactics for each layer, and may prevent some fusions from occurring.
Thus this can decrease the performance, especially for tf32 models.
This option will disable cuDNN, cuBLAS, and cuBLAS LT as tactic sources.
@@ -1624,7 +1615,7 @@ constexpr char const* deserialize_cuda_engine = R"trtdoc(
constexpr char const* deserialize_cuda_engine_reader = R"trtdoc(
Deserialize an :class:`ICudaEngine` from a stream reader.
- :arg stream_reader: The :class:`PyStreamReader` that will read the serialized :class:`ICudaEngine`. This enables deserialization from a file directly.
+ :arg stream_reader: The :class:`PyStreamReader` that will read the serialized :class:`ICudaEngine`. This enables deserialization from a file directly.
:returns: The :class:`ICudaEngine`, or None if it could not be deserialized.
)trtdoc";
@@ -1794,9 +1785,9 @@ constexpr char const* get_weights_prototype = R"trtdoc(
The dtype and size of weights prototype is the same as weights used for engine building.
The size of the weights prototype is -1 when the name of the weights is None or does not correspond to any refittable weights.
-
+
:arg weights_name: The name of the weights to be refitted.
-
+
:returns: weights prototype associated with the given name.
)trtdoc";
@@ -2033,7 +2024,7 @@ Note that all methods below (allocate, reallocate, deallocate, allocate_async, r
constexpr char const* allocate = R"trtdoc(
[DEPRECATED] Deprecated in TensorRT 10.0. Please use allocate_async instead.
A callback implemented by the application to handle acquisition of GPU memory.
- This is just a wrapper around a syncronous method allocate_async passing the default stream.
+ This is just a wrapper around a synchronous method allocate_async passing the default stream.
If an allocation request of size 0 is made, ``None`` should be returned.
@@ -2052,7 +2043,7 @@ constexpr char const* allocate = R"trtdoc(
constexpr char const* deallocate = R"trtdoc(
[DEPRECATED] Deprecated in TensorRT 10.0. Please use deallocate_async instead.
A callback implemented by the application to handle release of GPU memory.
- This is just a wrapper around a syncronous method deallocate_async passing the default stream.
+ This is just a wrapper around a synchronous method deallocate_async passing the default stream.
TensorRT may pass a 0 to this function if it was previously returned by ``allocate()``.
diff --git a/python/docstrings/infer/pyFoundationalTypesDoc.h b/python/docstrings/infer/pyFoundationalTypesDoc.h
index 0e404631..39ffd53f 100644
--- a/python/docstrings/infer/pyFoundationalTypesDoc.h
+++ b/python/docstrings/infer/pyFoundationalTypesDoc.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -94,8 +94,8 @@ constexpr const char* init_type = R"trtdoc(
constexpr const char* init_ptr = R"trtdoc(
Initializes a Weights object with the specified data.
- :type: A type to initialize the weights with.
- :ptr: A pointer to the data.
+ :type: A type to initialize the weights with.
+ :ptr: A pointer to the data.
:count: The number of weights.
)trtdoc";
@@ -108,7 +108,7 @@ constexpr const char* numpy = R"trtdoc(
Create a numpy array using the underlying buffer of this weights object.
The resulting array is just a view over the existing data, i.e. no deep copy is made.
- If the weights cannot be converted to NumPy (e.g. due to unsupported data type), the original weights are returned.
+ If the weights cannot be converted to NumPy (e.g. due to unsupported data type), the original weights are returned.
:returns: The NumPy array or the original weights.
)trtdoc";
diff --git a/python/docstrings/infer/pyGraphDoc.h b/python/docstrings/infer/pyGraphDoc.h
index 1581ad9c..e9913210 100644
--- a/python/docstrings/infer/pyGraphDoc.h
+++ b/python/docstrings/infer/pyGraphDoc.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -1341,8 +1341,10 @@ constexpr const char* descr = R"trtdoc(
Enumerates bounding box data formats used for the Boxes input tensor in the NMS layer.
)trtdoc";
-constexpr const char* CORNER_PAIRS = R"trtdoc((x1, y1, x2, y2) where (x1, y1) and (x2, y2) are any pair of diagonal corners)trtdoc";
-constexpr const char* CENTER_SIZES = R"trtdoc((x_center, y_center, width, height) where (x_center, y_center) is the center point of the box)trtdoc";
+constexpr const char* CORNER_PAIRS
+ = R"trtdoc((x1, y1, x2, y2) where (x1, y1) and (x2, y2) are any pair of diagonal corners)trtdoc";
+constexpr const char* CENTER_SIZES
+ = R"trtdoc((x_center, y_center, width, height) where (x_center, y_center) is the center point of the box)trtdoc";
} // namespace BoundingBoxFormatDoc
@@ -1422,7 +1424,6 @@ constexpr const char* set_input = R"trtdoc(
} // namespace INMSLayerDoc
-
namespace FillOperationDoc
{
constexpr const char* descr = R"trtdoc(The tensor fill operations that may performed by an Fill layer.)trtdoc";
diff --git a/python/docstrings/infer/pyInt8Doc.h b/python/docstrings/infer/pyInt8Doc.h
index 91b635fe..013c6c75 100644
--- a/python/docstrings/infer/pyInt8Doc.h
+++ b/python/docstrings/infer/pyInt8Doc.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/docstrings/infer/pyPluginDoc.h b/python/docstrings/infer/pyPluginDoc.h
index 5df97568..f541a281 100644
--- a/python/docstrings/infer/pyPluginDoc.h
+++ b/python/docstrings/infer/pyPluginDoc.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -183,7 +183,6 @@ constexpr const char* detach_from_context = R"trtdoc(
)trtdoc";
} // namespace IPluginV2ExtDoc
-
namespace IPluginV2DynamicExtDoc
{
constexpr const char* descr = R"trtdoc(
@@ -194,7 +193,7 @@ constexpr const char* descr = R"trtdoc(
Similar to `IPluginV2Ext` (including capability to support different output data types), but with support for dynamic shapes.
This class is made available for the purpose of implementing `IPluginV2DynamicExt` plugins with Python. Inherited
- Python->C++ bindings from `IPluginV2` and `IPluginV2Ext` will continue to work on C++-based `IPluginV2DynamicExt` plugins.
+ Python->C++ bindings from `IPluginV2` and `IPluginV2Ext` will continue to work on C++-based `IPluginV2DynamicExt` plugins.
.. note::
Every attribute except `tensorrt_version` must be explicitly initialized on Python-based plugins. Except `plugin_namespace`,
@@ -212,22 +211,22 @@ constexpr const char* initialize = R"trtdoc(
Initialize the plugin for execution. This is called when the engine is created.
.. note::
- When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `pass`.
+ When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `pass`.
.. warning::
In contrast to the C++ API for `initialize()`, this method must not return an error code. The expected behavior is to throw an appropriate exception
- if an error occurs.
+ if an error occurs.
.. warning::
This `initialize()` method is not available to be called from Python on C++-based plugins.
-
+
)trtdoc";
constexpr const char* terminate = R"trtdoc(
Release resources acquired during plugin layer initialization. This is called when the engine is destroyed.
.. note::
- When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `pass`.
+ When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `pass`.
)trtdoc";
@@ -238,7 +237,7 @@ constexpr const char* get_output_dimensions = R"trtdoc(
This function is called by the implementations of `IBuilder` during analysis of the network.
.. warning::
- This `get_output_dimensions()` method is not available to be called from Python on C++-based plugins
+ This `get_output_dimensions()` method is not available to be called from Python on C++-based plugins
:arg output_index: The index of the output tensor
:arg inputs: Expressions for dimensions of the input tensors
@@ -269,7 +268,7 @@ constexpr const char* configure_plugin = R"trtdoc(
Execution phase: `configure_plugin()` is called when a plugin is being prepared for executing the plugin for specific dimensions. This provides an opportunity for the plugin to change algorithmic choices based on the explicit input dimensions stored in `desc.dims` field.
.. warning::
- This `configure_plugin()` method is not available to be called from Python on C++-based plugins
+ This `configure_plugin()` method is not available to be called from Python on C++-based plugins
:arg in: The input tensors attributes that are used for configuration.
:arg out: The output tensors attributes that are used for configuration.
@@ -299,10 +298,10 @@ constexpr const char* get_workspace_size = R"trtdoc(
This function is called after the plugin is configured, and possibly during execution. The result should be a sufficient workspace size to deal with inputs and outputs of the given size or any smaller problem.
.. note::
- When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `return 0`.
+ When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `return 0`.
.. warning::
- This `get_workspace_size()` method is not available to be called from Python on C++-based plugins
+ This `get_workspace_size()` method is not available to be called from Python on C++-based plugins
:arg input_desc: How to interpret the memory for the input tensors.
:arg output_desc: How to interpret the memory for the output tensors.
@@ -314,7 +313,7 @@ constexpr const char* destroy = R"trtdoc(
Destroy the plugin object. This will be called when the :class:`INetworkDefinition` , :class:`Builder` or :class:`ICudaEngine` is destroyed.
.. note::
- When implementing a Python-based plugin, implementing this method is optional. The default behavior is a `pass`.
+ When implementing a Python-based plugin, implementing this method is optional. The default behavior is a `pass`.
)trtdoc";
@@ -322,13 +321,13 @@ constexpr const char* enqueue = R"trtdoc(
Execute the layer.
`inputs` and `outputs` contains pointers to the corresponding input and output device buffers as their `intptr_t` casts. `stream` also represents an `intptr_t` cast of the CUDA stream in which enqueue should be executed.
-
+
.. warning::
Since input, output, and workspace buffers are created and owned by TRT, care must be taken when writing to them from the Python side.
.. warning::
In contrast to the C++ API for `enqueue()`, this method must not return an error code. The expected behavior is to throw an appropriate exception.
- if an error occurs.
+ if an error occurs.
.. warning::
This `enqueue()` method is not available to be called from Python on C++-based plugins.
@@ -345,7 +344,7 @@ constexpr const char* enqueue = R"trtdoc(
constexpr const char* clone = R"trtdoc(
Clone the plugin object. This copies over internal plugin parameters as well and returns a new plugin object with these parameters.
- If the source plugin is pre-configured with `configure_plugin()`, the returned object should also be pre-configured.
+ If the source plugin is pre-configured with `configure_plugin()`, the returned object should also be pre-configured.
Cloned plugin objects can share the same per-engine immutable resource (e.g. weights) with the source object to avoid duplication.
)trtdoc";
@@ -353,7 +352,7 @@ constexpr const char* get_serialization_size = R"trtdoc(
Return the serialization size (in bytes) required by the plugin.
.. note::
- When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `return len(serialize())`.
+ When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `return len(serialize())`.
)trtdoc";
@@ -392,7 +391,7 @@ constexpr const char* ipluginv3_descr = R"trtdoc(
constexpr const char* iplugincapability_descr = R"trtdoc(
Base class for plugin capability interfaces
-
+
IPluginCapability represents a split in TensorRT V3 plugins to sub-objects that expose different types of capabilites a plugin may have,
as opposed to a single interface which defines all capabilities and behaviors of a plugin.
)trtdoc";
@@ -411,7 +410,7 @@ constexpr const char* ipluginv3onecore_descr = R"trtdoc(
constexpr const char* ipluginv3onebuild_descr = R"trtdoc(
A plugin capability interface that enables the build capability (PluginCapabilityType.BUILD).
-
+
Exposes methods that allow the expression of the build time properties and behavior of a plugin.
.. note::
@@ -423,7 +422,7 @@ constexpr const char* ipluginv3onebuild_descr = R"trtdoc(
constexpr const char* ipluginv3oneruntime_descr = R"trtdoc(
A plugin capability interface that enables the runtime capability (PluginCapabilityType.RUNTIME).
-
+
Exposes methods that allow the expression of the runtime properties and behavior of a plugin.
)trtdoc";
@@ -434,7 +433,7 @@ constexpr const char* get_output_shapes = R"trtdoc(
This function is called by the implementations of `IBuilder` during analysis of the network.
.. warning::
- This `get_output_shapes()` method is not available to be called from Python on C++-based plugins
+ This get_output_shapes() method is not available to be called from Python on C++-based plugins
:arg inputs: Expressions for shapes of the input tensors
:arg shape_inputs: Expressions for shapes of the shape inputs
@@ -445,9 +444,9 @@ constexpr const char* get_output_shapes = R"trtdoc(
constexpr const char* get_output_data_types = R"trtdoc(
- Return `DataType`s of the plugin outputs.
+ Return `DataType` s of the plugin outputs.
- Provide `DataType.FLOAT`s if the layer has no inputs. The data type for any size tensor outputs must be
+ Provide `DataType.FLOAT` s if the layer has no inputs. The data type for any size tensor outputs must be
`DataType.INT32`. The returned data types must each have a format that is supported by the plugin.
:arg input_types: Data types of the inputs.
@@ -458,7 +457,7 @@ constexpr const char* get_output_data_types = R"trtdoc(
constexpr const char* configure_plugin = R"trtdoc(
Configure the plugin.
- This function can be called multiple times in the build phase during creation of an engine by IBuilder.
+ This function can be called multiple times in the build phase during creation of an engine by IBuilder.
Build phase: `configure_plugin()` is called when a plugin is being prepared for profiling but not for any specific input size. This provides an opportunity for the plugin to make algorithmic choices on the basis of input and output formats, along with the bound of possible dimensions. The min, opt and max value of the
`DynamicPluginTensorDesc` correspond to the `MIN`, `OPT` and `MAX` value of the current profile that the plugin is
@@ -467,31 +466,28 @@ constexpr const char* configure_plugin = R"trtdoc(
.. warning::
In contrast to the C++ API for `configurePlugin()`, this method must not return an error code. The expected behavior is to throw an appropriate exception
- if an error occurs.
+ if an error occurs.
.. warning::
- This `configure_plugin()` method is not available to be called from Python on C++-based plugins
+ This `configure_plugin()` method is not available to be called from Python on C++-based plugins
:arg in: The input tensors attributes that are used for configuration.
:arg out: The output tensors attributes that are used for configuration.
)trtdoc";
constexpr const char* on_shape_change = R"trtdoc(
- Called when a plugin is being prepared for execution for specific dimensions. This could happen multiple times in the execution phase, both during creation of an engine by IBuilder and execution of an
- engine by IExecutionContext.
+ Called when a plugin is being prepared for execution for specific dimensions. This could happen multiple times in the execution phase, both during creation of an engine by IBuilder and execution of an
+ engine by IExecutionContext.
- * IBuilder will call this function once per profile, with `in` resolved to the values specified by the
- kOPT field of the current profile.
- * IExecutionContext will call this during the next subsequent instance of enqueue_v2() or execute_v3() if:
- - The optimization profile is changed.
- - An input binding is changed.
+ * IBuilder will call this function once per profile, with `in` resolved to the values specified by the kOPT field of the current profile.
+ * IExecutionContext will call this during the next subsequent instance of enqueue_v2() or execute_v3() if: (1) The optimization profile is changed (2). An input binding is changed.
.. warning::
In contrast to the C++ API for `onShapeChange()`, this method must not return an error code. The expected behavior is to throw an appropriate exception
- if an error occurs.
+ if an error occurs.
.. warning::
- This `on_shape_change()` method is not available to be called from Python on C++-based plugins
+ This `on_shape_change()` method is not available to be called from Python on C++-based plugins
:arg in: The input tensors attributes that are used for configuration.
:arg out: The output tensors attributes that are used for configuration.
@@ -521,10 +517,10 @@ constexpr const char* get_workspace_size = R"trtdoc(
This function is called after the plugin is configured, and possibly during execution. The result should be a sufficient workspace size to deal with inputs and outputs of the given size or any smaller problem.
.. note::
- When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `return 0`.
+ When implementing a Python-based plugin, implementing this method is optional. The default behavior is equivalent to `return 0`.
.. warning::
- This `get_workspace_size()` method is not available to be called from Python on C++-based plugins
+ This `get_workspace_size()` method is not available to be called from Python on C++-based plugins
:arg input_desc: How to interpret the memory for the input tensors.
:arg output_desc: How to interpret the memory for the output tensors.
@@ -539,7 +535,7 @@ constexpr const char* destroy = R"trtdoc(
There is no direct equivalent to this method in the C++ API.
.. note::
- Implementing this method is optional. The default behavior is a `pass`.
+ Implementing this method is optional. The default behavior is a `pass`.
)trtdoc";
@@ -547,13 +543,13 @@ constexpr const char* enqueue = R"trtdoc(
Execute the layer.
`inputs` and `outputs` contains pointers to the corresponding input and output device buffers as their `intptr_t` casts. `stream` also represents an `intptr_t` cast of the CUDA stream in which enqueue should be executed.
-
+
.. warning::
Since input, output, and workspace buffers are created and owned by TRT, care must be taken when writing to them from the Python side.
.. warning::
In contrast to the C++ API for `enqueue()`, this method must not return an error code. The expected behavior is to throw an appropriate exception.
- if an error occurs.
+ if an error occurs.
.. warning::
This `enqueue()` method is not available to be called from Python on C++-based plugins.
@@ -580,7 +576,7 @@ constexpr const char* get_capability_interface = R"trtdoc(
constexpr const char* clone = R"trtdoc(
Clone the plugin object. This copies over internal plugin parameters as well and returns a new plugin object with these parameters.
- If the source plugin is pre-configured with `configure_plugin()`, the returned object should also be pre-configured.
+ If the source plugin is pre-configured with `configure_plugin()`, the returned object should also be pre-configured.
Cloned plugin objects can share the same per-engine immutable resource (e.g. weights) with the source object to avoid duplication.
)trtdoc";
@@ -602,7 +598,7 @@ constexpr const char* set_tactic = R"trtdoc(
.. warning::
In contrast to the C++ API for `setTactic()`, this method must not return an error code. The expected behavior is to throw an appropriate exception
- if an error occurs.
+ if an error occurs.
.. warning::
This `set_tactic()` method is not available to be called from Python on C++-based plugins.
@@ -611,7 +607,7 @@ constexpr const char* set_tactic = R"trtdoc(
constexpr const char* get_valid_tactics = R"trtdoc(
Return any custom tactics that the plugin intends to use.
-
+
.. note::
The provided tactic values must be unique and positive
@@ -626,9 +622,9 @@ constexpr const char* attach_to_context = R"trtdoc(
This function is called automatically for each plugin when a new execution context is created.
The plugin may use resources provided by the resource_context until the plugin is deleted by TensorRT.
-
+
:arg resource_context: A resource context that exposes methods to get access to execution context specific resources. A different resource context is guaranteed for each different execution context to which the plugin is attached.
-
+
.. note::
This method should clone the entire IPluginV3 object, not just the runtime interface
@@ -660,7 +656,7 @@ constexpr const char* release = R"trtdoc(
constexpr const char* clone = R"trtdoc(
Resource initialization (if any) may be skipped for non-cloned objects since only clones will be
registered by TensorRT.
-
+
)trtdoc";
} // namespace IPluginResourceDoc
@@ -703,7 +699,7 @@ namespace IDimensionExprDoc
{
constexpr const char* descr = R"trtdoc(
An `IDimensionExpr` represents an integer expression constructed from constants, input dimensions, and binary operations.
-
+
These expressions are can be used in overrides of `IPluginV2DynamicExt::get_output_dimensions()` to define output dimensions in terms of input dimensions.
)trtdoc";
@@ -787,7 +783,7 @@ namespace IPluginResourceContextDoc
{
constexpr const char* descr = R"trtdoc(
Interface for plugins to access per context resources provided by TensorRT
-
+
There is no public way to construct an IPluginResourceContext. It appears as an argument to trt.IPluginV3OneRuntime.attach_to_context().
)trtdoc";
} // namespace IPluginResourceContextDoc
@@ -953,7 +949,7 @@ constexpr const char* get_plugin_creator = R"trtdoc(
Return plugin creator based on type, version and namespace
.. warning::
- Returns None if a plugin creator with matching name, version, and namespace is found, but is not a
+ Returns None if a plugin creator with matching name, version, and namespace is found, but is not a
descendent of IPluginCreator
:arg type: The type of the plugin.
@@ -998,12 +994,12 @@ constexpr const char* deregister_library = R"trtdoc(
constexpr const char* acquire_plugin_resource = R"trtdoc(
Get a handle to a plugin resource registered against the provided key.
- :arg: key: Key for identifying the resource.
+ :arg: key: Key for identifying the resource.
:arg: resource: A plugin resource object. The object will only need to be valid until this method returns, as only a clone of this object will be registered by TRT. Cannot be null.
)trtdoc";
constexpr const char* release_plugin_resource = R"trtdoc(
- Decrement reference count for the resource with this key. If reference count goes to zero after decrement, release() will be invoked on the resource,
+ Decrement reference count for the resource with this key. If reference count goes to zero after decrement, release() will be invoked on the resource,
and the key will be deregistered.
:arg: key: Key that was used to register the resource.
diff --git a/python/docstrings/parsers/pyOnnxDoc.h b/python/docstrings/parsers/pyOnnxDoc.h
index 7099a207..17656d27 100644
--- a/python/docstrings/parsers/pyOnnxDoc.h
+++ b/python/docstrings/parsers/pyOnnxDoc.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/docstrings/pyTensorRTDoc.h b/python/docstrings/pyTensorRTDoc.h
index 2ebb0a82..3594d387 100644
--- a/python/docstrings/pyTensorRTDoc.h
+++ b/python/docstrings/pyTensorRTDoc.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/include/ForwardDeclarations.h b/python/include/ForwardDeclarations.h
index c377bf66..d4bed446 100644
--- a/python/include/ForwardDeclarations.h
+++ b/python/include/ForwardDeclarations.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/include/utils.h b/python/include/utils.h
index 0b46743a..2f0d5bdc 100644
--- a/python/include/utils.h
+++ b/python/include/utils.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -162,7 +162,7 @@ void throwPyError(PyObject* type, std::string const& message = "python error");
{ \
utils::throwPyError(PyExc_IndexError, "Out of bounds"); \
} \
- }while(false)
+ } while (false)
#define PY_ASSERT_VALUE_ERROR(assertion, msg) \
do \
diff --git a/python/packaging/bindings_wheel/setup.py b/python/packaging/bindings_wheel/setup.py
index 7bd97517..32b9a730 100644
--- a/python/packaging/bindings_wheel/setup.py
+++ b/python/packaging/bindings_wheel/setup.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/packaging/bindings_wheel/tensorrt/__init__.py b/python/packaging/bindings_wheel/tensorrt/__init__.py
index 01e49480..e82ee1ec 100644
--- a/python/packaging/bindings_wheel/tensorrt/__init__.py
+++ b/python/packaging/bindings_wheel/tensorrt/__init__.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -51,18 +51,18 @@ def find_lib(name):
# Order matters here because of dependencies
LIBRARIES = {
"tensorrt": [
- "nvinfer.dll",
+ "nvinfer_##TENSORRT_MAJOR##.dll",
"cublas64_##CUDA_MAJOR##.dll",
"cublasLt64_##CUDA_MAJOR##.dll",
"cudnn64_##CUDNN_MAJOR##.dll",
- "nvinfer_plugin.dll",
- "nvonnxparser.dll",
+ "nvinfer_plugin_##TENSORRT_MAJOR##.dll",
+ "nvonnxparser_##TENSORRT_MAJOR##.dll",
],
"tensorrt_dispatch": [
- "nvinfer_dispatch.dll",
+ "nvinfer_dispatch_##TENSORRT_MAJOR##.dll",
],
"tensorrt_lean": [
- "nvinfer_lean.dll",
+ "nvinfer_lean_##TENSORRT_MAJOR##.dll",
],
}["##TENSORRT_MODULE##"]
diff --git a/python/packaging/frontend_sdist/setup.py b/python/packaging/frontend_sdist/setup.py
index b593e52c..8c050d20 100644
--- a/python/packaging/frontend_sdist/setup.py
+++ b/python/packaging/frontend_sdist/setup.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -104,14 +104,20 @@ def parent_command_line():
pass
# fall back to shell
try:
- return subprocess.check_output(["ps", "-p", str(pid), "-o", "command", "--no-headers"]).decode()
+ return subprocess.check_output(
+ ["ps", "-p", str(pid), "-o", "command", "--no-headers"]
+ ).decode()
except:
return ""
# use pip-inside-pip hack only if the nvidia index is not set in the environment
install_requires = []
-if disable_internal_pip or nvidia_pip_index_url in parent_command_line() or nvidia_pip_index_url in pip_config_list():
+if (
+ disable_internal_pip
+ or nvidia_pip_index_url in parent_command_line()
+ or nvidia_pip_index_url in pip_config_list()
+):
install_requires.extend(tensorrt_submodules)
cmdclass = {}
else:
diff --git a/python/packaging/frontend_sdist/tensorrt/__init__.py b/python/packaging/frontend_sdist/tensorrt/__init__.py
index d15c89d7..5b7038fd 100644
--- a/python/packaging/frontend_sdist/tensorrt/__init__.py
+++ b/python/packaging/frontend_sdist/tensorrt/__init__.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/packaging/libs_wheel/setup.py b/python/packaging/libs_wheel/setup.py
index b6060e0b..b9f7af76 100644
--- a/python/packaging/libs_wheel/setup.py
+++ b/python/packaging/libs_wheel/setup.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/packaging/libs_wheel/tensorrt_libs/__init__.py b/python/packaging/libs_wheel/tensorrt_libs/__init__.py
index a7d9e91a..0335c921 100644
--- a/python/packaging/libs_wheel/tensorrt_libs/__init__.py
+++ b/python/packaging/libs_wheel/tensorrt_libs/__init__.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -43,7 +43,12 @@ def try_load_libs_from_dir(path):
]
for dep_path in DEPENDENCY_PATHS:
try_load_libs_from_dir(
- os.path.join(CURDIR, os.path.pardir, dep_path, "bin" if sys.platform.startswith("win") else "lib")
+ os.path.join(
+ CURDIR,
+ os.path.pardir,
+ dep_path,
+ "bin" if sys.platform.startswith("win") else "lib",
+ )
)
diff --git a/python/packaging/metapackage/setup.py b/python/packaging/metapackage/setup.py
index b5f8452f..bd673247 100644
--- a/python/packaging/metapackage/setup.py
+++ b/python/packaging/metapackage/setup.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/src/infer/pyAlgorithmSelector.cpp b/python/src/infer/pyAlgorithmSelector.cpp
index 75fe97d2..81984930 100644
--- a/python/src/infer/pyAlgorithmSelector.cpp
+++ b/python/src/infer/pyAlgorithmSelector.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,10 +19,6 @@
#include "ForwardDeclarations.h"
#include "utils.h"
#include
-// remove md
-#if ENABLE_MDTRT
-#include "api/internal.h"
-#endif // ENABLE_MDTRT
#include "infer/pyAlgorithmSelectorDoc.h"
#include
#include
@@ -167,11 +163,7 @@ void bindAlgorithm(py::module& m)
.def("get_shape", lambdas::get_shape, "index"_a, IAlgorithmContextDoc::get_shape)
.def_property_readonly("num_inputs", &IAlgorithmContext::getNbInputs)
.def_property_readonly("num_outputs", &IAlgorithmContext::getNbOutputs)
-// remove md
-#if ENABLE_MDTRT
- .def_property_readonly("instance_id", &nvinfer1AlgorithmGetInstanceID)
-#endif // ENABLE_MDTRT
- ;
+ ;
// IAlgorithm
py::class_>(
diff --git a/python/src/infer/pyCore.cpp b/python/src/infer/pyCore.cpp
index e2d95473..4d6f72e0 100644
--- a/python/src/infer/pyCore.cpp
+++ b/python/src/infer/pyCore.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -55,16 +55,16 @@ static const auto opt_profile_get_shape
return shapes;
};
-static const auto opt_profile_set_shape_input
- = [](IOptimizationProfile& self, std::string const& inputName, std::vector const& min,
- std::vector const& opt, std::vector const& max) {
- PY_ASSERT_RUNTIME_ERROR(self.setShapeValues(inputName.c_str(), OptProfileSelector::kMIN, min.data(), min.size()),
- "min input provided for shape tensor is inconsistent with other inputs.");
- PY_ASSERT_RUNTIME_ERROR(self.setShapeValues(inputName.c_str(), OptProfileSelector::kOPT, opt.data(), opt.size()),
- "opt input provided for shape tensor is inconsistent with other inputs.");
- PY_ASSERT_RUNTIME_ERROR(self.setShapeValues(inputName.c_str(), OptProfileSelector::kMAX, max.data(), max.size()),
- "max input provided for shape tensor is inconsistent with other inputs.");
- };
+static const auto opt_profile_set_shape_input = [](IOptimizationProfile& self, std::string const& inputName,
+ std::vector const& min, std::vector const& opt,
+ std::vector const& max) {
+ PY_ASSERT_RUNTIME_ERROR(self.setShapeValues(inputName.c_str(), OptProfileSelector::kMIN, min.data(), min.size()),
+ "min input provided for shape tensor is inconsistent with other inputs.");
+ PY_ASSERT_RUNTIME_ERROR(self.setShapeValues(inputName.c_str(), OptProfileSelector::kOPT, opt.data(), opt.size()),
+ "opt input provided for shape tensor is inconsistent with other inputs.");
+ PY_ASSERT_RUNTIME_ERROR(self.setShapeValues(inputName.c_str(), OptProfileSelector::kMAX, max.data(), max.size()),
+ "max input provided for shape tensor is inconsistent with other inputs.");
+};
static const auto opt_profile_get_shape_input
= [](IOptimizationProfile& self, std::string const& inputName) -> std::vector> {
@@ -144,7 +144,8 @@ Dims castDimsFromPyIterable(PyIterable& in)
int32_t const maxDims{static_cast(Dims::MAX_DIMS)};
Dims dims{};
dims.nbDims = py::len(in);
- PY_ASSERT_RUNTIME_ERROR(dims.nbDims <= maxDims, "The number of input dims exceeds the maximum allowed number of dimensions");
+ PY_ASSERT_RUNTIME_ERROR(
+ dims.nbDims <= maxDims, "The number of input dims exceeds the maximum allowed number of dimensions");
for (int32_t i = 0; i < dims.nbDims; ++i)
{
dims.d[i] = in[i].template cast();
@@ -182,21 +183,6 @@ std::vector get_tensor_profile_shape(ICudaEngine& self, std::string const&
return shapes;
};
-std::vector engine_get_profile_shape(ICudaEngine& self, int32_t profileIndex, int32_t bindingIndex)
-{
- std::vector shapes{};
- auto const tensorName = self.getIOTensorName(bindingIndex);
- shapes.emplace_back(self.getProfileShape(tensorName, profileIndex, OptProfileSelector::kMIN));
- shapes.emplace_back(self.getProfileShape(tensorName, profileIndex, OptProfileSelector::kOPT));
- shapes.emplace_back(self.getProfileShape(tensorName, profileIndex, OptProfileSelector::kMAX));
- return shapes;
-};
-// Overload to allow using binding names instead of indices.
-std::vector engine_get_profile_shape_str(ICudaEngine& self, int32_t profileIndex, std::string const& bindingName)
-{
- return get_tensor_profile_shape(self, bindingName, profileIndex);
-};
-
std::vector> get_tensor_profile_values(
ICudaEngine& self, int32_t profileIndex, std::string const& tensorName)
{
@@ -618,8 +604,11 @@ class PyStreamReader : public IStreamReader
return 0;
}
- py::object bytesRead = pyFunc(reinterpret_cast(destination), size);
- return bytesRead.cast();
+ py::buffer data = pyFunc(size);
+ py::buffer_info info = data.request();
+ int64_t bytesRead = info.size * info.itemsize;
+ std::memcpy(destination, info.ptr, std::min(bytesRead, size));
+ return bytesRead;
}
catch (std::exception const& e)
{
@@ -1180,10 +1169,6 @@ void bindCore(py::module& m)
.def_property_readonly("name", &ICudaEngine::getName)
.def_property_readonly("num_optimization_profiles", &ICudaEngine::getNbOptimizationProfiles)
.def_property_readonly("engine_capability", &ICudaEngine::getEngineCapability)
- .def("get_profile_shape", utils::deprecate(lambdas::engine_get_profile_shape, "get_tensor_profile_shape"),
- "profile_index"_a, "binding"_a, ICudaEngineDoc::get_profile_shape)
- .def("get_profile_shape", utils::deprecate(lambdas::engine_get_profile_shape_str, "get_tensor_profile_shape"),
- "profile_index"_a, "binding"_a, ICudaEngineDoc::get_profile_shape)
// Start of enqueueV3 related APIs.
.def_property_readonly("num_io_tensors", &ICudaEngine::getNbIOTensors)
.def("get_tensor_name", &ICudaEngine::getIOTensorName, "index"_a, ICudaEngineDoc::get_tensor_name)
@@ -1278,7 +1263,7 @@ void bindCore(py::module& m)
.def_property_readonly("minimum_weight_streaming_budget", &ICudaEngine::getMinimumWeightStreamingBudget)
.def_property_readonly("streamable_weights_size", &ICudaEngine::getStreamableWeightsSize)
.def("is_debug_tensor", &ICudaEngine::isDebugTensor, "name"_a, ICudaEngineDoc::is_debug_tensor)
- .def("__del__", &utils::doNothingDel);
+ .def("__del__", &utils::doNothingDel);
py::enum_(m, "AllocatorFlag", py::arithmetic{}, AllocatorFlagDoc::descr, py::module_local())
.value("RESIZABLE", AllocatorFlag::kRESIZABLE, AllocatorFlagDoc::RESIZABLE);
diff --git a/python/src/infer/pyFoundationalTypes.cpp b/python/src/infer/pyFoundationalTypes.cpp
index e89e020a..6f64f7d4 100644
--- a/python/src/infer/pyFoundationalTypes.cpp
+++ b/python/src/infer/pyFoundationalTypes.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,8 +40,8 @@ static const auto weights_pointer_constructor = [](DataType const& type, size_t
static const auto weights_numpy_constructor = [](py::array& arr) {
arr = py::array::ensure(arr);
// In order to construct a weights object, we must have a contiguous C-style array.
- PY_ASSERT_VALUE_ERROR(arr,
- "Could not convert NumPy array to Weights. Is it using a data type supported by TensorRT?");
+ PY_ASSERT_VALUE_ERROR(
+ arr, "Could not convert NumPy array to Weights. Is it using a data type supported by TensorRT?");
PY_ASSERT_VALUE_ERROR((arr.flags() & py::array::c_style),
"Could not convert non-contiguous NumPy array to Weights. Please use numpy.ascontiguousarray() to fix this.");
return new Weights{utils::type(arr.dtype()), arr.data(), arr.size()};
@@ -105,8 +105,8 @@ static const auto dims_getter = [](Dims const& self, int32_t const pyIndex) -> i
static const auto dims_getter_slice = [](Dims const& self, py::slice slice) {
size_t start, stop, step, slicelength;
- PY_ASSERT_VALUE_ERROR(slice.compute(self.nbDims, &start, &stop, &step, &slicelength),
- "Incorrect getter slice dims");
+ PY_ASSERT_VALUE_ERROR(
+ slice.compute(self.nbDims, &start, &stop, &step, &slicelength), "Incorrect getter slice dims");
// Disallow out-of-bounds things.
PY_ASSERT_INDEX_ERROR(stop <= self.nbDims);
@@ -124,8 +124,8 @@ static const auto dims_setter = [](Dims& self, int32_t const pyIndex, int64_t co
static const auto dims_setter_slice = [](Dims& self, py::slice slice, Dims const& other) {
size_t start, stop, step, slicelength;
- PY_ASSERT_VALUE_ERROR(slice.compute(self.nbDims, &start, &stop, &step, &slicelength),
- "Incorrect setter slice dims");
+ PY_ASSERT_VALUE_ERROR(
+ slice.compute(self.nbDims, &start, &stop, &step, &slicelength), "Incorrect setter slice dims");
// Disallow out-of-bounds things.
PY_ASSERT_INDEX_ERROR(stop < self.nbDims);
diff --git a/python/src/infer/pyGraph.cpp b/python/src/infer/pyGraph.cpp
index 730481ae..ddca1e9d 100644
--- a/python/src/infer/pyGraph.cpp
+++ b/python/src/infer/pyGraph.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/src/infer/pyInt8.cpp b/python/src/infer/pyInt8.cpp
index 5639bcd1..9052f796 100644
--- a/python/src/infer/pyInt8.cpp
+++ b/python/src/infer/pyInt8.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -82,7 +82,8 @@ class pyCalibratorTrampoline : public Derived
{
py::gil_scoped_acquire gil{};
- py::function pyReadCalibrationCache = utils::getOverride(static_cast(this), "read_calibration_cache");
+ py::function pyReadCalibrationCache
+ = utils::getOverride(static_cast(this), "read_calibration_cache");
// Cannot cast `None` to py::buffer.
auto cacheRaw = pyReadCalibrationCache();
@@ -118,7 +119,7 @@ class pyCalibratorTrampoline : public Derived
py::function pyWriteCalibrationCache
= utils::getOverride(static_cast(this), "write_calibration_cache");
- #if PYBIND11_VERSION_MAJOR < 2 || PYBIND11_VERSION_MAJOR == 2 && PYBIND11_VERSION_MINOR < 6
+#if PYBIND11_VERSION_MAJOR < 2 || PYBIND11_VERSION_MAJOR == 2 && PYBIND11_VERSION_MINOR < 6
py::buffer_info info{
const_cast(ptr), /* Pointer to buffer */
sizeof(uint8_t), /* Size of one scalar */
@@ -128,10 +129,10 @@ class pyCalibratorTrampoline : public Derived
{ sizeof(uint8_t) } /* Strides (in bytes) for each index */
};
py::memoryview cache{info};
- #else
+#else
py::memoryview cache{
py::memoryview::from_buffer(static_cast(ptr), {length}, {sizeof(uint8_t)})};
- #endif
+#endif
pyWriteCalibrationCache(cache);
}
catch (std::exception const& e)
@@ -284,7 +285,10 @@ void bindInt8(py::module& m)
py::class_(m, "IInt8Calibrator", IInt8CalibratorDoc::descr, py::module_local())
.def(py::init<>())
- .def("get_batch_size", utils::deprecateMember(&IInt8Calibrator::getBatchSize, "Implicit batch dimensions support has been removed"), IInt8CalibratorDoc::get_batch_size)
+ .def("get_batch_size",
+ utils::deprecateMember(
+ &IInt8Calibrator::getBatchSize, "Implicit batch dimensions support has been removed"),
+ IInt8CalibratorDoc::get_batch_size)
.def("get_algorithm", &IInt8Calibrator::getAlgorithm, IInt8CalibratorDoc::get_algorithm)
// For documentation purposes only
.def("get_batch", docGetBatch, "names"_a, IInt8CalibratorDoc::get_batch)
@@ -296,7 +300,10 @@ void bindInt8(py::module& m)
py::class_(
m, "IInt8LegacyCalibrator", IInt8LegacyCalibratorDoc::descr, py::module_local())
.def(py::init<>())
- .def("get_batch_size", utils::deprecateMember(&IInt8LegacyCalibrator::getBatchSize, "Implicit batch dimensions support has been removed"), IInt8CalibratorDoc::get_batch_size)
+ .def("get_batch_size",
+ utils::deprecateMember(
+ &IInt8LegacyCalibrator::getBatchSize, "Implicit batch dimensions support has been removed"),
+ IInt8CalibratorDoc::get_batch_size)
.def("get_algorithm", &IInt8LegacyCalibrator::getAlgorithm, IInt8LegacyCalibratorDoc::get_algorithm)
// For documentation purposes only
.def("get_batch", docGetBatch, "names"_a, IInt8CalibratorDoc::get_batch)
@@ -308,7 +315,10 @@ void bindInt8(py::module& m)
py::class_>(
m, "IInt8EntropyCalibrator", IInt8EntropyCalibratorDoc::descr, py::module_local())
.def(py::init<>())
- .def("get_batch_size", utils::deprecateMember(&IInt8EntropyCalibrator::getBatchSize, "Implicit batch dimensions support has been removed"), IInt8CalibratorDoc::get_batch_size)
+ .def("get_batch_size",
+ utils::deprecateMember(
+ &IInt8EntropyCalibrator::getBatchSize, "Implicit batch dimensions support has been removed"),
+ IInt8CalibratorDoc::get_batch_size)
.def("get_algorithm", &IInt8EntropyCalibrator::getAlgorithm, IInt8EntropyCalibratorDoc::get_algorithm)
// For documentation purposes only
.def("get_batch", docGetBatch, "names"_a, IInt8CalibratorDoc::get_batch)
@@ -320,7 +330,10 @@ void bindInt8(py::module& m)
py::class_>(
m, "IInt8EntropyCalibrator2", IInt8EntropyCalibrator2Doc::descr, py::module_local())
.def(py::init<>())
- .def("get_batch_size", utils::deprecateMember(&IInt8EntropyCalibrator2::getBatchSize, "Implicit batch dimensions support has been removed"), IInt8CalibratorDoc::get_batch_size)
+ .def("get_batch_size",
+ utils::deprecateMember(
+ &IInt8EntropyCalibrator2::getBatchSize, "Implicit batch dimensions support has been removed"),
+ IInt8CalibratorDoc::get_batch_size)
.def("get_algorithm", &IInt8EntropyCalibrator2::getAlgorithm, IInt8EntropyCalibrator2Doc::get_algorithm)
// For documentation purposes only
.def("get_batch", docGetBatch, "names"_a, IInt8CalibratorDoc::get_batch)
@@ -332,7 +345,10 @@ void bindInt8(py::module& m)
py::class_>(
m, "IInt8MinMaxCalibrator", IInt8MinMaxCalibratorDoc::descr, py::module_local())
.def(py::init<>())
- .def("get_batch_size", utils::deprecateMember(&IInt8MinMaxCalibrator::getBatchSize, "Implicit batch dimensions support has been removed"), IInt8CalibratorDoc::get_batch_size)
+ .def("get_batch_size",
+ utils::deprecateMember(
+ &IInt8MinMaxCalibrator::getBatchSize, "Implicit batch dimensions support has been removed"),
+ IInt8CalibratorDoc::get_batch_size)
.def("get_algorithm", &IInt8MinMaxCalibrator::getAlgorithm, IInt8MinMaxCalibratorDoc::get_algorithm)
// For documentation purposes only
.def("get_batch", docGetBatch, "names"_a, IInt8CalibratorDoc::get_batch)
diff --git a/python/src/infer/pyPlugin.cpp b/python/src/infer/pyPlugin.cpp
index d87a42ec..9fc1b901 100644
--- a/python/src/infer/pyPlugin.cpp
+++ b/python/src/infer/pyPlugin.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -87,14 +87,14 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
public:
using PyIPluginV2DynamicExt::PyIPluginV2DynamicExt;
PyIPluginV2DynamicExtImpl() = default;
- PyIPluginV2DynamicExtImpl(const PyIPluginV2DynamicExt& a) {};
+ PyIPluginV2DynamicExtImpl(const PyIPluginV2DynamicExt& a){};
int32_t getNbOutputs() const noexcept override
{
try
{
py::gil_scoped_acquire gil{};
- if(!mIsNbOutputsInitialized)
+ if (!mIsNbOutputsInitialized)
{
utils::throwPyError(PyExc_AttributeError, "num_outputs not initialized");
}
@@ -104,7 +104,8 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
return -1;
}
- bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override
+ bool supportsFormatCombination(
+ int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override
{
try
{
@@ -118,7 +119,7 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
}
std::vector inOutVector;
- for(int32_t idx = 0; idx < nbInputs + nbOutputs; ++idx)
+ for (int32_t idx = 0; idx < nbInputs + nbOutputs; ++idx)
{
inOutVector.push_back(*(inOut + idx));
}
@@ -151,10 +152,11 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
return 0;
}
- try{
+ try
+ {
py::object pyResult = pyInitialize();
}
- catch (py::error_already_set &e)
+ catch (py::error_already_set& e)
{
std::cerr << "[ERROR] Exception thrown from initialize() " << e.what() << std::endl;
return -1;
@@ -165,7 +167,8 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
return -1;
}
- void terminate() noexcept override {
+ void terminate() noexcept override
+ {
try
{
py::gil_scoped_acquire gil{};
@@ -173,7 +176,7 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
py::function pyTerminate = py::get_override(static_cast(this), "terminate");
// if no implementation is provided for terminate(), it is defaulted to `pass`
- if(pyTerminate)
+ if (pyTerminate)
{
pyTerminate();
}
@@ -181,7 +184,8 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
PLUGIN_API_CATCH("terminate")
}
- int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override
+ int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
+ void* const* outputs, void* workspace, cudaStream_t stream) noexcept override
{
try
{
@@ -194,12 +198,12 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
}
std::vector inVector;
- for(int32_t idx = 0; idx < mNbInputs; ++idx)
+ for (int32_t idx = 0; idx < mNbInputs; ++idx)
{
inVector.push_back(*(inputDesc + idx));
}
std::vector outVector;
- for(int32_t idx = 0; idx < mNbOutputs; ++idx)
+ for (int32_t idx = 0; idx < mNbOutputs; ++idx)
{
outVector.push_back(*(outputDesc + idx));
}
@@ -218,10 +222,11 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
intptr_t workspacePtr = reinterpret_cast(workspace);
intptr_t cudaStreamPtr = reinterpret_cast(stream);
- try{
+ try
+ {
pyEnqueue(inVector, outVector, inPtrs, outPtrs, workspacePtr, cudaStreamPtr);
}
- catch (py::error_already_set &e)
+ catch (py::error_already_set& e)
{
std::cerr << "[ERROR] Exception thrown from enqueue() " << e.what() << std::endl;
return -1;
@@ -283,8 +288,7 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
{
py::gil_scoped_acquire gil{};
- py::function pySerialize
- = utils::getOverride(static_cast(this), "serialize");
+ py::function pySerialize = utils::getOverride(static_cast(this), "serialize");
if (!pySerialize)
{
utils::throwPyError(PyExc_RuntimeError, "no implementation provided for serialize()");
@@ -307,7 +311,7 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
try
{
py::gil_scoped_acquire gil{};
- if(!mIsPluginTypeInitialized)
+ if (!mIsPluginTypeInitialized)
{
utils::throwPyError(PyExc_AttributeError, "plugin_type not initialized");
}
@@ -322,7 +326,7 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
try
{
py::gil_scoped_acquire gil{};
- if(!mIsPluginVersionInitialized)
+ if (!mIsPluginVersionInitialized)
{
utils::throwPyError(PyExc_AttributeError, "plugin_version not initialized");
}
@@ -374,7 +378,6 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
// Remove reference to the Python plugin object so that it could be garbage-collected
pyObjVec[this].dec_ref();
-
}
PLUGIN_API_CATCH("destroy")
}
@@ -393,7 +396,7 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
{
py::gil_scoped_acquire gil{};
// getPluginNamespace() is not passed through to the Python side
- if(!mIsNamespaceInitialized)
+ if (!mIsNamespaceInitialized)
{
utils::throwPyError(PyExc_AttributeError, "plugin_namespace not initialized");
}
@@ -417,7 +420,7 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
}
std::vector inVector;
- for(int32_t idx = 0; idx < nbInputs; ++idx)
+ for (int32_t idx = 0; idx < nbInputs; ++idx)
{
inVector.push_back(*(inputTypes + idx));
}
@@ -436,8 +439,8 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
return DataType{};
}
-
- DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override
+ DimsExprs getOutputDimensions(
+ int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override
{
try
{
@@ -451,7 +454,7 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
}
std::vector inVector;
- for(int32_t idx = 0; idx < nbInputs; ++idx)
+ for (int32_t idx = 0; idx < nbInputs; ++idx)
{
inVector.push_back(*(inputs + idx));
}
@@ -470,7 +473,8 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
return DimsExprs{};
}
- void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override
+ void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
+ int32_t nbOutputs) noexcept override
{
try
{
@@ -486,13 +490,13 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
}
std::vector inVector;
- for(int32_t idx = 0; idx < nbInputs; ++idx)
+ for (int32_t idx = 0; idx < nbInputs; ++idx)
{
inVector.push_back(*(in + idx));
}
std::vector outVector;
- for(int32_t idx = 0; idx < nbOutputs; ++idx)
+ for (int32_t idx = 0; idx < nbOutputs; ++idx)
{
outVector.push_back(*(out + idx));
}
@@ -502,13 +506,15 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
PLUGIN_API_CATCH("configure_plugin")
}
- size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override
+ size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
+ int32_t nbOutputs) const noexcept override
{
try
{
py::gil_scoped_acquire gil{};
- py::function pyGetWorkspaceSize = py::get_override(static_cast(this), "get_workspace_size");
+ py::function pyGetWorkspaceSize
+ = py::get_override(static_cast(this), "get_workspace_size");
if (!pyGetWorkspaceSize)
{
@@ -517,13 +523,13 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
}
std::vector inVector;
- for(int32_t idx = 0; idx < nbInputs; ++idx)
+ for (int32_t idx = 0; idx < nbInputs; ++idx)
{
inVector.push_back(*(inputs + idx));
}
std::vector outVector;
- for(int32_t idx = 0; idx < nbOutputs; ++idx)
+ for (int32_t idx = 0; idx < nbOutputs; ++idx)
{
outVector.push_back(*(outputs + idx));
}
@@ -559,23 +565,24 @@ class PyIPluginV2DynamicExtImpl : public PyIPluginV2DynamicExt
mPluginVersion = std::move(pluginVersion);
mIsPluginVersionInitialized = true;
}
- private:
- int32_t getTensorRTVersion() const noexcept override
- {
+
+private:
+ int32_t getTensorRTVersion() const noexcept override
+ {
return static_cast((static_cast(PluginVersion::kV2_DYNAMICEXT_PYTHON) << 24U)
| (static_cast(NV_TENSORRT_VERSION) & 0xFFFFFFU));
- }
+ }
- int32_t mNbInputs{};
- int32_t mNbOutputs{};
- std::string mNamespace;
- std::string mPluginType;
- std::string mPluginVersion;
+ int32_t mNbInputs{};
+ int32_t mNbOutputs{};
+ std::string mNamespace;
+ std::string mPluginType;
+ std::string mPluginVersion;
- bool mIsNbOutputsInitialized{false};
- bool mIsNamespaceInitialized{false};
- bool mIsPluginTypeInitialized{false};
- bool mIsPluginVersionInitialized{false};
+ bool mIsNbOutputsInitialized{false};
+ bool mIsNamespaceInitialized{false};
+ bool mIsPluginTypeInitialized{false};
+ bool mIsPluginVersionInitialized{false};
};
class IPluginCreatorImpl : public IPluginCreator
@@ -593,7 +600,7 @@ class IPluginCreatorImpl : public IPluginCreator
try
{
py::gil_scoped_acquire gil{};
- if(!mIsNameInitialized)
+ if (!mIsNameInitialized)
{
utils::throwPyError(PyExc_AttributeError, "name not initialized");
}
@@ -608,7 +615,7 @@ class IPluginCreatorImpl : public IPluginCreator
try
{
py::gil_scoped_acquire gil{};
- if(!mIsPluginVersionInitialized)
+ if (!mIsPluginVersionInitialized)
{
utils::throwPyError(PyExc_AttributeError, "plugin_version not initialized");
}
@@ -623,7 +630,7 @@ class IPluginCreatorImpl : public IPluginCreator
try
{
py::gil_scoped_acquire gil{};
- if(!mIsFCInitialized)
+ if (!mIsFCInitialized)
{
utils::throwPyError(PyExc_AttributeError, "field_names not initialized");
}
@@ -661,8 +668,7 @@ class IPluginCreatorImpl : public IPluginCreator
return nullptr;
}
- IPluginV2* deserializePlugin(
- const char* name, const void* serialData, size_t serialLength) noexcept override
+ IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override
{
try
{
@@ -677,7 +683,9 @@ class IPluginCreatorImpl : public IPluginCreator
std::string nameString{name};
- py::handle handle = pyDeserializePlugin(nameString, py::bytes(static_cast(serialData), serialLength)).release();
+ py::handle handle
+ = pyDeserializePlugin(nameString, py::bytes(static_cast(serialData), serialLength))
+ .release();
try
{
auto result = handle.cast();
@@ -703,7 +711,7 @@ class IPluginCreatorImpl : public IPluginCreator
try
{
py::gil_scoped_acquire gil{};
- if(!mIsNamespaceInitialized)
+ if (!mIsNamespaceInitialized)
{
utils::throwPyError(PyExc_AttributeError, "plugin_namespace not initialized");
}
@@ -1755,9 +1763,10 @@ bool isPython(IVersionedInterface const& versionedInterface)
namespace lambdas
{
// For IPluginV2
-static const auto IPluginV2_get_output_shape = [](IPluginV2& self, int32_t const index, std::vector const& inputShapes) {
- return self.getOutputDimensions(index, inputShapes.data(), inputShapes.size());
-};
+static const auto IPluginV2_get_output_shape
+ = [](IPluginV2& self, int32_t const index, std::vector const& inputShapes) {
+ return self.getOutputDimensions(index, inputShapes.data(), inputShapes.size());
+ };
static const auto IPluginV2_configure_with_format
= [](IPluginV2& self, std::vector const& inputShapes, std::vector const& outputShapes, DataType dtype,
@@ -1789,13 +1798,14 @@ static const auto IPluginV2_serialize = [](IPluginV2& self) {
};
// `const vector::data()` corresponds to `const void* const*` (pointer to const-pointer to const void)
-static const auto IPluginV2_execute_async = [](IPluginV2& self, int32_t batchSize, const std::vector& inputs,
- std::vector& outputs, void* workspace, long stream) {
+static const auto IPluginV2_execute_async = [](IPluginV2& self, int32_t batchSize,
+ const std::vector& inputs, std::vector& outputs,
+ void* workspace, long stream) {
return self.enqueue(batchSize, inputs.data(), outputs.data(), workspace, reinterpret_cast(stream));
};
static const auto IPluginV2_set_num_outputs = [](IPluginV2& self, int32_t numOutputs) {
- if(getPluginVersion(self.getTensorRTVersion()) == PluginVersion::kV2_DYNAMICEXT_PYTHON)
+ if (getPluginVersion(self.getTensorRTVersion()) == PluginVersion::kV2_DYNAMICEXT_PYTHON)
{
auto plugin = static_cast(&self);
plugin->setNbOutputs(numOutputs);
@@ -1805,7 +1815,7 @@ static const auto IPluginV2_set_num_outputs = [](IPluginV2& self, int32_t numOut
};
static const auto IPluginV2_set_plugin_type = [](IPluginV2& self, std::string pluginType) {
- if(getPluginVersion(self.getTensorRTVersion()) == PluginVersion::kV2_DYNAMICEXT_PYTHON)
+ if (getPluginVersion(self.getTensorRTVersion()) == PluginVersion::kV2_DYNAMICEXT_PYTHON)
{
auto plugin = reinterpret_cast(&self);
plugin->setPluginType(std::move(pluginType));
@@ -1815,7 +1825,7 @@ static const auto IPluginV2_set_plugin_type = [](IPluginV2& self, std::string pl
};
static const auto IPluginV2_set_plugin_version = [](IPluginV2& self, std::string pluginVersion) {
- if(getPluginVersion(self.getTensorRTVersion()) == PluginVersion::kV2_DYNAMICEXT_PYTHON)
+ if (getPluginVersion(self.getTensorRTVersion()) == PluginVersion::kV2_DYNAMICEXT_PYTHON)
{
auto plugin = reinterpret_cast(&self);
plugin->setPluginVersion(std::move(pluginVersion));
@@ -1841,8 +1851,8 @@ static std::unique_ptr makeBoolArray(std::vector const& v)
static const auto configure_plugin
= [](IPluginV2Ext& self, std::vector const& inputShapes, std::vector const& outputShapes,
std::vector const& inputTypes, std::vector const& outputTypes,
- std::vector const& inputIsBroadcasted, std::vector const& outputIsBroadcasted, TensorFormat format,
- int32_t maxBatchSize) {
+ std::vector const& inputIsBroadcasted, std::vector const& outputIsBroadcasted,
+ TensorFormat format, int32_t maxBatchSize) {
auto inputBroadcast = makeBoolArray(inputIsBroadcasted);
auto outputBroadcast = makeBoolArray(outputIsBroadcasted);
return self.configurePlugin(inputShapes.data(), inputShapes.size(), outputShapes.data(), outputShapes.size(),
@@ -1984,7 +1994,7 @@ static const auto dimsexprs_vector_constructor = [](std::vector(Dims::MAX_DIMS)};
PY_ASSERT_VALUE_ERROR(in.size() <= maxDims,
- "Input length " + std::to_string(in.size()) + ". Max expected length is " + std::to_string(maxDims));
+ "Input length " + std::to_string(in.size()) + ". Max expected length is " + std::to_string(maxDims));
// Create the Dims object.
DimsExprs* self = new DimsExprs{};
@@ -2300,6 +2310,80 @@ void bindPlugin(py::module& m)
.def_readwrite("opt", &DynamicPluginTensorDesc::opt)
.def_readwrite("max", &DynamicPluginTensorDesc::max);
+ py::enum_(m, "PluginFieldType", PluginFieldTypeDoc::descr, py::module_local())
+ .value("FLOAT16", PluginFieldType::kFLOAT16)
+ .value("FLOAT32", PluginFieldType::kFLOAT32)
+ .value("FLOAT64", PluginFieldType::kFLOAT64)
+ .value("INT8", PluginFieldType::kINT8)
+ .value("INT16", PluginFieldType::kINT16)
+ .value("INT32", PluginFieldType::kINT32)
+ .value("CHAR", PluginFieldType::kCHAR)
+ .value("DIMS", PluginFieldType::kDIMS)
+ .value("UNKNOWN", PluginFieldType::kUNKNOWN)
+ .value("BF16", PluginFieldType::kBF16)
+ .value("INT64", PluginFieldType::kINT64)
+ .value("FP8", PluginFieldType::kFP8);
+
+ py::class_(m, "PluginField", PluginFieldDoc::descr, py::module_local())
+ .def(py::init(lambdas::plugin_field_default_constructor), "name"_a = "", py::keep_alive<1, 2>{})
+ .def(py::init(lambdas::plugin_field_constructor), "name"_a, "data"_a,
+ "type"_a = nvinfer1::PluginFieldType::kUNKNOWN, py::keep_alive<1, 2>{}, py::keep_alive<1, 3>{})
+ .def_property(
+ "name", [](PluginField& self) { return self.name; },
+ py::cpp_function(
+ [](PluginField& self, FallbackString& name) { self.name = name.c_str(); }, py::keep_alive<1, 2>{}))
+ .def_property(
+ "data",
+ [](PluginField& self) {
+ switch (self.type)
+ {
+ case PluginFieldType::kINT32:
+ return py::array(self.length, static_cast(self.data));
+ break;
+ case PluginFieldType::kINT8:
+ return py::array(self.length, static_cast(self.data));
+ break;
+ case PluginFieldType::kINT16:
+ return py::array(self.length, static_cast(self.data));
+ break;
+ case PluginFieldType::kFLOAT16:
+ // TODO: Figure out how to handle float16 correctly here
+ return py::array(self.length, static_cast(self.data));
+ break;
+ case PluginFieldType::kFLOAT32:
+ return py::array(self.length, static_cast(self.data));
+ break;
+ case PluginFieldType::kFLOAT64:
+ return py::array(self.length, static_cast(self.data));
+ break;
+ case PluginFieldType::kCHAR: return py::array(self.length, static_cast(self.data)); break;
+ default: assert(false && "No known conversion for returning data from PluginField"); break;
+ }
+ // should not reach this line
+ return py::array();
+ },
+ py::cpp_function(
+ [](PluginField& self, py::buffer& buffer) {
+ py::buffer_info info = buffer.request();
+ self.data = info.ptr;
+ },
+ py::keep_alive<1, 2>{}))
+ .def_readwrite("type", &PluginField::type)
+ .def_readwrite("size", &PluginField::length);
+
+ // PluginFieldCollection behaves like an iterable, and can be constructed from iterables.
+ py::class_(m, "PluginFieldCollection_", PluginFieldCollectionDoc::descr, py::module_local())
+ .def(py::init<>(lambdas::plugin_field_collection_constructor), py::keep_alive<1, 2>{})
+ .def("__len__", [](PluginFieldCollection& self) { return self.nbFields; })
+ .def("__getitem__", [](PluginFieldCollection& self, int32_t const index) {
+ PY_ASSERT_INDEX_ERROR(index < self.nbFields);
+ return self.fields[index];
+ });
+
+ // Creating a trt.PluginFieldCollection in Python will actually construct a vector,
+ // which can then be converted to an actual C++ PluginFieldCollection.
+ py::implicitly_convertible, PluginFieldCollection>();
+
py::class_(m, "IPluginV2", IPluginV2Doc::descr, py::module_local())
.def_property("num_outputs", &IPluginV2::getNbOutputs, lambdas::IPluginV2_set_num_outputs)
.def_property_readonly("tensorrt_version", &IPluginV2::getTensorRTVersion)
@@ -2337,7 +2421,8 @@ void bindPlugin(py::module& m)
.def("clone", &IPluginV2Ext::clone, IPluginV2ExtDoc::clone);
;
- py::class_>(m, "IPluginV2DynamicExtBase", py::module_local());
+ py::class_>(
+ m, "IPluginV2DynamicExtBase", py::module_local());
py::class_>(
@@ -2366,6 +2451,9 @@ void bindPlugin(py::module& m)
"stream"_a, IPluginV2DynamicExtDoc::enqueue)
.def("clone", &pluginDoc::clone, IPluginV2DynamicExtDoc::clone);
+ py::class_>(
+ m, "IPluginCapability", IPluginV3Doc::iplugincapability_descr, py::module_local());
+
py::class_>(
m, "IPluginV3", IPluginV3Doc::ipluginv3_descr, py::module_local())
.def(py::init<>())
@@ -2375,9 +2463,6 @@ void bindPlugin(py::module& m)
.def("clone", &pluginDoc::cloneV3, IPluginV3Doc::clone)
.def("destroy", &pluginDoc::destroyV3, IPluginV3Doc::destroy);
- py::class_>(
- m, "IPluginCapability", IPluginV3Doc::iplugincapability_descr, py::module_local());
-
py::class_>(
m, "IPluginV3OneCore", IPluginV3Doc::ipluginv3onecore_descr, py::module_local())
@@ -2430,80 +2515,6 @@ void bindPlugin(py::module& m)
"stream"_a, IPluginV3Doc::enqueue)
.def("attach_to_context", &pluginDoc::attachToContext, "resource_context"_a, IPluginV3Doc::attach_to_context);
- py::enum_(m, "PluginFieldType", PluginFieldTypeDoc::descr, py::module_local())
- .value("FLOAT16", PluginFieldType::kFLOAT16)
- .value("FLOAT32", PluginFieldType::kFLOAT32)
- .value("FLOAT64", PluginFieldType::kFLOAT64)
- .value("INT8", PluginFieldType::kINT8)
- .value("INT16", PluginFieldType::kINT16)
- .value("INT32", PluginFieldType::kINT32)
- .value("CHAR", PluginFieldType::kCHAR)
- .value("DIMS", PluginFieldType::kDIMS)
- .value("UNKNOWN", PluginFieldType::kUNKNOWN)
- .value("BF16", PluginFieldType::kBF16)
- .value("INT64", PluginFieldType::kINT64)
- .value("FP8", PluginFieldType::kFP8);
-
- py::class_(m, "PluginField", PluginFieldDoc::descr, py::module_local())
- .def(py::init(lambdas::plugin_field_default_constructor), "name"_a = "", py::keep_alive<1, 2>{})
- .def(py::init(lambdas::plugin_field_constructor), "name"_a, "data"_a,
- "type"_a = nvinfer1::PluginFieldType::kUNKNOWN, py::keep_alive<1, 2>{}, py::keep_alive<1, 3>{})
- .def_property(
- "name", [](PluginField& self) { return self.name; },
- py::cpp_function(
- [](PluginField& self, FallbackString& name) { self.name = name.c_str(); }, py::keep_alive<1, 2>{}))
- .def_property(
- "data",
- [](PluginField& self) {
- switch (self.type)
- {
- case PluginFieldType::kINT32:
- return py::array(self.length, static_cast(self.data));
- break;
- case PluginFieldType::kINT8:
- return py::array(self.length, static_cast(self.data));
- break;
- case PluginFieldType::kINT16:
- return py::array(self.length, static_cast(self.data));
- break;
- case PluginFieldType::kFLOAT16:
- // TODO: Figure out how to handle float16 correctly here
- return py::array(self.length, static_cast(self.data));
- break;
- case PluginFieldType::kFLOAT32:
- return py::array(self.length, static_cast(self.data));
- break;
- case PluginFieldType::kFLOAT64:
- return py::array(self.length, static_cast(self.data));
- break;
- case PluginFieldType::kCHAR: return py::array(self.length, static_cast(self.data)); break;
- default: assert(false && "No known conversion for returning data from PluginField"); break;
- }
- // should not reach this line
- return py::array();
- },
- py::cpp_function(
- [](PluginField& self, py::buffer& buffer) {
- py::buffer_info info = buffer.request();
- self.data = info.ptr;
- },
- py::keep_alive<1, 2>{}))
- .def_readwrite("type", &PluginField::type)
- .def_readwrite("size", &PluginField::length);
-
- // PluginFieldCollection behaves like an iterable, and can be constructed from iterables.
- py::class_(m, "PluginFieldCollection_", PluginFieldCollectionDoc::descr, py::module_local())
- .def(py::init<>(lambdas::plugin_field_collection_constructor), py::keep_alive<1, 2>{})
- .def("__len__", [](PluginFieldCollection& self) { return self.nbFields; })
- .def("__getitem__", [](PluginFieldCollection& self, int32_t const index) {
- PY_ASSERT_INDEX_ERROR(index < self.nbFields);
- return self.fields[index];
- });
-
- // Creating a trt.PluginFieldCollection in Python will actually construct a vector,
- // which can then be converted to an actual C++ PluginFieldCollection.
- py::implicitly_convertible, PluginFieldCollection>();
-
py::class_(
m, "IPluginCreatorInterface", IPluginCreatorInterfaceDoc::descr, py::module_local());
diff --git a/python/src/parsers/pyOnnx.cpp b/python/src/parsers/pyOnnx.cpp
index 122fc219..9059a3b7 100644
--- a/python/src/parsers/pyOnnx.cpp
+++ b/python/src/parsers/pyOnnx.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/src/pyTensorRT.cpp b/python/src/pyTensorRT.cpp
index a7fe0017..c562703a 100644
--- a/python/src/pyTensorRT.cpp
+++ b/python/src/pyTensorRT.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/python/src/utils.cpp b/python/src/utils.cpp
index 46e8b3ba..de601542 100644
--- a/python/src/utils.cpp
+++ b/python/src/utils.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -44,7 +44,7 @@ size_t size(nvinfer1::DataType type)
case nvinfer1::DataType::kUINT8: return 1;
case nvinfer1::DataType::kFP8: return 1;
case nvinfer1::DataType::kBF16: return 2;
- case nvinfer1::DataType::kINT4: break; //TRT-22011 - need to address sub-byte element size
+ case nvinfer1::DataType::kINT4: break; // TRT-22011 - need to address sub-byte element size
}
return -1;
}
diff --git a/quickstart/IntroNotebooks/Additional Examples/helper.py b/quickstart/IntroNotebooks/Additional Examples/helper.py
index 66c4e006..c00ed985 100644
--- a/quickstart/IntroNotebooks/Additional Examples/helper.py
+++ b/quickstart/IntroNotebooks/Additional Examples/helper.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/IntroNotebooks/helper.py b/quickstart/IntroNotebooks/helper.py
index 66c4e006..c00ed985 100644
--- a/quickstart/IntroNotebooks/helper.py
+++ b/quickstart/IntroNotebooks/helper.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/IntroNotebooks/onnx_helper.py b/quickstart/IntroNotebooks/onnx_helper.py
index 6bea97dd..2f3d6767 100644
--- a/quickstart/IntroNotebooks/onnx_helper.py
+++ b/quickstart/IntroNotebooks/onnx_helper.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/Makefile b/quickstart/Makefile
index bf728ff4..1e700e3d 100644
--- a/quickstart/Makefile
+++ b/quickstart/Makefile
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/Makefile.config b/quickstart/Makefile.config
index d81f325d..0d290ea5 100644
--- a/quickstart/Makefile.config
+++ b/quickstart/Makefile.config
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/SemanticSegmentation/Makefile b/quickstart/SemanticSegmentation/Makefile
index 5c1bdea3..3c1f68d0 100644
--- a/quickstart/SemanticSegmentation/Makefile
+++ b/quickstart/SemanticSegmentation/Makefile
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/SemanticSegmentation/export.py b/quickstart/SemanticSegmentation/export.py
index e5168aaa..560e233e 100644
--- a/quickstart/SemanticSegmentation/export.py
+++ b/quickstart/SemanticSegmentation/export.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/SemanticSegmentation/tutorial-runtime.cpp b/quickstart/SemanticSegmentation/tutorial-runtime.cpp
index 7f0854a3..c1f09197 100644
--- a/quickstart/SemanticSegmentation/tutorial-runtime.cpp
+++ b/quickstart/SemanticSegmentation/tutorial-runtime.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/common/logger.cpp b/quickstart/common/logger.cpp
index 2eaccd54..9d07754c 100644
--- a/quickstart/common/logger.cpp
+++ b/quickstart/common/logger.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/common/logger.h b/quickstart/common/logger.h
index 513275c2..35cbf367 100644
--- a/quickstart/common/logger.h
+++ b/quickstart/common/logger.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/common/logging.h b/quickstart/common/logging.h
index f323d22b..d891e168 100644
--- a/quickstart/common/logging.h
+++ b/quickstart/common/logging.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/common/util.cpp b/quickstart/common/util.cpp
index 717b63aa..55ccd630 100644
--- a/quickstart/common/util.cpp
+++ b/quickstart/common/util.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/common/util.h b/quickstart/common/util.h
index 50455e97..55457969 100644
--- a/quickstart/common/util.h
+++ b/quickstart/common/util.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/deploy_to_triton/config.pbtxt b/quickstart/deploy_to_triton/config.pbtxt
index 63046c8d..f65a9c55 100644
--- a/quickstart/deploy_to_triton/config.pbtxt
+++ b/quickstart/deploy_to_triton/config.pbtxt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/deploy_to_triton/export_resnet_to_onnx.py b/quickstart/deploy_to_triton/export_resnet_to_onnx.py
index fba1550a..64d6b137 100644
--- a/quickstart/deploy_to_triton/export_resnet_to_onnx.py
+++ b/quickstart/deploy_to_triton/export_resnet_to_onnx.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/quickstart/deploy_to_triton/triton_client.py b/quickstart/deploy_to_triton/triton_client.py
index a6e7553d..1575e208 100644
--- a/quickstart/deploy_to_triton/triton_client.py
+++ b/quickstart/deploy_to_triton/triton_client.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 1c26cc38..513810d9 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/CMakeSamplesTemplate.txt b/samples/CMakeSamplesTemplate.txt
index d4f78ae5..285e3f99 100644
--- a/samples/CMakeSamplesTemplate.txt
+++ b/samples/CMakeSamplesTemplate.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -62,11 +62,11 @@ add_executable(${TARGET_NAME}
set(DEPS_LIST "")
if(BUILD_PLUGINS)
- list(APPEND DEPS_LIST nvinfer_plugin)
+ list(APPEND DEPS_LIST ${nvinfer_plugin_lib_name})
endif()
if(BUILD_PARSERS)
- list(APPEND DEPS_LIST nvonnxparser)
+ list(APPEND DEPS_LIST ${nvonnxparser_lib_name})
endif()
if(BUILD_PLUGINS OR BUILD_PARSERS)
@@ -93,7 +93,7 @@ target_compile_options(${TARGET_NAME} PUBLIC
set(SAMPLE_DEP_LIBS
${CUDART_LIB}
- ${nvinfer_LIB_PATH}
+ ${${nvinfer_lib_name}_LIB_PATH}
${RT_LIB}
${CMAKE_DL_LIBS}
${CMAKE_THREAD_LIBS_INIT}
@@ -104,17 +104,17 @@ if (NOT MSVC)
endif()
if(${PLUGINS_NEEDED})
- list(APPEND SAMPLE_DEP_LIBS nvinfer_plugin)
+ list(APPEND SAMPLE_DEP_LIBS ${nvinfer_plugin_lib_name})
endif()
if("onnx" IN_LIST SAMPLE_PARSERS)
- list(APPEND SAMPLE_DEP_LIBS nvonnxparser)
+ list(APPEND SAMPLE_DEP_LIBS ${nvonnxparser_lib_name})
endif()
-# Necessary to link nvinfer_plugin library.
+# Necessary to link nvinfer_plugin library. Add unresolved symbols flag for non-Windows platforms.
target_link_libraries(${TARGET_NAME}
${SAMPLE_DEP_LIBS}
- -Wl,--unresolved-symbols=ignore-in-shared-libs
+ $<$>:-Wl,--unresolved-symbols=ignore-in-shared-libs>
)
set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL")
diff --git a/samples/common/BatchStream.h b/samples/common/BatchStream.h
index f6da8d70..c4ab9de0 100644
--- a/samples/common/BatchStream.h
+++ b/samples/common/BatchStream.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/EntropyCalibrator.h b/samples/common/EntropyCalibrator.h
index 936d10e0..67a0130e 100644
--- a/samples/common/EntropyCalibrator.h
+++ b/samples/common/EntropyCalibrator.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/ErrorRecorder.h b/samples/common/ErrorRecorder.h
index cd00f745..bfb857c5 100644
--- a/samples/common/ErrorRecorder.h
+++ b/samples/common/ErrorRecorder.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +17,7 @@
#ifndef ERROR_RECORDER_H
#define ERROR_RECORDER_H
-#include "NvInferRuntimeBase.h"
+#include "NvInferRuntime.h"
#include "logger.h"
#include
#include
diff --git a/samples/common/argsParser.h b/samples/common/argsParser.h
index 745070d9..b302dc47 100644
--- a/samples/common/argsParser.h
+++ b/samples/common/argsParser.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -68,7 +68,7 @@ struct Args
std::vector dataDirs;
std::string saveEngine;
std::string loadEngine;
- bool rowMajor{true};
+ bool rowOrder{true};
};
//!
@@ -85,7 +85,7 @@ inline bool parseArgs(Args& args, int32_t argc, char* argv[])
int32_t arg;
static struct option long_options[] = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'},
{"int8", no_argument, 0, 'i'}, {"fp16", no_argument, 0, 'f'}, {"bf16", no_argument, 0, 'z'},
- {"columnMajor", no_argument, 0, 'c'}, {"saveEngine", required_argument, 0, 's'},
+ {"columnOrder", no_argument, 0, 'c'}, {"saveEngine", required_argument, 0, 's'},
{"loadEngine", required_argument, 0, 'o'}, {"useDLACore", required_argument, 0, 'u'},
{"batch", required_argument, 0, 'b'}, {nullptr, 0, nullptr, 0}};
int32_t option_index = 0;
@@ -124,7 +124,7 @@ inline bool parseArgs(Args& args, int32_t argc, char* argv[])
case 'i': args.runInInt8 = true; break;
case 'f': args.runInFp16 = true; break;
case 'z': args.runInBf16 = true; break;
- case 'c': args.rowMajor = false; break;
+ case 'c': args.rowOrder = false; break;
case 'u':
if (optarg)
{
diff --git a/samples/common/bfloat16.cpp b/samples/common/bfloat16.cpp
index a9944789..8222826a 100644
--- a/samples/common/bfloat16.cpp
+++ b/samples/common/bfloat16.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/bfloat16.h b/samples/common/bfloat16.h
index 90b77421..0d0ab922 100644
--- a/samples/common/bfloat16.h
+++ b/samples/common/bfloat16.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/buffers.h b/samples/common/buffers.h
index bf40dc9c..e58f2f5c 100644
--- a/samples/common/buffers.h
+++ b/samples/common/buffers.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/common.h b/samples/common/common.h
index 557bd169..0324d2fb 100644
--- a/samples/common/common.h
+++ b/samples/common/common.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/dumpTFWts.py b/samples/common/dumpTFWts.py
index 0b7a0123..70770fbd 100644
--- a/samples/common/dumpTFWts.py
+++ b/samples/common/dumpTFWts.py
@@ -1,6 +1,6 @@
#!/usr/bin/python
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/getOptions.cpp b/samples/common/getOptions.cpp
index 8bcf7958..19cd3281 100644
--- a/samples/common/getOptions.cpp
+++ b/samples/common/getOptions.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/getOptions.h b/samples/common/getOptions.h
index e8460513..4bbf9e27 100644
--- a/samples/common/getOptions.h
+++ b/samples/common/getOptions.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/getoptWin.h b/samples/common/getoptWin.h
index 7e1cf1ba..a1dc6ffa 100644
--- a/samples/common/getoptWin.h
+++ b/samples/common/getoptWin.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/half.h b/samples/common/half.h
index c5ebdb1a..b997e7db 100644
--- a/samples/common/half.h
+++ b/samples/common/half.h
@@ -16,7 +16,7 @@
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/logger.cpp b/samples/common/logger.cpp
index 0592db2c..909ec0bb 100644
--- a/samples/common/logger.cpp
+++ b/samples/common/logger.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/logger.h b/samples/common/logger.h
index ff59bfa9..8205e457 100644
--- a/samples/common/logger.h
+++ b/samples/common/logger.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/logging.h b/samples/common/logging.h
index e61b3687..d2c571d9 100644
--- a/samples/common/logging.h
+++ b/samples/common/logging.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +18,7 @@
#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H
-#include "NvInferRuntimeBase.h"
+#include "NvInferRuntime.h"
#include "sampleOptions.h"
#include
#include
diff --git a/samples/common/parserOnnxConfig.h b/samples/common/parserOnnxConfig.h
index ed0a9b55..67ee6c71 100644
--- a/samples/common/parserOnnxConfig.h
+++ b/samples/common/parserOnnxConfig.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/safeCommon.h b/samples/common/safeCommon.h
index fc9f28b0..4cc87a70 100644
--- a/samples/common/safeCommon.h
+++ b/samples/common/safeCommon.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +18,7 @@
#ifndef TENSORRT_SAFE_COMMON_H
#define TENSORRT_SAFE_COMMON_H
-#include "NvInferRuntimeBase.h"
+#include "NvInferSafeRuntime.h"
#include "cuda_runtime.h"
#include "sampleEntrypoints.h"
#include
diff --git a/samples/common/sampleConfig.h b/samples/common/sampleConfig.h
index f60ed363..801a268a 100644
--- a/samples/common/sampleConfig.h
+++ b/samples/common/sampleConfig.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleDevice.cpp b/samples/common/sampleDevice.cpp
index f504fa69..235ad9f0 100644
--- a/samples/common/sampleDevice.cpp
+++ b/samples/common/sampleDevice.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleDevice.h b/samples/common/sampleDevice.h
index ad122180..5e62f6d0 100644
--- a/samples/common/sampleDevice.h
+++ b/samples/common/sampleDevice.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleEngines.cpp b/samples/common/sampleEngines.cpp
index bea07a53..b39d513b 100644
--- a/samples/common/sampleEngines.cpp
+++ b/samples/common/sampleEngines.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -654,7 +654,15 @@ void setMemoryPoolLimits(IBuilderConfig& config, BuildOptions const& build)
}
if (build.tacticSharedMem >= 0)
{
- config.setMemoryPoolLimit(MemoryPoolType::kTACTIC_SHARED_MEMORY, roundToBytes(build.tacticSharedMem));
+ if (build.tacticSharedMem >= 0.046 && build.tacticSharedMem <= 0.047)
+ {
+ // 48KB is a common use case but user might not type the exact number 0.046875MB.
+ config.setMemoryPoolLimit(MemoryPoolType::kTACTIC_SHARED_MEMORY, 48 << 10);
+ }
+ else
+ {
+ config.setMemoryPoolLimit(MemoryPoolType::kTACTIC_SHARED_MEMORY, roundToBytes(build.tacticSharedMem));
+ }
}
}
diff --git a/samples/common/sampleEngines.h b/samples/common/sampleEngines.h
index f6cff080..4c4272b7 100644
--- a/samples/common/sampleEngines.h
+++ b/samples/common/sampleEngines.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleEntrypoints.h b/samples/common/sampleEntrypoints.h
index 70f45dde..cc8bf1b9 100644
--- a/samples/common/sampleEntrypoints.h
+++ b/samples/common/sampleEntrypoints.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleInference.cpp b/samples/common/sampleInference.cpp
index dfc76708..024dd6f6 100644
--- a/samples/common/sampleInference.cpp
+++ b/samples/common/sampleInference.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -620,8 +620,10 @@ class EnqueueExplicit : private Enqueue
try
{
bool const result = mContext.enqueueV3(stream.get());
- // Collecting layer timing info from current profile index of execution context
- if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler())
+ // Collecting layer timing info from current profile index of execution context, except under capturing
+ // mode.
+ if (!isStreamCapturing(stream) && mContext.getProfiler() && !mContext.getEnqueueEmitsProfile()
+ && !mContext.reportToProfiler())
{
gLogWarning << "Failed to collect layer timing info from previous enqueueV3()" << std::endl;
}
@@ -635,6 +637,14 @@ class EnqueueExplicit : private Enqueue
}
private:
+ // Helper function to check if a stream is in capturing mode.
+ bool isStreamCapturing(TrtCudaStream& stream) const
+ {
+ cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone};
+ cudaCheck(cudaStreamIsCapturing(stream.get(), &status));
+ return status != cudaStreamCaptureStatusNone;
+ }
+
Bindings const& mBindings;
};
@@ -931,6 +941,8 @@ class Iteration
mEnqueue = EnqueueFunction(EnqueueExplicit(context, mBindings));
if (inference.graph)
{
+ sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl;
+
TrtCudaStream& stream = getStream(StreamType::kCOMPUTE);
// Avoid capturing initialization calls by executing the enqueue function at least
// once before starting CUDA graph capture.
@@ -948,6 +960,7 @@ class Iteration
{
mGraph.endCapture(stream);
mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph));
+ sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl;
}
else
{
diff --git a/samples/common/sampleInference.h b/samples/common/sampleInference.h
index e726cb31..e8e53bb7 100644
--- a/samples/common/sampleInference.h
+++ b/samples/common/sampleInference.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleOptions.cpp b/samples/common/sampleOptions.cpp
index 575668e1..7f2bd9f1 100644
--- a/samples/common/sampleOptions.cpp
+++ b/samples/common/sampleOptions.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleOptions.h b/samples/common/sampleOptions.h
index 00e8b15d..cddbc60d 100644
--- a/samples/common/sampleOptions.h
+++ b/samples/common/sampleOptions.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleReporting.cpp b/samples/common/sampleReporting.cpp
index 3c8efab0..1d3e2ca5 100644
--- a/samples/common/sampleReporting.cpp
+++ b/samples/common/sampleReporting.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleReporting.h b/samples/common/sampleReporting.h
index 8cab62ba..c6813fe6 100644
--- a/samples/common/sampleReporting.h
+++ b/samples/common/sampleReporting.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleUtils.cpp b/samples/common/sampleUtils.cpp
index 7f827bc8..522cde65 100644
--- a/samples/common/sampleUtils.cpp
+++ b/samples/common/sampleUtils.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/sampleUtils.h b/samples/common/sampleUtils.h
index 32d5f1b0..6cd4280b 100644
--- a/samples/common/sampleUtils.h
+++ b/samples/common/sampleUtils.h
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/common/streamReader.h b/samples/common/streamReader.h
index 657e35b8..7d4aa1c6 100644
--- a/samples/common/streamReader.h
+++ b/samples/common/streamReader.h
@@ -18,7 +18,7 @@
#ifndef STREAM_READER_H
#define STREAM_READER_H
-#include "NvInferRuntimeBase.h"
+#include "NvInferRuntime.h"
#include "sampleUtils.h"
#include
diff --git a/samples/python/common.py b/samples/python/common.py
index f289c366..10b2c323 100644
--- a/samples/python/common.py
+++ b/samples/python/common.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,16 +27,21 @@
except NameError:
FileNotFoundError = IOError
+
def GiB(val):
return val * 1 << 30
def add_help(description):
- parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(
+ description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
args, _ = parser.parse_known_args()
-def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
+def find_sample_data(
+ description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""
+):
"""
Parses sample arguments.
@@ -51,7 +56,9 @@ def find_sample_data(description="Runs a TensorRT Python sample", subfolder="",
# Standard command-line arguments for all samples.
kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
- parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(
+ description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
parser.add_argument(
"-d",
"--datadir",
@@ -66,7 +73,13 @@ def get_data_path(data_dir):
data_path = os.path.join(data_dir, subfolder)
if not os.path.exists(data_path):
if data_dir != kDEFAULT_DATA_ROOT:
- print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
+ print(
+ "WARNING: "
+ + data_path
+ + " does not exist. Trying "
+ + data_dir
+ + " instead."
+ )
data_path = data_dir
# Make sure data directory exists.
if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
@@ -109,10 +122,13 @@ def locate_files(data_paths, filenames, err_msg=""):
for f, filename in zip(found_files, filenames):
if not f or not os.path.exists(f):
raise FileNotFoundError(
- "Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg)
+ "Could not find {:}. Searched in data paths: {:}\n{:}".format(
+ filename, data_paths, err_msg
+ )
)
return found_files
+
# Sets up the builder to use the timing cache file, and creates it if it does not already exist
def setup_timing_cache(config: trt.IBuilderConfig, timing_cache_path: os.PathLike):
buffer = b""
@@ -122,8 +138,9 @@ def setup_timing_cache(config: trt.IBuilderConfig, timing_cache_path: os.PathLik
timing_cache: trt.ITimingCache = config.create_timing_cache(buffer)
config.set_timing_cache(timing_cache, True)
+
# Saves the config's timing cache to file
def save_timing_cache(config: trt.IBuilderConfig, timing_cache_path: os.PathLike):
timing_cache: trt.ITimingCache = config.get_timing_cache()
- with open(timing_cache_path, 'wb') as timing_cache_file:
+ with open(timing_cache_path, "wb") as timing_cache_file:
timing_cache_file.write(memoryview(timing_cache.serialize()))
diff --git a/samples/python/detectron2/build_engine.py b/samples/python/detectron2/build_engine.py
index aa6f5795..c62b941c 100644
--- a/samples/python/detectron2/build_engine.py
+++ b/samples/python/detectron2/build_engine.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,6 +33,7 @@
sys.path.insert(1, os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
import common
+
class EngineCalibrator(trt.IInt8MinMaxCalibrator):
"""
Implements the INT8 MinMax Calibrator.
@@ -55,7 +56,10 @@ def set_image_batcher(self, image_batcher: ImageBatcher):
:param image_batcher: The ImageBatcher object
"""
self.image_batcher = image_batcher
- self.size = int(np.dtype(self.image_batcher.dtype).itemsize * np.prod(self.image_batcher.shape))
+ self.size = int(
+ np.dtype(self.image_batcher.dtype).itemsize
+ * np.prod(self.image_batcher.shape)
+ )
self.batch_allocation = common.cuda_call(cudart.cudaMalloc(self.size))
self.batch_generator = self.image_batcher.get_batch()
@@ -80,8 +84,14 @@ def get_batch(self, names):
return None
try:
batch, _, _ = next(self.batch_generator)
- log.info("Calibrating image {} / {}".format(self.image_batcher.image_index, self.image_batcher.num_images))
- common.memcpy_host_to_device(self.batch_allocation, np.ascontiguousarray(batch))
+ log.info(
+ "Calibrating image {} / {}".format(
+ self.image_batcher.image_index, self.image_batcher.num_images
+ )
+ )
+ common.memcpy_host_to_device(
+ self.batch_allocation, np.ascontiguousarray(batch)
+ )
return [int(self.batch_allocation)]
except StopIteration:
@@ -130,7 +140,9 @@ def __init__(self, verbose=False, workspace=8):
self.builder = trt.Builder(self.trt_logger)
self.config = self.builder.create_builder_config()
- self.config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace * (2 ** 30))
+ self.config.set_memory_pool_limit(
+ trt.MemoryPoolType.WORKSPACE, workspace * (2**30)
+ )
self.batch_size = None
self.network = None
@@ -158,13 +170,29 @@ def create_network(self, onnx_path):
log.info("Network Description")
for input in inputs:
self.batch_size = input.shape[0]
- log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype))
+ log.info(
+ "Input '{}' with shape {} and dtype {}".format(
+ input.name, input.shape, input.dtype
+ )
+ )
for output in outputs:
- log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype))
+ log.info(
+ "Output '{}' with shape {} and dtype {}".format(
+ output.name, output.shape, output.dtype
+ )
+ )
assert self.batch_size > 0
- def create_engine(self, engine_path, precision, config_file, calib_input=None, calib_cache=None, calib_num_images=5000,
- calib_batch_size=8):
+ def create_engine(
+ self,
+ engine_path,
+ precision,
+ config_file,
+ calib_input=None,
+ calib_cache=None,
+ calib_num_images=5000,
+ calib_batch_size=8,
+ ):
"""
Build the TensorRT engine and serialize it to disk.
:param engine_path: The path where to serialize the engine to.
@@ -194,8 +222,15 @@ def create_engine(self, engine_path, precision, config_file, calib_input=None, c
calib_shape = [calib_batch_size] + list(inputs[0].shape[1:])
calib_dtype = trt.nptype(inputs[0].dtype)
self.config.int8_calibrator.set_image_batcher(
- ImageBatcher(calib_input, calib_shape, calib_dtype, max_num_images=calib_num_images,
- exact_batches=True, config_file=config_file))
+ ImageBatcher(
+ calib_input,
+ calib_shape,
+ calib_dtype,
+ max_num_images=calib_num_images,
+ exact_batches=True,
+ config_file=config_file,
+ )
+ )
engine_bytes = self.builder.build_serialized_network(self.network, self.config)
if engine_bytes is None:
@@ -210,34 +245,76 @@ def create_engine(self, engine_path, precision, config_file, calib_input=None, c
def main(args):
builder = EngineBuilder(args.verbose, args.workspace)
builder.create_network(args.onnx)
- builder.create_engine(args.engine, args.precision, args.det2_config, args.calib_input, args.calib_cache, args.calib_num_images,
- args.calib_batch_size)
+ builder.create_engine(
+ args.engine,
+ args.precision,
+ args.det2_config,
+ args.calib_input,
+ args.calib_cache,
+ args.calib_num_images,
+ args.calib_batch_size,
+ )
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--onnx", help="The input ONNX model file to load")
parser.add_argument("-e", "--engine", help="The output path for the TRT engine")
- parser.add_argument("-c", "--det2_config", default=None, help="The Detectron 2 config file (.yaml) for the model", type=str)
- parser.add_argument("-p", "--precision", default="fp16", choices=["fp32", "fp16", "int8"],
- help="The precision mode to build in, either fp32/fp16/int8, default: 'fp16'")
- parser.add_argument("-v", "--verbose", action="store_true", help="Enable more verbose log output")
- parser.add_argument("-w", "--workspace", default=1, type=int, help="The max memory workspace size to allow in Gb, "
- "default: 1")
- parser.add_argument("--calib_input", help="The directory holding images to use for calibration")
- parser.add_argument("--calib_cache", default="./calibration.cache",
- help="The file path for INT8 calibration cache to use, default: ./calibration.cache")
- parser.add_argument("--calib_num_images", default=5000, type=int,
- help="The maximum number of images to use for calibration, default: 5000")
- parser.add_argument("--calib_batch_size", default=8, type=int,
- help="The batch size for the calibration process, default: 8")
+ parser.add_argument(
+ "-c",
+ "--det2_config",
+ default=None,
+ help="The Detectron 2 config file (.yaml) for the model",
+ type=str,
+ )
+ parser.add_argument(
+ "-p",
+ "--precision",
+ default="fp16",
+ choices=["fp32", "fp16", "int8"],
+ help="The precision mode to build in, either fp32/fp16/int8, default: 'fp16'",
+ )
+ parser.add_argument(
+ "-v", "--verbose", action="store_true", help="Enable more verbose log output"
+ )
+ parser.add_argument(
+ "-w",
+ "--workspace",
+ default=1,
+ type=int,
+ help="The max memory workspace size to allow in Gb, " "default: 1",
+ )
+ parser.add_argument(
+ "--calib_input", help="The directory holding images to use for calibration"
+ )
+ parser.add_argument(
+ "--calib_cache",
+ default="./calibration.cache",
+ help="The file path for INT8 calibration cache to use, default: ./calibration.cache",
+ )
+ parser.add_argument(
+ "--calib_num_images",
+ default=5000,
+ type=int,
+ help="The maximum number of images to use for calibration, default: 5000",
+ )
+ parser.add_argument(
+ "--calib_batch_size",
+ default=8,
+ type=int,
+ help="The batch size for the calibration process, default: 8",
+ )
args = parser.parse_args()
if not all([args.onnx, args.engine]):
parser.print_help()
log.error("These arguments are required: --onnx and --engine")
sys.exit(1)
- if args.precision in ["int8"] and not (args.calib_input or os.path.exists(args.calib_cache)):
+ if args.precision in ["int8"] and not (
+ args.calib_input or os.path.exists(args.calib_cache)
+ ):
parser.print_help()
- log.error("When building in int8 precision, --calib_input or an existing --calib_cache file is required")
+ log.error(
+ "When building in int8 precision, --calib_input or an existing --calib_cache file is required"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/detectron2/create_onnx.py b/samples/python/detectron2/create_onnx.py
index 38538464..478ead75 100644
--- a/samples/python/detectron2/create_onnx.py
+++ b/samples/python/detectron2/create_onnx.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,7 +34,9 @@
from detectron2.structures import ImageList
except ImportError:
print("Could not import Detectron 2 modules. Maybe you did not install Detectron 2")
- print("Please install Detectron 2, check https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md")
+ print(
+ "Please install Detectron 2, check https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md"
+ )
sys.exit(1)
import onnx_utils
@@ -81,14 +83,24 @@ def det2_setup(config_file, weights):
self.first_NMS_max_proposals = self.det2_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
self.first_NMS_iou_threshold = self.det2_cfg.MODEL.RPN.NMS_THRESH
self.first_NMS_score_threshold = 0.01
- self.first_ROIAlign_pooled_size = self.det2_cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
- self.first_ROIAlign_sampling_ratio = self.det2_cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+ self.first_ROIAlign_pooled_size = (
+ self.det2_cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+ )
+ self.first_ROIAlign_sampling_ratio = (
+ self.det2_cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+ )
self.first_ROIAlign_type = self.det2_cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
self.second_NMS_max_proposals = self.det2_cfg.TEST.DETECTIONS_PER_IMAGE
self.second_NMS_iou_threshold = self.det2_cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
- self.second_NMS_score_threshold = self.det2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
- self.second_ROIAlign_pooled_size = self.det2_cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
- self.second_ROIAlign_sampling_ratio = self.det2_cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
+ self.second_NMS_score_threshold = (
+ self.det2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
+ )
+ self.second_ROIAlign_pooled_size = (
+ self.det2_cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
+ )
+ self.second_ROIAlign_sampling_ratio = (
+ self.det2_cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
+ )
self.second_ROIAlign_type = self.det2_cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE
self.mask_out_res = 28
@@ -97,17 +109,37 @@ def det2_setup(config_file, weights):
log.info("Number of classes is {}".format(self.num_classes))
log.info("First NMS max proposals is {}".format(self.first_NMS_max_proposals))
log.info("First NMS iou threshold is {}".format(self.first_NMS_iou_threshold))
- log.info("First NMS score threshold is {}".format(self.first_NMS_score_threshold))
+ log.info(
+ "First NMS score threshold is {}".format(self.first_NMS_score_threshold)
+ )
log.info("First ROIAlign type is {}".format(self.first_ROIAlign_type))
- log.info("First ROIAlign pooled size is {}".format(self.first_ROIAlign_pooled_size))
- log.info("First ROIAlign sampling ratio is {}".format(self.first_ROIAlign_sampling_ratio))
+ log.info(
+ "First ROIAlign pooled size is {}".format(self.first_ROIAlign_pooled_size)
+ )
+ log.info(
+ "First ROIAlign sampling ratio is {}".format(
+ self.first_ROIAlign_sampling_ratio
+ )
+ )
log.info("Second NMS max proposals is {}".format(self.second_NMS_max_proposals))
log.info("Second NMS iou threshold is {}".format(self.second_NMS_iou_threshold))
- log.info("Second NMS score threshold is {}".format(self.second_NMS_score_threshold))
+ log.info(
+ "Second NMS score threshold is {}".format(self.second_NMS_score_threshold)
+ )
log.info("Second ROIAlign type is {}".format(self.second_ROIAlign_type))
- log.info("Second ROIAlign pooled size is {}".format(self.second_ROIAlign_pooled_size))
- log.info("Second ROIAlign sampling ratio is {}".format(self.second_ROIAlign_sampling_ratio))
- log.info("Individual mask output resolution is {}x{}".format(self.mask_out_res, self.mask_out_res))
+ log.info(
+ "Second ROIAlign pooled size is {}".format(self.second_ROIAlign_pooled_size)
+ )
+ log.info(
+ "Second ROIAlign sampling ratio is {}".format(
+ self.second_ROIAlign_sampling_ratio
+ )
+ )
+ log.info(
+ "Individual mask output resolution is {}x{}".format(
+ self.mask_out_res, self.mask_out_res
+ )
+ )
self.batch_size = None
@@ -128,12 +160,16 @@ def sanitize(self):
model = shape_inference.infer_shapes(model)
self.graph = gs.import_onnx(model)
except Exception as e:
- log.info("Shape inference could not be performed at this time:\n{}".format(e))
+ log.info(
+ "Shape inference could not be performed at this time:\n{}".format(e)
+ )
try:
self.graph.fold_constants(fold_shapes=True)
except TypeError as e:
- log.error("This version of ONNX GraphSurgeon does not support folding shapes, please upgrade your "
- "onnx_graphsurgeon module. Error:\n{}".format(e))
+ log.error(
+ "This version of ONNX GraphSurgeon does not support folding shapes, please upgrade your "
+ "onnx_graphsurgeon module. Error:\n{}".format(e)
+ )
raise
count_after = len(self.graph.nodes)
@@ -182,7 +218,9 @@ def get_anchors(self, sample_image):
p4_anchors = det2_anchors[2].tensor.detach().cpu().numpy()
p5_anchors = det2_anchors[3].tensor.detach().cpu().numpy()
p6_anchors = det2_anchors[4].tensor.detach().cpu().numpy()
- final_anchors = np.concatenate((p2_anchors,p3_anchors,p4_anchors,p5_anchors,p6_anchors))
+ final_anchors = np.concatenate(
+ (p2_anchors, p3_anchors, p4_anchors, p5_anchors, p6_anchors)
+ )
return final_anchors
@@ -214,18 +252,29 @@ def update_preprocessor(self, batch_size):
self.graph.inputs[0].name = "input_tensor"
self.sanitize()
- log.info("ONNX graph input shape: {} [NCHW format set]".format(self.graph.inputs[0].shape))
+ log.info(
+ "ONNX graph input shape: {} [NCHW format set]".format(
+ self.graph.inputs[0].shape
+ )
+ )
# Find the initial nodes of the graph, whatever the input is first connected to, and disconnect them.
- for node in [node for node in self.graph.nodes if self.graph.inputs[0] in node.inputs]:
+ for node in [
+ node for node in self.graph.nodes if self.graph.inputs[0] in node.inputs
+ ]:
node.inputs.clear()
# Get input tensor.
input_tensor = self.graph.inputs[0]
# Create preprocessing Sub node and connect input tensor to it.
- sub_const = np.expand_dims(np.asarray([255 * 0.406, 255 * 0.456, 255 * 0.485], dtype=np.float32), axis=(1, 2))
- sub_out = self.graph.op_with_const("Sub", "preprocessor/mean", input_tensor, sub_const)
+ sub_const = np.expand_dims(
+ np.asarray([255 * 0.406, 255 * 0.456, 255 * 0.485], dtype=np.float32),
+ axis=(1, 2),
+ )
+ sub_out = self.graph.op_with_const(
+ "Sub", "preprocessor/mean", input_tensor, sub_const
+ )
# Find first Div node and connect to output of Sub node.
div_node = self.graph.find_node_by_op("Div")
@@ -242,7 +291,19 @@ def update_preprocessor(self, batch_size):
if type(node.inputs[1]) == gs.Constant and node.inputs[1].values[0] == 1:
node.inputs[1].values[0] = self.batch_size
- def NMS(self, boxes, scores, anchors, background_class, score_activation, max_proposals, iou_threshold, nms_score_threshold, user_threshold, nms_name=None):
+ def NMS(
+ self,
+ boxes,
+ scores,
+ anchors,
+ background_class,
+ score_activation,
+ max_proposals,
+ iou_threshold,
+ nms_score_threshold,
+ user_threshold,
+ nms_name=None,
+ ):
# Helper function to create the NMS Plugin node with the selected inputs.
# EfficientNMS_TRT TensorRT Plugin is suitable for our use case.
# :param boxes: The box predictions from the Box Net.
@@ -263,41 +324,71 @@ def NMS(self, boxes, scores, anchors, background_class, score_activation, max_pr
nms_name = "_" + nms_name
# Set score threshold.
- score_threshold = nms_score_threshold if user_threshold is None else user_threshold
+ score_threshold = (
+ nms_score_threshold if user_threshold is None else user_threshold
+ )
# NMS Outputs.
- nms_output_num_detections = gs.Variable(name="num_detections"+nms_name, dtype=np.int32, shape=[self.batch_size, 1])
- nms_output_boxes = gs.Variable(name="detection_boxes"+nms_name, dtype=np.float32,
- shape=[self.batch_size, max_proposals, 4])
- nms_output_scores = gs.Variable(name="detection_scores"+nms_name, dtype=np.float32,
- shape=[self.batch_size, max_proposals])
- nms_output_classes = gs.Variable(name="detection_classes"+nms_name, dtype=np.int32,
- shape=[self.batch_size, max_proposals])
+ nms_output_num_detections = gs.Variable(
+ name="num_detections" + nms_name, dtype=np.int32, shape=[self.batch_size, 1]
+ )
+ nms_output_boxes = gs.Variable(
+ name="detection_boxes" + nms_name,
+ dtype=np.float32,
+ shape=[self.batch_size, max_proposals, 4],
+ )
+ nms_output_scores = gs.Variable(
+ name="detection_scores" + nms_name,
+ dtype=np.float32,
+ shape=[self.batch_size, max_proposals],
+ )
+ nms_output_classes = gs.Variable(
+ name="detection_classes" + nms_name,
+ dtype=np.int32,
+ shape=[self.batch_size, max_proposals],
+ )
- nms_outputs = [nms_output_num_detections, nms_output_boxes, nms_output_scores, nms_output_classes]
+ nms_outputs = [
+ nms_output_num_detections,
+ nms_output_boxes,
+ nms_output_scores,
+ nms_output_classes,
+ ]
# Plugin.
self.graph.plugin(
op="EfficientNMS_TRT",
- name="nms"+nms_name,
+ name="nms" + nms_name,
inputs=[boxes, scores, anchors],
outputs=nms_outputs,
attrs={
- 'plugin_version': "1",
- 'background_class': background_class,
- 'max_output_boxes': max_proposals,
- 'score_threshold': max(0.01, score_threshold),
- 'iou_threshold': iou_threshold,
- 'score_activation': score_activation,
- 'class_agnostic': False,
- 'box_coding': 1,
- }
+ "plugin_version": "1",
+ "background_class": background_class,
+ "max_output_boxes": max_proposals,
+ "score_threshold": max(0.01, score_threshold),
+ "iou_threshold": iou_threshold,
+ "score_activation": score_activation,
+ "class_agnostic": False,
+ "box_coding": 1,
+ },
)
log.info("Created nms{} with EfficientNMS_TRT plugin".format(nms_name))
return nms_outputs
- def ROIAlign(self, rois, p2, p3, p4, p5, pooled_size, sampling_ratio, roi_align_type, num_rois, ra_name):
+ def ROIAlign(
+ self,
+ rois,
+ p2,
+ p3,
+ p4,
+ p5,
+ pooled_size,
+ sampling_ratio,
+ roi_align_type,
+ num_rois,
+ ra_name,
+ ):
# Helper function to create the ROIAlign Plugin node with the selected inputs.
# PyramidROIAlign_TRT TensorRT Plugin is suitable for our use case.
# :param rois: Regions of interest/detection boxes outputs from preceding NMS node.
@@ -318,31 +409,42 @@ def ROIAlign(self, rois, p2, p3, p4, p5, pooled_size, sampling_ratio, roi_align_
roi_coords_transform = 0
# ROIAlign outputs.
- roi_align_output = gs.Variable(name="roi_align/output_"+ra_name, dtype=np.float32,
- shape=[self.batch_size, num_rois, self.fpn_out_channels, pooled_size, pooled_size])
+ roi_align_output = gs.Variable(
+ name="roi_align/output_" + ra_name,
+ dtype=np.float32,
+ shape=[
+ self.batch_size,
+ num_rois,
+ self.fpn_out_channels,
+ pooled_size,
+ pooled_size,
+ ],
+ )
# Plugin.
self.graph.plugin(
op="PyramidROIAlign_TRT",
- name="roi_align_"+ra_name,
+ name="roi_align_" + ra_name,
inputs=[rois, p2, p3, p4, p5],
outputs=[roi_align_output],
attrs={
- 'plugin_version': "1",
- 'fpn_scale': 224,
- 'pooled_size': pooled_size,
- 'image_size': [self.height, self.width],
- 'roi_coords_absolute': 0,
- 'roi_coords_swap': 0,
- 'roi_coords_transform': roi_coords_transform,
- 'sampling_ratio': sampling_ratio,
- }
+ "plugin_version": "1",
+ "fpn_scale": 224,
+ "pooled_size": pooled_size,
+ "image_size": [self.height, self.width],
+ "roi_coords_absolute": 0,
+ "roi_coords_swap": 0,
+ "roi_coords_transform": roi_coords_transform,
+ "sampling_ratio": sampling_ratio,
+ },
)
log.info("Created {} with PyramidROIAlign_TRT plugin".format(ra_name))
return roi_align_output
- def process_graph(self, anchors, first_nms_threshold=None, second_nms_threshold=None):
+ def process_graph(
+ self, anchors, first_nms_threshold=None, second_nms_threshold=None
+ ):
"""
Processes the graph to replace the GenerateProposals and BoxWithNMSLimit operations with EfficientNMS_TRT
TensorRT plugin nodes and ROIAlign operations with PyramidROIAlign_TRT plugin nodes.
@@ -351,6 +453,7 @@ def process_graph(self, anchors, first_nms_threshold=None, second_nms_threshold=
:param first_nms_threshold: Override the 1st NMS score threshold value. If set to None, use the value in the graph.
:param second_nms_threshold: Override the 2nd NMS score threshold value. If set to None, use the value in the graph.
"""
+
def backbone():
"""
Updates the graph to replace all ResizeNearest ops with ResizeNearest plugins in backbone.
@@ -361,7 +464,6 @@ def backbone():
p4 = self.graph.find_node_by_op_name("Conv", "/backbone/fpn_output4/Conv")
p5 = self.graph.find_node_by_op_name("Conv", "/backbone/fpn_output5/Conv")
-
return p2.outputs[0], p3.outputs[0], p4.outputs[0], p5.outputs[0]
def proposal_generator(anchors, first_nms_threshold):
@@ -372,38 +474,101 @@ def proposal_generator(anchors, first_nms_threshold):
:param first_nms_threshold: Override the 1st NMS score threshold value. If set to None, use the value in the graph.
"""
# Get nodes containing final objectness logits.
- p2_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten")
- p3_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten_1")
- p4_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten_2")
- p5_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten_3")
- p6_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten_4")
+ p2_logits = self.graph.find_node_by_op_name(
+ "Flatten", "/proposal_generator/Flatten"
+ )
+ p3_logits = self.graph.find_node_by_op_name(
+ "Flatten", "/proposal_generator/Flatten_1"
+ )
+ p4_logits = self.graph.find_node_by_op_name(
+ "Flatten", "/proposal_generator/Flatten_2"
+ )
+ p5_logits = self.graph.find_node_by_op_name(
+ "Flatten", "/proposal_generator/Flatten_3"
+ )
+ p6_logits = self.graph.find_node_by_op_name(
+ "Flatten", "/proposal_generator/Flatten_4"
+ )
# Get nodes containing final anchor_deltas.
- p2_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_1")
- p3_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_3")
- p4_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_5")
- p5_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_7")
- p6_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_9")
+ p2_anchors = self.graph.find_node_by_op_name(
+ "Reshape", "/proposal_generator/Reshape_1"
+ )
+ p3_anchors = self.graph.find_node_by_op_name(
+ "Reshape", "/proposal_generator/Reshape_3"
+ )
+ p4_anchors = self.graph.find_node_by_op_name(
+ "Reshape", "/proposal_generator/Reshape_5"
+ )
+ p5_anchors = self.graph.find_node_by_op_name(
+ "Reshape", "/proposal_generator/Reshape_7"
+ )
+ p6_anchors = self.graph.find_node_by_op_name(
+ "Reshape", "/proposal_generator/Reshape_9"
+ )
# Concatenate all objectness logits/scores data.
- scores_inputs = [p2_logits.outputs[0], p3_logits.outputs[0], p4_logits.outputs[0], p5_logits.outputs[0], p6_logits.outputs[0]]
- scores_tensor = self.graph.layer(name="scores", op="Concat", inputs=scores_inputs, outputs=['scores'], attrs={'axis': 1})[0]
+ scores_inputs = [
+ p2_logits.outputs[0],
+ p3_logits.outputs[0],
+ p4_logits.outputs[0],
+ p5_logits.outputs[0],
+ p6_logits.outputs[0],
+ ]
+ scores_tensor = self.graph.layer(
+ name="scores",
+ op="Concat",
+ inputs=scores_inputs,
+ outputs=["scores"],
+ attrs={"axis": 1},
+ )[0]
# Unsqueeze to add 3rd dimension of 1 to match tensor dimensions of boxes tensor.
scores = self.graph.unsqueeze("scores_unsqueeze", scores_tensor, [2])[0]
# Concatenate all boxes/anchor_delta data.
- boxes_inputs = [p2_anchors.outputs[0], p3_anchors.outputs[0], p4_anchors.outputs[0], p5_anchors.outputs[0], p6_anchors.outputs[0]]
- boxes = self.graph.layer(name="boxes", op="Concat", inputs=boxes_inputs, outputs=['anchors'], attrs={'axis': 1})[0]
+ boxes_inputs = [
+ p2_anchors.outputs[0],
+ p3_anchors.outputs[0],
+ p4_anchors.outputs[0],
+ p5_anchors.outputs[0],
+ p6_anchors.outputs[0],
+ ]
+ boxes = self.graph.layer(
+ name="boxes",
+ op="Concat",
+ inputs=boxes_inputs,
+ outputs=["anchors"],
+ attrs={"axis": 1},
+ )[0]
# Convert the anchors from Corners to CenterSize encoding.
- anchors = np.matmul(anchors, [[0.5, 0, -1, 0], [0, 0.5, 0, -1], [0.5, 0, 1, 0], [0, 0.5, 0, 1]])
- anchors = anchors / [self.width, self.height, self.width, self.height] # Normalize anchors to [0-1] range
+ anchors = np.matmul(
+ anchors,
+ [[0.5, 0, -1, 0], [0, 0.5, 0, -1], [0.5, 0, 1, 0], [0, 0.5, 0, 1]],
+ )
+ anchors = anchors / [
+ self.width,
+ self.height,
+ self.width,
+ self.height,
+ ] # Normalize anchors to [0-1] range
anchors = np.expand_dims(anchors, axis=0)
anchors = anchors.astype(np.float32)
anchors = gs.Constant(name="default_anchors", values=anchors)
# Create NMS node.
- nms_outputs = self.NMS(boxes, scores, anchors, -1, False, self.first_NMS_max_proposals, self.first_NMS_iou_threshold, self.first_NMS_score_threshold, first_nms_threshold, 'rpn')
+ nms_outputs = self.NMS(
+ boxes,
+ scores,
+ anchors,
+ -1,
+ False,
+ self.first_NMS_max_proposals,
+ self.first_NMS_iou_threshold,
+ self.first_NMS_score_threshold,
+ first_nms_threshold,
+ "rpn",
+ )
return nms_outputs
@@ -422,63 +587,149 @@ def roi_heads(rpn_outputs, p2, p3, p4, p5, second_nms_threshold):
:param second_nms_threshold: Override the 2nd NMS score threshold value. If set to None, use the value in the graph.
"""
# Create ROIAlign node.
- box_pooler_output = self.ROIAlign(rpn_outputs[1], p2, p3, p4, p5, self.first_ROIAlign_pooled_size, self.first_ROIAlign_sampling_ratio, self.first_ROIAlign_type, self.first_NMS_max_proposals, 'box_pooler')
+ box_pooler_output = self.ROIAlign(
+ rpn_outputs[1],
+ p2,
+ p3,
+ p4,
+ p5,
+ self.first_ROIAlign_pooled_size,
+ self.first_ROIAlign_sampling_ratio,
+ self.first_ROIAlign_type,
+ self.first_NMS_max_proposals,
+ "box_pooler",
+ )
# Reshape node that prepares ROIAlign/box pooler output for Gemm node that comes next.
- box_pooler_shape = np.asarray([-1, self.fpn_out_channels*self.first_ROIAlign_pooled_size*self.first_ROIAlign_pooled_size], dtype=np.int64)
- box_pooler_reshape = self.graph.op_with_const("Reshape", "box_pooler/reshape", box_pooler_output, box_pooler_shape)
+ box_pooler_shape = np.asarray(
+ [
+ -1,
+ self.fpn_out_channels
+ * self.first_ROIAlign_pooled_size
+ * self.first_ROIAlign_pooled_size,
+ ],
+ dtype=np.int64,
+ )
+ box_pooler_reshape = self.graph.op_with_const(
+ "Reshape", "box_pooler/reshape", box_pooler_output, box_pooler_shape
+ )
# Get first Gemm op of box head and connect box pooler to it.
- first_box_head_gemm = self.graph.find_node_by_op_name("Gemm", "/roi_heads/box_head/fc1/Gemm")
+ first_box_head_gemm = self.graph.find_node_by_op_name(
+ "Gemm", "/roi_heads/box_head/fc1/Gemm"
+ )
first_box_head_gemm.inputs[0] = box_pooler_reshape[0]
# Get final two nodes of box predictor. Softmax op for cls_score, Gemm op for bbox_pred.
cls_score = self.graph.find_node_by_op_name("Softmax", "/roi_heads/Softmax")
- bbox_pred = self.graph.find_node_by_op_name("Gemm", "/roi_heads/box_predictor/bbox_pred/Gemm")
+ bbox_pred = self.graph.find_node_by_op_name(
+ "Gemm", "/roi_heads/box_predictor/bbox_pred/Gemm"
+ )
# Linear transformation to convert box coordinates from (TopLeft, BottomRight) Corner encoding
# to CenterSize encoding. 1st NMS boxes are multiplied by transformation matrix in order to
# encode it into CenterSize format.
- matmul_const = np.matrix('0.5 0 -1 0; 0 0.5 0 -1; 0.5 0 1 0; 0 0.5 0 1', dtype=np.float32)
- matmul_out = self.graph.matmul("RPN_NMS/detection_boxes_conversion", rpn_outputs[1], matmul_const)
+ matmul_const = np.matrix(
+ "0.5 0 -1 0; 0 0.5 0 -1; 0.5 0 1 0; 0 0.5 0 1", dtype=np.float32
+ )
+ matmul_out = self.graph.matmul(
+ "RPN_NMS/detection_boxes_conversion", rpn_outputs[1], matmul_const
+ )
# Reshape node that prepares bbox_pred for scaling and second NMS.
- bbox_pred_shape = np.asarray([self.batch_size, self.first_NMS_max_proposals, self.num_classes, 4], dtype=np.int64)
- bbox_pred_reshape = self.graph.op_with_const("Reshape", "bbox_pred/reshape", bbox_pred.outputs[0], bbox_pred_shape)
+ bbox_pred_shape = np.asarray(
+ [self.batch_size, self.first_NMS_max_proposals, self.num_classes, 4],
+ dtype=np.int64,
+ )
+ bbox_pred_reshape = self.graph.op_with_const(
+ "Reshape", "bbox_pred/reshape", bbox_pred.outputs[0], bbox_pred_shape
+ )
# 0.1, 0.1, 0.2, 0.2 are localization head variance numbers, they scale bbox_pred_reshape, in order to get accurate coordinates.
- scale_adj = np.expand_dims(np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1))
- final_bbox_pred = self.graph.op_with_const("Mul", "bbox_pred/scale", bbox_pred_reshape[0], scale_adj)
+ scale_adj = np.expand_dims(
+ np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1)
+ )
+ final_bbox_pred = self.graph.op_with_const(
+ "Mul", "bbox_pred/scale", bbox_pred_reshape[0], scale_adj
+ )
# Reshape node that prepares cls_score for slicing and second NMS.
- cls_score_shape = np.array([self.batch_size, self.first_NMS_max_proposals, self.num_classes+1], dtype=np.int64)
- cls_score_reshape = self.graph.op_with_const("Reshape", "cls_score/reshape", cls_score.outputs[0], cls_score_shape)
+ cls_score_shape = np.array(
+ [self.batch_size, self.first_NMS_max_proposals, self.num_classes + 1],
+ dtype=np.int64,
+ )
+ cls_score_reshape = self.graph.op_with_const(
+ "Reshape", "cls_score/reshape", cls_score.outputs[0], cls_score_shape
+ )
# Slice operation to adjust third dimension of cls_score tensor, deletion of background class (81 in Detectron 2).
- final_cls_score = self.graph.slice("cls_score/slicer", cls_score_reshape[0], 0, self.num_classes, 2)
+ final_cls_score = self.graph.slice(
+ "cls_score/slicer", cls_score_reshape[0], 0, self.num_classes, 2
+ )
# Create NMS node.
- nms_outputs = self.NMS(final_bbox_pred[0], final_cls_score[0], matmul_out[0], -1, False, self.second_NMS_max_proposals, self.second_NMS_iou_threshold, self.second_NMS_score_threshold, second_nms_threshold, 'box_outputs')
+ nms_outputs = self.NMS(
+ final_bbox_pred[0],
+ final_cls_score[0],
+ matmul_out[0],
+ -1,
+ False,
+ self.second_NMS_max_proposals,
+ self.second_NMS_iou_threshold,
+ self.second_NMS_score_threshold,
+ second_nms_threshold,
+ "box_outputs",
+ )
# Create ROIAlign node.
- mask_pooler_output = self.ROIAlign(nms_outputs[1], p2, p3, p4, p5, self.second_ROIAlign_pooled_size, self.second_ROIAlign_sampling_ratio, self.second_ROIAlign_type, self.second_NMS_max_proposals, 'mask_pooler')
+ mask_pooler_output = self.ROIAlign(
+ nms_outputs[1],
+ p2,
+ p3,
+ p4,
+ p5,
+ self.second_ROIAlign_pooled_size,
+ self.second_ROIAlign_sampling_ratio,
+ self.second_ROIAlign_type,
+ self.second_NMS_max_proposals,
+ "mask_pooler",
+ )
# Reshape mask pooler output.
- mask_pooler_shape = np.asarray([self.second_NMS_max_proposals*self.batch_size, self.fpn_out_channels, self.second_ROIAlign_pooled_size, self.second_ROIAlign_pooled_size], dtype=np.int64)
- mask_pooler_reshape_node = self.graph.op_with_const("Reshape", "mask_pooler/reshape", mask_pooler_output, mask_pooler_shape)
+ mask_pooler_shape = np.asarray(
+ [
+ self.second_NMS_max_proposals * self.batch_size,
+ self.fpn_out_channels,
+ self.second_ROIAlign_pooled_size,
+ self.second_ROIAlign_pooled_size,
+ ],
+ dtype=np.int64,
+ )
+ mask_pooler_reshape_node = self.graph.op_with_const(
+ "Reshape", "mask_pooler/reshape", mask_pooler_output, mask_pooler_shape
+ )
# Get first Conv op in mask head and connect ROIAlign's squeezed output to it.
- mask_head_conv = self.graph.find_node_by_op_name("Conv", "/roi_heads/mask_head/mask_fcn1/Conv")
+ mask_head_conv = self.graph.find_node_by_op_name(
+ "Conv", "/roi_heads/mask_head/mask_fcn1/Conv"
+ )
mask_head_conv.inputs[0] = mask_pooler_reshape_node[0]
# Reshape node that is preparing 2nd NMS class outputs for Add node that comes next.
- classes_reshape_shape = np.asarray([self.second_NMS_max_proposals*self.batch_size], dtype=np.int64)
- classes_reshape_node = self.graph.op_with_const("Reshape", "box_outputs/reshape_classes", nms_outputs[3], classes_reshape_shape)
+ classes_reshape_shape = np.asarray(
+ [self.second_NMS_max_proposals * self.batch_size], dtype=np.int64
+ )
+ classes_reshape_node = self.graph.op_with_const(
+ "Reshape",
+ "box_outputs/reshape_classes",
+ nms_outputs[3],
+ classes_reshape_shape,
+ )
# This loop will generate an array used in Add node, which eventually will help Gather node to pick the single
# class of interest per bounding box, instead of creating 80 masks for every single bounding box.
add_array = []
- for i in range(self.second_NMS_max_proposals*self.batch_size):
+ for i in range(self.second_NMS_max_proposals * self.batch_size):
if i == 0:
start_pos = 0
else:
@@ -488,23 +739,59 @@ def roi_heads(rpn_outputs, p2, p3, p4, p5, second_nms_threshold):
# This Add node is one of the Gather node inputs, Gather node performs gather on 0th axis of data tensor
# and requires indices that set tensors to be withing bounds, this Add node provides the bounds for Gather.
add_array = np.asarray(add_array, dtype=np.int32)
- classes_add_node = self.graph.op_with_const("Add", "box_outputs/add", classes_reshape_node[0], add_array)
+ classes_add_node = self.graph.op_with_const(
+ "Add", "box_outputs/add", classes_reshape_node[0], add_array
+ )
# Get the last Conv op in mask head and reshape it to correctly gather class of interest's masks.
- last_conv = self.graph.find_node_by_op_name("Conv", "/roi_heads/mask_head/predictor/Conv")
- last_conv_reshape_shape = np.asarray([self.second_NMS_max_proposals*self.num_classes*self.batch_size, self.mask_out_res, self.mask_out_res], dtype=np.int64)
- last_conv_reshape_node = self.graph.op_with_const("Reshape", "mask_head/reshape_all_masks", last_conv.outputs[0], last_conv_reshape_shape)
+ last_conv = self.graph.find_node_by_op_name(
+ "Conv", "/roi_heads/mask_head/predictor/Conv"
+ )
+ last_conv_reshape_shape = np.asarray(
+ [
+ self.second_NMS_max_proposals * self.num_classes * self.batch_size,
+ self.mask_out_res,
+ self.mask_out_res,
+ ],
+ dtype=np.int64,
+ )
+ last_conv_reshape_node = self.graph.op_with_const(
+ "Reshape",
+ "mask_head/reshape_all_masks",
+ last_conv.outputs[0],
+ last_conv_reshape_shape,
+ )
# Gather node that selects only masks belonging to detected class, 79 other masks are discarded.
- final_gather = self.graph.gather("mask_head/final_gather", last_conv_reshape_node[0], classes_add_node[0], 0)
+ final_gather = self.graph.gather(
+ "mask_head/final_gather",
+ last_conv_reshape_node[0],
+ classes_add_node[0],
+ 0,
+ )
# Get last Sigmoid node and connect Gather node to it.
- mask_head_sigmoid = self.graph.find_node_by_op_name("Sigmoid", "/roi_heads/mask_head/Sigmoid")
+ mask_head_sigmoid = self.graph.find_node_by_op_name(
+ "Sigmoid", "/roi_heads/mask_head/Sigmoid"
+ )
mask_head_sigmoid.inputs[0] = final_gather[0]
# Final Reshape node, reshapes output of Sigmoid, important for various batch_size support (not tested yet).
- final_graph_reshape_shape = np.asarray([self.batch_size, self.second_NMS_max_proposals, self.mask_out_res, self.mask_out_res], dtype=np.int64)
- final_graph_reshape_node = self.graph.op_with_const("Reshape", "mask_head/final_reshape", mask_head_sigmoid.outputs[0], final_graph_reshape_shape)
+ final_graph_reshape_shape = np.asarray(
+ [
+ self.batch_size,
+ self.second_NMS_max_proposals,
+ self.mask_out_res,
+ self.mask_out_res,
+ ],
+ dtype=np.int64,
+ )
+ final_graph_reshape_node = self.graph.op_with_const(
+ "Reshape",
+ "mask_head/final_reshape",
+ mask_head_sigmoid.outputs[0],
+ final_graph_reshape_shape,
+ )
final_graph_reshape_node[0].dtype = np.float32
final_graph_reshape_node[0].name = "detection_masks"
@@ -513,7 +800,9 @@ def roi_heads(rpn_outputs, p2, p3, p4, p5, second_nms_threshold):
# Only Detectron 2's Mask-RCNN R50-FPN 3x is supported currently.
p2, p3, p4, p5 = backbone()
rpn_outputs = proposal_generator(anchors, first_nms_threshold)
- box_head_outputs, mask_head_output = roi_heads(rpn_outputs, p2, p3, p4, p5, second_nms_threshold)
+ box_head_outputs, mask_head_output = roi_heads(
+ rpn_outputs, p2, p3, p4, p5, second_nms_threshold
+ )
# Append segmentation head output.
box_head_outputs.append(mask_head_output)
# Set graph outputs, both bbox and segmentation heads.
@@ -531,17 +820,55 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-i", "--exported_onnx", help="The exported to ONNX Detectron 2 Mask R-CNN", type=str)
- parser.add_argument("-o", "--onnx", help="The output ONNX model file to write", type=str)
- parser.add_argument("-c", "--det2_config", help="The Detectron 2 config file (.yaml) for the model", type=str)
- parser.add_argument("-w", "--det2_weights", help="The Detectron 2 model weights (.pkl)", type=str)
- parser.add_argument("-s", "--sample_image", help="Sample image for anchors generation", type=str)
- parser.add_argument("-b", "--batch_size", help="Batch size for the model", type=int, default=1)
- parser.add_argument("-t1", "--first_nms_threshold", help="Override the score threshold for the 1st NMS operation", type=float)
- parser.add_argument("-t2", "--second_nms_threshold", help="Override the score threshold for the 2nd NMS operation", type=float)
+ parser.add_argument(
+ "-i",
+ "--exported_onnx",
+ help="The exported to ONNX Detectron 2 Mask R-CNN",
+ type=str,
+ )
+ parser.add_argument(
+ "-o", "--onnx", help="The output ONNX model file to write", type=str
+ )
+ parser.add_argument(
+ "-c",
+ "--det2_config",
+ help="The Detectron 2 config file (.yaml) for the model",
+ type=str,
+ )
+ parser.add_argument(
+ "-w", "--det2_weights", help="The Detectron 2 model weights (.pkl)", type=str
+ )
+ parser.add_argument(
+ "-s", "--sample_image", help="Sample image for anchors generation", type=str
+ )
+ parser.add_argument(
+ "-b", "--batch_size", help="Batch size for the model", type=int, default=1
+ )
+ parser.add_argument(
+ "-t1",
+ "--first_nms_threshold",
+ help="Override the score threshold for the 1st NMS operation",
+ type=float,
+ )
+ parser.add_argument(
+ "-t2",
+ "--second_nms_threshold",
+ help="Override the score threshold for the 2nd NMS operation",
+ type=float,
+ )
args = parser.parse_args()
- if not all([args.exported_onnx, args.onnx, args.det2_config, args.det2_weights, args.sample_image]):
+ if not all(
+ [
+ args.exported_onnx,
+ args.onnx,
+ args.det2_config,
+ args.det2_weights,
+ args.sample_image,
+ ]
+ ):
parser.print_help()
- print("\nThese arguments are required: --exported_onnx --onnx --det2_config --det2_weights and --sample_image")
+ print(
+ "\nThese arguments are required: --exported_onnx --onnx --det2_config --det2_weights and --sample_image"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/detectron2/eval_coco.py b/samples/python/detectron2/eval_coco.py
index 828413d4..7afb6116 100644
--- a/samples/python/detectron2/eval_coco.py
+++ b/samples/python/detectron2/eval_coco.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,9 +31,12 @@
from detectron2.structures import Instances, Boxes, ROIMasks
except ImportError:
print("Could not import Detectron 2 modules. Maybe you did not install Detectron 2")
- print("Please install Detectron 2, check https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md")
+ print(
+ "Please install Detectron 2, check https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md"
+ )
sys.exit(1)
+
def build_evaluator(dataset_name):
"""
Create evaluator for a COCO dataset.
@@ -45,6 +48,7 @@ def build_evaluator(dataset_name):
else:
raise NotImplementedError("Evaluator type is not supported")
+
def setup(config_file, weights):
"""
Create config and perform basic setup.
@@ -55,6 +59,7 @@ def setup(config_file, weights):
cfg.freeze()
return cfg
+
def main(args):
# Set up Detectron 2 config and build evaluator.
cfg = setup(args.det2_config, args.det2_weights)
@@ -63,10 +68,15 @@ def main(args):
evaluator.reset()
trt_infer = TensorRTInfer(args.engine)
- batcher = ImageBatcher(args.input, *trt_infer.input_spec(), config_file=args.det2_config)
+ batcher = ImageBatcher(
+ args.input, *trt_infer.input_spec(), config_file=args.det2_config
+ )
for batch, images, scales in batcher.get_batch():
- print("Processing Image {} / {}".format(batcher.image_index, batcher.num_images), end="\r")
+ print(
+ "Processing Image {} / {}".format(batcher.image_index, batcher.num_images),
+ end="\r",
+ )
detections = trt_infer.infer(batch, scales, args.nms_threshold)
for i in range(len(images)):
# Get inference image resolution.
@@ -85,13 +95,13 @@ def main(args):
for n in range(num_instances):
det = detections[i][n]
# Append box coordinates data.
- pred_boxes.append([det['ymin'], det['xmin'], det['ymax'], det['xmax']])
+ pred_boxes.append([det["ymin"], det["xmin"], det["ymax"], det["xmax"]])
# Append score.
- scores.append(det['score'])
+ scores.append(det["score"])
# Append class.
- pred_classes.append(det['class'])
+ pred_classes.append(det["class"])
# Append mask.
- pred_masks[n] = det['mask']
+ pred_masks[n] = det["mask"]
# Create new Instances object required for Detectron 2 evalutions and add:
# boxes, scores, pred_classes, pred_masks.
image_shape = (im_height, im_width)
@@ -100,10 +110,12 @@ def main(args):
instances.scores = torch.tensor(scores)
instances.pred_classes = torch.tensor(pred_classes)
roi_masks = ROIMasks(torch.tensor(pred_masks))
- instances.pred_masks = roi_masks.to_bitmasks(instances.pred_boxes, im_height, im_width, args.iou_threshold).tensor
+ instances.pred_masks = roi_masks.to_bitmasks(
+ instances.pred_boxes, im_height, im_width, args.iou_threshold
+ ).tensor
# Process evaluations per image.
- image_dict = [{'instances': instances}]
- input_dict = [{'image_id': source_id}]
+ image_dict = [{"instances": instances}]
+ input_dict = [{"image_id": source_id}]
evaluator.process(input_dict, image_dict)
# Final evaluations, generation of mAP accuracy performance.
@@ -113,17 +125,37 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with.")
- parser.add_argument("-i", "--input",
- help="The input to infer, either a single image path, or a directory of images.")
- parser.add_argument("-c", "--det2_config", help="The Detectron 2 config file (.yaml) for the model", type=str)
- parser.add_argument("-w", "--det2_weights", help="The Detectron 2 model weights (.pkl)", type=str)
- parser.add_argument("-t", "--nms_threshold", type=float,
- help="Override the score threshold for the NMS operation, if higher than the threshold in the engine.")
- parser.add_argument("--iou_threshold", default=0.5, type=float,
- help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0.")
+ parser.add_argument(
+ "-i",
+ "--input",
+ help="The input to infer, either a single image path, or a directory of images.",
+ )
+ parser.add_argument(
+ "-c",
+ "--det2_config",
+ help="The Detectron 2 config file (.yaml) for the model",
+ type=str,
+ )
+ parser.add_argument(
+ "-w", "--det2_weights", help="The Detectron 2 model weights (.pkl)", type=str
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the score threshold for the NMS operation, if higher than the threshold in the engine.",
+ )
+ parser.add_argument(
+ "--iou_threshold",
+ default=0.5,
+ type=float,
+ help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0.",
+ )
args = parser.parse_args()
if not all([args.engine, args.input, args.det2_config, args.det2_weights]):
parser.print_help()
- print("\nThese arguments are required: --engine --input --det2_config and --det2_weights")
+ print(
+ "\nThese arguments are required: --engine --input --det2_config and --det2_weights"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/detectron2/image_batcher.py b/samples/python/detectron2/image_batcher.py
index 228798ad..0fb1d90a 100644
--- a/samples/python/detectron2/image_batcher.py
+++ b/samples/python/detectron2/image_batcher.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,15 +24,26 @@
from detectron2.config import get_cfg
except ImportError:
print("Could not import Detectron 2 modules. Maybe you did not install Detectron 2")
- print("Please install Detectron 2, check https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md")
+ print(
+ "Please install Detectron 2, check https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md"
+ )
sys.exit(1)
+
class ImageBatcher:
"""
Creates batches of pre-processed images.
"""
- def __init__(self, input, shape, dtype, max_num_images=None, exact_batches=False, config_file=None):
+ def __init__(
+ self,
+ input,
+ shape,
+ dtype,
+ max_num_images=None,
+ exact_batches=False,
+ config_file=None,
+ ):
"""
:param input: The input directory to read images from.
:param shape: The tensor shape of the batch to prepare, either in NCHW or NHWC format.
@@ -68,10 +79,16 @@ def det2_setup(config_file):
extensions = [".jpg", ".jpeg", ".png", ".bmp", ".ppm"]
def is_image(path):
- return os.path.isfile(path) and os.path.splitext(path)[1].lower() in extensions
+ return (
+ os.path.isfile(path) and os.path.splitext(path)[1].lower() in extensions
+ )
if os.path.isdir(input):
- self.images = [os.path.join(input, f) for f in os.listdir(input) if is_image(os.path.join(input, f))]
+ self.images = [
+ os.path.join(input, f)
+ for f in os.listdir(input)
+ if is_image(os.path.join(input, f))
+ ]
self.images.sort()
elif os.path.isfile(input):
if is_image(input):
@@ -108,7 +125,7 @@ def is_image(path):
if self.num_images < 1:
print("Not enough images to create batches")
sys.exit(1)
- self.images = self.images[0:self.num_images]
+ self.images = self.images[0 : self.num_images]
# Subdivide the list of images into batches.
self.num_batches = 1 + int((self.num_images - 1) / self.batch_size)
@@ -122,7 +139,6 @@ def is_image(path):
self.image_index = 0
self.batch_index = 0
-
def preprocess_image(self, image_path):
"""
The image preprocessor loads an image from disk and prepares it as needed for batching. This includes padding,
@@ -165,7 +181,7 @@ def resize_pad(image, pad_color=(0, 0, 0)):
newh = int(newh + 0.5)
# Scaling factor for normalized box coordinates scaling in post-processing.
- scaling = max(newh/height, neww/width)
+ scaling = max(newh / height, neww / width)
# Padding.
image = image.resize((neww, newh), resample=Image.BILINEAR)
@@ -176,7 +192,7 @@ def resize_pad(image, pad_color=(0, 0, 0)):
scale = None
image = Image.open(image_path)
- image = image.convert(mode='RGB')
+ image = image.convert(mode="RGB")
# Pad with mean values of COCO dataset, since padding is applied before actual model's
# preprocessor steps (Sub, Div ops), we need to pad with mean values in order to reverse
# the effects of Sub and Div, so that padding after model's preprocessor will be with actual 0s.
@@ -185,7 +201,7 @@ def resize_pad(image, pad_color=(0, 0, 0)):
# Change HWC -> CHW.
image = np.transpose(image, (2, 0, 1))
# Change RGB -> BGR.
- return image[[2,1,0]], scale
+ return image[[2, 1, 0]], scale
def get_batch(self):
"""
diff --git a/samples/python/detectron2/infer.py b/samples/python/detectron2/infer.py
index db7c83b6..d086fb76 100644
--- a/samples/python/detectron2/infer.py
+++ b/samples/python/detectron2/infer.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,6 +27,7 @@
sys.path.insert(1, os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
import common
+
class TensorRTInfer:
"""
Implements inference for the Model TensorRT engine.
@@ -65,12 +66,12 @@ def __init__(self, engine_path):
size *= s
allocation = common.cuda_call(cudart.cudaMalloc(size))
binding = {
- 'index': i,
- 'name': name,
- 'dtype': np.dtype(trt.nptype(dtype)),
- 'shape': list(shape),
- 'allocation': allocation,
- 'size': size
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(trt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
+ "size": size,
}
self.allocations.append(allocation)
if is_input:
@@ -88,7 +89,7 @@ def input_spec(self):
Get the specs for the input tensor of the network. Useful to prepare memory allocations.
:return: Two items, the shape of the input tensor and its (numpy) datatype.
"""
- return self.inputs[0]['shape'], self.inputs[0]['dtype']
+ return self.inputs[0]["shape"], self.inputs[0]["dtype"]
def output_spec(self):
"""
@@ -97,7 +98,7 @@ def output_spec(self):
"""
specs = []
for o in self.outputs:
- specs.append((o['shape'], o['dtype']))
+ specs.append((o["shape"], o["dtype"]))
return specs
def infer(self, batch, scales=None, nms_threshold=None):
@@ -115,11 +116,13 @@ def infer(self, batch, scales=None, nms_threshold=None):
outputs.append(np.zeros(shape, dtype))
# Process I/O and execute the network.
- common.memcpy_host_to_device(self.inputs[0]['allocation'], np.ascontiguousarray(batch))
+ common.memcpy_host_to_device(
+ self.inputs[0]["allocation"], np.ascontiguousarray(batch)
+ )
self.context.execute_v2(self.allocations)
for o in range(len(outputs)):
- common.memcpy_device_to_host(outputs[o], self.outputs[o]['allocation'])
+ common.memcpy_device_to_host(outputs[o], self.outputs[o]["allocation"])
# Process the results.
nums = outputs[0]
@@ -136,7 +139,7 @@ def infer(self, batch, scales=None, nms_threshold=None):
mask = masks[i][n]
# Calculate scaling values for bboxes.
- scale = self.inputs[0]['shape'][2]
+ scale = self.inputs[0]["shape"][2]
scale /= scales[i]
scale_y = scale
scale_x = scale
@@ -144,15 +147,17 @@ def infer(self, batch, scales=None, nms_threshold=None):
if nms_threshold and scores[i][n] < nms_threshold:
continue
# Append to detections
- detections[i].append({
- 'ymin': boxes[i][n][0] * scale_y,
- 'xmin': boxes[i][n][1] * scale_x,
- 'ymax': boxes[i][n][2] * scale_y,
- 'xmax': boxes[i][n][3] * scale_x,
- 'score': scores[i][n],
- 'class': int(pred_classes[i][n]),
- 'mask': mask,
- })
+ detections[i].append(
+ {
+ "ymin": boxes[i][n][0] * scale_y,
+ "xmin": boxes[i][n][1] * scale_x,
+ "ymax": boxes[i][n][2] * scale_y,
+ "xmax": boxes[i][n][3] * scale_x,
+ "score": scores[i][n],
+ "class": int(pred_classes[i][n]),
+ "mask": mask,
+ }
+ )
return detections
@@ -160,22 +165,117 @@ def main(args):
output_dir = os.path.realpath(args.output)
os.makedirs(output_dir, exist_ok=True)
- labels = ["person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed","dining table","toilet","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book","clock","vase","scissors","teddy bear","hair drier", "toothbrush"]
+ labels = [
+ "person",
+ "bicycle",
+ "car",
+ "motorcycle",
+ "airplane",
+ "bus",
+ "train",
+ "truck",
+ "boat",
+ "traffic light",
+ "fire hydrant",
+ "stop sign",
+ "parking meter",
+ "bench",
+ "bird",
+ "cat",
+ "dog",
+ "horse",
+ "sheep",
+ "cow",
+ "elephant",
+ "bear",
+ "zebra",
+ "giraffe",
+ "backpack",
+ "umbrella",
+ "handbag",
+ "tie",
+ "suitcase",
+ "frisbee",
+ "skis",
+ "snowboard",
+ "sports ball",
+ "kite",
+ "baseball bat",
+ "baseball glove",
+ "skateboard",
+ "surfboard",
+ "tennis racket",
+ "bottle",
+ "wine glass",
+ "cup",
+ "fork",
+ "knife",
+ "spoon",
+ "bowl",
+ "banana",
+ "apple",
+ "sandwich",
+ "orange",
+ "broccoli",
+ "carrot",
+ "hot dog",
+ "pizza",
+ "donut",
+ "cake",
+ "chair",
+ "couch",
+ "potted plant",
+ "bed",
+ "dining table",
+ "toilet",
+ "tv",
+ "laptop",
+ "mouse",
+ "remote",
+ "keyboard",
+ "cell phone",
+ "microwave",
+ "oven",
+ "toaster",
+ "sink",
+ "refrigerator",
+ "book",
+ "clock",
+ "vase",
+ "scissors",
+ "teddy bear",
+ "hair drier",
+ "toothbrush",
+ ]
trt_infer = TensorRTInfer(args.engine)
- batcher = ImageBatcher(args.input, *trt_infer.input_spec(), config_file=args.det2_config)
+ batcher = ImageBatcher(
+ args.input, *trt_infer.input_spec(), config_file=args.det2_config
+ )
for batch, images, scales in batcher.get_batch():
- print("Processing Image {} / {}".format(batcher.image_index, batcher.num_images), end="\r")
+ print(
+ "Processing Image {} / {}".format(batcher.image_index, batcher.num_images),
+ end="\r",
+ )
detections = trt_infer.infer(batch, scales, args.nms_threshold)
for i in range(len(images)):
basename = os.path.splitext(os.path.basename(images[i]))[0]
# Image Visualizations
output_path = os.path.join(output_dir, "{}.png".format(basename))
- visualize_detections(images[i], output_path, detections[i], labels, args.iou_threshold)
+ visualize_detections(
+ images[i], output_path, detections[i], labels, args.iou_threshold
+ )
# Text Results
output_results = ""
for d in detections[i]:
- line = [d['xmin'], d['ymin'], d['xmax'], d['ymax'], d['score'], d['class']]
+ line = [
+ d["xmin"],
+ d["ymin"],
+ d["xmax"],
+ d["ymax"],
+ d["score"],
+ d["class"],
+ ]
output_results += "\t".join([str(f) for f in line]) + "\n"
with open(os.path.join(args.output, "{}.txt".format(basename)), "w") as f:
f.write(output_results)
@@ -185,17 +285,41 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-e", "--engine", default=None, help="The serialized TensorRT engine")
- parser.add_argument("-i", "--input", default=None, help="Path to the image or directory to process")
- parser.add_argument("-c", "--det2_config", help="The Detectron 2 config file (.yaml) for the model", type=str)
- parser.add_argument("-o", "--output", default=None, help="Directory where to save the visualization results")
- parser.add_argument("-t", "--nms_threshold", type=float,
- help="Override the score threshold for the NMS operation, if higher than the threshold in the engine.")
- parser.add_argument("--iou_threshold", default=0.5, type=float,
- help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0")
+ parser.add_argument(
+ "-e", "--engine", default=None, help="The serialized TensorRT engine"
+ )
+ parser.add_argument(
+ "-i", "--input", default=None, help="Path to the image or directory to process"
+ )
+ parser.add_argument(
+ "-c",
+ "--det2_config",
+ help="The Detectron 2 config file (.yaml) for the model",
+ type=str,
+ )
+ parser.add_argument(
+ "-o",
+ "--output",
+ default=None,
+ help="Directory where to save the visualization results",
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the score threshold for the NMS operation, if higher than the threshold in the engine.",
+ )
+ parser.add_argument(
+ "--iou_threshold",
+ default=0.5,
+ type=float,
+ help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0",
+ )
args = parser.parse_args()
if not all([args.engine, args.input, args.output, args.det2_config]):
parser.print_help()
- print("\nThese arguments are required: --engine --input --output and --det2_config")
+ print(
+ "\nThese arguments are required: --engine --input --output and --det2_config"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/detectron2/onnx_utils.py b/samples/python/detectron2/onnx_utils.py
index 56d280fa..2144fea0 100644
--- a/samples/python/detectron2/onnx_utils.py
+++ b/samples/python/detectron2/onnx_utils.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +23,7 @@
logging.getLogger("ModelHelper").setLevel(logging.INFO)
log = logging.getLogger("ModelHelper")
+
@gs.Graph.register()
def op_with_const(self, op, name, input, value):
"""
@@ -35,7 +36,10 @@ def op_with_const(self, op, name, input, value):
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}': {}".format(op, name, value.squeeze()))
const = gs.Constant(name="{}_value:0".format(name), values=value)
- return self.layer(name=name, op=op, inputs=[input_tensor, const], outputs=[name + ":0"])
+ return self.layer(
+ name=name, op=op, inputs=[input_tensor, const], outputs=[name + ":0"]
+ )
+
@gs.Graph.register()
def matmul(self, name, input, value):
@@ -48,7 +52,10 @@ def matmul(self, name, input, value):
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}': {}".format("MatMul", name, value.squeeze()))
const = gs.Constant(name="{}_value:0".format(name), values=value)
- return self.layer(name=name, op="MatMul", inputs=[input_tensor, const], outputs=[name + ":0"])
+ return self.layer(
+ name=name, op="MatMul", inputs=[input_tensor, const], outputs=[name + ":0"]
+ )
+
@gs.Graph.register()
def clip(self, name, input, clip_min, clip_max):
@@ -61,9 +68,19 @@ def clip(self, name, input, clip_min, clip_max):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}".format("Clip", name))
- const_min = gs.Constant(name="{}_value:0".format(name), values=np.asarray([clip_min], dtype=np.float32))
- const_max = gs.Constant(name="{}_value:1".format(name), values=np.asarray([clip_max], dtype=np.float32))
- return self.layer(name=name, op="Clip", inputs=[input_tensor, const_min, const_max], outputs=[name + ":0"])
+ const_min = gs.Constant(
+ name="{}_value:0".format(name), values=np.asarray([clip_min], dtype=np.float32)
+ )
+ const_max = gs.Constant(
+ name="{}_value:1".format(name), values=np.asarray([clip_max], dtype=np.float32)
+ )
+ return self.layer(
+ name=name,
+ op="Clip",
+ inputs=[input_tensor, const_min, const_max],
+ outputs=[name + ":0"],
+ )
+
@gs.Graph.register()
def slice(self, name, input, starts, ends, axes):
@@ -79,10 +96,22 @@ def slice(self, name, input, starts, ends, axes):
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}".format("Slice", name))
- const_start = gs.Constant(name="{}_value:0".format(name), values=np.asarray([starts], dtype=np.int64))
- const_end = gs.Constant(name="{}_value:1".format(name), values=np.asarray([ends], dtype=np.int64))
- const_axes = gs.Constant(name="{}_value:2".format(name), values=np.asarray([axes], dtype=np.int64))
- return self.layer(name=name, op="Slice", inputs=[input_tensor, const_start, const_end, const_axes], outputs=[name + ":0"])
+ const_start = gs.Constant(
+ name="{}_value:0".format(name), values=np.asarray([starts], dtype=np.int64)
+ )
+ const_end = gs.Constant(
+ name="{}_value:1".format(name), values=np.asarray([ends], dtype=np.int64)
+ )
+ const_axes = gs.Constant(
+ name="{}_value:2".format(name), values=np.asarray([axes], dtype=np.int64)
+ )
+ return self.layer(
+ name=name,
+ op="Slice",
+ inputs=[input_tensor, const_start, const_end, const_axes],
+ outputs=[name + ":0"],
+ )
+
@gs.Graph.register()
def unsqueeze(self, name, input, axes=[3]):
@@ -96,7 +125,14 @@ def unsqueeze(self, name, input, axes=[3]):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Unsqueeze node '{}': {}".format(name, axes))
- return self.layer(name=name, op="Unsqueeze", inputs=[input_tensor], outputs=[name + ":0"], attrs={'axes': axes})
+ return self.layer(
+ name=name,
+ op="Unsqueeze",
+ inputs=[input_tensor],
+ outputs=[name + ":0"],
+ attrs={"axes": axes},
+ )
+
@gs.Graph.register()
def squeeze(self, name, input, axes=[2]):
@@ -110,7 +146,14 @@ def squeeze(self, name, input, axes=[2]):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Squeeze node '{}': {}".format(name, axes))
- return self.layer(name=name, op="Squeeze", inputs=[input_tensor], outputs=[name + ":0"], attrs={'axes': axes})
+ return self.layer(
+ name=name,
+ op="Squeeze",
+ inputs=[input_tensor],
+ outputs=[name + ":0"],
+ attrs={"axes": axes},
+ )
+
@gs.Graph.register()
def gather(self, name, data, indices, axes=0):
@@ -125,7 +168,14 @@ def gather(self, name, data, indices, axes=0):
data_tensor = data if type(data) is gs.Variable else data[0]
indices_tensor = indices if type(indices) is gs.Variable else indices[0]
log.debug("Created Gather node '{}': {}".format(name, axes))
- return self.layer(name=name, op="Gather", inputs=[data_tensor, indices_tensor], outputs=[name + ":0"], attrs={'axes': axes})
+ return self.layer(
+ name=name,
+ op="Gather",
+ inputs=[data_tensor, indices_tensor],
+ outputs=[name + ":0"],
+ attrs={"axes": axes},
+ )
+
@gs.Graph.register()
def transpose(self, name, input, perm):
@@ -139,7 +189,14 @@ def transpose(self, name, input, perm):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Transpose node '{}': {}".format(name, perm))
- return self.layer(name=name, op="Transpose", inputs=[input_tensor], outputs=[name + ":0"], attrs={'perm': perm})
+ return self.layer(
+ name=name,
+ op="Transpose",
+ inputs=[input_tensor],
+ outputs=[name + ":0"],
+ attrs={"perm": perm},
+ )
+
@gs.Graph.register()
def sigmoid(self, name, input):
@@ -152,7 +209,10 @@ def sigmoid(self, name, input):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Sigmoid node '{}'".format(name))
- return self.layer(name=name, op="Sigmoid", inputs=[input_tensor], outputs=[name + ":0"])
+ return self.layer(
+ name=name, op="Sigmoid", inputs=[input_tensor], outputs=[name + ":0"]
+ )
+
@gs.Graph.register()
def plugin(self, op, name, inputs: list, outputs: list, attrs):
@@ -170,6 +230,7 @@ def plugin(self, op, name, inputs: list, outputs: list, attrs):
log.debug("Created TRT Plugin node '{}': {}".format(name, attrs))
return self.layer(op=op, name=name, inputs=inputs, outputs=outputs, attrs=attrs)
+
@gs.Graph.register()
def find_node_by_op(self, op):
"""
@@ -183,6 +244,7 @@ def find_node_by_op(self, op):
return node
return None
+
@gs.Graph.register()
def find_node_by_op_name(self, op, name):
"""
@@ -197,8 +259,11 @@ def find_node_by_op_name(self, op, name):
return node
return None
+
@gs.Graph.register()
-def find_node_by_op_input_output_name(self, op, input_name, output_name, input_pos=0, output_pos=0):
+def find_node_by_op_input_output_name(
+ self, op, input_name, output_name, input_pos=0, output_pos=0
+):
"""
Finds the first node in the graph with the given operation name.
:param self: The gs.Graph object being extended.
@@ -210,10 +275,15 @@ def find_node_by_op_input_output_name(self, op, input_name, output_name, input_p
:return: The first node matching that performs that op.
"""
for node in self.nodes:
- if node.op == op and node.inputs[input_pos].name == input_name and node.outputs[output_pos].name == output_name:
+ if (
+ node.op == op
+ and node.inputs[input_pos].name == input_name
+ and node.outputs[output_pos].name == output_name
+ ):
return node
return None
+
@gs.Graph.register()
def find_descendant_by_op(self, node, op, depth=10):
"""
@@ -237,6 +307,7 @@ def find_descendant_by_op(self, node, op, depth=10):
queue.append(child)
return None
+
@gs.Graph.register()
def find_ancestor_by_op(self, node, op, depth=10):
"""
diff --git a/samples/python/detectron2/visualize.py b/samples/python/detectron2/visualize.py
index dd8b6ead..00e930f1 100644
--- a/samples/python/detectron2/visualize.py
+++ b/samples/python/detectron2/visualize.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,55 +22,177 @@
import PIL.ImageFilter as ImageFilter
-COLORS = ['GoldenRod', 'MediumTurquoise', 'GreenYellow', 'SteelBlue', 'DarkSeaGreen', 'SeaShell', 'LightGrey',
- 'IndianRed', 'DarkKhaki', 'LawnGreen', 'WhiteSmoke', 'Peru', 'LightCoral', 'FireBrick', 'OldLace',
- 'LightBlue', 'SlateGray', 'OliveDrab', 'NavajoWhite', 'PaleVioletRed', 'SpringGreen', 'AliceBlue', 'Violet',
- 'DeepSkyBlue', 'Red', 'MediumVioletRed', 'PaleTurquoise', 'Tomato', 'Azure', 'Yellow', 'Cornsilk',
- 'Aquamarine', 'CadetBlue', 'CornflowerBlue', 'DodgerBlue', 'Olive', 'Orchid', 'LemonChiffon', 'Sienna',
- 'OrangeRed', 'Orange', 'DarkSalmon', 'Magenta', 'Wheat', 'Lime', 'GhostWhite', 'SlateBlue', 'Aqua',
- 'MediumAquaMarine', 'LightSlateGrey', 'MediumSeaGreen', 'SandyBrown', 'YellowGreen', 'Plum', 'FloralWhite',
- 'LightPink', 'Thistle', 'DarkViolet', 'Pink', 'Crimson', 'Chocolate', 'DarkGrey', 'Ivory', 'PaleGreen',
- 'DarkGoldenRod', 'LavenderBlush', 'SlateGrey', 'DeepPink', 'Gold', 'Cyan', 'LightSteelBlue', 'MediumPurple',
- 'ForestGreen', 'DarkOrange', 'Tan', 'Salmon', 'PaleGoldenRod', 'LightGreen', 'LightSlateGray', 'HoneyDew',
- 'Fuchsia', 'LightSeaGreen', 'DarkOrchid', 'Green', 'Chartreuse', 'LimeGreen', 'AntiqueWhite', 'Beige',
- 'Gainsboro', 'Bisque', 'SaddleBrown', 'Silver', 'Lavender', 'Teal', 'LightCyan', 'PapayaWhip', 'Purple',
- 'Coral', 'BurlyWood', 'LightGray', 'Snow', 'MistyRose', 'PowderBlue', 'DarkCyan', 'White', 'Turquoise',
- 'MediumSlateBlue', 'PeachPuff', 'Moccasin', 'LightSalmon', 'SkyBlue', 'Khaki', 'MediumSpringGreen',
- 'BlueViolet', 'MintCream', 'Linen', 'SeaGreen', 'HotPink', 'LightYellow', 'BlanchedAlmond', 'RoyalBlue',
- 'RosyBrown', 'MediumOrchid', 'DarkTurquoise', 'LightGoldenRodYellow', 'LightSkyBlue']
+COLORS = [
+ "GoldenRod",
+ "MediumTurquoise",
+ "GreenYellow",
+ "SteelBlue",
+ "DarkSeaGreen",
+ "SeaShell",
+ "LightGrey",
+ "IndianRed",
+ "DarkKhaki",
+ "LawnGreen",
+ "WhiteSmoke",
+ "Peru",
+ "LightCoral",
+ "FireBrick",
+ "OldLace",
+ "LightBlue",
+ "SlateGray",
+ "OliveDrab",
+ "NavajoWhite",
+ "PaleVioletRed",
+ "SpringGreen",
+ "AliceBlue",
+ "Violet",
+ "DeepSkyBlue",
+ "Red",
+ "MediumVioletRed",
+ "PaleTurquoise",
+ "Tomato",
+ "Azure",
+ "Yellow",
+ "Cornsilk",
+ "Aquamarine",
+ "CadetBlue",
+ "CornflowerBlue",
+ "DodgerBlue",
+ "Olive",
+ "Orchid",
+ "LemonChiffon",
+ "Sienna",
+ "OrangeRed",
+ "Orange",
+ "DarkSalmon",
+ "Magenta",
+ "Wheat",
+ "Lime",
+ "GhostWhite",
+ "SlateBlue",
+ "Aqua",
+ "MediumAquaMarine",
+ "LightSlateGrey",
+ "MediumSeaGreen",
+ "SandyBrown",
+ "YellowGreen",
+ "Plum",
+ "FloralWhite",
+ "LightPink",
+ "Thistle",
+ "DarkViolet",
+ "Pink",
+ "Crimson",
+ "Chocolate",
+ "DarkGrey",
+ "Ivory",
+ "PaleGreen",
+ "DarkGoldenRod",
+ "LavenderBlush",
+ "SlateGrey",
+ "DeepPink",
+ "Gold",
+ "Cyan",
+ "LightSteelBlue",
+ "MediumPurple",
+ "ForestGreen",
+ "DarkOrange",
+ "Tan",
+ "Salmon",
+ "PaleGoldenRod",
+ "LightGreen",
+ "LightSlateGray",
+ "HoneyDew",
+ "Fuchsia",
+ "LightSeaGreen",
+ "DarkOrchid",
+ "Green",
+ "Chartreuse",
+ "LimeGreen",
+ "AntiqueWhite",
+ "Beige",
+ "Gainsboro",
+ "Bisque",
+ "SaddleBrown",
+ "Silver",
+ "Lavender",
+ "Teal",
+ "LightCyan",
+ "PapayaWhip",
+ "Purple",
+ "Coral",
+ "BurlyWood",
+ "LightGray",
+ "Snow",
+ "MistyRose",
+ "PowderBlue",
+ "DarkCyan",
+ "White",
+ "Turquoise",
+ "MediumSlateBlue",
+ "PeachPuff",
+ "Moccasin",
+ "LightSalmon",
+ "SkyBlue",
+ "Khaki",
+ "MediumSpringGreen",
+ "BlueViolet",
+ "MintCream",
+ "Linen",
+ "SeaGreen",
+ "HotPink",
+ "LightYellow",
+ "BlanchedAlmond",
+ "RoyalBlue",
+ "RosyBrown",
+ "MediumOrchid",
+ "DarkTurquoise",
+ "LightGoldenRodYellow",
+ "LightSkyBlue",
+]
-#Overlay mask with transparency on top of the image.
+# Overlay mask with transparency on top of the image.
def overlay(image, mask, color, alpha_transparency=0.5):
for channel in range(3):
- image[:, :, channel] = np.where(mask == 1,
- image[:, :, channel] *
- (1 - alpha_transparency) + alpha_transparency * color[channel] * 255,
- image[:, :, channel])
+ image[:, :, channel] = np.where(
+ mask == 1,
+ image[:, :, channel] * (1 - alpha_transparency)
+ + alpha_transparency * color[channel] * 255,
+ image[:, :, channel],
+ )
return image
-def visualize_detections(image_path, output_path, detections, labels=[], iou_threshold=0.5):
- image = Image.open(image_path).convert(mode='RGB')
+
+def visualize_detections(
+ image_path, output_path, detections, labels=[], iou_threshold=0.5
+):
+ image = Image.open(image_path).convert(mode="RGB")
# Get image dimensions.
im_width, im_height = image.size
line_width = 2
font = ImageFont.load_default()
for d in detections:
- color = COLORS[d['class'] % len(COLORS)]
+ color = COLORS[d["class"] % len(COLORS)]
# Dynamically convert PIL color into RGB numpy array.
- pixel_color = Image.new("RGB",(1, 1), color)
+ pixel_color = Image.new("RGB", (1, 1), color)
# Normalize.
- np_color = (np.asarray(pixel_color)[0][0])/255
+ np_color = (np.asarray(pixel_color)[0][0]) / 255
# TRT instance segmentation masks.
- if isinstance(d['mask'], np.ndarray) and d['mask'].shape == (28, 28):
+ if isinstance(d["mask"], np.ndarray) and d["mask"].shape == (28, 28):
# PyTorch uses [x1,y1,x2,y2] format instead of regular [y1,x1,y2,x2].
- d['ymin'], d['xmin'], d['ymax'], d['xmax'] = d['xmin'], d['ymin'], d['xmax'], d['ymax']
+ d["ymin"], d["xmin"], d["ymax"], d["xmax"] = (
+ d["xmin"],
+ d["ymin"],
+ d["xmax"],
+ d["ymax"],
+ )
# Get detection bbox resolution.
- det_width = round(d['xmax'] - d['xmin'])
- det_height = round(d['ymax'] - d['ymin'])
+ det_width = round(d["xmax"] - d["xmin"])
+ det_height = round(d["ymax"] - d["ymin"])
# Slight scaling, to get binary masks after float32 -> uint8
# conversion, if not scaled all pixels are zero.
- mask = d['mask'] > iou_threshold
+ mask = d["mask"] > iou_threshold
# Convert float32 -> uint8.
mask = mask.astype(np.uint8)
# Create an image out of predicted mask array.
@@ -80,10 +202,10 @@ def visualize_detections(image_path, output_path, detections, labels=[], iou_thr
# Create an original image sized template for correct mask placement.
pad = Image.new("L", (im_width, im_height))
# Place your mask according to detection bbox placement.
- pad.paste(mask, (round(d['xmin']), (round(d['ymin']))))
+ pad.paste(mask, (round(d["xmin"]), (round(d["ymin"]))))
# Reconvert mask into numpy array for evaluation.
padded_mask = np.array(pad)
- #Creat np.array from original image, copy in order to modify.
+ # Creat np.array from original image, copy in order to modify.
image_copy = np.asarray(image).copy()
# Image with overlaid mask.
masked_image = overlay(image_copy, padded_mask, np_color)
@@ -92,23 +214,42 @@ def visualize_detections(image_path, output_path, detections, labels=[], iou_thr
# Bbox lines.
draw = ImageDraw.Draw(image)
- draw.line([(d['xmin'], d['ymin']), (d['xmin'], d['ymax']), (d['xmax'], d['ymax']), (d['xmax'], d['ymin']),
- (d['xmin'], d['ymin'])], width=line_width, fill=color)
- label = "Class {}".format(d['class'])
- if d['class'] < len(labels):
- label = "{}".format(labels[d['class']])
- score = d['score']
+ draw.line(
+ [
+ (d["xmin"], d["ymin"]),
+ (d["xmin"], d["ymax"]),
+ (d["xmax"], d["ymax"]),
+ (d["xmax"], d["ymin"]),
+ (d["xmin"], d["ymin"]),
+ ],
+ width=line_width,
+ fill=color,
+ )
+ label = "Class {}".format(d["class"])
+ if d["class"] < len(labels):
+ label = "{}".format(labels[d["class"]])
+ score = d["score"]
text = "{}: {}%".format(label, int(100 * score))
if score < 0:
text = label
left, top, right, bottom = font.getbbox(text)
text_width, text_height = right - left, bottom - top
- text_bottom = max(text_height, d['ymin'])
- text_left = d['xmin']
+ text_bottom = max(text_height, d["ymin"])
+ text_left = d["xmin"]
margin = np.ceil(0.05 * text_height)
- draw.rectangle([(text_left, text_bottom - text_height - 2 * margin), (text_left + text_width, text_bottom)],
- fill=color)
- draw.text((text_left + margin, text_bottom - text_height - margin), text, fill='black', font=font)
+ draw.rectangle(
+ [
+ (text_left, text_bottom - text_height - 2 * margin),
+ (text_left + text_width, text_bottom),
+ ],
+ fill=color,
+ )
+ draw.text(
+ (text_left + margin, text_bottom - text_height - margin),
+ text,
+ fill="black",
+ font=font,
+ )
if output_path is None:
return image
image.save(output_path)
diff --git a/samples/python/downloader.py b/samples/python/downloader.py
index b4a436e2..5e8be202 100755
--- a/samples/python/downloader.py
+++ b/samples/python/downloader.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -87,8 +87,13 @@ def download(data_dir, yaml_path, overwrite=False):
def _downloadFile(path, url):
logger.info("Downloading %s from %s", path, url)
import requests
+ from requests.adapters import HTTPAdapter, Retry
+
+ session = requests.Session()
+ retries = Retry(total=5, backoff_factor=0.5)
+ session.mount("http://", HTTPAdapter(max_retries=retries))
+ r = session.get(url, stream=True, timeout=10)
- r = requests.get(url, stream=True, timeout=5)
size = int(r.headers.get("content-length", 0))
from tqdm import tqdm
@@ -124,7 +129,9 @@ def _downloadFile(path, url):
def _parseArgs():
- parser = argparse.ArgumentParser(description="Downloader of TensorRT sample data files.")
+ parser = argparse.ArgumentParser(
+ description="Downloader of TensorRT sample data files."
+ )
parser.add_argument(
"-d",
"--data",
@@ -137,7 +144,11 @@ def _parseArgs():
default="download.yml",
)
parser.add_argument(
- "-o", "--overwrite", help="Force to overwrite if MD5 check failed", action="store_true", default=False
+ "-o",
+ "--overwrite",
+ help="Force to overwrite if MD5 check failed",
+ action="store_true",
+ default=False,
)
parser.add_argument(
"-v",
@@ -150,7 +161,9 @@ def _parseArgs():
args, _ = parser.parse_known_args()
data = os.environ.get("TRT_DATA_DIR", None) if args.data is None else args.data
if data is None:
- raise ValueError("Data directory must be specified by either `-d $DATA` or environment variable $TRT_DATA_DIR.")
+ raise ValueError(
+ "Data directory must be specified by either `-d $DATA` or environment variable $TRT_DATA_DIR."
+ )
return data, args
@@ -209,16 +222,22 @@ def getFilePath(path):
"""
global TRT_DATA_DIR
if not TRT_DATA_DIR:
- parser = argparse.ArgumentParser(description="Helper of data file download tool")
+ parser = argparse.ArgumentParser(
+ description="Helper of data file download tool"
+ )
parser.add_argument(
"-d",
"--data",
help="Specify the data directory where it is saved in. $TRT_DATA_DIR will be overwritten by this argument.",
)
args, _ = parser.parse_known_args()
- TRT_DATA_DIR = os.environ.get("TRT_DATA_DIR", None) if args.data is None else args.data
+ TRT_DATA_DIR = (
+ os.environ.get("TRT_DATA_DIR", None) if args.data is None else args.data
+ )
if TRT_DATA_DIR is None:
- raise ValueError("Data directory must be specified by either `-d $DATA` or environment variable $TRT_DATA_DIR.")
+ raise ValueError(
+ "Data directory must be specified by either `-d $DATA` or environment variable $TRT_DATA_DIR."
+ )
fullpath = os.path.join(TRT_DATA_DIR, path)
if not os.path.exists(fullpath):
diff --git a/samples/python/efficientdet/build_engine.py b/samples/python/efficientdet/build_engine.py
index 77143aad..58dd6d5c 100644
--- a/samples/python/efficientdet/build_engine.py
+++ b/samples/python/efficientdet/build_engine.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -56,7 +56,10 @@ def set_image_batcher(self, image_batcher: ImageBatcher):
:param image_batcher: The ImageBatcher object
"""
self.image_batcher = image_batcher
- size = int(np.dtype(self.image_batcher.dtype).itemsize * np.prod(self.image_batcher.shape))
+ size = int(
+ np.dtype(self.image_batcher.dtype).itemsize
+ * np.prod(self.image_batcher.shape)
+ )
self.batch_allocation = common.cuda_call(cudart.cudaMalloc(size))
self.batch_generator = self.image_batcher.get_batch()
@@ -81,8 +84,14 @@ def get_batch(self, names):
return None
try:
batch, _, _ = next(self.batch_generator)
- log.info("Calibrating image {} / {}".format(self.image_batcher.image_index, self.image_batcher.num_images))
- common.memcpy_host_to_device(self.batch_allocation, np.ascontiguousarray(batch))
+ log.info(
+ "Calibrating image {} / {}".format(
+ self.image_batcher.image_index, self.image_batcher.num_images
+ )
+ )
+ common.memcpy_host_to_device(
+ self.batch_allocation, np.ascontiguousarray(batch)
+ )
return [int(self.batch_allocation)]
except StopIteration:
log.info("Finished calibration batches")
@@ -130,7 +139,9 @@ def __init__(self, verbose=False, workspace=8):
self.builder = trt.Builder(self.trt_logger)
self.config = self.builder.create_builder_config()
- self.config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace * (2 ** 30))
+ self.config.set_memory_pool_limit(
+ trt.MemoryPoolType.WORKSPACE, workspace * (2**30)
+ )
self.network = None
self.parser = None
@@ -161,29 +172,46 @@ def create_network(self, onnx_path, batch_size, dynamic_batch_size=None):
profile = self.builder.create_optimization_profile()
dynamic_inputs = False
for input in inputs:
- log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype))
+ log.info(
+ "Input '{}' with shape {} and dtype {}".format(
+ input.name, input.shape, input.dtype
+ )
+ )
if input.shape[0] == -1:
dynamic_inputs = True
if dynamic_batch_size:
if type(dynamic_batch_size) is str:
- dynamic_batch_size = [int(v) for v in dynamic_batch_size.split(",")]
+ dynamic_batch_size = [
+ int(v) for v in dynamic_batch_size.split(",")
+ ]
assert len(dynamic_batch_size) == 3
min_shape = [dynamic_batch_size[0]] + list(input.shape[1:])
opt_shape = [dynamic_batch_size[1]] + list(input.shape[1:])
max_shape = [dynamic_batch_size[2]] + list(input.shape[1:])
profile.set_shape(input.name, min_shape, opt_shape, max_shape)
- log.info("Input '{}' Optimization Profile with shape MIN {} / OPT {} / MAX {}".format(
- input.name, min_shape, opt_shape, max_shape))
+ log.info(
+ "Input '{}' Optimization Profile with shape MIN {} / OPT {} / MAX {}".format(
+ input.name, min_shape, opt_shape, max_shape
+ )
+ )
else:
shape = [batch_size] + list(input.shape[1:])
profile.set_shape(input.name, shape, shape, shape)
- log.info("Input '{}' Optimization Profile with shape {}".format(input.name, shape))
+ log.info(
+ "Input '{}' Optimization Profile with shape {}".format(
+ input.name, shape
+ )
+ )
if dynamic_inputs:
self.config.add_optimization_profile(profile)
outputs = [self.network.get_output(i) for i in range(self.network.num_outputs)]
for output in outputs:
- log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype))
+ log.info(
+ "Output '{}' with shape {} and dtype {}".format(
+ output.name, output.shape, output.dtype
+ )
+ )
def set_mixed_precision(self):
"""
@@ -202,7 +230,8 @@ def set_mixed_precision(self):
# add or remove blocks.
for i in range(self.network.num_layers):
layer = self.network.get_layer(i)
- if layer.type == trt.LayerType.CONVOLUTION and any([
+ if layer.type == trt.LayerType.CONVOLUTION and any(
+ [
# AutoML Layer Names:
"/stem/" in layer.name,
"/blocks_0/" in layer.name,
@@ -213,12 +242,24 @@ def set_mixed_precision(self):
"/stack_0/block_0/" in layer.name,
"/stack_1/block_0/" in layer.name,
"/stack_1/block_1/" in layer.name,
- ]):
+ ]
+ ):
self.network.get_layer(i).precision = trt.DataType.HALF
- log.info("Mixed-Precision Layer {} set to HALF STRICT data type".format(layer.name))
-
- def create_engine(self, engine_path, precision, calib_input=None, calib_cache=None, calib_num_images=5000,
- calib_batch_size=8):
+ log.info(
+ "Mixed-Precision Layer {} set to HALF STRICT data type".format(
+ layer.name
+ )
+ )
+
+ def create_engine(
+ self,
+ engine_path,
+ precision,
+ calib_input=None,
+ calib_cache=None,
+ calib_num_images=5000,
+ calib_batch_size=8,
+ ):
"""
Build the TensorRT engine and serialize it to disk.
:param engine_path: The path where to serialize the engine to.
@@ -251,8 +292,15 @@ def create_engine(self, engine_path, precision, calib_input=None, calib_cache=No
calib_shape = [calib_batch_size] + list(inputs[0].shape[1:])
calib_dtype = trt.nptype(inputs[0].dtype)
self.config.int8_calibrator.set_image_batcher(
- ImageBatcher(calib_input, calib_shape, calib_dtype, max_num_images=calib_num_images,
- exact_batches=True, shuffle_files=True))
+ ImageBatcher(
+ calib_input,
+ calib_shape,
+ calib_dtype,
+ max_num_images=calib_num_images,
+ exact_batches=True,
+ shuffle_files=True,
+ )
+ )
engine_bytes = self.builder.build_serialized_network(self.network, self.config)
if engine_bytes is None:
@@ -272,41 +320,88 @@ def main(args):
builder.create_network(args.onnx, args.batch_size, args.dynamic_batch_size)
if args.precision == "mixed":
builder.set_mixed_precision()
- builder.create_engine(args.engine, args.precision, args.calib_input, args.calib_cache, args.calib_num_images,
- args.calib_batch_size)
+ builder.create_engine(
+ args.engine,
+ args.precision,
+ args.calib_input,
+ args.calib_cache,
+ args.calib_num_images,
+ args.calib_batch_size,
+ )
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-o", "--onnx", required=True,
- help="The input ONNX model file to load")
- parser.add_argument("-e", "--engine", required=True,
- help="The output path for the TRT engine")
- parser.add_argument("-b", "--batch_size", default=1, type=int,
- help="The static batch size to build the engine with, default: 1")
- parser.add_argument("-d", "--dynamic_batch_size", default=None,
- help="Enable dynamic batch size by providing a comma-separated MIN,OPT,MAX batch size, "
- "if this option is set, --batch_size is ignored, example: -d 1,16,32, "
- "default: None, build static engine")
- parser.add_argument("-p", "--precision", default="fp16", choices=["fp32", "fp16", "int8", "mixed"],
- help="The precision mode to build in, either fp32/fp16/int8/mixed, default: fp16")
- parser.add_argument("-v", "--verbose", action="store_true",
- help="Enable more verbose log output")
- parser.add_argument("-w", "--workspace", default=8, type=int,
- help="The max memory workspace size to allow in Gb, default: 8")
- parser.add_argument("--calib_input",
- help="The directory holding images to use for calibration")
- parser.add_argument("--calib_cache", default=None,
- help="The file path for INT8 calibration cache to use, default: ./calibration.cache")
- parser.add_argument("--calib_num_images", default=5000, type=int,
- help="The maximum number of images to use for calibration, default: 5000")
- parser.add_argument("--calib_batch_size", default=8, type=int,
- help="The batch size for the calibration process, default: 8")
- parser.add_argument("--timing_cache", default="./timing.cache",
- help="The file path for timing cache, default: ./timing.cache")
+ parser.add_argument(
+ "-o", "--onnx", required=True, help="The input ONNX model file to load"
+ )
+ parser.add_argument(
+ "-e", "--engine", required=True, help="The output path for the TRT engine"
+ )
+ parser.add_argument(
+ "-b",
+ "--batch_size",
+ default=1,
+ type=int,
+ help="The static batch size to build the engine with, default: 1",
+ )
+ parser.add_argument(
+ "-d",
+ "--dynamic_batch_size",
+ default=None,
+ help="Enable dynamic batch size by providing a comma-separated MIN,OPT,MAX batch size, "
+ "if this option is set, --batch_size is ignored, example: -d 1,16,32, "
+ "default: None, build static engine",
+ )
+ parser.add_argument(
+ "-p",
+ "--precision",
+ default="fp16",
+ choices=["fp32", "fp16", "int8", "mixed"],
+ help="The precision mode to build in, either fp32/fp16/int8/mixed, default: fp16",
+ )
+ parser.add_argument(
+ "-v", "--verbose", action="store_true", help="Enable more verbose log output"
+ )
+ parser.add_argument(
+ "-w",
+ "--workspace",
+ default=8,
+ type=int,
+ help="The max memory workspace size to allow in Gb, default: 8",
+ )
+ parser.add_argument(
+ "--calib_input", help="The directory holding images to use for calibration"
+ )
+ parser.add_argument(
+ "--calib_cache",
+ default=None,
+ help="The file path for INT8 calibration cache to use, default: ./calibration.cache",
+ )
+ parser.add_argument(
+ "--calib_num_images",
+ default=5000,
+ type=int,
+ help="The maximum number of images to use for calibration, default: 5000",
+ )
+ parser.add_argument(
+ "--calib_batch_size",
+ default=8,
+ type=int,
+ help="The batch size for the calibration process, default: 8",
+ )
+ parser.add_argument(
+ "--timing_cache",
+ default="./timing.cache",
+ help="The file path for timing cache, default: ./timing.cache",
+ )
args = parser.parse_args()
- if args.precision in ["int8", "mixed"] and not (args.calib_input or os.path.exists(args.calib_cache)):
+ if args.precision in ["int8", "mixed"] and not (
+ args.calib_input or os.path.exists(args.calib_cache)
+ ):
parser.print_help()
- log.error("When building in int8 or mixed precision, --calib_input or an existing --calib_cache file is required")
+ log.error(
+ "When building in int8 or mixed precision, --calib_input or an existing --calib_cache file is required"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/efficientdet/compare_tf.py b/samples/python/efficientdet/compare_tf.py
index 54c356cd..4e4b91fc 100644
--- a/samples/python/efficientdet/compare_tf.py
+++ b/samples/python/efficientdet/compare_tf.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -35,7 +35,12 @@ def run(batcher, inferer, framework, nms_threshold=None):
for batch, images, scales in batcher.get_batch():
res_detections += inferer.process(batch, scales, nms_threshold)
res_images += images
- print("Processing {} / {} images ({})".format(batcher.image_index, batcher.num_images, framework), end="\r")
+ print(
+ "Processing {} / {} images ({})".format(
+ batcher.image_index, batcher.num_images, framework
+ ),
+ end="\r",
+ )
print()
return res_images, res_detections
@@ -62,7 +67,15 @@ def parse_annotations(annotations_path):
return annotations
-def compare_images(tf_images, tf_detections, trt_images, trt_detections, output_dir, annotations_path, labels_path):
+def compare_images(
+ tf_images,
+ tf_detections,
+ trt_images,
+ trt_detections,
+ output_dir,
+ annotations_path,
+ labels_path,
+):
labels = []
if labels_path and os.path.exists(labels_path):
with open(labels_path) as f:
@@ -72,7 +85,9 @@ def compare_images(tf_images, tf_detections, trt_images, trt_detections, output_
annotations = parse_annotations(annotations_path)
count = 1
- for tf_img, tf_det, trt_img, trt_det in zip(tf_images, tf_detections, trt_images, trt_detections):
+ for tf_img, tf_det, trt_img, trt_det in zip(
+ tf_images, tf_detections, trt_images, trt_detections
+ ):
vis = []
names = []
colors = []
@@ -90,18 +105,27 @@ def compare_images(tf_images, tf_detections, trt_images, trt_detections, output_
if img_id.isnumeric():
img_id = int(img_id)
if img_id in annotations.keys():
- vis.append(visualize_detections(trt_img, None, annotations[img_id], labels))
+ vis.append(
+ visualize_detections(trt_img, None, annotations[img_id], labels)
+ )
names.append("Ground Truth")
colors.append("RoyalBlue")
else:
- print("Image {} does not have a COCO annotation, skipping ground truth visualization".format(trt_img))
+ print(
+ "Image {} does not have a COCO annotation, skipping ground truth visualization".format(
+ trt_img
+ )
+ )
basename = os.path.splitext(os.path.basename(tf_img))[0]
output_path = os.path.join(output_dir, "{}.compare.png".format(basename))
os.makedirs(output_dir, exist_ok=True)
concat_visualizations(vis, names, colors, output_path)
- print("Processing {} / {} images (Visualization)".format(count, len(tf_images)), end="\r")
+ print(
+ "Processing {} / {} images (Visualization)".format(count, len(tf_images)),
+ end="\r",
+ )
count += 1
print()
@@ -110,32 +134,80 @@ def main(args):
tf_infer = TensorFlowInfer(args.saved_model)
trt_infer = TensorRTInfer(args.engine)
- trt_batcher = ImageBatcher(args.input, *trt_infer.input_spec(), max_num_images=args.num_images)
- tf_infer.override_input_shape(0, [1, trt_batcher.height, trt_batcher.width, 3]) # Same size input in TF as TRT
- tf_batcher = ImageBatcher(args.input, *tf_infer.input_spec(), max_num_images=args.num_images)
-
- tf_images, tf_detections = run(tf_batcher, tf_infer, "TensorFlow", args.nms_threshold)
- trt_images, trt_detections = run(trt_batcher, trt_infer, "TensorRT", args.nms_threshold)
-
- compare_images(tf_images, tf_detections, trt_images, trt_detections, args.output, args.annotations, args.labels)
+ trt_batcher = ImageBatcher(
+ args.input, *trt_infer.input_spec(), max_num_images=args.num_images
+ )
+ tf_infer.override_input_shape(
+ 0, [1, trt_batcher.height, trt_batcher.width, 3]
+ ) # Same size input in TF as TRT
+ tf_batcher = ImageBatcher(
+ args.input, *tf_infer.input_spec(), max_num_images=args.num_images
+ )
+
+ tf_images, tf_detections = run(
+ tf_batcher, tf_infer, "TensorFlow", args.nms_threshold
+ )
+ trt_images, trt_detections = run(
+ trt_batcher, trt_infer, "TensorRT", args.nms_threshold
+ )
+
+ compare_images(
+ tf_images,
+ tf_detections,
+ trt_images,
+ trt_detections,
+ args.output,
+ args.annotations,
+ args.labels,
+ )
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with")
- parser.add_argument("-m", "--saved_model", help="The TensorFlow saved model path to validate against")
- parser.add_argument("-i", "--input",
- help="The input to infer, either a single image path, or a directory of images")
- parser.add_argument("-o", "--output", default=None, help="Directory where to save the visualization results")
- parser.add_argument("-l", "--labels", default="./labels_coco.txt",
- help="File to use for reading the class labels from, default: ./labels_coco.txt")
- parser.add_argument("-a", "--annotations", default=None,
- help="Set the path to the 'instances_val2017.json' file to use for COCO annotations, in which "
- "case --input should point to the COCO val2017 dataset, default: not used")
- parser.add_argument("-n", "--num_images", default=100, type=int,
- help="The maximum number of images to visualize, default: 100")
- parser.add_argument("-t", "--nms_threshold", type=float, help="Override the score threshold for the NMS operation, "
- "if higher than the threshold in the model/engine.")
+ parser.add_argument(
+ "-m",
+ "--saved_model",
+ help="The TensorFlow saved model path to validate against",
+ )
+ parser.add_argument(
+ "-i",
+ "--input",
+ help="The input to infer, either a single image path, or a directory of images",
+ )
+ parser.add_argument(
+ "-o",
+ "--output",
+ default=None,
+ help="Directory where to save the visualization results",
+ )
+ parser.add_argument(
+ "-l",
+ "--labels",
+ default="./labels_coco.txt",
+ help="File to use for reading the class labels from, default: ./labels_coco.txt",
+ )
+ parser.add_argument(
+ "-a",
+ "--annotations",
+ default=None,
+ help="Set the path to the 'instances_val2017.json' file to use for COCO annotations, in which "
+ "case --input should point to the COCO val2017 dataset, default: not used",
+ )
+ parser.add_argument(
+ "-n",
+ "--num_images",
+ default=100,
+ type=int,
+ help="The maximum number of images to visualize, default: 100",
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the score threshold for the NMS operation, "
+ "if higher than the threshold in the model/engine.",
+ )
args = parser.parse_args()
if not all([args.engine, args.saved_model, args.input, args.output]):
parser.print_help()
diff --git a/samples/python/efficientdet/create_onnx.py b/samples/python/efficientdet/create_onnx.py
index 17fee5f6..ffe83094 100644
--- a/samples/python/efficientdet/create_onnx.py
+++ b/samples/python/efficientdet/create_onnx.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -52,8 +52,12 @@ def __init__(self, saved_model_path):
with tf.Graph().as_default() as tf_graph:
tf.import_graph_def(graph_def, name="")
with tf_loader.tf_session(graph=tf_graph):
- onnx_graph = tfonnx.process_tf_graph(tf_graph, input_names=inputs, output_names=outputs, opset=11)
- onnx_model = optimizer.optimize_graph(onnx_graph).make_model("Converted from {}".format(saved_model_path))
+ onnx_graph = tfonnx.process_tf_graph(
+ tf_graph, input_names=inputs, output_names=outputs, opset=11
+ )
+ onnx_model = optimizer.optimize_graph(onnx_graph).make_model(
+ "Converted from {}".format(saved_model_path)
+ )
self.graph = gs.import_onnx(onnx_model)
assert self.graph
log.info("TF2ONNX graph created successfully")
@@ -65,7 +69,16 @@ def __init__(self, saved_model_path):
self.api = None
if len([node for node in self.graph.nodes if "class_net/" in node.name]) > 0:
self.api = "AutoML"
- elif len([node for node in self.graph.nodes if "/WeightSharedConvolutionalClassHead/" in node.name]) > 0:
+ elif (
+ len(
+ [
+ node
+ for node in self.graph.nodes
+ if "/WeightSharedConvolutionalClassHead/" in node.name
+ ]
+ )
+ > 0
+ ):
self.api = "TFOD"
assert self.api
log.info("Graph was detected as {}".format(self.api))
@@ -87,7 +100,9 @@ def sanitize(self):
model = shape_inference.infer_shapes(model)
self.graph = gs.import_onnx(model)
except Exception as e:
- log.info("Shape inference could not be performed at this time:\n{}".format(e))
+ log.info(
+ "Shape inference could not be performed at this time:\n{}".format(e)
+ )
try:
self.graph.fold_constants(fold_shapes=True)
except TypeError as e:
@@ -130,41 +145,63 @@ def update_preprocessor(self, input_format, input_size, preprocessor="imagenet")
assert input_size[i] >= 1
assert input_format in ["NCHW", "NHWC"]
if input_format == "NCHW":
- self.graph.inputs[0].shape = ['N', 3, input_size[0], input_size[1]]
+ self.graph.inputs[0].shape = ["N", 3, input_size[0], input_size[1]]
if input_format == "NHWC":
- self.graph.inputs[0].shape = ['N', input_size[0], input_size[1], 3]
+ self.graph.inputs[0].shape = ["N", input_size[0], input_size[1], 3]
self.graph.inputs[0].dtype = np.float32
self.graph.inputs[0].name = "input"
- log.info("ONNX graph input shape: {} [{} format]".format(self.graph.inputs[0].shape, input_format))
+ log.info(
+ "ONNX graph input shape: {} [{} format]".format(
+ self.graph.inputs[0].shape, input_format
+ )
+ )
self.sanitize()
# Find the initial nodes of the graph, whatever the input is first connected to, and disconnect them
- for node in [node for node in self.graph.nodes if self.graph.inputs[0] in node.inputs]:
+ for node in [
+ node for node in self.graph.nodes if self.graph.inputs[0] in node.inputs
+ ]:
node.inputs.clear()
# Convert to NCHW format if needed
input_tensor = self.graph.inputs[0]
if input_format == "NHWC":
- input_tensor = self.graph.transpose("preprocessor/transpose", input_tensor, [0, 3, 1, 2])
+ input_tensor = self.graph.transpose(
+ "preprocessor/transpose", input_tensor, [0, 3, 1, 2]
+ )
assert preprocessor in ["imagenet", "scale_range"]
preprocessed_tensor = None
if preprocessor == "imagenet":
# RGB Normalizers. The per-channel values are given with shape [1, 3, 1, 1] for proper NCHW shape broadcasting
scale_val = 1 / np.asarray([255], dtype=np.float32)
- mean_val = -1 * np.expand_dims(np.asarray([0.485, 0.456, 0.406], dtype=np.float32), axis=(0, 2, 3))
- stddev_val = 1 / np.expand_dims(np.asarray([0.229, 0.224, 0.225], dtype=np.float32), axis=(0, 2, 3))
+ mean_val = -1 * np.expand_dims(
+ np.asarray([0.485, 0.456, 0.406], dtype=np.float32), axis=(0, 2, 3)
+ )
+ stddev_val = 1 / np.expand_dims(
+ np.asarray([0.229, 0.224, 0.225], dtype=np.float32), axis=(0, 2, 3)
+ )
# y = (x * scale + mean) * stddev --> y = x * scale * stddev + mean * stddev
- scale_out = self.graph.elt_const("Mul", "preprocessor/scale", input_tensor, scale_val * stddev_val)
- mean_out = self.graph.elt_const("Add", "preprocessor/mean", scale_out, mean_val * stddev_val)
+ scale_out = self.graph.elt_const(
+ "Mul", "preprocessor/scale", input_tensor, scale_val * stddev_val
+ )
+ mean_out = self.graph.elt_const(
+ "Add", "preprocessor/mean", scale_out, mean_val * stddev_val
+ )
preprocessed_tensor = mean_out[0]
if preprocessor == "scale_range":
# RGB Normalizers. The per-channel values are given with shape [1, 3, 1, 1] for proper NCHW shape broadcasting
scale_val = 2 / np.asarray([255], dtype=np.float32)
- offset_val = np.expand_dims(np.asarray([-1, -1, -1], dtype=np.float32), axis=(0, 2, 3))
+ offset_val = np.expand_dims(
+ np.asarray([-1, -1, -1], dtype=np.float32), axis=(0, 2, 3)
+ )
# y = (x * scale + mean) * stddev --> y = x * scale * stddev + mean * stddev
- scale_out = self.graph.elt_const("Mul", "preprocessor/scale", input_tensor, scale_val)
- range_out = self.graph.elt_const("Add", "preprocessor/range", scale_out, offset_val)
+ scale_out = self.graph.elt_const(
+ "Mul", "preprocessor/scale", input_tensor, scale_val
+ )
+ range_out = self.graph.elt_const(
+ "Add", "preprocessor/range", scale_out, offset_val
+ )
preprocessed_tensor = range_out[0]
# Find the first stem conv node of the graph, and connect the normalizer directly to it
@@ -173,7 +210,11 @@ def update_preprocessor(self, input_format, input_size, preprocessor="imagenet")
stem_name = "/stem/"
if self.api == "TFOD":
stem_name = "/stem_conv2d/"
- stem = [node for node in self.graph.nodes if node.op == "Conv" and stem_name in node.name][0]
+ stem = [
+ node
+ for node in self.graph.nodes
+ if node.op == "Conv" and stem_name in node.name
+ ][0]
log.info("Found {} node '{}' as stem entry".format(stem.op, stem.name))
stem.inputs[0] = preprocessed_tensor
@@ -184,7 +225,10 @@ def update_shapes(self):
# Output-Head reshapes use [1, -1, C], corrected reshape value should be [-1, V, C]
for node in [node for node in self.graph.nodes if node.op == "Reshape"]:
shape_in = node.inputs[0].shape
- if shape_in is None or len(shape_in) not in [4,5]: # TFOD graphs have 5-dim inputs on this Reshape
+ if shape_in is None or len(shape_in) not in [
+ 4,
+ 5,
+ ]: # TFOD graphs have 5-dim inputs on this Reshape
continue
if type(node.inputs[1]) != gs.Constant:
continue
@@ -195,15 +239,29 @@ def update_shapes(self):
if len(shape_in) == 5:
volume *= shape_in[4]
shape_corrected = np.asarray([-1, volume, shape_out[2]], dtype=np.int64)
- node.inputs[1] = gs.Constant("{}_shape".format(node.name), values=shape_corrected)
- log.info("Updating Output-Head Reshape node {} to {}".format(node.name, node.inputs[1].values))
+ node.inputs[1] = gs.Constant(
+ "{}_shape".format(node.name), values=shape_corrected
+ )
+ log.info(
+ "Updating Output-Head Reshape node {} to {}".format(
+ node.name, node.inputs[1].values
+ )
+ )
# Other Reshapes only need to change the first dim to -1, as long as there are no -1's already
for node in [node for node in self.graph.nodes if node.op == "Reshape"]:
- if type(node.inputs[1]) != gs.Constant or node.inputs[1].values[0] != 1 or -1 in node.inputs[1].values:
+ if (
+ type(node.inputs[1]) != gs.Constant
+ or node.inputs[1].values[0] != 1
+ or -1 in node.inputs[1].values
+ ):
continue
node.inputs[1].values[0] = -1
- log.info("Updating Reshape node {} to {}".format(node.name, node.inputs[1].values))
+ log.info(
+ "Updating Reshape node {} to {}".format(
+ node.name, node.inputs[1].values
+ )
+ )
# Resize nodes try to calculate the output shape dynamically, it's more optimal to pre-compute the shape
if self.api == "AutoML":
@@ -223,13 +281,18 @@ def update_shapes(self):
concat = node.i(3)
if concat.op != "Concat":
continue
- if type(concat.inputs[1]) != gs.Constant or len(concat.inputs[1].values) != 2:
+ if (
+ type(concat.inputs[1]) != gs.Constant
+ or len(concat.inputs[1].values) != 2
+ ):
continue
scale_h = concat.inputs[1].values[0] / node.inputs[0].shape[2]
scale_w = concat.inputs[1].values[1] / node.inputs[0].shape[3]
scales = np.asarray([1, 1, scale_h, scale_w], dtype=np.float32)
del node.inputs[3]
- node.inputs[2] = gs.Constant(name="{}_scales".format(node.name), values=scales)
+ node.inputs[2] = gs.Constant(
+ name="{}_scales".format(node.name), values=scales
+ )
log.info("Updating Resize node {} to {}".format(node.name, scales))
self.sanitize()
@@ -241,7 +304,9 @@ def update_network(self):
"""
if self.api == "TFOD":
- for reduce in [node for node in self.graph.nodes if node.op == "ReduceMean"]:
+ for reduce in [
+ node for node in self.graph.nodes if node.op == "ReduceMean"
+ ]:
# TFOD models have their ReduceMean nodes applied with some redundant transposes that can be
# optimized away for better performance
# Make sure the correct subgraph is being replaced, basically search for this:
@@ -249,19 +314,30 @@ def update_network(self):
# And change to this:
# X > ReduceMean (2,3) > Conv > Y
transpose = reduce.i()
- if transpose.op != "Transpose" or transpose.attrs['perm'] != [0, 2, 3, 1]:
+ if transpose.op != "Transpose" or transpose.attrs["perm"] != [
+ 0,
+ 2,
+ 3,
+ 1,
+ ]:
continue
- if len(reduce.attrs['axes']) != 2 or reduce.attrs['axes'] != [1, 2]:
+ if len(reduce.attrs["axes"]) != 2 or reduce.attrs["axes"] != [1, 2]:
continue
reshape1 = reduce.o()
if reshape1.op != "Reshape" or len(reshape1.inputs[1].values) != 4:
continue
- if reshape1.inputs[1].values[1] != 1 or reshape1.inputs[1].values[2] != 1:
+ if (
+ reshape1.inputs[1].values[1] != 1
+ or reshape1.inputs[1].values[2] != 1
+ ):
continue
reshape2 = reshape1.o()
if reshape2.op != "Reshape" or len(reshape2.inputs[1].values) != 4:
continue
- if reshape2.inputs[1].values[2] != 1 or reshape2.inputs[1].values[3] != 1:
+ if (
+ reshape2.inputs[1].values[2] != 1
+ or reshape2.inputs[1].values[3] != 1
+ ):
continue
conv = reshape2.o()
if conv.op != "Conv":
@@ -269,12 +345,21 @@ def update_network(self):
# If all the checks above pass, then this node sequence can be optimized by just the ReduceMean itself
# operating on a different set of axes
input_tensor = transpose.inputs[0] # Input tensor to the Transpose
- reduce.inputs[0] = input_tensor # Forward the Transpose input to the ReduceMean node
+ reduce.inputs[0] = (
+ input_tensor # Forward the Transpose input to the ReduceMean node
+ )
output_tensor = reduce.outputs[0] # Output tensor of the ReduceMean
- conv.inputs[0] = output_tensor # Forward the ReduceMean output to the Conv node
- reduce.attrs["axes"] = [2, 3] # Update the axes that ReduceMean operates on
+ conv.inputs[0] = (
+ output_tensor # Forward the ReduceMean output to the Conv node
+ )
+ reduce.attrs["axes"] = [
+ 2,
+ 3,
+ ] # Update the axes that ReduceMean operates on
reduce.attrs["keepdims"] = 1 # Keep the reduced dimensions
- log.info("Optimized subgraph around ReduceMean node '{}'".format(reduce.name))
+ log.info(
+ "Optimized subgraph around ReduceMean node '{}'".format(reduce.name)
+ )
def update_nms(self, threshold=None, detections=None):
"""
@@ -290,10 +375,18 @@ def find_head_concat(name_scope):
# and the concatenated Box Net node has the shape [batch_size, num_anchors, 4].
# These concatenation nodes can be be found by searching for all Concat's and checking if the node two
# steps above in the graph has a name that begins with either "box_net/..." or "class_net/...".
- for node in [node for node in self.graph.nodes if node.op == "Transpose" and name_scope in node.name]:
+ for node in [
+ node
+ for node in self.graph.nodes
+ if node.op == "Transpose" and name_scope in node.name
+ ]:
concat = self.graph.find_descendant_by_op(node, "Concat")
assert concat and len(concat.inputs) == 5
- log.info("Found {} node '{}' as the tip of {}".format(concat.op, concat.name, name_scope))
+ log.info(
+ "Found {} node '{}' as the tip of {}".format(
+ concat.op, concat.name, name_scope
+ )
+ )
return concat
def extract_anchors_tensor(split):
@@ -319,7 +412,9 @@ def get_anchor_np(output_idx, op):
anchors_x = get_anchor_np(1, "Add")
anchors_h = get_anchor_np(2, "Mul")
anchors_w = get_anchor_np(3, "Mul")
- anchors = np.concatenate([anchors_y, anchors_x, anchors_h, anchors_w], axis=2)
+ anchors = np.concatenate(
+ [anchors_y, anchors_x, anchors_h, anchors_w], axis=2
+ )
return gs.Constant(name="nms/anchors:0", values=anchors)
self.sanitize()
@@ -328,7 +423,10 @@ def get_anchor_np(output_idx, op):
if self.api == "AutoML":
head_names = ["class_net/", "box_net/"]
if self.api == "TFOD":
- head_names = ["/WeightSharedConvolutionalClassHead/", "/WeightSharedConvolutionalBoxHead/"]
+ head_names = [
+ "/WeightSharedConvolutionalClassHead/",
+ "/WeightSharedConvolutionalBoxHead/",
+ ]
# There are five nodes at the bottom of the graph that provide important connection points:
@@ -353,9 +451,13 @@ def get_anchor_np(output_idx, op):
nms_node = self.graph.find_node_by_op("NonMaxSuppression")
# Extract NMS Configuration
- num_detections = int(nms_node.inputs[2].values) if detections is None else detections
+ num_detections = (
+ int(nms_node.inputs[2].values) if detections is None else detections
+ )
iou_threshold = float(nms_node.inputs[3].values)
- score_threshold = float(nms_node.inputs[4].values) if threshold is None else threshold
+ score_threshold = (
+ float(nms_node.inputs[4].values) if threshold is None else threshold
+ )
num_classes = class_net.i().inputs[1].values[-1]
normalized = True if self.api == "TFOD" else False
@@ -380,27 +482,41 @@ def get_anchor_np(output_idx, op):
nms_inputs = [box_net_tensor, class_net_tensor, anchors_tensor]
nms_op = "EfficientNMS_TRT"
nms_attrs = {
- 'plugin_version': "1",
- 'background_class': -1,
- 'max_output_boxes': num_detections,
- 'score_threshold': max(0.01, score_threshold), # Keep threshold to at least 0.01 for better efficiency
- 'iou_threshold': iou_threshold,
- 'score_activation': True,
- 'class_agnostic': False,
- 'box_coding': 1,
+ "plugin_version": "1",
+ "background_class": -1,
+ "max_output_boxes": num_detections,
+ "score_threshold": max(
+ 0.01, score_threshold
+ ), # Keep threshold to at least 0.01 for better efficiency
+ "iou_threshold": iou_threshold,
+ "score_activation": True,
+ "class_agnostic": False,
+ "box_coding": 1,
}
nms_output_classes_dtype = np.int32
# NMS Outputs
- nms_output_num_detections = gs.Variable(name="num_detections", dtype=np.int32, shape=['N', 1])
- nms_output_boxes = gs.Variable(name="detection_boxes", dtype=np.float32,
- shape=['N', num_detections, 4])
- nms_output_scores = gs.Variable(name="detection_scores", dtype=np.float32,
- shape=['N', num_detections])
- nms_output_classes = gs.Variable(name="detection_classes", dtype=nms_output_classes_dtype,
- shape=['N', num_detections])
+ nms_output_num_detections = gs.Variable(
+ name="num_detections", dtype=np.int32, shape=["N", 1]
+ )
+ nms_output_boxes = gs.Variable(
+ name="detection_boxes", dtype=np.float32, shape=["N", num_detections, 4]
+ )
+ nms_output_scores = gs.Variable(
+ name="detection_scores", dtype=np.float32, shape=["N", num_detections]
+ )
+ nms_output_classes = gs.Variable(
+ name="detection_classes",
+ dtype=nms_output_classes_dtype,
+ shape=["N", num_detections],
+ )
- nms_outputs = [nms_output_num_detections, nms_output_boxes, nms_output_scores, nms_output_classes]
+ nms_outputs = [
+ nms_output_num_detections,
+ nms_output_boxes,
+ nms_output_scores,
+ nms_output_classes,
+ ]
# Create the NMS Plugin node with the selected inputs. The outputs of the node will also become the final
# outputs of the graph.
@@ -409,8 +525,11 @@ def get_anchor_np(output_idx, op):
name="nms/non_maximum_suppression",
inputs=nms_inputs,
outputs=nms_outputs,
- attrs=nms_attrs)
- log.info("Created NMS plugin '{}' with attributes: {}".format(nms_op, nms_attrs))
+ attrs=nms_attrs,
+ )
+ log.info(
+ "Created NMS plugin '{}' with attributes: {}".format(nms_op, nms_attrs)
+ )
self.graph.outputs = nms_outputs
@@ -430,25 +549,54 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-m", "--saved_model", required=True,
- help="The TensorFlow saved model directory to load")
- parser.add_argument("-o", "--onnx", required=True,
- help="The output ONNX model file to write")
- parser.add_argument("-f", "--input_format", default="NHWC", choices=["NHWC", "NCHW"],
- help="Set the input data format of the graph, either NCHW or NHWC, default: NHWC")
- parser.add_argument("-i", "--input_size", default="512,512",
- help="Set the input shape of the graph, as a comma-separated dimensions in H,W format, "
- "default: 512,512")
- parser.add_argument("-p", "--preprocessor", default="imagenet", choices=["imagenet", "scale_range"],
- help="Set the preprocessor to apply on the graph, either 'imagenet' for standard mean "
- "subtraction and stdev normalization, or 'scale_range' for uniform [-1,+1] "
- "normalization as is used in the AdvProp models, default: imagenet")
- parser.add_argument("-t", "--nms_threshold", type=float,
- help="Override the NMS score threshold, default: use the original value in the model")
- parser.add_argument("-d", "--nms_detections", type=int,
- help="Override the NMS max detections, default: use the original value in the model")
- parser.add_argument("--tf2onnx",
- help="The path where to save the intermediate ONNX graph generated by tf2onnx, useful"
- "for graph debugging purposes, default: not saved")
+ parser.add_argument(
+ "-m",
+ "--saved_model",
+ required=True,
+ help="The TensorFlow saved model directory to load",
+ )
+ parser.add_argument(
+ "-o", "--onnx", required=True, help="The output ONNX model file to write"
+ )
+ parser.add_argument(
+ "-f",
+ "--input_format",
+ default="NHWC",
+ choices=["NHWC", "NCHW"],
+ help="Set the input data format of the graph, either NCHW or NHWC, default: NHWC",
+ )
+ parser.add_argument(
+ "-i",
+ "--input_size",
+ default="512,512",
+ help="Set the input shape of the graph, as a comma-separated dimensions in H,W format, "
+ "default: 512,512",
+ )
+ parser.add_argument(
+ "-p",
+ "--preprocessor",
+ default="imagenet",
+ choices=["imagenet", "scale_range"],
+ help="Set the preprocessor to apply on the graph, either 'imagenet' for standard mean "
+ "subtraction and stdev normalization, or 'scale_range' for uniform [-1,+1] "
+ "normalization as is used in the AdvProp models, default: imagenet",
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the NMS score threshold, default: use the original value in the model",
+ )
+ parser.add_argument(
+ "-d",
+ "--nms_detections",
+ type=int,
+ help="Override the NMS max detections, default: use the original value in the model",
+ )
+ parser.add_argument(
+ "--tf2onnx",
+ help="The path where to save the intermediate ONNX graph generated by tf2onnx, useful"
+ "for graph debugging purposes, default: not saved",
+ )
args = parser.parse_args()
main(args)
diff --git a/samples/python/efficientdet/eval_coco.py b/samples/python/efficientdet/eval_coco.py
index 966f49be..d6796ac0 100644
--- a/samples/python/efficientdet/eval_coco.py
+++ b/samples/python/efficientdet/eval_coco.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,15 +31,24 @@ def main(args):
try:
import coco_metric
except ImportError:
- print("Could not import the 'coco_metric' module from AutoML. Searching in: {}".format(automl_path))
- print("Please clone the repository https://github.com/google/automl and provide its path with --automl_path.")
+ print(
+ "Could not import the 'coco_metric' module from AutoML. Searching in: {}".format(
+ automl_path
+ )
+ )
+ print(
+ "Please clone the repository https://github.com/google/automl and provide its path with --automl_path."
+ )
sys.exit(1)
trt_infer = TensorRTInfer(args.engine)
batcher = ImageBatcher(args.input, *trt_infer.input_spec())
evaluator = coco_metric.EvaluationMetric(filename=args.annotations)
for batch, images, scales in batcher.get_batch():
- print("Processing Image {} / {}".format(batcher.image_index, batcher.num_images), end="\r")
+ print(
+ "Processing Image {} / {}".format(batcher.image_index, batcher.num_images),
+ end="\r",
+ )
detections = trt_infer.process(batch, scales, args.nms_threshold)
coco_det = np.zeros((len(images), max([len(d) for d in detections]), 7))
coco_det[:, :, -1] = -1
@@ -54,7 +63,8 @@ def main(args):
det["xmax"] - det["xmin"],
det["ymax"] - det["ymin"],
det["score"],
- det["class"] + 1, # The COCO evaluator expects class 0 to be background, so offset by 1
+ det["class"]
+ + 1, # The COCO evaluator expects class 0 to be background, so offset by 1
]
evaluator.update_state(None, coco_det)
print()
@@ -64,14 +74,30 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with")
- parser.add_argument("-i", "--input",
- help="The input to infer, either a single image path, or a directory of images")
- parser.add_argument("-a", "--annotations", help="Set the path to the COCO 'instances_val2017.json' file")
- parser.add_argument("-p", "--automl_path", default="./automl",
- help="Set the path where to find the AutoML repository, from "
- "https://github.com/google/automl. Default: ./automl")
- parser.add_argument("-t", "--nms_threshold", type=float, help="Override the score threshold for the NMS operation, "
- "if higher than the threshold in the engine.")
+ parser.add_argument(
+ "-i",
+ "--input",
+ help="The input to infer, either a single image path, or a directory of images",
+ )
+ parser.add_argument(
+ "-a",
+ "--annotations",
+ help="Set the path to the COCO 'instances_val2017.json' file",
+ )
+ parser.add_argument(
+ "-p",
+ "--automl_path",
+ default="./automl",
+ help="Set the path where to find the AutoML repository, from "
+ "https://github.com/google/automl. Default: ./automl",
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the score threshold for the NMS operation, "
+ "if higher than the threshold in the engine.",
+ )
args = parser.parse_args()
if not all([args.engine, args.input, args.annotations]):
parser.print_help()
diff --git a/samples/python/efficientdet/image_batcher.py b/samples/python/efficientdet/image_batcher.py
index e519a5db..11b94c24 100644
--- a/samples/python/efficientdet/image_batcher.py
+++ b/samples/python/efficientdet/image_batcher.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,7 +28,16 @@ class ImageBatcher:
Creates batches of pre-processed images.
"""
- def __init__(self, input, shape, dtype, max_num_images=None, exact_batches=False, preprocessor="EfficientDet", shuffle_files=False):
+ def __init__(
+ self,
+ input,
+ shape,
+ dtype,
+ max_num_images=None,
+ exact_batches=False,
+ preprocessor="EfficientDet",
+ shuffle_files=False,
+ ):
"""
:param input: The input directory to read images from.
:param shape: The tensor shape of the batch to prepare, either in NCHW or NHWC format.
@@ -47,10 +56,16 @@ def __init__(self, input, shape, dtype, max_num_images=None, exact_batches=False
extensions = [".jpg", ".jpeg", ".png", ".bmp"]
def is_image(path):
- return os.path.isfile(path) and os.path.splitext(path)[1].lower() in extensions
+ return (
+ os.path.isfile(path) and os.path.splitext(path)[1].lower() in extensions
+ )
if os.path.isdir(input):
- self.images = [os.path.join(input, f) for f in os.listdir(input) if is_image(os.path.join(input, f))]
+ self.images = [
+ os.path.join(input, f)
+ for f in os.listdir(input)
+ if is_image(os.path.join(input, f))
+ ]
self.images.sort()
if shuffle_files:
random.seed(47)
@@ -129,7 +144,9 @@ def resize_pad(image, pad_color=(0, 0, 0)):
width_scale = width / self.width
height_scale = height / self.height
scale = 1.0 / max(width_scale, height_scale)
- image = image.resize((round(width * scale), round(height * scale)), resample=Image.BILINEAR)
+ image = image.resize(
+ (round(width * scale), round(height * scale)), resample=Image.BILINEAR
+ )
pad = Image.new("RGB", (self.width, self.height))
pad.paste(pad_color, [0, 0, self.width, self.height])
pad.paste(image)
diff --git a/samples/python/efficientdet/infer.py b/samples/python/efficientdet/infer.py
index 25bd28de..5308cf47 100644
--- a/samples/python/efficientdet/infer.py
+++ b/samples/python/efficientdet/infer.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -62,7 +62,7 @@ def __init__(self, engine_path):
shape = self.context.get_tensor_shape(name)
if is_input and shape[0] < 0:
assert self.engine.num_optimization_profiles > 0
- profile_shape = self.engine.get_profile_shape(0, name)
+ profile_shape = self.engine.get_tensor_profile_shape(name, 0)
assert len(profile_shape) == 3 # min,opt,max
# Set the *max* profile as binding shape
self.context.set_input_shape(name, profile_shape[2])
@@ -87,9 +87,14 @@ def __init__(self, engine_path):
self.inputs.append(binding)
else:
self.outputs.append(binding)
- print("{} '{}' with shape {} and dtype {}".format(
- "Input" if is_input else "Output",
- binding['name'], binding['shape'], binding['dtype']))
+ print(
+ "{} '{}' with shape {} and dtype {}".format(
+ "Input" if is_input else "Output",
+ binding["name"],
+ binding["shape"],
+ binding["dtype"],
+ )
+ )
assert self.batch_size > 0
assert len(self.inputs) > 0
@@ -101,7 +106,7 @@ def input_spec(self):
Get the specs for the input tensor of the network. Useful to prepare memory allocations.
:return: Two items, the shape of the input tensor and its (numpy) datatype.
"""
- return self.inputs[0]['shape'], self.inputs[0]['dtype']
+ return self.inputs[0]["shape"], self.inputs[0]["dtype"]
def output_spec(self):
"""
@@ -110,7 +115,7 @@ def output_spec(self):
"""
specs = []
for o in self.outputs:
- specs.append((o['shape'], o['dtype']))
+ specs.append((o["shape"], o["dtype"]))
return specs
def infer(self, batch):
@@ -120,11 +125,13 @@ def infer(self, batch):
:return A list of outputs as numpy arrays.
"""
# Copy I/O and Execute
- common.memcpy_host_to_device(self.inputs[0]['allocation'], batch)
+ common.memcpy_host_to_device(self.inputs[0]["allocation"], batch)
self.context.execute_v2(self.allocations)
for o in range(len(self.outputs)):
- common.memcpy_device_to_host(self.outputs[o]['host_allocation'], self.outputs[o]['allocation'])
- return [o['host_allocation'] for o in self.outputs]
+ common.memcpy_device_to_host(
+ self.outputs[o]["host_allocation"], self.outputs[o]["allocation"]
+ )
+ return [o["host_allocation"] for o in self.outputs]
def process(self, batch, scales=None, nms_threshold=None):
"""
@@ -143,11 +150,11 @@ def process(self, batch, scales=None, nms_threshold=None):
scores = outputs[2]
classes = outputs[3]
detections = []
- normalized = (np.max(boxes) < 2.0)
+ normalized = np.max(boxes) < 2.0
for i in range(self.batch_size):
detections.append([])
for n in range(int(nums[i])):
- scale = self.inputs[0]['shape'][2] if normalized else 1.0
+ scale = self.inputs[0]["shape"][2] if normalized else 1.0
if scales and i < len(scales):
scale /= scales[i]
if nms_threshold and scores[i][n] < nms_threshold:
@@ -181,7 +188,12 @@ def main(args):
print("Inferring data in {}".format(args.input))
batcher = ImageBatcher(args.input, *trt_infer.input_spec())
for batch, images, scales in batcher.get_batch():
- print("Processing Image {} / {}".format(batcher.image_index, batcher.num_images), end="\r")
+ print(
+ "Processing Image {} / {}".format(
+ batcher.image_index, batcher.num_images
+ ),
+ end="\r",
+ )
detections = trt_infer.process(batch, scales, args.nms_threshold)
if args.output:
for i in range(len(images)):
@@ -192,9 +204,18 @@ def main(args):
# Text Results
output_results = ""
for d in detections[i]:
- line = [d['xmin'], d['ymin'], d['xmax'], d['ymax'], d['score'], d['class']]
+ line = [
+ d["xmin"],
+ d["ymin"],
+ d["xmax"],
+ d["ymax"],
+ d["score"],
+ d["class"],
+ ]
output_results += "\t".join([str(f) for f in line]) + "\n"
- with open(os.path.join(output_dir, "{}.txt".format(basename)), "w") as f:
+ with open(
+ os.path.join(output_dir, "{}.txt".format(basename)), "w"
+ ) as f:
f.write(output_results)
else:
print("No input provided, running in benchmark mode")
@@ -210,10 +231,12 @@ def main(args):
times.append(time.time() - start)
print("Iteration {} / {}".format(i + 1, iterations), end="\r")
print("Benchmark results include time for H2D and D2H memory copies")
- print("Average Latency: {:.3f} ms".format(
- 1000 * np.average(times)))
- print("Average Throughput: {:.1f} ips".format(
- trt_infer.batch_size / np.average(times)))
+ print("Average Latency: {:.3f} ms".format(1000 * np.average(times)))
+ print(
+ "Average Throughput: {:.1f} ips".format(
+ trt_infer.batch_size / np.average(times)
+ )
+ )
print()
print("Finished Processing")
@@ -221,15 +244,33 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-e", "--engine", default=None, required=True,
- help="The serialized TensorRT engine")
- parser.add_argument("-i", "--input", default=None,
- help="Path to the image or directory to process")
- parser.add_argument("-o", "--output", default=None,
- help="Directory where to save the visualization results")
- parser.add_argument("-l", "--labels", default="./labels_coco.txt",
- help="File to use for reading the class labels from, default: ./labels_coco.txt")
- parser.add_argument("-t", "--nms_threshold", type=float,
- help="Override the score threshold for the NMS operation, if higher than the built-in threshold")
+ parser.add_argument(
+ "-e",
+ "--engine",
+ default=None,
+ required=True,
+ help="The serialized TensorRT engine",
+ )
+ parser.add_argument(
+ "-i", "--input", default=None, help="Path to the image or directory to process"
+ )
+ parser.add_argument(
+ "-o",
+ "--output",
+ default=None,
+ help="Directory where to save the visualization results",
+ )
+ parser.add_argument(
+ "-l",
+ "--labels",
+ default="./labels_coco.txt",
+ help="File to use for reading the class labels from, default: ./labels_coco.txt",
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the score threshold for the NMS operation, if higher than the built-in threshold",
+ )
args = parser.parse_args()
main(args)
diff --git a/samples/python/efficientdet/infer_tf.py b/samples/python/efficientdet/infer_tf.py
index a02f87ee..a2ecbd93 100644
--- a/samples/python/efficientdet/infer_tf.py
+++ b/samples/python/efficientdet/infer_tf.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,47 +30,51 @@ class TensorFlowInfer:
"""
def __init__(self, saved_model_path):
- gpus = tf.config.experimental.list_physical_devices('GPU')
+ gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
self.model = tf.saved_model.load(saved_model_path)
- self.pred_fn = self.model.signatures['serving_default']
+ self.pred_fn = self.model.signatures["serving_default"]
# Setup I/O bindings
self.batch_size = 1
self.inputs = []
fn_inputs = self.pred_fn.structured_input_signature[1]
for i, input in enumerate(list(fn_inputs.values())):
- self.inputs.append({
- 'index': i,
- 'name': input.name,
- 'dtype': np.dtype(input.dtype.as_numpy_dtype()),
- 'shape': [1, 512, 512, 3], # This can be overridden later
- })
+ self.inputs.append(
+ {
+ "index": i,
+ "name": input.name,
+ "dtype": np.dtype(input.dtype.as_numpy_dtype()),
+ "shape": [1, 512, 512, 3], # This can be overridden later
+ }
+ )
self.outputs = []
fn_outputs = self.pred_fn.structured_outputs
for i, output in enumerate(list(fn_outputs.values())):
- self.outputs.append({
- 'index': i,
- 'name': output.name,
- 'dtype': np.dtype(output.dtype.as_numpy_dtype()),
- 'shape': output.shape.as_list(),
- })
+ self.outputs.append(
+ {
+ "index": i,
+ "name": output.name,
+ "dtype": np.dtype(output.dtype.as_numpy_dtype()),
+ "shape": output.shape.as_list(),
+ }
+ )
def override_input_shape(self, input, shape):
- self.inputs[input]['shape'] = shape
+ self.inputs[input]["shape"] = shape
self.batch_size = shape[0]
def input_spec(self):
- return self.inputs[0]['shape'], self.inputs[0]['dtype']
+ return self.inputs[0]["shape"], self.inputs[0]["dtype"]
def output_spec(self):
- return self.outputs[0]['shape'], self.outputs[0]['dtype']
+ return self.outputs[0]["shape"], self.outputs[0]["dtype"]
def infer(self, batch):
# Process I/O and execute the network
- input = {self.inputs[0]['name']: tf.convert_to_tensor(batch)}
+ input = {self.inputs[0]["name"]: tf.convert_to_tensor(batch)}
output = self.pred_fn(**input)
return output
@@ -84,38 +88,42 @@ def process(self, batch, scales=None, nms_threshold=None):
classes = None
if len(self.outputs) == 1:
# Detected as AutoML Saved Model
- assert len(self.outputs[0]['shape']) == 3 and self.outputs[0]['shape'][2] == 7
- results = output[self.outputs[0]['name']].numpy()
+ assert (
+ len(self.outputs[0]["shape"]) == 3 and self.outputs[0]["shape"][2] == 7
+ )
+ results = output[self.outputs[0]["name"]].numpy()
boxes = results[:, :, 1:5]
scores = results[:, :, 5]
classes = results[:, :, 6].astype(np.int32)
elif len(self.outputs) >= 4:
# Detected as TFOD Saved Model
- assert output['num_detections']
- num = int(output['num_detections'].numpy().flatten()[0])
- boxes = output['detection_boxes'].numpy()[:, 0:num, :]
- scores = output['detection_scores'].numpy()[:, 0:num]
- classes = output['detection_classes'].numpy()[:, 0:num]
+ assert output["num_detections"]
+ num = int(output["num_detections"].numpy().flatten()[0])
+ boxes = output["detection_boxes"].numpy()[:, 0:num, :]
+ scores = output["detection_scores"].numpy()[:, 0:num]
+ classes = output["detection_classes"].numpy()[:, 0:num]
# Process the results
detections = [[]]
- normalized = (np.max(boxes) < 2.0)
+ normalized = np.max(boxes) < 2.0
for n in range(scores.shape[1]):
if scores[0][n] == 0.0:
break
- scale = self.inputs[0]['shape'][2] if normalized else 1.0
+ scale = self.inputs[0]["shape"][2] if normalized else 1.0
if scales:
scale /= scales[0]
if nms_threshold and scores[0][n] < nms_threshold:
continue
- detections[0].append({
- 'ymin': boxes[0][n][0] * scale,
- 'xmin': boxes[0][n][1] * scale,
- 'ymax': boxes[0][n][2] * scale,
- 'xmax': boxes[0][n][3] * scale,
- 'score': scores[0][n],
- 'class': int(classes[0][n]) - 1,
- })
+ detections[0].append(
+ {
+ "ymin": boxes[0][n][0] * scale,
+ "xmin": boxes[0][n][1] * scale,
+ "ymax": boxes[0][n][2] * scale,
+ "xmax": boxes[0][n][3] * scale,
+ "score": scores[0][n],
+ "class": int(classes[0][n]) - 1,
+ }
+ )
return detections
@@ -137,10 +145,10 @@ def main(args):
times.append(time.time() - start)
print("Iteration {} / {}".format(i + 1, iterations), end="\r")
print("Benchmark results include TensorFlow host overhead")
- print("Average Latency: {:.3f} ms".format(
- 1000 * np.average(times)))
- print("Average Throughput: {:.1f} ips".format(
- tf_infer.batch_size / np.average(times)))
+ print("Average Latency: {:.3f} ms".format(1000 * np.average(times)))
+ print(
+ "Average Throughput: {:.1f} ips".format(tf_infer.batch_size / np.average(times))
+ )
print()
print("Finished Processing")
@@ -148,11 +156,24 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-m", "--saved_model", required=True,
- help="The TensorFlow saved model path to validate against")
- parser.add_argument("-i", "--input_size", default="512,512",
- help="The input size to run the model with, in HEIGHT,WIDTH format")
- parser.add_argument("-b", "--batch_size", default=1, type=int,
- help="The batch size to run the model with")
+ parser.add_argument(
+ "-m",
+ "--saved_model",
+ required=True,
+ help="The TensorFlow saved model path to validate against",
+ )
+ parser.add_argument(
+ "-i",
+ "--input_size",
+ default="512,512",
+ help="The input size to run the model with, in HEIGHT,WIDTH format",
+ )
+ parser.add_argument(
+ "-b",
+ "--batch_size",
+ default=1,
+ type=int,
+ help="The batch size to run the model with",
+ )
args = parser.parse_args()
main(args)
diff --git a/samples/python/efficientdet/onnx_utils.py b/samples/python/efficientdet/onnx_utils.py
index e55f7e11..a98c3a7c 100644
--- a/samples/python/efficientdet/onnx_utils.py
+++ b/samples/python/efficientdet/onnx_utils.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,7 +36,9 @@ def elt_const(self, op, name, input, value):
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}': {}".format(op, name, value.squeeze()))
const = gs.Constant(name="{}_value:0".format(name), values=value)
- return self.layer(name=name, op=op, inputs=[input_tensor, const], outputs=[name + ":0"])
+ return self.layer(
+ name=name, op=op, inputs=[input_tensor, const], outputs=[name + ":0"]
+ )
@gs.Graph.register()
@@ -51,7 +53,13 @@ def unsqueeze(self, name, input, axes=[-1]):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Unsqueeze node '{}': {}".format(name, axes))
- return self.layer(name=name, op="Unsqueeze", inputs=[input_tensor], outputs=[name + ":0"], attrs={"axes": axes})
+ return self.layer(
+ name=name,
+ op="Unsqueeze",
+ inputs=[input_tensor],
+ outputs=[name + ":0"],
+ attrs={"axes": axes},
+ )
@gs.Graph.register()
@@ -66,7 +74,13 @@ def transpose(self, name, input, perm):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Transpose node '{}': {}".format(name, perm))
- return self.layer(name=name, op="Transpose", inputs=[input_tensor], outputs=[name + ":0"], attrs={"perm": perm})
+ return self.layer(
+ name=name,
+ op="Transpose",
+ inputs=[input_tensor],
+ outputs=[name + ":0"],
+ attrs={"perm": perm},
+ )
@gs.Graph.register()
@@ -80,7 +94,9 @@ def sigmoid(self, name, input):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Sigmoid node '{}'".format(name))
- return self.layer(name=name, op="Sigmoid", inputs=[input_tensor], outputs=[name + ":0"])
+ return self.layer(
+ name=name, op="Sigmoid", inputs=[input_tensor], outputs=[name + ":0"]
+ )
@gs.Graph.register()
@@ -98,7 +114,9 @@ def plugin(self, op, name, inputs, outputs, attrs):
"""
input_tensors = inputs if type(inputs) is list else [inputs]
log.debug("Created TRT Plugin node '{}': {}".format(name, attrs))
- return self.layer(op=op, name=name, inputs=input_tensors, outputs=outputs, attrs=attrs)
+ return self.layer(
+ op=op, name=name, inputs=input_tensors, outputs=outputs, attrs=attrs
+ )
@gs.Graph.register()
diff --git a/samples/python/efficientdet/visualize.py b/samples/python/efficientdet/visualize.py
index 4366f9e0..3fb982ef 100644
--- a/samples/python/efficientdet/visualize.py
+++ b/samples/python/efficientdet/visualize.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -182,9 +182,18 @@ def visualize_detections(image_path, output_path, detections, labels=[]):
text_left = d["xmin"]
margin = np.ceil(0.05 * text_height)
draw.rectangle(
- [(text_left, text_bottom - text_height - 2 * margin), (text_left + text_width, text_bottom)], fill=color
+ [
+ (text_left, text_bottom - text_height - 2 * margin),
+ (text_left + text_width, text_bottom),
+ ],
+ fill=color,
+ )
+ draw.text(
+ (text_left + margin, text_bottom - text_height - margin),
+ text,
+ fill="black",
+ font=font,
)
- draw.text((text_left + margin, text_bottom - text_height - margin), text, fill="black", font=font)
if output_path is None:
return image
image.save(output_path)
@@ -195,7 +204,12 @@ def draw_text(draw, font, text, width, bar_height, offset, color):
left, top, right, bottom = font.getbbox(text)
text_width, text_height = right - left, bottom - top
draw.rectangle([(offset, 0), (offset + width, bar_height)], fill=color)
- draw.text((offset + (width - text_width) / 2, text_height - text_height / 2), text, fill="black", font=font)
+ draw.text(
+ (offset + (width - text_width) / 2, text_height - text_height / 2),
+ text,
+ fill="black",
+ font=font,
+ )
bar_height = 18
width = 0
diff --git a/samples/python/efficientnet/build_engine.py b/samples/python/efficientnet/build_engine.py
index a4d75552..683c567c 100644
--- a/samples/python/efficientnet/build_engine.py
+++ b/samples/python/efficientnet/build_engine.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -56,7 +56,10 @@ def set_image_batcher(self, image_batcher: ImageBatcher):
:param image_batcher: The ImageBatcher object
"""
self.image_batcher = image_batcher
- size = int(np.dtype(self.image_batcher.dtype).itemsize * np.prod(self.image_batcher.shape))
+ size = int(
+ np.dtype(self.image_batcher.dtype).itemsize
+ * np.prod(self.image_batcher.shape)
+ )
self.batch_allocation = common.cuda_call(cudart.cudaMalloc(size))
self.batch_generator = self.image_batcher.get_batch()
@@ -81,8 +84,14 @@ def get_batch(self, names):
return None
try:
batch, _ = next(self.batch_generator)
- log.info("Calibrating image {} / {}".format(self.image_batcher.image_index, self.image_batcher.num_images))
- common.memcpy_host_to_device(self.batch_allocation, np.ascontiguousarray(batch))
+ log.info(
+ "Calibrating image {} / {}".format(
+ self.image_batcher.image_index, self.image_batcher.num_images
+ )
+ )
+ common.memcpy_host_to_device(
+ self.batch_allocation, np.ascontiguousarray(batch)
+ )
return [int(self.batch_allocation)]
except StopIteration:
log.info("Finished calibration batches")
@@ -127,7 +136,9 @@ def __init__(self, verbose=False):
self.builder = trt.Builder(self.trt_logger)
self.config = self.builder.create_builder_config()
- self.config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 8 * (2 ** 30)) # 8 GB
+ self.config.set_memory_pool_limit(
+ trt.MemoryPoolType.WORKSPACE, 8 * (2**30)
+ ) # 8 GB
self.batch_size = None
self.network = None
@@ -156,9 +167,17 @@ def create_network(self, onnx_path):
log.info("Network Description")
for input in inputs:
self.batch_size = input.shape[0]
- log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype))
+ log.info(
+ "Input '{}' with shape {} and dtype {}".format(
+ input.name, input.shape, input.dtype
+ )
+ )
for output in outputs:
- log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype))
+ log.info(
+ "Output '{}' with shape {} and dtype {}".format(
+ output.name, output.shape, output.dtype
+ )
+ )
assert self.batch_size > 0
def create_engine(
@@ -254,8 +273,12 @@ def main(args):
choices=["fp32", "fp16", "int8"],
help="The precision mode to build in, either 'fp32', 'fp16' or 'int8', default: 'fp16'",
)
- parser.add_argument("-v", "--verbose", action="store_true", help="Enable more verbose log output")
- parser.add_argument("--calib_input", help="The directory holding images to use for calibration")
+ parser.add_argument(
+ "-v", "--verbose", action="store_true", help="Enable more verbose log output"
+ )
+ parser.add_argument(
+ "--calib_input", help="The directory holding images to use for calibration"
+ )
parser.add_argument(
"--calib_cache",
default="./calibration.cache",
@@ -268,7 +291,10 @@ def main(args):
help="The maximum number of images to use for calibration, default: 25000",
)
parser.add_argument(
- "--calib_batch_size", default=8, type=int, help="The batch size for the calibration process, default: 1"
+ "--calib_batch_size",
+ default=8,
+ type=int,
+ help="The batch size for the calibration process, default: 1",
)
parser.add_argument(
"--calib_preprocessor",
@@ -288,6 +314,8 @@ def main(args):
sys.exit(1)
if args.precision == "int8" and not any([args.calib_input, args.calib_cache]):
parser.print_help()
- log.error("When building in int8 precision, either --calib_input or --calib_cache are required")
+ log.error(
+ "When building in int8 precision, either --calib_input or --calib_cache are required"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/efficientnet/compare_tf.py b/samples/python/efficientnet/compare_tf.py
index 6d9ad88f..2671572e 100644
--- a/samples/python/efficientnet/compare_tf.py
+++ b/samples/python/efficientnet/compare_tf.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -91,7 +91,10 @@ def main(args):
trt_infer = TensorRTInfer(args.engine)
batcher = ImageBatcher(
- args.input, *trt_infer.input_spec(), max_num_images=args.num_images, preprocessor=args.preprocessor
+ args.input,
+ *trt_infer.input_spec(),
+ max_num_images=args.num_images,
+ preprocessor=args.preprocessor
)
# Make sure both systems use the same input spec, so we can use the exact same image batches with both
@@ -101,14 +104,20 @@ def main(args):
print("Input datatype does not match")
print("TRT Engine Input Dtype: {} {}".format(trt_dtype))
print("TF Saved Model Input Dtype: {} {}".format(tf_dtype))
- print("Please use the same TensorFlow saved model that the TensorRT engine was built with")
+ print(
+ "Please use the same TensorFlow saved model that the TensorRT engine was built with"
+ )
sys.exit(1)
- if (tf_shape[1] and trt_shape[1] != tf_shape[1]) or (tf_shape[2] and trt_shape[2] != tf_shape[2]):
+ if (tf_shape[1] and trt_shape[1] != tf_shape[1]) or (
+ tf_shape[2] and trt_shape[2] != tf_shape[2]
+ ):
print("Input shapes do not match")
print("TRT Engine Input Shape: {} {}".format(trt_shape[1:]))
print("TF Saved Model Input Shape: {} {}".format(tf_shape[1:]))
- print("Please use the same TensorFlow saved model that the TensorRT engine was built with")
+ print(
+ "Please use the same TensorFlow saved model that the TensorRT engine was built with"
+ )
sys.exit(1)
match = 0
@@ -131,24 +140,40 @@ def main(args):
print(
"Processing {} / {} images: {:.2f}% match ".format(
- batcher.image_index, batcher.num_images, (100 * (match / batcher.image_index))
+ batcher.image_index,
+ batcher.num_images,
+ (100 * (match / batcher.image_index)),
),
end="\r",
)
print()
pc = 100 * (match / batcher.num_images)
- print("Matching Top-1 class predictions for {} out of {} images: {:.2f}%".format(match, batcher.num_images, pc))
+ print(
+ "Matching Top-1 class predictions for {} out of {} images: {:.2f}%".format(
+ match, batcher.num_images, pc
+ )
+ )
avgerror = np.sqrt(error / batcher.num_images)
- print("RMSE between TensorFlow and TensorRT confidence scores: {:.3f}".format(avgerror))
+ print(
+ "RMSE between TensorFlow and TensorRT confidence scores: {:.3f}".format(
+ avgerror
+ )
+ )
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with")
- parser.add_argument("-m", "--saved_model", help="The TensorFlow saved model path to validate against")
parser.add_argument(
- "-i", "--input", help="The input to infer, either a single image path, or a directory of images"
+ "-m",
+ "--saved_model",
+ help="The TensorFlow saved model path to validate against",
+ )
+ parser.add_argument(
+ "-i",
+ "--input",
+ help="The input to infer, either a single image path, or a directory of images",
)
parser.add_argument(
"-n",
diff --git a/samples/python/efficientnet/create_onnx.py b/samples/python/efficientnet/create_onnx.py
index b98fd137..c0e7d109 100644
--- a/samples/python/efficientnet/create_onnx.py
+++ b/samples/python/efficientnet/create_onnx.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,12 +32,18 @@ def main(args):
# Load saved model
saved_model_path = os.path.realpath(args.saved_model)
assert os.path.isdir(saved_model_path)
- graph_def, inputs, outputs = tf_loader.from_saved_model(saved_model_path, None, None, "serve", ["serving_default"])
+ graph_def, inputs, outputs = tf_loader.from_saved_model(
+ saved_model_path, None, None, "serve", ["serving_default"]
+ )
with tf.Graph().as_default() as tf_graph:
tf.import_graph_def(graph_def, name="")
with tf_loader.tf_session(graph=tf_graph):
- onnx_graph = tfonnx.process_tf_graph(tf_graph, input_names=inputs, output_names=outputs, opset=11)
- onnx_model = optimizer.optimize_graph(onnx_graph).make_model("Converted from {}".format(saved_model_path))
+ onnx_graph = tfonnx.process_tf_graph(
+ tf_graph, input_names=inputs, output_names=outputs, opset=11
+ )
+ onnx_model = optimizer.optimize_graph(onnx_graph).make_model(
+ "Converted from {}".format(saved_model_path)
+ )
graph = gs.import_onnx(onnx_model)
assert graph
print()
@@ -55,11 +61,21 @@ def main(args):
# Format NCHW
graph.inputs[0].shape[2] = args.input_size
graph.inputs[0].shape[3] = args.input_size
- print("ONNX input named '{}' with shape {}".format(graph.inputs[0].name, graph.inputs[0].shape))
- print("ONNX output named '{}' with shape {}".format(graph.outputs[0].name, graph.outputs[0].shape))
+ print(
+ "ONNX input named '{}' with shape {}".format(
+ graph.inputs[0].name, graph.inputs[0].shape
+ )
+ )
+ print(
+ "ONNX output named '{}' with shape {}".format(
+ graph.outputs[0].name, graph.outputs[0].shape
+ )
+ )
for i in range(4):
if type(graph.inputs[0].shape[i]) != int or graph.inputs[0].shape[i] <= 0:
- print("The input shape of the graph is invalid, try overriding it by giving a fixed size with --input_size")
+ print(
+ "The input shape of the graph is invalid, try overriding it by giving a fixed size with --input_size"
+ )
sys.exit(1)
# Fix Clip Nodes (ReLU6)
@@ -85,9 +101,13 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-m", "--saved_model", help="The TensorFlow saved model directory to load")
+ parser.add_argument(
+ "-m", "--saved_model", help="The TensorFlow saved model directory to load"
+ )
parser.add_argument("-o", "--onnx", help="The output ONNX model file to write")
- parser.add_argument("-b", "--batch_size", type=int, default=1, help="Set the batch size, default: 1")
+ parser.add_argument(
+ "-b", "--batch_size", type=int, default=1, help="Set the batch size, default: 1"
+ )
parser.add_argument(
"-i",
"--input_size",
diff --git a/samples/python/efficientnet/eval_gt.py b/samples/python/efficientnet/eval_gt.py
index 14d5a8d1..9f57aaa5 100644
--- a/samples/python/efficientnet/eval_gt.py
+++ b/samples/python/efficientnet/eval_gt.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,17 +24,25 @@
from infer import TensorRTInfer
from image_batcher import ImageBatcher
+
def main(args):
annotations = {}
for line in open(args.annotations, "r"):
line = line.strip().split(args.separator)
if len(line) < 2 or not line[1].isnumeric():
- print("Could not parse the annotations file correctly, make sure the correct separator is used")
+ print(
+ "Could not parse the annotations file correctly, make sure the correct separator is used"
+ )
sys.exit(1)
annotations[os.path.basename(line[0])] = int(line[1])
trt_infer = TensorRTInfer(args.engine)
- batcher = ImageBatcher(args.input, *trt_infer.input_spec(), max_num_images=args.num_images, preprocessor=args.preprocessor)
+ batcher = ImageBatcher(
+ args.input,
+ *trt_infer.input_spec(),
+ max_num_images=args.num_images,
+ preprocessor=args.preprocessor
+ )
top1 = 0
top5 = 0
total = 0
@@ -70,9 +78,15 @@ def main(args):
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with")
parser.add_argument(
- "-i", "--input", help="The input to infer, either a single image path, or a directory of images"
+ "-i",
+ "--input",
+ help="The input to infer, either a single image path, or a directory of images",
+ )
+ parser.add_argument(
+ "-a",
+ "--annotations",
+ help="Set the file to use for classification ground truth annotations",
)
- parser.add_argument("-a", "--annotations", help="Set the file to use for classification ground truth annotations")
parser.add_argument(
"-s",
"--separator",
diff --git a/samples/python/efficientnet/image_batcher.py b/samples/python/efficientnet/image_batcher.py
index 996a72a3..63d37784 100644
--- a/samples/python/efficientnet/image_batcher.py
+++ b/samples/python/efficientnet/image_batcher.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,7 +27,15 @@ class ImageBatcher:
Creates batches of pre-processed images.
"""
- def __init__(self, input, shape, dtype, max_num_images=None, exact_batches=False, preprocessor="V2"):
+ def __init__(
+ self,
+ input,
+ shape,
+ dtype,
+ max_num_images=None,
+ exact_batches=False,
+ preprocessor="V2",
+ ):
"""
:param input: The input directory to read images from.
:param shape: The tensor shape of the batch to prepare, either in NCHW or NHWC format.
@@ -45,10 +53,16 @@ def __init__(self, input, shape, dtype, max_num_images=None, exact_batches=False
extensions = [".jpg", ".jpeg", ".png", ".bmp"]
def is_image(path):
- return os.path.isfile(path) and os.path.splitext(path)[1].lower() in extensions
+ return (
+ os.path.isfile(path) and os.path.splitext(path)[1].lower() in extensions
+ )
if os.path.isdir(input):
- self.images = [os.path.join(input, f) for f in os.listdir(input) if is_image(os.path.join(input, f))]
+ self.images = [
+ os.path.join(input, f)
+ for f in os.listdir(input)
+ if is_image(os.path.join(input, f))
+ ]
self.images.sort()
elif os.path.isfile(input):
if is_image(input):
diff --git a/samples/python/efficientnet/infer.py b/samples/python/efficientnet/infer.py
index 2c70b14e..cc18e1c8 100644
--- a/samples/python/efficientnet/infer.py
+++ b/samples/python/efficientnet/infer.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -110,7 +110,9 @@ def infer(self, batch, top=1):
output = np.zeros(*self.output_spec())
# Process I/O and execute the network
- common.memcpy_host_to_device(self.inputs[0]["allocation"], np.ascontiguousarray(batch))
+ common.memcpy_host_to_device(
+ self.inputs[0]["allocation"], np.ascontiguousarray(batch)
+ )
self.context.execute_v2(self.allocations)
common.memcpy_device_to_host(output, self.outputs[0]["allocation"])
@@ -126,7 +128,9 @@ def infer(self, batch, top=1):
def main(args):
trt_infer = TensorRTInfer(args.engine)
- batcher = ImageBatcher(args.input, *trt_infer.input_spec(), preprocessor=args.preprocessor)
+ batcher = ImageBatcher(
+ args.input, *trt_infer.input_spec(), preprocessor=args.preprocessor
+ )
for batch, images in batcher.get_batch():
classes, scores, top = trt_infer.infer(batch)
for i in range(len(images)):
@@ -146,10 +150,16 @@ def main(args):
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with")
parser.add_argument(
- "-i", "--input", help="The input to infer, either a single image path, or a directory of images"
+ "-i",
+ "--input",
+ help="The input to infer, either a single image path, or a directory of images",
)
parser.add_argument(
- "-t", "--top", default=1, type=int, help="The amount of top classes and scores to output per image, default: 1"
+ "-t",
+ "--top",
+ default=1,
+ type=int,
+ help="The amount of top classes and scores to output per image, default: 1",
)
parser.add_argument(
"-s",
diff --git a/samples/python/engine_refit_onnx_bidaf/build_and_refit_engine.py b/samples/python/engine_refit_onnx_bidaf/build_and_refit_engine.py
index 268a5cf5..240f1295 100644
--- a/samples/python/engine_refit_onnx_bidaf/build_and_refit_engine.py
+++ b/samples/python/engine_refit_onnx_bidaf/build_and_refit_engine.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,25 +20,25 @@
import sys
import numpy as np
-
+import argparse
import tensorrt as trt
-from data_processing import get_inputs, preprocess
sys.path.insert(1, os.path.join(sys.path[0], ".."))
-import common
+from cuda import cudart
TRT_LOGGER = trt.Logger()
-def get_engine(onnx_file_path, engine_file_path):
+def get_plan(onnx_file_path, engine_file_path, version_compatible):
"""Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
- def build_engine():
+ def build_plan():
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
+ import tensorrt as trt
+
builder = trt.Builder(TRT_LOGGER)
- network = builder.create_network(common.EXPLICIT_BATCH)
+ network = builder.create_network(0)
parser = trt.OnnxParser(network, TRT_LOGGER)
- runtime = trt.Runtime(TRT_LOGGER)
# Parse model file
print("Loading ONNX file from path {}...".format(onnx_file_path))
@@ -59,8 +59,8 @@ def build_engine():
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.REFIT)
- config.max_workspace_size = 1 << 28 # 256MiB
-
+ if version_compatible:
+ config.set_flag(trt.BuilderFlag.VERSION_COMPATIBLE)
for opt in [6, 10]:
profile = builder.create_optimization_profile()
@@ -68,47 +68,119 @@ def build_engine():
input0_min = (1, 1)
input0_opt = (opt, 1)
input0_max = (15, 1)
- profile.set_shape(network.get_input(0).name, min=input0_min, opt=input0_opt, max=input0_max)
+ profile.set_shape(
+ network.get_input(0).name,
+ min=input0_min,
+ opt=input0_opt,
+ max=input0_max,
+ )
input1_min = (1, 1, 1, 16)
input1_opt = (opt, 1, 1, 16)
input1_max = (15, 1, 1, 16)
- profile.set_shape(network.get_input(1).name, min=input1_min, opt=input1_opt, max=input1_max)
+ profile.set_shape(
+ network.get_input(1).name,
+ min=input1_min,
+ opt=input1_opt,
+ max=input1_max,
+ )
input2_min = (1, 1)
input2_opt = (opt, 1)
input2_max = (15, 1)
- profile.set_shape(network.get_input(2).name, min=input2_min, opt=input2_opt, max=input2_max)
+ profile.set_shape(
+ network.get_input(2).name,
+ min=input2_min,
+ opt=input2_opt,
+ max=input2_max,
+ )
input3_min = (1, 1, 1, 16)
input3_opt = (opt, 1, 1, 16)
input3_max = (15, 1, 1, 16)
- profile.set_shape(network.get_input(3).name, min=input3_min, opt=input3_opt, max=input3_max)
+ profile.set_shape(
+ network.get_input(3).name,
+ min=input3_min,
+ opt=input3_opt,
+ max=input3_max,
+ )
config.add_optimization_profile(profile)
- print("Building an engine from file {}; this may take a while...".format(onnx_file_path))
+ print(
+ "Building an engine from file {}; this may take a while...".format(
+ onnx_file_path
+ )
+ )
plan = builder.build_serialized_network(network, config)
- engine = runtime.deserialize_cuda_engine(plan)
print("Completed creating Engine")
with open(engine_file_path, "wb") as f:
f.write(plan)
- return engine
+ return plan
if os.path.exists(engine_file_path):
# If a serialized engine exists, use it instead of building an engine.
- print("Reading engine from file {}".format(engine_file_path))
- with open(engine_file_path, "rb") as f:
- runtime = trt.Runtime(TRT_LOGGER)
- return runtime.deserialize_cuda_engine(f.read())
- else:
- return build_engine()
+ print("Reading engine from file {}...".format(engine_file_path))
+ f = open(engine_file_path, "rb")
+ return f.read()
+ return build_plan()
def main():
+ global trt
+ global TRT_LOGGER
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-l",
+ "--weights-location",
+ dest="weights_location",
+ default="GPU",
+ choices=["GPU", "CPU"],
+ help="The location for weights passed to refitter, either GPU/CPU, default: GPU",
+ )
+ parser.add_argument(
+ "--version-compatible",
+ dest="version_compatible",
+ action="store_true",
+ help="Build a version compatible engine for refitting",
+ )
+ args = parser.parse_args()
+
onnx_file_path = "bidaf-modified.onnx"
- engine_file_path = "bidaf.trt"
+ engine_file_path = "bidaf{}.trt".format("-vc" if args.version_compatible else "")
+
+ plan = get_plan(onnx_file_path, engine_file_path, args.version_compatible)
+
+ if args.version_compatible:
+ # Try using dispatch runtime for refitting and inference. If failed, fallback to full runtime.
+ try:
+ del sys.modules["tensorrt"]
+ sys.modules["tensorrt"] = __import__("tensorrt_dispatch")
+ sys.modules["trt"] = sys.modules["tensorrt"]
+ import tensorrt_dispatch as trt
+
+ print(
+ "Importing tensorrt_dispatch instead of full tensorrt for refitting and running vc engines."
+ )
+ except:
+ print(
+ "Failed to import tensorrt_dispatch for refitting and running vc engines. Please install the package first!"
+ )
+ sys.modules["tensorrt"] = __import__("tensorrt")
+ TRT_LOGGER = trt.Logger()
+
+ engine = None
+ with open(engine_file_path, "rb") as f:
+ runtime = trt.Runtime(TRT_LOGGER)
+ if args.version_compatible:
+ runtime.engine_host_code_allowed = True
+ engine = runtime.deserialize_cuda_engine(plan)
+
+ # should be after get_engine
+ from data_processing import get_inputs, preprocess
+ import common_runtime as common
# input
context = "A quick brown fox jumps over the lazy dog."
@@ -119,50 +191,93 @@ def main():
# Do inference with TensorRT
weights_names = ["Parameter576_B_0", "W_0"]
- refit_weights_dict = {name: np.load("{}.npy".format(name)) for name in weights_names}
- fake_weights_dict = {name: np.ones_like(weights) for name, weights in refit_weights_dict.items()}
- engine = get_engine(onnx_file_path, engine_file_path)
+ refit_weights_dict = {
+ name: np.load("{}.npy".format(name)) for name in weights_names
+ }
+ fake_weights_dict = {
+ name: np.ones_like(weights) for name, weights in refit_weights_dict.items()
+ }
+ device_mem_dict = {}
+ if args.weights_location == "GPU":
+ for name, weights in refit_weights_dict.items():
+ nbytes = weights.size * weights.itemsize
+ device_mem_dict[name] = common.cuda_call(cudart.cudaMalloc(nbytes))
+
+ execution_context = engine.create_execution_context()
refitter = trt.Refitter(engine, TRT_LOGGER)
- for weights_dict, answer_correct in [(fake_weights_dict, False), (refit_weights_dict, True)]:
- print("Refitting engine...")
- # To get a list of all refittable weights' names
- # in the network, use refitter.get_all_weights().
-
+ # Skip weights validation since we are confident that the new weights are similar to the weights used to build engine.
+ refitter.weights_validation = False
+ # To get a list of all refittable weights' names
+ # in the network, use refitter.get_all_weights().
+
+ if args.weights_location == "GPU":
+ for name, device_mem in device_mem_dict.items():
+ device_weights = trt.Weights(
+ trt.DataType.FLOAT, device_mem, refit_weights_dict[name].size
+ )
+ weights_prototype = refitter.get_weights_prototype(name)
+ assert device_weights.dtype == weights_prototype.dtype
+ assert device_weights.size == weights_prototype.size
+ refitter.set_named_weights(name, device_weights, trt.TensorLocation.DEVICE)
+
+ for weights_dict, answer_correct in [
+ (fake_weights_dict, False),
+ (refit_weights_dict, True),
+ ]:
+ import time
+
+ T1 = time.perf_counter()
+ device_mem_list = []
# Refit named weights via set_named_weights
for name in weights_names:
- refitter.set_named_weights(name, weights_dict[name])
-
- # Get missing weights names. This should return empty
- # lists in this case.
+ host_weights = weights_dict[name]
+ if args.weights_location == "CPU":
+ weights = host_weights
+ location = trt.TensorLocation.HOST
+ refitter.set_named_weights(name, weights, location)
+ else:
+ common.memcpy_host_to_device(device_mem_dict[name], host_weights)
+
+ # Get missing weights names. This should return empty lists in this case.
missing_weights = refitter.get_missing_weights()
assert (
len(missing_weights) == 0
), "Refitter found missing weights. Call set_named_weights() or set_weights() for all missing weights"
- # Refit the engine with the new weights. This will return True if
- # the refit operation succeeded.
+
+ print(f"Refitting engine from {args.weights_location} weights...")
+ # Refit the engine with the new weights. This will return True if the refit operation succeeded.
assert refitter.refit_cuda_engine()
+ T2 = time.perf_counter()
+ print("Engine refitted in {:.2f} ms.".format((T2 - T1) * 1000))
+
for profile_idx in range(engine.num_optimization_profiles):
print("Doing inference...")
# Do inference
- inputs, outputs, bindings, stream = common.allocate_buffers(engine, profile_idx)
+ inputs, outputs, bindings, stream = common.allocate_buffers(
+ engine, profile_idx
+ )
padding_bindings = [0] * (len(bindings) * profile_idx)
new_bindings = padding_bindings + bindings
- # Set host input. The common.do_inference_v2 function will copy the input to the GPU before executing.
+ # Set host input. The common.do_inference function will copy the input to the GPU before executing.
inputs[0].host = cw
inputs[1].host = cc
inputs[2].host = qw
inputs[3].host = qc
- execution_context = engine.create_execution_context()
execution_context.set_optimization_profile_async(profile_idx, stream)
execution_context.set_input_shape("CategoryMapper_4", (10, 1))
execution_context.set_input_shape("CategoryMapper_5", (10, 1, 1, 16))
execution_context.set_input_shape("CategoryMapper_6", (6, 1))
execution_context.set_input_shape("CategoryMapper_7", (6, 1, 1, 16))
- trt_outputs = common.do_inference_v2(
- execution_context, bindings=new_bindings, inputs=inputs, outputs=outputs, stream=stream
+ trt_outputs = common.do_inference(
+ execution_context,
+ engine=engine,
+ bindings=bindings,
+ inputs=inputs,
+ outputs=outputs,
+ stream=stream,
)
start = trt_outputs[0].item()
@@ -170,6 +285,10 @@ def main():
answer = [w.encode() for w in cw_str[start : end + 1].reshape(-1)]
assert answer_correct == (answer == [b"brown"]), answer
common.free_buffers(inputs, outputs, stream)
+
+ for _, device_mem in device_mem_dict.items():
+ common.cuda_call(cudart.cudaFree(device_mem))
+
print("Passed")
diff --git a/samples/python/engine_refit_onnx_bidaf/data_processing.py b/samples/python/engine_refit_onnx_bidaf/data_processing.py
index 6eb90fa0..f6740bc5 100644
--- a/samples/python/engine_refit_onnx_bidaf/data_processing.py
+++ b/samples/python/engine_refit_onnx_bidaf/data_processing.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,7 +40,9 @@ def preprocess(text):
def get_map_func(filepath):
file = open(filepath)
category_map = json.load(file)
- category_mapper = dict(zip(category_map["cats_strings"], category_map["cats_int64s"]))
+ category_mapper = dict(
+ zip(category_map["cats_strings"], category_map["cats_int64s"])
+ )
default_int64 = category_map["default_int64"]
func = lambda s: category_mapper.get(s, default_int64)
return np.vectorize(func)
diff --git a/samples/python/engine_refit_onnx_bidaf/prepare_model.py b/samples/python/engine_refit_onnx_bidaf/prepare_model.py
index cbeb6a92..eb45226e 100644
--- a/samples/python/engine_refit_onnx_bidaf/prepare_model.py
+++ b/samples/python/engine_refit_onnx_bidaf/prepare_model.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -82,7 +82,9 @@ def save_weights_for_refitting(graph):
def main():
- org_model_file_path = getFilePath("samples/python/engine_refit_onnx_bidaf/bidaf-original.onnx")
+ org_model_file_path = getFilePath(
+ "samples/python/engine_refit_onnx_bidaf/bidaf-original.onnx"
+ )
print("Modifying the ONNX model ...")
original_model = onnx.load(org_model_file_path)
diff --git a/samples/python/introductory_parser_samples/onnx_resnet50.py b/samples/python/introductory_parser_samples/onnx_resnet50.py
index f07e99ff..fd69cc48 100644
--- a/samples/python/introductory_parser_samples/onnx_resnet50.py
+++ b/samples/python/introductory_parser_samples/onnx_resnet50.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,6 +40,7 @@ class ModelData(object):
# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+
# The Onnx path is used for Onnx models.
def build_engine_onnx(model_file):
builder = trt.Builder(TRT_LOGGER)
@@ -111,7 +112,14 @@ def main():
test_case = load_normalized_test_case(test_image, inputs[0].host)
# Run the engine. The output will be a 1D tensor of length 1000, where each value represents the
# probability that the image corresponds to that label
- trt_outputs = common.do_inference(context, engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
+ trt_outputs = common.do_inference(
+ context,
+ engine=engine,
+ bindings=bindings,
+ inputs=inputs,
+ outputs=outputs,
+ stream=stream,
+ )
# We use the highest probability as our prediction. Its index corresponds to the predicted label.
pred = labels[np.argmax(trt_outputs[0])]
common.free_buffers(inputs, outputs, stream)
diff --git a/samples/python/network_api_pytorch_mnist/model.py b/samples/python/network_api_pytorch_mnist/model.py
index 3f1a4fe4..53371989 100644
--- a/samples/python/network_api_pytorch_mnist/model.py
+++ b/samples/python/network_api_pytorch_mnist/model.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -59,7 +59,9 @@ def __init__(self):
"/tmp/mnist/data",
train=True,
download=True,
- transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]),
+ transform=transforms.Compose(
+ [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+ ),
),
batch_size=self.batch_size,
shuffle=True,
@@ -70,7 +72,9 @@ def __init__(self):
datasets.MNIST(
"/tmp/mnist/data",
train=False,
- transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]),
+ transform=transforms.Compose(
+ [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+ ),
),
batch_size=self.test_batch_size,
shuffle=True,
@@ -86,7 +90,11 @@ def learn(self, num_epochs=2):
# Train the network for a single epoch
def train(epoch):
self.network.train()
- optimizer = optim.SGD(self.network.parameters(), lr=self.learning_rate, momentum=self.sgd_momentum)
+ optimizer = optim.SGD(
+ self.network.parameters(),
+ lr=self.learning_rate,
+ momentum=self.sgd_momentum,
+ )
for batch, (data, target) in enumerate(self.train_loader):
if torch.cuda.is_available():
data = data.to("cuda")
@@ -126,7 +134,10 @@ def test(epoch):
test_loss /= len(self.test_loader)
print(
"\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
- test_loss, correct, len(self.test_loader.dataset), 100.0 * correct / len(self.test_loader.dataset)
+ test_loss,
+ correct,
+ len(self.test_loader.dataset),
+ 100.0 * correct / len(self.test_loader.dataset),
)
)
diff --git a/samples/python/network_api_pytorch_mnist/sample.py b/samples/python/network_api_pytorch_mnist/sample.py
index 1f634443..a695ee9a 100644
--- a/samples/python/network_api_pytorch_mnist/sample.py
+++ b/samples/python/network_api_pytorch_mnist/sample.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,7 +41,9 @@ class ModelData(object):
def populate_network(network, weights):
# Configure the network layers based on the weights provided.
- input_tensor = network.add_input(name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE)
+ input_tensor = network.add_input(
+ name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE
+ )
def add_matmul_as_fc(net, input, outputs, w, b):
assert len(input.shape) >= 3
@@ -64,7 +66,9 @@ def add_matmul_as_fc(net, input, outputs, w, b):
)
bias_const = net.add_constant(trt.Dims2(1, n), b)
- bias_add = net.add_elementwise(mm.get_output(0), bias_const.get_output(0), trt.ElementWiseOperation.SUM)
+ bias_add = net.add_elementwise(
+ mm.get_output(0), bias_const.get_output(0), trt.ElementWiseOperation.SUM
+ )
output_reshape = net.add_shuffle(bias_add.get_output(0))
output_reshape.reshape_dims = trt.Dims4(m, n, 1, 1)
@@ -73,16 +77,24 @@ def add_matmul_as_fc(net, input, outputs, w, b):
conv1_w = weights["conv1.weight"].cpu().numpy()
conv1_b = weights["conv1.bias"].cpu().numpy()
conv1 = network.add_convolution_nd(
- input=input_tensor, num_output_maps=20, kernel_shape=(5, 5), kernel=conv1_w, bias=conv1_b
+ input=input_tensor,
+ num_output_maps=20,
+ kernel_shape=(5, 5),
+ kernel=conv1_w,
+ bias=conv1_b,
)
conv1.stride_nd = (1, 1)
- pool1 = network.add_pooling_nd(input=conv1.get_output(0), type=trt.PoolingType.MAX, window_size=(2, 2))
+ pool1 = network.add_pooling_nd(
+ input=conv1.get_output(0), type=trt.PoolingType.MAX, window_size=(2, 2)
+ )
pool1.stride_nd = trt.Dims2(2, 2)
conv2_w = weights["conv2.weight"].cpu().numpy()
conv2_b = weights["conv2.bias"].cpu().numpy()
- conv2 = network.add_convolution_nd(pool1.get_output(0), 50, (5, 5), conv2_w, conv2_b)
+ conv2 = network.add_convolution_nd(
+ pool1.get_output(0), 50, (5, 5), conv2_w, conv2_b
+ )
conv2.stride_nd = (1, 1)
pool2 = network.add_pooling_nd(conv2.get_output(0), trt.PoolingType.MAX, (2, 2))
@@ -92,11 +104,15 @@ def add_matmul_as_fc(net, input, outputs, w, b):
fc1_b = weights["fc1.bias"].cpu().numpy()
fc1 = add_matmul_as_fc(network, pool2.get_output(0), 500, fc1_w, fc1_b)
- relu1 = network.add_activation(input=fc1.get_output(0), type=trt.ActivationType.RELU)
+ relu1 = network.add_activation(
+ input=fc1.get_output(0), type=trt.ActivationType.RELU
+ )
fc2_w = weights["fc2.weight"].cpu().numpy()
fc2_b = weights["fc2.bias"].cpu().numpy()
- fc2 = add_matmul_as_fc(network, relu1.get_output(0), ModelData.OUTPUT_SIZE, fc2_w, fc2_b)
+ fc2 = add_matmul_as_fc(
+ network, relu1.get_output(0), ModelData.OUTPUT_SIZE, fc2_w, fc2_b
+ )
fc2.get_output(0).name = ModelData.OUTPUT_NAME
network.mark_output(tensor=fc2.get_output(0))
@@ -143,7 +159,14 @@ def main():
case_num = load_random_test_case(mnist_model, pagelocked_buffer=inputs[0].host)
# For more information on performing inference, refer to the introductory samples.
# The common.do_inference function will return a list of outputs - we only have one in this case.
- [output] = common.do_inference(context, engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
+ [output] = common.do_inference(
+ context,
+ engine=engine,
+ bindings=bindings,
+ inputs=inputs,
+ outputs=outputs,
+ stream=stream,
+ )
pred = np.argmax(output)
common.free_buffers(inputs, outputs, stream)
print("Test Case: " + str(case_num))
diff --git a/samples/python/onnx_custom_plugin/CMakeLists.txt b/samples/python/onnx_custom_plugin/CMakeLists.txt
index 75f69af4..f00bcd31 100644
--- a/samples/python/onnx_custom_plugin/CMakeLists.txt
+++ b/samples/python/onnx_custom_plugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/python/onnx_custom_plugin/load_plugin_lib.py b/samples/python/onnx_custom_plugin/load_plugin_lib.py
index 0a85f18e..a3feaa37 100644
--- a/samples/python/onnx_custom_plugin/load_plugin_lib.py
+++ b/samples/python/onnx_custom_plugin/load_plugin_lib.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +18,9 @@
import os
import ctypes
-WORKING_DIR = os.environ.get("TRT_WORKING_DIR") or os.path.dirname(os.path.realpath(__file__))
+WORKING_DIR = os.environ.get("TRT_WORKING_DIR") or os.path.dirname(
+ os.path.realpath(__file__)
+)
IS_WINDOWS = os.name == "nt"
if IS_WINDOWS:
HARDMAX_PLUGIN_LIBRARY_NAME = "customHardmaxPlugin.dll"
@@ -28,7 +30,10 @@
]
else:
HARDMAX_PLUGIN_LIBRARY_NAME = "libcustomHardmaxPlugin.so"
- HARDMAX_PLUGIN_LIBRARY = [os.path.join(WORKING_DIR, "build", HARDMAX_PLUGIN_LIBRARY_NAME)]
+ HARDMAX_PLUGIN_LIBRARY = [
+ os.path.join(WORKING_DIR, "build", HARDMAX_PLUGIN_LIBRARY_NAME)
+ ]
+
def load_plugin_lib():
for plugin_lib in HARDMAX_PLUGIN_LIBRARY:
diff --git a/samples/python/onnx_custom_plugin/model.py b/samples/python/onnx_custom_plugin/model.py
index cde029c5..53b2a96e 100644
--- a/samples/python/onnx_custom_plugin/model.py
+++ b/samples/python/onnx_custom_plugin/model.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,18 +24,21 @@
MODEL_URL = "https://github.com/onnx/models/raw/e77240a62df68ed13e3138a5812553a552b857bb/text/machine_comprehension/bidirectional_attention_flow/model/bidaf-9.onnx"
-WORKING_DIR = os.environ.get("TRT_WORKING_DIR") or os.path.dirname(os.path.realpath(__file__))
-MODEL_DIR = os.path.join(WORKING_DIR, "models")
+WORKING_DIR = os.environ.get("TRT_WORKING_DIR") or os.path.dirname(
+ os.path.realpath(__file__)
+)
+MODEL_DIR = os.path.join(WORKING_DIR, "models")
RAW_MODEL_PATH = os.path.join(MODEL_DIR, "bidaf-9.onnx")
TRT_MODEL_PATH = os.path.join(MODEL_DIR, "bidaf-9-trt.onnx")
+
def _do_graph_surgery(raw_model_path, trt_model_path):
graph = gs.import_onnx(onnx.load(raw_model_path))
# Replace unsupported Hardmax with our CustomHardmax op
for node in graph.nodes:
- if node.op == 'Hardmax':
- node.op = 'CustomHardmax'
+ if node.op == "Hardmax":
+ node.op = "CustomHardmax"
hardmax_node = node
# The original onnx model also uses another unsupported op called "Compress".
@@ -47,16 +50,16 @@ def _do_graph_surgery(raw_model_path, trt_model_path):
#
# So, we will replace the subgraph Compress(Transpose_29, Cast(Reshape(Hardmax)))
# with the subgraph Einsum(Transpose_29, Hardmax) where the equation in Einsum takes the dot product.
- node_by_name = {node.name : node for node in graph.nodes}
- transpose_node = node_by_name['Transpose_29']
- compress_node = node_by_name['Compress_31']
+ node_by_name = {node.name: node for node in graph.nodes}
+ transpose_node = node_by_name["Transpose_29"]
+ compress_node = node_by_name["Compress_31"]
einsum_node = gs.Node(
- 'Einsum',
- 'Dot_of_Hardmax_and_Transpose',
- attrs={'equation': 'ij,ij->i'}, # "Dot product" of 2d tensors
+ "Einsum",
+ "Dot_of_Hardmax_and_Transpose",
+ attrs={"equation": "ij,ij->i"}, # "Dot product" of 2d tensors
inputs=[hardmax_node.outputs[0], transpose_node.outputs[0]],
- outputs=[compress_node.outputs[0]]
+ outputs=[compress_node.outputs[0]],
)
graph.nodes.append(einsum_node)
@@ -80,7 +83,9 @@ def _do_graph_surgery(raw_model_path, trt_model_path):
#
# Later we will feed the model the integer tokens directly.
# Note: list conversion is necessary because we modify graph.nodes in the for loop.
- category_mapper_nodes = [node for node in graph.nodes if node.op == 'CategoryMapper']
+ category_mapper_nodes = [
+ node for node in graph.nodes if node.op == "CategoryMapper"
+ ]
for node in category_mapper_nodes:
# Remove CategoryMapper node from onnx graph
graph.nodes.remove(node)
diff --git a/samples/python/onnx_custom_plugin/sample.py b/samples/python/onnx_custom_plugin/sample.py
index 7026f0e5..25d4ca36 100644
--- a/samples/python/onnx_custom_plugin/sample.py
+++ b/samples/python/onnx_custom_plugin/sample.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,7 +30,7 @@
# Reuse some BiDAF-specific methods
# ../engine_refit_onnx_bidaf/data_processing.py
-sys.path.insert(1, os.path.join(parent_dir, 'engine_refit_onnx_bidaf'))
+sys.path.insert(1, os.path.join(parent_dir, "engine_refit_onnx_bidaf"))
from engine_refit_onnx_bidaf.data_processing import preprocess, get_inputs
# Maxmimum number of words in context or query text.
@@ -38,10 +38,12 @@
# Adjustable.
MAX_TEXT_LENGTH = 64
-WORKING_DIR = os.environ.get("TRT_WORKING_DIR") or os.path.dirname(os.path.realpath(__file__))
+WORKING_DIR = os.environ.get("TRT_WORKING_DIR") or os.path.dirname(
+ os.path.realpath(__file__)
+)
# Path to which trained model will be saved (check README.md)
-ENGINE_FILE_PATH = os.path.join(WORKING_DIR, 'bidaf.trt')
+ENGINE_FILE_PATH = os.path.join(WORKING_DIR, "bidaf.trt")
# Define global logger object (it should be a singleton,
# available for TensorRT from anywhere in code).
@@ -49,13 +51,16 @@
# (or lower to display more messages)
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+
# Builds TensorRT Engine
def build_engine(model_path):
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(0)
config = builder.create_builder_config()
- config.set_tactic_sources(config.get_tactic_sources() | 1 << int(trt.TacticSource.CUBLAS))
+ config.set_tactic_sources(
+ config.get_tactic_sources() | 1 << int(trt.TacticSource.CUBLAS)
+ )
parser = trt.OnnxParser(network, TRT_LOGGER)
runtime = trt.Runtime(TRT_LOGGER)
@@ -90,17 +95,20 @@ def build_engine(model_path):
f.write(plan)
return engine
+
def load_test_case(inputs, context_text, query_text, trt_context):
# Part 1: Specify Input shapes
cw, cc = preprocess(context_text)
qw, qc = preprocess(query_text)
for arr in (cw, cc, qw, qc):
- assert arr.shape[0] <= MAX_TEXT_LENGTH, "Input context or query is too long! " + \
- "Either decrease the input length or increase MAX_TEXT_LENGTH"
- trt_context.set_input_shape('CategoryMapper_4', cw.shape)
- trt_context.set_input_shape('CategoryMapper_5', cc.shape)
- trt_context.set_input_shape('CategoryMapper_6', qw.shape)
- trt_context.set_input_shape('CategoryMapper_7', qc.shape)
+ assert arr.shape[0] <= MAX_TEXT_LENGTH, (
+ "Input context or query is too long! "
+ + "Either decrease the input length or increase MAX_TEXT_LENGTH"
+ )
+ trt_context.set_input_shape("CategoryMapper_4", cw.shape)
+ trt_context.set_input_shape("CategoryMapper_5", cc.shape)
+ trt_context.set_input_shape("CategoryMapper_6", qw.shape)
+ trt_context.set_input_shape("CategoryMapper_7", qc.shape)
# Part 2: load input data
cw_flat, cc_flat, qw_flat, qc_flat = get_inputs(context_text, query_text)
@@ -138,20 +146,23 @@ def main():
inputs, outputs, bindings, stream = common.allocate_buffers(engine, profile_idx=0)
testcases = [
- ('Garry the lion is 5 years old. He lives in the savanna.', 'Where does the lion live?'),
- ('A quick brown fox jumps over the lazy dog.', 'What color is the fox?')
+ (
+ "Garry the lion is 5 years old. He lives in the savanna.",
+ "Where does the lion live?",
+ ),
+ ("A quick brown fox jumps over the lazy dog.", "What color is the fox?"),
]
print("\n=== Testing ===")
- interactive = '--interactive' in sys.argv
+ interactive = "--interactive" in sys.argv
if interactive:
context_text = input("Enter context: ")
query_text = input("Enter query: ")
testcases = [(context_text, query_text)]
trt_context = engine.create_execution_context()
- for (context_text, query_text) in testcases:
+ for context_text, query_text in testcases:
context_words, _ = preprocess(context_text)
@@ -159,7 +170,14 @@ def main():
if not interactive:
print(f"Input context: {context_text}")
print(f"Input query: {query_text}")
- trt_outputs = common.do_inference(trt_context, engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
+ trt_outputs = common.do_inference(
+ trt_context,
+ engine=engine,
+ bindings=bindings,
+ inputs=inputs,
+ outputs=outputs,
+ stream=stream,
+ )
start = trt_outputs[1].item()
end = trt_outputs[0].item()
answer = context_words[start : end + 1].flatten()
@@ -168,5 +186,6 @@ def main():
common.free_buffers(inputs, outputs, stream)
print("Passed")
+
if __name__ == "__main__":
main()
diff --git a/samples/python/onnx_custom_plugin/test_custom_hardmax_plugin.py b/samples/python/onnx_custom_plugin/test_custom_hardmax_plugin.py
index c99b78d1..59b08b06 100644
--- a/samples/python/onnx_custom_plugin/test_custom_hardmax_plugin.py
+++ b/samples/python/onnx_custom_plugin/test_custom_hardmax_plugin.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,27 +29,35 @@
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+
def hardmax_reference_impl(arr, axis):
one_hot = np.zeros(arr.shape, dtype=arr.dtype)
argmax = np.expand_dims(np.argmax(arr, axis), axis)
- np.put_along_axis(one_hot,argmax,1,axis=axis)
+ np.put_along_axis(one_hot, argmax, 1, axis=axis)
return one_hot
+
def make_trt_network_and_engine(input_shape, axis):
registry = trt.get_plugin_registry()
plugin_creator = registry.get_plugin_creator("CustomHardmax", "1")
axis_buffer = np.array([axis])
axis_attr = trt.PluginField("axis", axis_buffer, type=trt.PluginFieldType.INT32)
field_collection = trt.PluginFieldCollection([axis_attr])
- plugin = plugin_creator.create_plugin(name="CustomHardmax", field_collection=field_collection)
+ plugin = plugin_creator.create_plugin(
+ name="CustomHardmax", field_collection=field_collection
+ )
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(0)
config = builder.create_builder_config()
- config.set_tactic_sources(config.get_tactic_sources() | 1 << int(trt.TacticSource.CUBLAS))
+ config.set_tactic_sources(
+ config.get_tactic_sources() | 1 << int(trt.TacticSource.CUBLAS)
+ )
runtime = trt.Runtime(TRT_LOGGER)
- input_layer = network.add_input(name="input_layer", dtype=trt.float32, shape=input_shape)
+ input_layer = network.add_input(
+ name="input_layer", dtype=trt.float32, shape=input_shape
+ )
hardmax = network.add_plugin_v2(inputs=[input_layer], plugin=plugin)
network.mark_output(hardmax.get_output(0))
@@ -58,15 +66,24 @@ def make_trt_network_and_engine(input_shape, axis):
return engine
+
def custom_plugin_impl(input_arr, engine):
inputs, outputs, bindings, stream = common.allocate_buffers(engine)
context = engine.create_execution_context()
inputs[0].host = input_arr.astype(trt.nptype(trt.float32))
- trt_outputs = common.do_inference(context, engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
+ trt_outputs = common.do_inference(
+ context,
+ engine=engine,
+ bindings=bindings,
+ inputs=inputs,
+ outputs=outputs,
+ stream=stream,
+ )
output = trt_outputs[0].copy()
common.free_buffers(inputs, outputs, stream)
return output
+
def main():
load_plugin_lib()
for num_dims in range(1, 8):
@@ -80,5 +97,6 @@ def main():
assert np.all(res1 == res2), f"Test failed for shape={shape}, axis={axis}"
print("Passed")
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/samples/python/onnx_packnet/convert_to_onnx.py b/samples/python/onnx_packnet/convert_to_onnx.py
index df604f96..72c31b72 100644
--- a/samples/python/onnx_packnet/convert_to_onnx.py
+++ b/samples/python/onnx_packnet/convert_to_onnx.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -63,17 +63,29 @@ def build_packnet(model_file, args):
model_pyt = PackNet01(version="1A")
# Convert the model into ONNX
- torch.onnx.export(model_pyt, input_pyt, model_file, verbose=args.verbose, opset_version=args.opset)
+ torch.onnx.export(
+ model_pyt, input_pyt, model_file, verbose=args.verbose, opset_version=args.opset
+ )
def main():
parser = argparse.ArgumentParser(
description="Exports PackNet01 to ONNX, and post-processes it to insert TensorRT plugins"
)
- parser.add_argument("-o", "--output", help="Path to save the generated ONNX model", default="model.onnx")
- parser.add_argument("-op", "--opset", type=int, help="ONNX opset to use", default=11)
parser.add_argument(
- "-v", "--verbose", action="store_true", help="Flag to enable verbose logging for torch.onnx.export"
+ "-o",
+ "--output",
+ help="Path to save the generated ONNX model",
+ default="model.onnx",
+ )
+ parser.add_argument(
+ "-op", "--opset", type=int, help="ONNX opset to use", default=11
+ )
+ parser.add_argument(
+ "-v",
+ "--verbose",
+ action="store_true",
+ help="Flag to enable verbose logging for torch.onnx.export",
)
args = parser.parse_args()
diff --git a/samples/python/onnx_packnet/post_processing.py b/samples/python/onnx_packnet/post_processing.py
index 887834c7..33adcf1d 100644
--- a/samples/python/onnx_packnet/post_processing.py
+++ b/samples/python/onnx_packnet/post_processing.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,6 +22,7 @@
import numpy as np
import torch
+
# Pad layer subgraph structure in ONNX (specific to opset 11):
# Constant
# |
@@ -65,7 +66,9 @@ def process_pad_nodes(graph):
def fold_pad_inputs(node, graph):
# Gather the amount of padding in each dimension from pytorch graph.
if torch.__version__ < "1.5.0":
- pad_values_pyt = node.i(1).i(0).i(0).i(0).i(0).i(0).i(0).i(0).attrs["value"].values
+ pad_values_pyt = (
+ node.i(1).i(0).i(0).i(0).i(0).i(0).i(0).i(0).attrs["value"].values
+ )
elif torch.__version__ < "2.0.0":
pad_values_pyt = node.i(1).i(0).i(0).i(0).i(0).i(0).inputs[0].values
else:
@@ -80,7 +83,9 @@ def fold_pad_inputs(node, graph):
j -= 1
# Change the existing pad tensor to the new onnx_pad values tensor
- pads_folded_tensor = gs.Constant(name=node.inputs[1].name, values=np.array(onnx_pad_values))
+ pads_folded_tensor = gs.Constant(
+ name=node.inputs[1].name, values=np.array(onnx_pad_values)
+ )
node.inputs[1] = pads_folded_tensor
@@ -134,7 +139,9 @@ def fold_upsample_inputs(upsample, graph, opset=11):
if opset == 9:
# Gather the scale factor from mul op in the upsample input subgraph
- scale_factor = upsample.i(1).i(1).i(0).i(0).i(0).i(0).i(0).i(0).i(1).attrs["value"].values
+ scale_factor = (
+ upsample.i(1).i(1).i(0).i(0).i(0).i(0).i(0).i(0).i(1).attrs["value"].values
+ )
# Create the new scales tensor
scales = np.array([1.0, 1.0, scale_factor, scale_factor], dtype=np.float32)
@@ -148,7 +155,9 @@ def fold_upsample_inputs(upsample, graph, opset=11):
sizes_tensor_name = upsample.inputs[3].name
# Create the new scales tensor
- scale_factor = upsample.i(3).i(1).i().i().i().i().i(0).i(1).attrs["value"].values
+ scale_factor = (
+ upsample.i(3).i(1).i().i().i().i().i(0).i(1).attrs["value"].values
+ )
scales = np.array([1.0, 1.0, scale_factor, scale_factor], dtype=np.float32)
scale_tensor = gs.Constant(name=sizes_tensor_name, values=scales)
diff --git a/samples/python/python_plugin/CMakeLists.txt b/samples/python/python_plugin/CMakeLists.txt
index 3b8fc1f3..6338ea50 100644
--- a/samples/python/python_plugin/CMakeLists.txt
+++ b/samples/python/python_plugin/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/python/python_plugin/circ_pad_plugin_cpp.py b/samples/python/python_plugin/circ_pad_plugin_cpp.py
index a820399f..a7cb8d2f 100644
--- a/samples/python/python_plugin/circ_pad_plugin_cpp.py
+++ b/samples/python/python_plugin/circ_pad_plugin_cpp.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,14 +29,29 @@
TrtRunner,
)
+
def parseArgs():
- parser = argparse.ArgumentParser(description="Options for Circular Padding plugin C++ example")
+ parser = argparse.ArgumentParser(
+ description="Options for Circular Padding plugin C++ example"
+ )
- parser.add_argument('--precision', type=str, default="fp32", choices=["fp32", "fp16"], help="Precision to use for plugin")
- parser.add_argument('--plugin-lib', type=str, help="Path to the Circular Padding plugin lib", required=True)
+ parser.add_argument(
+ "--precision",
+ type=str,
+ default="fp32",
+ choices=["fp32", "fp16"],
+ help="Precision to use for plugin",
+ )
+ parser.add_argument(
+ "--plugin-lib",
+ type=str,
+ help="Path to the Circular Padding plugin lib",
+ required=True,
+ )
return parser.parse_args()
+
if __name__ == "__main__":
args = parseArgs()
@@ -67,15 +82,15 @@ def parseArgs():
# build engine
build_engine = EngineFromNetwork(
- NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision==np.float16)
+ NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision == np.float16)
)
Y_ref = np.pad(X, [[0, 0], [0, 0], [pads[0], pads[1]], [pads[2], pads[3]]], "wrap")
# Run
- with TrtRunner(build_engine, "trt_runner")as runner:
+ with TrtRunner(build_engine, "trt_runner") as runner:
outputs = runner.infer({"X": X})
Y = outputs["Y"]
-
+
if np.allclose(Y, Y_ref):
print("Inference result correct!")
else:
diff --git a/samples/python/python_plugin/circ_pad_plugin_cuda_python.py b/samples/python/python_plugin/circ_pad_plugin_cuda_python.py
index 88ad1ff7..212e3e74 100644
--- a/samples/python/python_plugin/circ_pad_plugin_cuda_python.py
+++ b/samples/python/python_plugin/circ_pad_plugin_cuda_python.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,14 +24,14 @@
CreateConfig,
EngineFromNetwork,
NetworkFromOnnxPath,
- TrtRunner
+ TrtRunner,
)
from polygraphy.json import to_json, from_json
from utils import checkCudaErrors, KernelHelper, parseArgs, CudaCtxManager
from cuda import cuda
-circ_pad_half_kernel = r'''
+circ_pad_half_kernel = r"""
#include
extern "C" __global__
void circ_pad_half(half const* X, int const* all_pads, int const* orig_dims, half* Y, int const* Y_shape, int Y_len) {
@@ -58,9 +58,9 @@
];
}
}
-'''
+"""
-circ_pad_float_kernel = r'''
+circ_pad_float_kernel = r"""
extern "C" __global__
void circ_pad_float(float const* X, int const* all_pads, int const* orig_dims, float* Y, int const* Y_shape, int Y_len) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -86,7 +86,8 @@
];
}
}
-'''
+"""
+
class CircPadPlugin(trt.IPluginV2DynamicExt):
def __init__(self, fc=None):
@@ -107,7 +108,9 @@ def __init__(self, fc=None):
self.cuDevice = None
if fc is not None:
- assert set([f.name for f in fc]) == set(["pads", "N"]), "Field collection invalid"
+ assert set([f.name for f in fc]) == set(
+ ["pads", "N"]
+ ), "Field collection invalid"
for f in fc:
if f.name == "pads":
self.pads = f.data
@@ -116,11 +119,17 @@ def __init__(self, fc=None):
def initialize(self):
err, self.cuDevice = cuda.cuDeviceGet(0)
- trt.get_plugin_registry().acquire_plugin_resource("cuda_ctx", CudaCtxManager(self.cuDevice))
- self.all_pads_d = checkCudaErrors(cuda.cuMemAlloc(np.int32().itemsize * self.N * 2))
- self.orig_dims_d = checkCudaErrors(cuda.cuMemAlloc(np.int32().itemsize * self.N))
+ trt.get_plugin_registry().acquire_plugin_resource(
+ "cuda_ctx", CudaCtxManager(self.cuDevice)
+ )
+ self.all_pads_d = checkCudaErrors(
+ cuda.cuMemAlloc(np.int32().itemsize * self.N * 2)
+ )
+ self.orig_dims_d = checkCudaErrors(
+ cuda.cuMemAlloc(np.int32().itemsize * self.N)
+ )
self.Y_shape_d = checkCudaErrors(cuda.cuMemAlloc(np.int32().itemsize * self.N))
-
+
def get_output_datatype(self, index, input_types):
return input_types[0]
@@ -157,11 +166,17 @@ def configure_plugin(self, inp, out):
# Copy vectors from host memory to device memory
if self.all_pads_d:
- checkCudaErrors(cuda.cuMemcpyHtoD(self.all_pads_d, all_pads, all_pads.nbytes))
+ checkCudaErrors(
+ cuda.cuMemcpyHtoD(self.all_pads_d, all_pads, all_pads.nbytes)
+ )
if self.orig_dims_d:
- checkCudaErrors(cuda.cuMemcpyHtoD(self.orig_dims_d, orig_dims, orig_dims.nbytes))
+ checkCudaErrors(
+ cuda.cuMemcpyHtoD(self.orig_dims_d, orig_dims, orig_dims.nbytes)
+ )
if self.Y_shape_d:
- checkCudaErrors(cuda.cuMemcpyHtoD(self.Y_shape_d, out_dims, out_dims.nbytes))
+ checkCudaErrors(
+ cuda.cuMemcpyHtoD(self.Y_shape_d, out_dims, out_dims.nbytes)
+ )
self.Y_len_d = np.prod(out_dims)
@@ -205,25 +220,43 @@ def enqueue(self, input_desc, output_desc, inputs, outputs, workspace, stream):
if inp_dtype == np.float32:
kernelHelper = KernelHelper(circ_pad_float_kernel, int(self.cuDevice))
- _circ_pad_float_kernel = kernelHelper.getFunction(b'circ_pad_float')
- checkCudaErrors(cuda.cuLaunchKernel(_circ_pad_float_kernel,
- numBlocks, 1, 1,
- blockSize, 1, 1,
- 0,
- stream_ptr,
- kernelArgs, 0))
+ _circ_pad_float_kernel = kernelHelper.getFunction(b"circ_pad_float")
+ checkCudaErrors(
+ cuda.cuLaunchKernel(
+ _circ_pad_float_kernel,
+ numBlocks,
+ 1,
+ 1,
+ blockSize,
+ 1,
+ 1,
+ 0,
+ stream_ptr,
+ kernelArgs,
+ 0,
+ )
+ )
elif inp_dtype == np.float16:
kernelHelper = KernelHelper(circ_pad_half_kernel, int(self.cuDevice))
- _circ_pad_half_kernel = kernelHelper.getFunction(b'circ_pad_half')
- checkCudaErrors(cuda.cuLaunchKernel(_circ_pad_half_kernel,
- numBlocks, 1, 1,
- blockSize, 1, 1,
- 0,
- stream_ptr,
- kernelArgs, 0))
+ _circ_pad_half_kernel = kernelHelper.getFunction(b"circ_pad_half")
+ checkCudaErrors(
+ cuda.cuLaunchKernel(
+ _circ_pad_half_kernel,
+ numBlocks,
+ 1,
+ 1,
+ blockSize,
+ 1,
+ 1,
+ 0,
+ stream_ptr,
+ kernelArgs,
+ 0,
+ )
+ )
else:
raise ValueError("inp_dtype not valid")
-
+
def clone(self):
cloned_plugin = CircPadPlugin()
cloned_plugin.__dict__.update(self.__dict__)
@@ -239,7 +272,7 @@ def terminate(self):
trt.get_plugin_registry().release_plugin_resource("cuda_ctx")
- #
+ #
# The following defaults take effect since the respective methods are not overriden
#
@@ -248,7 +281,7 @@ def terminate(self):
# def get_workspace_size(self, input_desc, output_desc):
# return 0
-
+
# def destroy(self):
# pass
@@ -259,10 +292,12 @@ def __init__(self):
self.name = "CircPadPlugin"
self.plugin_namespace = ""
self.plugin_version = "1"
- self.field_names = trt.PluginFieldCollection([
- trt.PluginField("pads", np.array([]), trt.PluginFieldType.INT32),
- trt.PluginField("N", np.array([]), trt.PluginFieldType.INT32)
- ])
+ self.field_names = trt.PluginFieldCollection(
+ [
+ trt.PluginField("pads", np.array([]), trt.PluginFieldType.INT32),
+ trt.PluginField("N", np.array([]), trt.PluginFieldType.INT32),
+ ]
+ )
def create_plugin(self, name, fc):
return CircPadPlugin(fc)
@@ -273,12 +308,13 @@ def deserialize_plugin(self, name, data):
deserialized.__dict__.update(j)
return deserialized
+
if __name__ == "__main__":
args = parseArgs()
# Initialize CUDA Driver API
- err, = cuda.cuInit(0)
+ (err,) = cuda.cuInit(0)
# Retrieve handle for device 0
err, cuDevice = cuda.cuDeviceGet(0)
@@ -319,12 +355,12 @@ def deserialize_plugin(self, name, data):
# build engine
build_engine = EngineFromNetwork(
- NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision==np.float16)
+ NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision == np.float16)
)
Y_ref = np.pad(X, [[0, 0], [0, 0], [pads[0], pads[1]], [pads[2], pads[3]]], "wrap")
# Run
- with TrtRunner(build_engine, "trt_runner")as runner:
+ with TrtRunner(build_engine, "trt_runner") as runner:
outputs = runner.infer({"X": X})
Y = outputs["Y"]
diff --git a/samples/python/python_plugin/circ_pad_plugin_cupy.py b/samples/python/python_plugin/circ_pad_plugin_cupy.py
index 82b271cc..19545a11 100644
--- a/samples/python/python_plugin/circ_pad_plugin_cupy.py
+++ b/samples/python/python_plugin/circ_pad_plugin_cupy.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,14 +27,15 @@
CreateConfig,
EngineFromNetwork,
NetworkFromOnnxPath,
- TrtRunner
+ TrtRunner,
)
from polygraphy.json import to_json, from_json
from utils import volume, parseArgs
-circ_pad_half_kernel = cp.RawKernel(r'''
+circ_pad_half_kernel = cp.RawKernel(
+ r"""
#include
extern "C" __global__
void circ_pad_half(half const* X, int const* all_pads, int const* orig_dims, half* Y, int const* Y_shape, int const* Y_len) {
@@ -61,9 +62,12 @@
];
}
}
-''', 'circ_pad_half')
+""",
+ "circ_pad_half",
+)
-circ_pad_float_kernel = cp.RawKernel(r'''
+circ_pad_float_kernel = cp.RawKernel(
+ r"""
extern "C" __global__
void circ_pad_float(float const* X, int const* all_pads, int const* orig_dims, float* Y, int const* Y_shape, int const* Y_len) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -89,14 +93,17 @@
];
}
}
-''', 'circ_pad_float')
+""",
+ "circ_pad_float",
+)
+
class CircPadPlugin(trt.IPluginV2DynamicExt):
def __init__(self, fc=None):
trt.IPluginV2DynamicExt.__init__(self)
self.pads = []
self.X_shape = []
-
+
self.num_outputs = 1
self.plugin_namespace = ""
self.plugin_type = "CircPadPlugin"
@@ -190,9 +197,31 @@ def enqueue(self, input_desc, output_desc, inputs, outputs, workspace, stream):
with cuda_stream:
if inp_dtype == np.float32:
- circ_pad_float_kernel((numBlocks,), (blockSize,), (a, self.all_pads_d, self.orig_dims_d, c, self.Y_shape_d, self.Y_len_d))
+ circ_pad_float_kernel(
+ (numBlocks,),
+ (blockSize,),
+ (
+ a,
+ self.all_pads_d,
+ self.orig_dims_d,
+ c,
+ self.Y_shape_d,
+ self.Y_len_d,
+ ),
+ )
elif inp_dtype == np.float16:
- circ_pad_half_kernel((numBlocks,), (blockSize,), (a, self.all_pads_d, self.orig_dims_d, c, self.Y_shape_d, self.Y_len_d))
+ circ_pad_half_kernel(
+ (numBlocks,),
+ (blockSize,),
+ (
+ a,
+ self.all_pads_d,
+ self.orig_dims_d,
+ c,
+ self.Y_shape_d,
+ self.Y_len_d,
+ ),
+ )
else:
raise ValueError("inp_dtype not valid")
@@ -201,7 +230,7 @@ def clone(self):
cloned_plugin.__dict__.update(self.__dict__)
return cloned_plugin
- #
+ #
# The following defaults take effect since the respective methods are not overriden
#
@@ -213,17 +242,18 @@ def clone(self):
# def get_workspace_size(self, input_desc, output_desc):
# return 0
-
+
# def destroy(self):
# pass
# def terminate(self):
# pass
+
class CircPadPluginCreator(trt.IPluginCreator):
def __init__(self):
trt.IPluginCreator.__init__(self)
-
+
self.name = "CircPadPlugin"
self.plugin_namespace = ""
self.plugin_version = "1"
@@ -233,13 +263,14 @@ def __init__(self):
def create_plugin(self, name, fc):
return CircPadPlugin(fc)
-
+
def deserialize_plugin(self, name, data):
j = dict(from_json(data.decode("utf-8")))
deserialized = CircPadPlugin()
deserialized.__dict__.update(j)
return deserialized
+
if __name__ == "__main__":
args = parseArgs()
@@ -275,12 +306,12 @@ def deserialize_plugin(self, name, data):
# build engine
build_engine = EngineFromNetwork(
- NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision==np.float16)
+ NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision == np.float16)
)
Y_ref = np.pad(X, [[0, 0], [0, 0], [pads[0], pads[1]], [pads[2], pads[3]]], "wrap")
# Run
- with TrtRunner(build_engine, "trt_runner")as runner:
+ with TrtRunner(build_engine, "trt_runner") as runner:
outputs = runner.infer({"X": X})
Y = outputs["Y"]
diff --git a/samples/python/python_plugin/circ_pad_plugin_inetdef_cuda_python.py b/samples/python/python_plugin/circ_pad_plugin_inetdef_cuda_python.py
index 60208ab3..6abf526f 100644
--- a/samples/python/python_plugin/circ_pad_plugin_inetdef_cuda_python.py
+++ b/samples/python/python_plugin/circ_pad_plugin_inetdef_cuda_python.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,7 +23,7 @@
CreateConfig,
TrtRunner,
create_network,
- engine_from_network
+ engine_from_network,
)
from polygraphy.json import to_json, from_json
@@ -31,7 +31,7 @@
from utils import checkCudaErrors, KernelHelper, parseArgs, CudaCtxManager
from cuda import cuda
-circ_pad_half_kernel = r'''
+circ_pad_half_kernel = r"""
#include
extern "C" __global__
void circ_pad_half(half const* X, int const* all_pads, int const* orig_dims, half* Y, int const* Y_shape, int Y_len) {
@@ -58,9 +58,9 @@
];
}
}
-'''
+"""
-circ_pad_float_kernel = r'''
+circ_pad_float_kernel = r"""
extern "C" __global__
void circ_pad_float(float const* X, int const* all_pads, int const* orig_dims, float* Y, int const* Y_shape, int Y_len) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -86,7 +86,8 @@
];
}
}
-'''
+"""
+
class CircPadPlugin(trt.IPluginV2DynamicExt):
def __init__(self, fc=None):
@@ -107,7 +108,9 @@ def __init__(self, fc=None):
self.cuDevice = None
if fc is not None:
- assert set([f.name for f in fc]) == set(["pads", "N"]), "Field collection invalid"
+ assert set([f.name for f in fc]) == set(
+ ["pads", "N"]
+ ), "Field collection invalid"
for f in fc:
if f.name == "pads":
self.pads = f.data
@@ -116,11 +119,17 @@ def __init__(self, fc=None):
def initialize(self):
err, self.cuDevice = cuda.cuDeviceGet(0)
- trt.get_plugin_registry().acquire_plugin_resource("cuda_ctx", CudaCtxManager(self.cuDevice))
- self.all_pads_d = checkCudaErrors(cuda.cuMemAlloc(np.int32().itemsize * self.N * 2))
- self.orig_dims_d = checkCudaErrors(cuda.cuMemAlloc(np.int32().itemsize * self.N))
+ trt.get_plugin_registry().acquire_plugin_resource(
+ "cuda_ctx", CudaCtxManager(self.cuDevice)
+ )
+ self.all_pads_d = checkCudaErrors(
+ cuda.cuMemAlloc(np.int32().itemsize * self.N * 2)
+ )
+ self.orig_dims_d = checkCudaErrors(
+ cuda.cuMemAlloc(np.int32().itemsize * self.N)
+ )
self.Y_shape_d = checkCudaErrors(cuda.cuMemAlloc(np.int32().itemsize * self.N))
-
+
def get_output_datatype(self, index, input_types):
return input_types[0]
@@ -157,11 +166,17 @@ def configure_plugin(self, inp, out):
# Copy vectors from host memory to device memory
if self.all_pads_d:
- checkCudaErrors(cuda.cuMemcpyHtoD(self.all_pads_d, all_pads, all_pads.nbytes))
+ checkCudaErrors(
+ cuda.cuMemcpyHtoD(self.all_pads_d, all_pads, all_pads.nbytes)
+ )
if self.orig_dims_d:
- checkCudaErrors(cuda.cuMemcpyHtoD(self.orig_dims_d, orig_dims, orig_dims.nbytes))
+ checkCudaErrors(
+ cuda.cuMemcpyHtoD(self.orig_dims_d, orig_dims, orig_dims.nbytes)
+ )
if self.Y_shape_d:
- checkCudaErrors(cuda.cuMemcpyHtoD(self.Y_shape_d, out_dims, out_dims.nbytes))
+ checkCudaErrors(
+ cuda.cuMemcpyHtoD(self.Y_shape_d, out_dims, out_dims.nbytes)
+ )
self.Y_len_d = np.prod(out_dims)
@@ -205,25 +220,43 @@ def enqueue(self, input_desc, output_desc, inputs, outputs, workspace, stream):
if inp_dtype == np.float32:
kernelHelper = KernelHelper(circ_pad_float_kernel, int(self.cuDevice))
- _circ_pad_float_kernel = kernelHelper.getFunction(b'circ_pad_float')
- checkCudaErrors(cuda.cuLaunchKernel(_circ_pad_float_kernel,
- numBlocks, 1, 1,
- blockSize, 1, 1,
- 0,
- stream_ptr,
- kernelArgs, 0))
+ _circ_pad_float_kernel = kernelHelper.getFunction(b"circ_pad_float")
+ checkCudaErrors(
+ cuda.cuLaunchKernel(
+ _circ_pad_float_kernel,
+ numBlocks,
+ 1,
+ 1,
+ blockSize,
+ 1,
+ 1,
+ 0,
+ stream_ptr,
+ kernelArgs,
+ 0,
+ )
+ )
elif inp_dtype == np.float16:
kernelHelper = KernelHelper(circ_pad_half_kernel, int(self.cuDevice))
- _circ_pad_half_kernel = kernelHelper.getFunction(b'circ_pad_half')
- checkCudaErrors(cuda.cuLaunchKernel(_circ_pad_half_kernel,
- numBlocks, 1, 1,
- blockSize, 1, 1,
- 0,
- stream_ptr,
- kernelArgs, 0))
+ _circ_pad_half_kernel = kernelHelper.getFunction(b"circ_pad_half")
+ checkCudaErrors(
+ cuda.cuLaunchKernel(
+ _circ_pad_half_kernel,
+ numBlocks,
+ 1,
+ 1,
+ blockSize,
+ 1,
+ 1,
+ 0,
+ stream_ptr,
+ kernelArgs,
+ 0,
+ )
+ )
else:
raise ValueError("inp_dtype not valid")
-
+
def clone(self):
cloned_plugin = CircPadPlugin()
cloned_plugin.__dict__.update(self.__dict__)
@@ -239,7 +272,7 @@ def terminate(self):
plg_registry.release_plugin_resource("cuda_ctx")
- #
+ #
# The following defaults take effect since the respective methods are not overriden
#
@@ -248,7 +281,7 @@ def terminate(self):
# def get_workspace_size(self, input_desc, output_desc):
# return 0
-
+
# def destroy(self):
# pass
@@ -259,10 +292,12 @@ def __init__(self):
self.name = "CircPadPlugin"
self.plugin_namespace = ""
self.plugin_version = "1"
- self.field_names = trt.PluginFieldCollection([
- trt.PluginField("pads", np.array([]), trt.PluginFieldType.INT32),
- trt.PluginField("N", np.array([]), trt.PluginFieldType.INT32)
- ])
+ self.field_names = trt.PluginFieldCollection(
+ [
+ trt.PluginField("pads", np.array([]), trt.PluginFieldType.INT32),
+ trt.PluginField("N", np.array([]), trt.PluginFieldType.INT32),
+ ]
+ )
def create_plugin(self, name, fc):
return CircPadPlugin(fc)
@@ -273,13 +308,14 @@ def deserialize_plugin(self, name, data):
deserialized.__dict__.update(j)
return deserialized
+
if __name__ == "__main__":
args = parseArgs()
precision = np.float32 if args.precision == "fp32" else np.float16
# Initialize CUDA Driver API
- err, = cuda.cuInit(0)
+ (err,) = cuda.cuInit(0)
# Retrieve handle for device 0
err, cuDevice = cuda.cuDeviceGet(0)
@@ -306,28 +342,36 @@ def deserialize_plugin(self, name, data):
builder, network = create_network()
plg_creator = plg_registry.get_plugin_creator("CircPadPlugin", "1", "")
plugin_fields_list = [
- trt.PluginField("pads", np.array(pads, dtype=np.int32), trt.PluginFieldType.INT32),
+ trt.PluginField(
+ "pads", np.array(pads, dtype=np.int32), trt.PluginFieldType.INT32
+ ),
trt.PluginField("N", np.array([4], dtype=np.int32), trt.PluginFieldType.INT32),
]
pfc = trt.PluginFieldCollection(plugin_fields_list)
plugin = plg_creator.create_plugin("CircPadPlugin", pfc)
# Populate network
- input_X = network.add_input(name="X", dtype=trt.float32 if precision==np.float32 else trt.float16, shape=X.shape)
+ input_X = network.add_input(
+ name="X",
+ dtype=trt.float32 if precision == np.float32 else trt.float16,
+ shape=X.shape,
+ )
out = network.add_plugin_v2([input_X], plugin)
out.get_output(0).name = "Y"
network.mark_output(tensor=out.get_output(0))
# Build engine
config = builder.create_builder_config()
- engine = engine_from_network((builder, network), CreateConfig(fp16=precision==trt.float16))
+ engine = engine_from_network(
+ (builder, network), CreateConfig(fp16=precision == trt.float16)
+ )
Y_ref = np.pad(X, [[0, 0], [0, 0], [pads[0], pads[1]], [pads[2], pads[3]]], "wrap")
# Run
- with TrtRunner(engine, "trt_runner")as runner:
+ with TrtRunner(engine, "trt_runner") as runner:
outputs = runner.infer({"X": X})
Y = outputs["Y"]
-
+
if np.allclose(Y, Y_ref):
print("Inference result correct!")
else:
diff --git a/samples/python/python_plugin/circ_pad_plugin_numba.py b/samples/python/python_plugin/circ_pad_plugin_numba.py
index 2cc0bfab..d568419d 100644
--- a/samples/python/python_plugin/circ_pad_plugin_numba.py
+++ b/samples/python/python_plugin/circ_pad_plugin_numba.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,6 +32,7 @@
from polygraphy.json import to_json, from_json
from utils import volume, parseArgs
+
@cuda.jit
def circ_pad(X, all_pads, orig_dims, Y, Y_shape, Y_len):
index = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
@@ -57,6 +58,7 @@ def circ_pad(X, all_pads, orig_dims, Y, Y_shape, Y_len):
)
]
+
class CircPadPlugin(trt.IPluginV2DynamicExt):
def __init__(self, fc=None):
trt.IPluginV2DynamicExt.__init__(self)
@@ -76,7 +78,7 @@ def get_output_datatype(self, index, input_types):
return input_types[0]
def get_output_dimensions(self, output_index, inputs, exprBuilder):
-
+
output_dims = trt.DimsExprs(inputs[0])
for i in range(np.size(self.pads) // 2):
@@ -163,8 +165,8 @@ def clone(self):
cloned_plugin = CircPadPlugin()
cloned_plugin.__dict__.update(self.__dict__)
return cloned_plugin
-
- #
+
+ #
# The following defaults take effect since the respective methods are not overriden
#
@@ -176,7 +178,7 @@ def clone(self):
# def get_workspace_size(self, input_desc, output_desc):
# return 0
-
+
# def destroy(self):
# pass
@@ -203,6 +205,7 @@ def deserialize_plugin(self, name, data):
deserialized.__dict__.update(j)
return deserialized
+
if __name__ == "__main__":
args = parseArgs()
@@ -234,12 +237,12 @@ def deserialize_plugin(self, name, data):
# build engine
build_engine = EngineFromNetwork(
- NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision==np.float16)
+ NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision == np.float16)
)
Y_ref = np.pad(X, [[0, 0], [0, 0], [pads[0], pads[1]], [pads[2], pads[3]]], "wrap")
# Run
- with TrtRunner(build_engine, "trt_runner")as runner:
+ with TrtRunner(build_engine, "trt_runner") as runner:
outputs = runner.infer({"X": X})
Y = outputs["Y"]
diff --git a/samples/python/python_plugin/circ_pad_plugin_torch.py b/samples/python/python_plugin/circ_pad_plugin_torch.py
index 8b036469..76e8cc41 100644
--- a/samples/python/python_plugin/circ_pad_plugin_torch.py
+++ b/samples/python/python_plugin/circ_pad_plugin_torch.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,12 +33,13 @@
from utils import volume, parseArgs
+
class CircPadPlugin(trt.IPluginV2DynamicExt):
def __init__(self, fc=None):
trt.IPluginV2DynamicExt.__init__(self)
self.pads = []
self.X_shape = []
-
+
self.num_outputs = 1
self.plugin_namespace = ""
self.plugin_type = "CircPadPlugin"
@@ -110,10 +111,10 @@ def enqueue(self, input_desc, output_desc, inputs, outputs, workspace, stream):
a_d = cp.ndarray(tuple(input_desc[0].dims), dtype=inp_dtype, memptr=a_ptr)
c_d = cp.ndarray((volume(output_desc[0].dims)), dtype=inp_dtype, memptr=c_ptr)
- a_t = torch.as_tensor(a_d, device='cuda')
+ a_t = torch.as_tensor(a_d, device="cuda")
# Use PyTorch functional op - no need to write kernel
- out = torch.nn.functional.pad(a_t, self.pads.tolist(), mode='circular')
+ out = torch.nn.functional.pad(a_t, self.pads.tolist(), mode="circular")
cp.copyto(c_d, cp.reshape(cp.asarray(out), (-1,)))
return 0
@@ -123,7 +124,7 @@ def clone(self):
cloned_plugin.__dict__.update(self.__dict__)
return cloned_plugin
- #
+ #
# The following defaults take effect since the respective methods are not overriden
#
@@ -135,7 +136,7 @@ def clone(self):
# def get_workspace_size(self, input_desc, output_desc):
# return 0
-
+
# def destroy(self):
# pass
@@ -162,6 +163,7 @@ def deserialize_plugin(self, name, data):
deserialized.__dict__.update(j)
return deserialized
+
if __name__ == "__main__":
args = parseArgs()
@@ -193,12 +195,12 @@ def deserialize_plugin(self, name, data):
# build engine
build_engine = EngineFromNetwork(
- NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision==np.float16)
+ NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision == np.float16)
)
Y_ref = np.pad(X, [[0, 0], [0, 0], [pads[0], pads[1]], [pads[2], pads[3]]], "wrap")
# Run
- with TrtRunner(build_engine, "trt_runner")as runner:
+ with TrtRunner(build_engine, "trt_runner") as runner:
outputs = runner.infer({"X": X})
Y = outputs["Y"]
diff --git a/samples/python/python_plugin/circ_pad_plugin_triton.py b/samples/python/python_plugin/circ_pad_plugin_triton.py
index 93b5f0fd..686d4e5c 100644
--- a/samples/python/python_plugin/circ_pad_plugin_triton.py
+++ b/samples/python/python_plugin/circ_pad_plugin_triton.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,13 +36,26 @@
from utils import volume, parseArgs
+
@triton.jit
-def circ_pad(X,
- all_pads_0, all_pads_2, all_pads_4, all_pads_6,
- orig_dims_0, orig_dims_1, orig_dims_2, orig_dims_3,
- Y,
- Y_shape_1, Y_shape_2, Y_shape_3,
- X_len, Y_len, BLOCK_SIZE: tl.constexpr,):
+def circ_pad(
+ X,
+ all_pads_0,
+ all_pads_2,
+ all_pads_4,
+ all_pads_6,
+ orig_dims_0,
+ orig_dims_1,
+ orig_dims_2,
+ orig_dims_3,
+ Y,
+ Y_shape_1,
+ Y_shape_2,
+ Y_shape_3,
+ X_len,
+ Y_len,
+ BLOCK_SIZE: tl.constexpr,
+):
pid = tl.program_id(0)
i = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
@@ -58,7 +71,12 @@ def circ_pad(X,
j2 = (i2 - all_pads_4 + orig_dims_2) % orig_dims_2
j3 = (i3 - all_pads_6 + orig_dims_3) % orig_dims_3
- load_idx = orig_dims_3 * orig_dims_2 * orig_dims_1 * j0 + orig_dims_3 * orig_dims_2 * j1 + orig_dims_3 * j2 + j3
+ load_idx = (
+ orig_dims_3 * orig_dims_2 * orig_dims_1 * j0
+ + orig_dims_3 * orig_dims_2 * j1
+ + orig_dims_3 * j2
+ + j3
+ )
mask_x = load_idx < X_len
x = tl.load(X + load_idx, mask=mask_x)
@@ -143,8 +161,8 @@ def enqueue(self, input_desc, output_desc, inputs, outputs, workspace, stream):
a_d = cp.ndarray((volume(input_desc[0].dims)), dtype=inp_dtype, memptr=a_ptr)
c_d = cp.ndarray((volume(output_desc[0].dims)), dtype=inp_dtype, memptr=c_ptr)
- a_t = torch.as_tensor(a_d, device='cuda')
- c_t = torch.as_tensor(c_d, device='cuda')
+ a_t = torch.as_tensor(a_d, device="cuda")
+ c_t = torch.as_tensor(c_d, device="cuda")
N = len(self.X_shape)
all_pads = np.zeros((N * 2,), dtype=np.int32)
@@ -163,12 +181,23 @@ def enqueue(self, input_desc, output_desc, inputs, outputs, workspace, stream):
blockSize = 256
numBlocks = (int((np.prod(out_dims) + blockSize - 1) // blockSize),)
- circ_pad[numBlocks](a_t,
- all_pads[0], all_pads[2], all_pads[4], all_pads[6],
- orig_dims[0], orig_dims[1], orig_dims[2], orig_dims[3],
+ circ_pad[numBlocks](
+ a_t,
+ all_pads[0],
+ all_pads[2],
+ all_pads[4],
+ all_pads[6],
+ orig_dims[0],
+ orig_dims[1],
+ orig_dims[2],
+ orig_dims[3],
c_t,
- out_dims[1], out_dims[2], out_dims[3],
- int(np.prod(orig_dims)), int(np.prod(out_dims)), BLOCK_SIZE=256
+ out_dims[1],
+ out_dims[2],
+ out_dims[3],
+ int(np.prod(orig_dims)),
+ int(np.prod(out_dims)),
+ BLOCK_SIZE=256,
)
return 0
@@ -178,7 +207,7 @@ def clone(self):
cloned_plugin.__dict__.update(self.__dict__)
return cloned_plugin
- #
+ #
# The following defaults take effect since the respective methods are not overriden
#
@@ -190,7 +219,7 @@ def clone(self):
# def get_workspace_size(self, input_desc, output_desc):
# return 0
-
+
# def destroy(self):
# pass
@@ -217,6 +246,7 @@ def deserialize_plugin(self, name, data):
deserialized.__dict__.update(j)
return deserialized
+
if __name__ == "__main__":
args = parseArgs()
@@ -248,12 +278,12 @@ def deserialize_plugin(self, name, data):
# build engine
build_engine = EngineFromNetwork(
- NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision==np.float16)
+ NetworkFromOnnxPath(onnx_path), CreateConfig(fp16=precision == np.float16)
)
Y_ref = np.pad(X, [[0, 0], [0, 0], [pads[0], pads[1]], [pads[2], pads[3]]], "wrap")
# Run
- with TrtRunner(build_engine, "trt_runner")as runner:
+ with TrtRunner(build_engine, "trt_runner") as runner:
outputs = runner.infer({"X": X})
Y = outputs["Y"]
diff --git a/samples/python/python_plugin/circ_plugin_cpp/circ_pad_plugin.cu b/samples/python/python_plugin/circ_plugin_cpp/circ_pad_plugin.cu
index 8e06a025..0bcffd56 100644
--- a/samples/python/python_plugin/circ_plugin_cpp/circ_pad_plugin.cu
+++ b/samples/python/python_plugin/circ_plugin_cpp/circ_pad_plugin.cu
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -109,8 +109,7 @@ __global__ void circPadKernel(
int32_t j2 = (i2 - allPads[4] + origDims[2]) % origDims[2];
int32_t j3 = (i3 - allPads[6] + origDims[3]) % origDims[3];
- y[i] = x[origDims[3] * origDims[2] * origDims[1] * j0 + origDims[3] * origDims[2] * j1 + origDims[3] * j2
- + j3];
+ y[i] = x[origDims[3] * origDims[2] * origDims[1] * j0 + origDims[3] * origDims[2] * j1 + origDims[3] * j2 + j3];
}
}
diff --git a/samples/python/python_plugin/utils.py b/samples/python/python_plugin/utils.py
index 4015b72c..1a1aa16c 100644
--- a/samples/python/python_plugin/utils.py
+++ b/samples/python/python_plugin/utils.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,15 +23,26 @@
import tensorrt as trt
+
def parseArgs():
- parser = argparse.ArgumentParser(description="Options for Circular Padding plugin C++ example")
- parser.add_argument('--precision', type=str, default="fp32", choices=["fp32", "fp16"], help="Precision to use for plugin")
+ parser = argparse.ArgumentParser(
+ description="Options for Circular Padding plugin C++ example"
+ )
+ parser.add_argument(
+ "--precision",
+ type=str,
+ default="fp32",
+ choices=["fp32", "fp16"],
+ help="Precision to use for plugin",
+ )
return parser.parse_args()
+
def volume(d):
return np.prod(d)
+
# Taken from https://github.com/NVIDIA/cuda-python/blob/main/examples/common/helper_cuda.py
def checkCudaErrors(result):
def _cudaGetErrorEnum(error):
@@ -43,9 +54,14 @@ def _cudaGetErrorEnum(error):
elif isinstance(error, nvrtc.nvrtcResult):
return nvrtc.nvrtcGetErrorString(error)[1]
else:
- raise RuntimeError('Unknown error type: {}'.format(error))
+ raise RuntimeError("Unknown error type: {}".format(error))
+
if result[0].value:
- raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
+ raise RuntimeError(
+ "CUDA error code={}({})".format(
+ result[0].value, _cudaGetErrorEnum(result[0])
+ )
+ )
if len(result) == 1:
return None
elif len(result) == 2:
@@ -53,34 +69,50 @@ def _cudaGetErrorEnum(error):
else:
return result[1:]
+
# Taken from https://github.com/NVIDIA/cuda-python/blob/main/examples/common/common.py
class KernelHelper:
def __init__(self, code, devID):
- prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b'sourceCode.cu', 0, [], []))
- CUDA_HOME = os.getenv('CUDA_HOME')
+ prog = checkCudaErrors(
+ nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, [], [])
+ )
+ CUDA_HOME = os.getenv("CUDA_HOME")
if CUDA_HOME == None:
- CUDA_HOME = os.getenv('CUDA_PATH')
+ CUDA_HOME = os.getenv("CUDA_PATH")
if CUDA_HOME == None:
- raise RuntimeError('Environment variable CUDA_HOME or CUDA_PATH is not set')
- include_dirs = os.path.join(CUDA_HOME, 'include')
+ raise RuntimeError("Environment variable CUDA_HOME or CUDA_PATH is not set")
+ include_dirs = os.path.join(CUDA_HOME, "include")
# Initialize CUDA
checkCudaErrors(cudart.cudaFree(0))
- major = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID))
- minor = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID))
+ major = checkCudaErrors(
+ cudart.cudaDeviceGetAttribute(
+ cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID
+ )
+ )
+ minor = checkCudaErrors(
+ cudart.cudaDeviceGetAttribute(
+ cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID
+ )
+ )
_, nvrtc_minor = checkCudaErrors(nvrtc.nvrtcVersion())
- use_cubin = (nvrtc_minor >= 1)
- prefix = 'sm' if use_cubin else 'compute'
- arch_arg = bytes(f'--gpu-architecture={prefix}_{major}{minor}', 'ascii')
+ use_cubin = nvrtc_minor >= 1
+ prefix = "sm" if use_cubin else "compute"
+ arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
try:
- opts = [b'--fmad=true', arch_arg, '--include-path={}'.format(include_dirs).encode('UTF-8'),
- b'--std=c++11', b'-default-device']
+ opts = [
+ b"--fmad=true",
+ arch_arg,
+ "--include-path={}".format(include_dirs).encode("UTF-8"),
+ b"--std=c++11",
+ b"-default-device",
+ ]
checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts))
except RuntimeError as err:
logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog))
- log = b' ' * logSize
+ log = b" " * logSize
checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log))
print(log.decode())
print(err)
@@ -88,11 +120,11 @@ def __init__(self, code, devID):
if use_cubin:
dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog))
- data = b' ' * dataSize
+ data = b" " * dataSize
checkCudaErrors(nvrtc.nvrtcGetCUBIN(prog, data))
else:
dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog))
- data = b' ' * dataSize
+ data = b" " * dataSize
checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data))
self.module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data)))
@@ -100,8 +132,9 @@ def __init__(self, code, devID):
def getFunction(self, name):
return checkCudaErrors(cuda.cuModuleGetFunction(self.module, name))
+
class CudaCtxManager(trt.IPluginResource):
- def __init__(self, device = None):
+ def __init__(self, device=None):
trt.IPluginResource.__init__(self)
self.device = device
self.cuda_ctx = None
diff --git a/samples/python/scripts/download_mnist_data.sh b/samples/python/scripts/download_mnist_data.sh
index 809bcbc9..196ddd4e 100755
--- a/samples/python/scripts/download_mnist_data.sh
+++ b/samples/python/scripts/download_mnist_data.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/python/scripts/download_mnist_pgms.py b/samples/python/scripts/download_mnist_pgms.py
index a1ee0cba..dee877fe 100644
--- a/samples/python/scripts/download_mnist_pgms.py
+++ b/samples/python/scripts/download_mnist_pgms.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/python/simple_progress_monitor/simple_progress_monitor.py b/samples/python/simple_progress_monitor/simple_progress_monitor.py
index 9ed6c6ba..fe54f720 100644
--- a/samples/python/simple_progress_monitor/simple_progress_monitor.py
+++ b/samples/python/simple_progress_monitor/simple_progress_monitor.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,6 +36,7 @@ class ModelData(object):
# We can convert TensorRT data types to numpy types with trt.nptype().
DTYPE = trt.float32
+
# This is a simple ASCII-art progress monitor comparable to the C++ version in sample_progress_monitor.
class SimpleProgressMonitor(trt.IProgressMonitor):
def __init__(self):
@@ -46,10 +47,15 @@ def __init__(self):
def phase_start(self, phase_name, parent_phase, num_steps):
try:
if parent_phase is not None:
- nbIndents = 1 + self._active_phases[parent_phase]['nbIndents']
+ nbIndents = 1 + self._active_phases[parent_phase]["nbIndents"]
else:
nbIndents = 0
- self._active_phases[phase_name] = { 'title': phase_name, 'steps': 0, 'num_steps': num_steps, 'nbIndents': nbIndents }
+ self._active_phases[phase_name] = {
+ "title": phase_name,
+ "steps": 0,
+ "num_steps": num_steps,
+ "nbIndents": nbIndents,
+ }
self._redraw()
except KeyboardInterrupt:
# The phase_start callback cannot directly cancel the build, so request the cancellation from within step_complete.
@@ -58,13 +64,13 @@ def phase_start(self, phase_name, parent_phase, num_steps):
def phase_finish(self, phase_name):
try:
del self._active_phases[phase_name]
- self._redraw(blank_lines=1) # Clear the removed phase.
+ self._redraw(blank_lines=1) # Clear the removed phase.
except KeyboardInterrupt:
_step_result = False
def step_complete(self, phase_name, step):
try:
- self._active_phases[phase_name]['steps'] = step
+ self._active_phases[phase_name]["steps"] = step
self._redraw()
return self._step_result
except KeyboardInterrupt:
@@ -75,32 +81,35 @@ def _redraw(self, *, blank_lines=0):
# The Python curses module is not widely available on Windows platforms.
# Instead, this function uses raw terminal escape sequences. See the sample documentation for references.
def clear_line():
- print('\x1B[2K', end='')
+ print("\x1B[2K", end="")
+
def move_to_start_of_line():
- print('\x1B[0G', end='')
+ print("\x1B[0G", end="")
+
def move_cursor_up(lines):
- print('\x1B[{}A'.format(lines), end='')
+ print("\x1B[{}A".format(lines), end="")
def progress_bar(steps, num_steps):
INNER_WIDTH = 10
completed_bar_chars = int(INNER_WIDTH * steps / float(num_steps))
- return '[{}{}]'.format(
- '=' * completed_bar_chars,
- '-' * (INNER_WIDTH - completed_bar_chars))
+ return "[{}{}]".format(
+ "=" * completed_bar_chars, "-" * (INNER_WIDTH - completed_bar_chars)
+ )
# Set max_cols to a default of 200 if not run in interactive mode.
max_cols = os.get_terminal_size().columns if sys.stdout.isatty() else 200
move_to_start_of_line()
for phase in self._active_phases.values():
- phase_prefix = '{indent}{bar} {title}'.format(
- indent = ' ' * phase['nbIndents'],
- bar = progress_bar(phase['steps'], phase['num_steps']),
- title = phase['title'])
- phase_suffix = '{steps}/{num_steps}'.format(**phase)
+ phase_prefix = "{indent}{bar} {title}".format(
+ indent=" " * phase["nbIndents"],
+ bar=progress_bar(phase["steps"], phase["num_steps"]),
+ title=phase["title"],
+ )
+ phase_suffix = "{steps}/{num_steps}".format(**phase)
allowable_prefix_chars = max_cols - len(phase_suffix) - 2
if allowable_prefix_chars < len(phase_prefix):
- phase_prefix = phase_prefix[0:allowable_prefix_chars-3] + '...'
+ phase_prefix = phase_prefix[0 : allowable_prefix_chars - 3] + "..."
clear_line()
print(phase_prefix, phase_suffix)
for line in range(blank_lines):
@@ -109,16 +118,20 @@ def progress_bar(steps, num_steps):
move_cursor_up(len(self._active_phases) + blank_lines)
sys.stdout.flush()
+
# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+
# The Onnx path is used for Onnx models.
def build_engine_onnx(model_file):
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(0)
config = builder.create_builder_config()
if not sys.stdout.isatty():
- print("Warning: This sample should be run from an interactive terminal in order to showcase the progress monitor correctly.")
+ print(
+ "Warning: This sample should be run from an interactive terminal in order to showcase the progress monitor correctly."
+ )
config.progress_monitor = SimpleProgressMonitor()
parser = trt.OnnxParser(network, TRT_LOGGER)
@@ -186,7 +199,14 @@ def main():
test_case = load_normalized_test_case(test_image, inputs[0].host)
# Run the engine. The output will be a 1D tensor of length 1000, where each value represents the
# probability that the image corresponds to that label
- trt_outputs = common.do_inference(context, engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
+ trt_outputs = common.do_inference(
+ context,
+ engine=engine,
+ bindings=bindings,
+ inputs=inputs,
+ outputs=outputs,
+ stream=stream,
+ )
# We use the highest probability as our prediction. Its index corresponds to the predicted label.
pred = labels[np.argmax(trt_outputs[0])]
common.free_buffers(inputs, outputs, stream)
@@ -195,5 +215,6 @@ def main():
else:
print("Incorrectly recognized " + test_case + " as " + pred)
+
if __name__ == "__main__":
main()
diff --git a/samples/python/tensorflow_object_detection_api/build_engine.py b/samples/python/tensorflow_object_detection_api/build_engine.py
index 0a0d6238..9bbf5f7c 100644
--- a/samples/python/tensorflow_object_detection_api/build_engine.py
+++ b/samples/python/tensorflow_object_detection_api/build_engine.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -56,7 +56,10 @@ def set_image_batcher(self, image_batcher: ImageBatcher):
:param image_batcher: The ImageBatcher object
"""
self.image_batcher = image_batcher
- size = int(np.dtype(self.image_batcher.dtype).itemsize * np.prod(self.image_batcher.shape))
+ size = int(
+ np.dtype(self.image_batcher.dtype).itemsize
+ * np.prod(self.image_batcher.shape)
+ )
self.batch_allocation = common.cuda_call(cudart.cudaMalloc(size))
self.batch_generator = self.image_batcher.get_batch()
@@ -81,8 +84,14 @@ def get_batch(self, names):
return None
try:
batch, _, _ = next(self.batch_generator)
- log.info("Calibrating image {} / {}".format(self.image_batcher.image_index, self.image_batcher.num_images))
- common.memcpy_host_to_device(self.batch_allocation, np.ascontiguousarray(batch))
+ log.info(
+ "Calibrating image {} / {}".format(
+ self.image_batcher.image_index, self.image_batcher.num_images
+ )
+ )
+ common.memcpy_host_to_device(
+ self.batch_allocation, np.ascontiguousarray(batch)
+ )
return [int(self.batch_allocation)]
except StopIteration:
log.info("Finished calibration batches")
@@ -128,7 +137,9 @@ def __init__(self, verbose=False, workspace=8):
self.builder = trt.Builder(self.trt_logger)
self.config = self.builder.create_builder_config()
- self.config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace * (2 ** 30))
+ self.config.set_memory_pool_limit(
+ trt.MemoryPoolType.WORKSPACE, workspace * (2**30)
+ )
self.batch_size = None
self.network = None
@@ -157,9 +168,17 @@ def create_network(self, onnx_path):
log.info("Network Description")
for input in inputs:
self.batch_size = input.shape[0]
- log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype))
+ log.info(
+ "Input '{}' with shape {} and dtype {}".format(
+ input.name, input.shape, input.dtype
+ )
+ )
for output in outputs:
- log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype))
+ log.info(
+ "Output '{}' with shape {} and dtype {}".format(
+ output.name, output.shape, output.dtype
+ )
+ )
assert self.batch_size > 0
# TODO: These overrides are to improve fp16/int8 performance on FRCNN models
@@ -167,17 +186,25 @@ def create_network(self, onnx_path):
# type on the two NMS plugins. To be determined.
for i in range(self.network.num_layers):
if self.network.get_layer(i).name in [
- "FirstStageBoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/squeeze",
- "FirstStageBoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/scale_value:0",
- "FirstStageBoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/scale",
- "nms/anchors:0"]:
+ "FirstStageBoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/squeeze",
+ "FirstStageBoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/scale_value:0",
+ "FirstStageBoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/scale",
+ "nms/anchors:0",
+ ]:
self.network.get_layer(i).precision = trt.DataType.FLOAT
- self.network.get_layer(i-1).precision = trt.DataType.FLOAT
+ self.network.get_layer(i - 1).precision = trt.DataType.FLOAT
if self.network.get_layer(i).name == "FirstNMS/detection_boxes_conversion":
self.network.get_layer(i).precision = trt.DataType.FLOAT
- def create_engine(self, engine_path, precision, calib_input=None, calib_cache=None, calib_num_images=5000,
- calib_batch_size=8):
+ def create_engine(
+ self,
+ engine_path,
+ precision,
+ calib_input=None,
+ calib_cache=None,
+ calib_num_images=5000,
+ calib_batch_size=8,
+ ):
"""
Build the TensorRT engine and serialize it to disk.
:param engine_path: The path where to serialize the engine to.
@@ -218,8 +245,14 @@ def create_engine(self, engine_path, precision, calib_input=None, calib_cache=No
calib_shape = [calib_batch_size] + list(inputs[0].shape[1:])
calib_dtype = trt.nptype(inputs[0].dtype)
self.config.int8_calibrator.set_image_batcher(
- ImageBatcher(calib_input, calib_shape, calib_dtype, max_num_images=calib_num_images,
- exact_batches=True))
+ ImageBatcher(
+ calib_input,
+ calib_shape,
+ calib_dtype,
+ max_num_images=calib_num_images,
+ exact_batches=True,
+ )
+ )
engine_bytes = self.builder.build_serialized_network(self.network, self.config)
if engine_bytes is None:
@@ -234,33 +267,68 @@ def create_engine(self, engine_path, precision, calib_input=None, calib_cache=No
def main(args):
builder = EngineBuilder(args.verbose, args.workspace)
builder.create_network(args.onnx)
- builder.create_engine(args.engine, args.precision, args.calib_input, args.calib_cache, args.calib_num_images,
- args.calib_batch_size)
+ builder.create_engine(
+ args.engine,
+ args.precision,
+ args.calib_input,
+ args.calib_cache,
+ args.calib_num_images,
+ args.calib_batch_size,
+ )
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--onnx", help="The input ONNX model file to load")
parser.add_argument("-e", "--engine", help="The output path for the TRT engine")
- parser.add_argument("-p", "--precision", default="fp16", choices=["fp32", "fp16", "int8"],
- help="The precision mode to build in, either 'fp32', 'fp16' or 'int8', default: 'fp16'")
- parser.add_argument("-v", "--verbose", action="store_true", help="Enable more verbose log output")
- parser.add_argument("-w", "--workspace", default=1, type=int, help="The max memory workspace size to allow in Gb, "
- "default: 1")
- parser.add_argument("--calib_input", help="The directory holding images to use for calibration")
- parser.add_argument("--calib_cache", default="./calibration.cache",
- help="The file path for INT8 calibration cache to use, default: ./calibration.cache")
- parser.add_argument("--calib_num_images", default=5000, type=int,
- help="The maximum number of images to use for calibration, default: 5000")
- parser.add_argument("--calib_batch_size", default=8, type=int,
- help="The batch size for the calibration process, default: 8")
+ parser.add_argument(
+ "-p",
+ "--precision",
+ default="fp16",
+ choices=["fp32", "fp16", "int8"],
+ help="The precision mode to build in, either 'fp32', 'fp16' or 'int8', default: 'fp16'",
+ )
+ parser.add_argument(
+ "-v", "--verbose", action="store_true", help="Enable more verbose log output"
+ )
+ parser.add_argument(
+ "-w",
+ "--workspace",
+ default=1,
+ type=int,
+ help="The max memory workspace size to allow in Gb, " "default: 1",
+ )
+ parser.add_argument(
+ "--calib_input", help="The directory holding images to use for calibration"
+ )
+ parser.add_argument(
+ "--calib_cache",
+ default="./calibration.cache",
+ help="The file path for INT8 calibration cache to use, default: ./calibration.cache",
+ )
+ parser.add_argument(
+ "--calib_num_images",
+ default=5000,
+ type=int,
+ help="The maximum number of images to use for calibration, default: 5000",
+ )
+ parser.add_argument(
+ "--calib_batch_size",
+ default=8,
+ type=int,
+ help="The batch size for the calibration process, default: 8",
+ )
args = parser.parse_args()
if not all([args.onnx, args.engine]):
parser.print_help()
log.error("These arguments are required: --onnx and --engine")
sys.exit(1)
- if args.precision == "int8" and not (args.calib_input or os.path.exists(args.calib_cache)):
+ if args.precision == "int8" and not (
+ args.calib_input or os.path.exists(args.calib_cache)
+ ):
parser.print_help()
- log.error("When building in int8 precision, --calib_input or an existing --calib_cache file is required")
+ log.error(
+ "When building in int8 precision, --calib_input or an existing --calib_cache file is required"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/tensorflow_object_detection_api/compare_tf.py b/samples/python/tensorflow_object_detection_api/compare_tf.py
index 409aec6b..ae5168eb 100644
--- a/samples/python/tensorflow_object_detection_api/compare_tf.py
+++ b/samples/python/tensorflow_object_detection_api/compare_tf.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,6 +27,7 @@
from image_batcher import ImageBatcher
from visualize import visualize_detections, concat_visualizations
+
class TensorFlowInfer:
"""
Implements TensorFlow inference of a saved model, following the same API as the TensorRTInfer class.
@@ -36,45 +37,49 @@ def __init__(self, saved_model_path, preprocessor, detection_type, iou_threshold
self.preprocessor = preprocessor
self.detection_type = detection_type
self.iou_threshold = iou_threshold
- gpus = tf.config.experimental.list_physical_devices('GPU')
+ gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
self.model = tf.saved_model.load(saved_model_path)
- self.pred_fn = self.model.signatures['serving_default']
+ self.pred_fn = self.model.signatures["serving_default"]
# Setup I/O bindings
self.inputs = []
fn_inputs = self.pred_fn.structured_input_signature[1]
for i, input in enumerate(list(fn_inputs.values())):
- self.inputs.append({
- 'index': i,
- 'name': input.name,
- 'dtype': np.dtype(input.dtype.as_numpy_dtype()),
- 'shape': [1, 512, 512, 3], # This can be overridden later
- })
+ self.inputs.append(
+ {
+ "index": i,
+ "name": input.name,
+ "dtype": np.dtype(input.dtype.as_numpy_dtype()),
+ "shape": [1, 512, 512, 3], # This can be overridden later
+ }
+ )
self.outputs = []
fn_outputs = self.pred_fn.structured_outputs
for i, output in enumerate(list(fn_outputs.values())):
- self.outputs.append({
- 'index': i,
- 'name': output.name,
- 'dtype': np.dtype(output.dtype.as_numpy_dtype()),
- 'shape': output.shape.as_list(),
- })
+ self.outputs.append(
+ {
+ "index": i,
+ "name": output.name,
+ "dtype": np.dtype(output.dtype.as_numpy_dtype()),
+ "shape": output.shape.as_list(),
+ }
+ )
def override_input_shape(self, input, shape):
- self.inputs[input]['shape'] = shape
+ self.inputs[input]["shape"] = shape
def input_spec(self):
- return self.inputs[0]['shape'], self.inputs[0]['dtype']
+ return self.inputs[0]["shape"], self.inputs[0]["dtype"]
def output_spec(self):
- return self.outputs[0]['shape'], self.outputs[0]['dtype']
+ return self.outputs[0]["shape"], self.outputs[0]["dtype"]
def infer(self, batch, scales=None, nms_threshold=None):
# Process I/O and execute the network
- input = {self.inputs[0]['name']: tf.convert_to_tensor(batch)}
+ input = {self.inputs[0]["name"]: tf.convert_to_tensor(batch)}
output = self.pred_fn(**input)
# Extract the results depending on what kind of saved model this is
@@ -82,24 +87,24 @@ def infer(self, batch, scales=None, nms_threshold=None):
scores = None
classes = None
- assert output['num_detections']
- num = int(output['num_detections'].numpy().flatten()[0])
- boxes = output['detection_boxes'].numpy()[:, 0:num, :]
- scores = output['detection_scores'].numpy()[:, 0:num]
- classes = output['detection_classes'].numpy()[:, 0:num]
+ assert output["num_detections"]
+ num = int(output["num_detections"].numpy().flatten()[0])
+ boxes = output["detection_boxes"].numpy()[:, 0:num, :]
+ scores = output["detection_scores"].numpy()[:, 0:num]
+ classes = output["detection_classes"].numpy()[:, 0:num]
# One additional output for segmentation masks
if "detection_masks" in output:
- masks = output['detection_masks'].numpy()[:, 0:num]
+ masks = output["detection_masks"].numpy()[:, 0:num]
# Process the results
detections = [[]]
- normalized = (np.max(boxes) < 2.0)
+ normalized = np.max(boxes) < 2.0
for n in range(scores.shape[1]):
# Depending on preprocessor, box scaling will be slightly different.
if self.preprocessor == "fixed_shape_resizer":
if scores[0][n] == 0.0:
break
- scale_x = self.inputs[0]['shape'][1] if normalized else 1.0
- scale_y = self.inputs[0]['shape'][2] if normalized else 1.0
+ scale_x = self.inputs[0]["shape"][1] if normalized else 1.0
+ scale_y = self.inputs[0]["shape"][2] if normalized else 1.0
if scales:
scale_x /= scales[0][0]
@@ -107,11 +112,11 @@ def infer(self, batch, scales=None, nms_threshold=None):
if nms_threshold and scores[0][n] < nms_threshold:
continue
# Depending on detection type you need slightly different data.
- if self.detection_type == 'bbox':
+ if self.detection_type == "bbox":
mask = None
# Segmentation is only supported with Mask R-CNN, which has
# fixed_shape_resizer as image_resizer (lookup pipeline.config)
- elif self.detection_type == 'segmentation':
+ elif self.detection_type == "segmentation":
# Select a mask
mask = masks[0][n]
# Slight scaling, to get binary masks after float32 -> uint8
@@ -124,7 +129,7 @@ def infer(self, batch, scales=None, nms_threshold=None):
mask = None
if scores[0][n] == 0.0:
break
- scale = self.inputs[0]['shape'][2] if normalized else 1.0
+ scale = self.inputs[0]["shape"][2] if normalized else 1.0
if scales:
scale /= scales[0]
scale_y = scale
@@ -132,15 +137,17 @@ def infer(self, batch, scales=None, nms_threshold=None):
if nms_threshold and scores[0][n] < nms_threshold:
continue
# Append to detections
- detections[0].append({
- 'ymin': boxes[0][n][0] * scale_y,
- 'xmin': boxes[0][n][1] * scale_x,
- 'ymax': boxes[0][n][2] * scale_y,
- 'xmax': boxes[0][n][3] * scale_x,
- 'score': scores[0][n],
- 'class': int(classes[0][n]) - 1,
- 'mask': mask,
- })
+ detections[0].append(
+ {
+ "ymin": boxes[0][n][0] * scale_y,
+ "xmin": boxes[0][n][1] * scale_x,
+ "ymax": boxes[0][n][2] * scale_y,
+ "xmax": boxes[0][n][3] * scale_x,
+ "score": scores[0][n],
+ "class": int(classes[0][n]) - 1,
+ "mask": mask,
+ }
+ )
return detections
@@ -150,7 +157,12 @@ def run(batcher, inferer, framework, nms_threshold=None):
for batch, images, scales in batcher.get_batch():
res_detections += inferer.infer(batch, scales, nms_threshold)
res_images += images
- print("Processing {} / {} images ({})".format(batcher.image_index, batcher.num_images, framework), end="\r")
+ print(
+ "Processing {} / {} images ({})".format(
+ batcher.image_index, batcher.num_images, framework
+ ),
+ end="\r",
+ )
print()
return res_images, res_detections
@@ -159,34 +171,45 @@ def parse_annotations(annotations_path, detection_type):
annotations = {}
if annotations_path and os.path.exists(annotations_path):
# Load annotations as coco, to extract segmentation masks
- coco=COCO(annotations_path)
+ coco = COCO(annotations_path)
with open(annotations_path) as f:
ann_json = json.load(f)
- for ann in ann_json['annotations']:
- img_id = ann['image_id']
+ for ann in ann_json["annotations"]:
+ img_id = ann["image_id"]
if img_id not in annotations.keys():
annotations[img_id] = []
# Depending on detection type you need slightly different data.
- if detection_type == 'bbox':
+ if detection_type == "bbox":
mask = None
# Segmentation is only supported with Mask R-CNN, which has
# fixed_shape_resizer as image_resizer (lookup pipeline.config)
- elif detection_type == 'segmentation':
+ elif detection_type == "segmentation":
# Get np.array segmentation mask from annotation
mask = coco.annToMask(ann)
- annotations[img_id].append({
- 'ymin': ann['bbox'][1],
- 'xmin': ann['bbox'][0],
- 'ymax': ann['bbox'][1] + ann['bbox'][3],
- 'xmax': ann['bbox'][0] + ann['bbox'][2],
- 'score': -1,
- 'class': ann['category_id'] - 1,
- 'mask': mask,
- })
+ annotations[img_id].append(
+ {
+ "ymin": ann["bbox"][1],
+ "xmin": ann["bbox"][0],
+ "ymax": ann["bbox"][1] + ann["bbox"][3],
+ "xmax": ann["bbox"][0] + ann["bbox"][2],
+ "score": -1,
+ "class": ann["category_id"] - 1,
+ "mask": mask,
+ }
+ )
return annotations
-def compare_images(tf_images, tf_detections, trt_images, trt_detections, output_dir, annotations_path, labels_path, detection_type):
+def compare_images(
+ tf_images,
+ tf_detections,
+ trt_images,
+ trt_detections,
+ output_dir,
+ annotations_path,
+ labels_path,
+ detection_type,
+):
labels = []
if labels_path and os.path.exists(labels_path):
with open(labels_path) as f:
@@ -196,7 +219,9 @@ def compare_images(tf_images, tf_detections, trt_images, trt_detections, output_
annotations = parse_annotations(annotations_path, detection_type)
count = 1
- for tf_img, tf_det, trt_img, trt_det in zip(tf_images, tf_detections, trt_images, trt_detections):
+ for tf_img, tf_det, trt_img, trt_det in zip(
+ tf_images, tf_detections, trt_images, trt_detections
+ ):
vis = []
names = []
colors = []
@@ -214,60 +239,142 @@ def compare_images(tf_images, tf_detections, trt_images, trt_detections, output_
if img_id.isnumeric():
img_id = int(img_id)
if img_id in annotations.keys():
- vis.append(visualize_detections(trt_img, None, annotations[img_id], labels))
+ vis.append(
+ visualize_detections(trt_img, None, annotations[img_id], labels)
+ )
names.append("Ground Truth")
colors.append("RoyalBlue")
else:
- print("Image {} does not have a COCO annotation, skipping ground truth visualization".format(trt_img))
+ print(
+ "Image {} does not have a COCO annotation, skipping ground truth visualization".format(
+ trt_img
+ )
+ )
basename = os.path.splitext(os.path.basename(tf_img))[0]
output_path = os.path.join(output_dir, "{}.compare.png".format(basename))
os.makedirs(output_dir, exist_ok=True)
concat_visualizations(vis, names, colors, output_path)
- print("Processing {} / {} images (Visualization)".format(count, len(tf_images)), end="\r")
+ print(
+ "Processing {} / {} images (Visualization)".format(count, len(tf_images)),
+ end="\r",
+ )
count += 1
print()
def main(args):
- tf_infer = TensorFlowInfer(args.saved_model, args.preprocessor, args.detection_type, args.iou_threshold)
- trt_infer = TensorRTInfer(args.engine, args.preprocessor, args.detection_type, args.iou_threshold)
-
- trt_batcher = ImageBatcher(args.input, *trt_infer.input_spec(), max_num_images=args.num_images, preprocessor=args.preprocessor)
- tf_infer.override_input_shape(0, [1, trt_batcher.height, trt_batcher.width, 3]) # Same size input in TF as TRT
- tf_batcher = ImageBatcher(args.input, *tf_infer.input_spec(), max_num_images=args.num_images, preprocessor=args.preprocessor)
-
- tf_images, tf_detections = run(tf_batcher, tf_infer, "TensorFlow", args.nms_threshold)
- trt_images, trt_detections = run(trt_batcher, trt_infer, "TensorRT", args.nms_threshold)
-
- compare_images(tf_images, tf_detections, trt_images, trt_detections, args.output, args.annotations, args.labels, args.detection_type)
+ tf_infer = TensorFlowInfer(
+ args.saved_model, args.preprocessor, args.detection_type, args.iou_threshold
+ )
+ trt_infer = TensorRTInfer(
+ args.engine, args.preprocessor, args.detection_type, args.iou_threshold
+ )
+
+ trt_batcher = ImageBatcher(
+ args.input,
+ *trt_infer.input_spec(),
+ max_num_images=args.num_images,
+ preprocessor=args.preprocessor
+ )
+ tf_infer.override_input_shape(
+ 0, [1, trt_batcher.height, trt_batcher.width, 3]
+ ) # Same size input in TF as TRT
+ tf_batcher = ImageBatcher(
+ args.input,
+ *tf_infer.input_spec(),
+ max_num_images=args.num_images,
+ preprocessor=args.preprocessor
+ )
+
+ tf_images, tf_detections = run(
+ tf_batcher, tf_infer, "TensorFlow", args.nms_threshold
+ )
+ trt_images, trt_detections = run(
+ trt_batcher, trt_infer, "TensorRT", args.nms_threshold
+ )
+
+ compare_images(
+ tf_images,
+ tf_detections,
+ trt_images,
+ trt_detections,
+ args.output,
+ args.annotations,
+ args.labels,
+ args.detection_type,
+ )
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with")
- parser.add_argument("-m", "--saved_model", help="The TensorFlow saved model path to validate against")
- parser.add_argument("-i", "--input",
- help="The input to infer, either a single image path, or a directory of images")
- parser.add_argument("-o", "--output", default=None, help="Directory where to save the visualization results")
- parser.add_argument("-l", "--labels", default="./labels_coco.txt",
- help="File to use for reading the class labels from, default: ./labels_coco.txt")
- parser.add_argument("-a", "--annotations", default=None,
- help="Set the path to the 'instances_val2017.json' file to use for COCO annotations, in which "
- "case --input should point to the COCO val2017 dataset, default: not used")
- parser.add_argument("-n", "--num_images", default=100, type=int,
- help="The maximum number of images to visualize, default: 100")
- parser.add_argument("-t", "--nms_threshold", type=float,
- help="Override the score threshold for the NMS operation, if higher than the threshold in the model/engine.")
- parser.add_argument("--iou_threshold", default=0.5, type=float,
- help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0")
- parser.add_argument("-d", "--detection_type", default="bbox", choices=["bbox", "segmentation"],
- help="Detection type for COCO, either bbox or if you are using Mask R-CNN's instance segmentation - segmentation")
- parser.add_argument("--preprocessor", default="fixed_shape_resizer", choices=["fixed_shape_resizer", "keep_aspect_ratio_resizer"],
- help="Select the image preprocessor to use based on your pipeline.config, either 'fixed_shape_resizer' or 'keep_aspect_ratio_resizer', default: fixed_shape_resizer")
+ parser.add_argument(
+ "-m",
+ "--saved_model",
+ help="The TensorFlow saved model path to validate against",
+ )
+ parser.add_argument(
+ "-i",
+ "--input",
+ help="The input to infer, either a single image path, or a directory of images",
+ )
+ parser.add_argument(
+ "-o",
+ "--output",
+ default=None,
+ help="Directory where to save the visualization results",
+ )
+ parser.add_argument(
+ "-l",
+ "--labels",
+ default="./labels_coco.txt",
+ help="File to use for reading the class labels from, default: ./labels_coco.txt",
+ )
+ parser.add_argument(
+ "-a",
+ "--annotations",
+ default=None,
+ help="Set the path to the 'instances_val2017.json' file to use for COCO annotations, in which "
+ "case --input should point to the COCO val2017 dataset, default: not used",
+ )
+ parser.add_argument(
+ "-n",
+ "--num_images",
+ default=100,
+ type=int,
+ help="The maximum number of images to visualize, default: 100",
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the score threshold for the NMS operation, if higher than the threshold in the model/engine.",
+ )
+ parser.add_argument(
+ "--iou_threshold",
+ default=0.5,
+ type=float,
+ help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0",
+ )
+ parser.add_argument(
+ "-d",
+ "--detection_type",
+ default="bbox",
+ choices=["bbox", "segmentation"],
+ help="Detection type for COCO, either bbox or if you are using Mask R-CNN's instance segmentation - segmentation",
+ )
+ parser.add_argument(
+ "--preprocessor",
+ default="fixed_shape_resizer",
+ choices=["fixed_shape_resizer", "keep_aspect_ratio_resizer"],
+ help="Select the image preprocessor to use based on your pipeline.config, either 'fixed_shape_resizer' or 'keep_aspect_ratio_resizer', default: fixed_shape_resizer",
+ )
args = parser.parse_args()
- if not all([args.engine, args.saved_model, args.input, args.output, args.preprocessor]):
+ if not all(
+ [args.engine, args.saved_model, args.input, args.output, args.preprocessor]
+ ):
parser.print_help()
sys.exit(1)
main(args)
diff --git a/samples/python/tensorflow_object_detection_api/create_onnx.py b/samples/python/tensorflow_object_detection_api/create_onnx.py
index 919cc8e6..fc75fa17 100644
--- a/samples/python/tensorflow_object_detection_api/create_onnx.py
+++ b/samples/python/tensorflow_object_detection_api/create_onnx.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,7 +32,9 @@
from object_detection.utils import config_util
except ImportError:
print("Could not import TFOD modules. Maybe you did not install TFOD API")
- print("Please install TensorFlow 2 Object Detection API, check https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2.md")
+ print(
+ "Please install TensorFlow 2 Object Detection API, check https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2.md"
+ )
sys.exit(1)
import onnx_utils
@@ -55,14 +57,19 @@ def __init__(self, saved_model_path, pipeline_config_path):
assert os.path.exists(saved_model_path)
# Use tf2onnx to convert saved model to an initial ONNX graph.
- graph_def, inputs, outputs = tf_loader.from_saved_model(saved_model_path, None, None, "serve",
- ["serving_default"])
+ graph_def, inputs, outputs = tf_loader.from_saved_model(
+ saved_model_path, None, None, "serve", ["serving_default"]
+ )
log.info("Loaded saved model from {}".format(saved_model_path))
with tf.Graph().as_default() as tf_graph:
tf.import_graph_def(graph_def, name="")
with tf_loader.tf_session(graph=tf_graph):
- onnx_graph = tfonnx.process_tf_graph(tf_graph, input_names=inputs, output_names=outputs, opset=11)
- onnx_model = optimizer.optimize_graph(onnx_graph).make_model("Converted from {}".format(saved_model_path))
+ onnx_graph = tfonnx.process_tf_graph(
+ tf_graph, input_names=inputs, output_names=outputs, opset=11
+ )
+ onnx_model = optimizer.optimize_graph(onnx_graph).make_model(
+ "Converted from {}".format(saved_model_path)
+ )
self.graph = gs.import_onnx(onnx_model)
assert self.graph
log.info("TF2ONNX graph created successfully")
@@ -71,61 +78,140 @@ def __init__(self, saved_model_path, pipeline_config_path):
self.graph.fold_constants()
# Pipeline config parsing.
- pipeline_config = config_util.get_configs_from_pipeline_file(pipeline_config_path)
+ pipeline_config = config_util.get_configs_from_pipeline_file(
+ pipeline_config_path
+ )
# Get input resolution.
- self.height, self.width = config_util.get_spatial_image_size(config_util.get_image_resizer_config(pipeline_config["model"]))
+ self.height, self.width = config_util.get_spatial_image_size(
+ config_util.get_image_resizer_config(pipeline_config["model"])
+ )
# If your model is SSD, get characteristics accordingly from pipeline.config file.
if pipeline_config["model"].HasField("ssd"):
# Getting model characteristics.
self.model = str(pipeline_config["model"].ssd.feature_extractor.type)
- self.first_stage_nms_score_threshold = float(pipeline_config["model"].ssd.post_processing.batch_non_max_suppression.score_threshold)
- self.first_stage_nms_iou_threshold = float(pipeline_config["model"].ssd.post_processing.batch_non_max_suppression.iou_threshold)
- self.first_stage_max_proposals = int(pipeline_config["model"].ssd.post_processing.batch_non_max_suppression.max_detections_per_class)
+ self.first_stage_nms_score_threshold = float(
+ pipeline_config[
+ "model"
+ ].ssd.post_processing.batch_non_max_suppression.score_threshold
+ )
+ self.first_stage_nms_iou_threshold = float(
+ pipeline_config[
+ "model"
+ ].ssd.post_processing.batch_non_max_suppression.iou_threshold
+ )
+ self.first_stage_max_proposals = int(
+ pipeline_config[
+ "model"
+ ].ssd.post_processing.batch_non_max_suppression.max_detections_per_class
+ )
# If your model is Faster R-CNN get it's characteristics from pipeline.config file.
elif pipeline_config["model"].HasField("faster_rcnn"):
# Getting model characteristics.
- self.model = str(pipeline_config["model"].faster_rcnn.feature_extractor.type)
+ self.model = str(
+ pipeline_config["model"].faster_rcnn.feature_extractor.type
+ )
self.num_classes = pipeline_config["model"].faster_rcnn.num_classes
- self.first_stage_nms_score_threshold = float(pipeline_config["model"].faster_rcnn.first_stage_nms_score_threshold)
- self.first_stage_nms_iou_threshold = float(pipeline_config["model"].faster_rcnn.first_stage_nms_iou_threshold)
- self.first_stage_max_proposals = int(pipeline_config["model"].faster_rcnn.first_stage_max_proposals)
- self.first_stage_crop_size = int(pipeline_config["model"].faster_rcnn.initial_crop_size)
- self.second_stage_nms_score_threshold = float(pipeline_config["model"].faster_rcnn.second_stage_post_processing.batch_non_max_suppression.score_threshold)
- self.second_stage_iou_threshold = float(pipeline_config["model"].faster_rcnn.second_stage_post_processing.batch_non_max_suppression.iou_threshold)
+ self.first_stage_nms_score_threshold = float(
+ pipeline_config["model"].faster_rcnn.first_stage_nms_score_threshold
+ )
+ self.first_stage_nms_iou_threshold = float(
+ pipeline_config["model"].faster_rcnn.first_stage_nms_iou_threshold
+ )
+ self.first_stage_max_proposals = int(
+ pipeline_config["model"].faster_rcnn.first_stage_max_proposals
+ )
+ self.first_stage_crop_size = int(
+ pipeline_config["model"].faster_rcnn.initial_crop_size
+ )
+ self.second_stage_nms_score_threshold = float(
+ pipeline_config[
+ "model"
+ ].faster_rcnn.second_stage_post_processing.batch_non_max_suppression.score_threshold
+ )
+ self.second_stage_iou_threshold = float(
+ pipeline_config[
+ "model"
+ ].faster_rcnn.second_stage_post_processing.batch_non_max_suppression.iou_threshold
+ )
self.mask_height = None
self.mask_width = None
self.matmul_crop_and_resize = False
# Check what kind of Crop and Resize operation is used
- if pipeline_config["model"].faster_rcnn.HasField("use_matmul_crop_and_resize"):
- self.matmul_crop_and_resize = pipeline_config["model"].faster_rcnn.use_matmul_crop_and_resize
+ if pipeline_config["model"].faster_rcnn.HasField(
+ "use_matmul_crop_and_resize"
+ ):
+ self.matmul_crop_and_resize = pipeline_config[
+ "model"
+ ].faster_rcnn.use_matmul_crop_and_resize
# If model is Mask R-CNN, get final instance segmentation masks resolution.
- if pipeline_config["model"].faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.HasField("mask_height") and pipeline_config["model"].faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.HasField("mask_width"):
- self.mask_height = int(pipeline_config["model"].faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.mask_height)
- self.mask_width = int(pipeline_config["model"].faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.mask_width)
+ if pipeline_config[
+ "model"
+ ].faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.HasField(
+ "mask_height"
+ ) and pipeline_config[
+ "model"
+ ].faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.HasField(
+ "mask_width"
+ ):
+ self.mask_height = int(
+ pipeline_config[
+ "model"
+ ].faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.mask_height
+ )
+ self.mask_width = int(
+ pipeline_config[
+ "model"
+ ].faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.mask_width
+ )
else:
log.info("Given Model type is not supported")
sys.exit(1)
# List of supported models.
- supported_models = ["ssd_mobilenet_v2_keras", "ssd_mobilenet_v1_fpn_keras", "ssd_mobilenet_v2_fpn_keras", "ssd_resnet50_v1_fpn_keras",
- "ssd_resnet101_v1_fpn_keras", "ssd_resnet152_v1_fpn_keras", "faster_rcnn_resnet50_keras", "faster_rcnn_resnet101_keras",
- "faster_rcnn_resnet152_keras", "faster_rcnn_inception_resnet_v2_keras"]
+ supported_models = [
+ "ssd_mobilenet_v2_keras",
+ "ssd_mobilenet_v1_fpn_keras",
+ "ssd_mobilenet_v2_fpn_keras",
+ "ssd_resnet50_v1_fpn_keras",
+ "ssd_resnet101_v1_fpn_keras",
+ "ssd_resnet152_v1_fpn_keras",
+ "faster_rcnn_resnet50_keras",
+ "faster_rcnn_resnet101_keras",
+ "faster_rcnn_resnet152_keras",
+ "faster_rcnn_inception_resnet_v2_keras",
+ ]
assert self.model in supported_models
# Model characteristics.
log.info("Model is {}".format(self.model))
log.info("Height is {}".format(self.height))
log.info("Width is {}".format(self.width))
- log.info("First NMS score threshold is {}".format(self.first_stage_nms_score_threshold))
- log.info("First NMS iou threshold is {}".format(self.first_stage_nms_iou_threshold))
+ log.info(
+ "First NMS score threshold is {}".format(
+ self.first_stage_nms_score_threshold
+ )
+ )
+ log.info(
+ "First NMS iou threshold is {}".format(self.first_stage_nms_iou_threshold)
+ )
log.info("First NMS max proposals is {}".format(self.first_stage_max_proposals))
if "faster_rcnn" in self.model:
log.info("Number of classes is {}".format(self.num_classes))
- log.info("Crop and Resize output size is {}".format(self.first_stage_crop_size))
- log.info("Second NMS score threshold is {}".format(self.second_stage_nms_score_threshold))
- log.info("Second NMS iou threshold is {}".format(self.second_stage_iou_threshold))
- log.info("Using MatMul Crop and Resize: {}".format(self.matmul_crop_and_resize))
+ log.info(
+ "Crop and Resize output size is {}".format(self.first_stage_crop_size)
+ )
+ log.info(
+ "Second NMS score threshold is {}".format(
+ self.second_stage_nms_score_threshold
+ )
+ )
+ log.info(
+ "Second NMS iou threshold is {}".format(self.second_stage_iou_threshold)
+ )
+ log.info(
+ "Using MatMul Crop and Resize: {}".format(self.matmul_crop_and_resize)
+ )
if not (self.mask_height is None and self.mask_width is None):
log.info("Mask height is {}".format(self.mask_height))
log.info("Mask width is {}".format(self.mask_width))
@@ -155,12 +241,16 @@ def sanitize(self):
model = shape_inference.infer_shapes(model)
self.graph = gs.import_onnx(model)
except Exception as e:
- log.info("Shape inference could not be performed at this time:\n{}".format(e))
+ log.info(
+ "Shape inference could not be performed at this time:\n{}".format(e)
+ )
try:
self.graph.fold_constants(fold_shapes=True)
except TypeError as e:
- log.error("This version of ONNX GraphSurgeon does not support folding shapes, please upgrade your "
- "onnx_graphsurgeon module. Error:\n{}".format(e))
+ log.error(
+ "This version of ONNX GraphSurgeon does not support folding shapes, please upgrade your "
+ "onnx_graphsurgeon module. Error:\n{}".format(e)
+ )
raise
count_after = len(self.graph.nodes)
@@ -189,11 +279,22 @@ def add_debug_output(self, debug):
for n, name in enumerate(debug):
if name not in tensors:
log.warning("Could not find tensor '{}'".format(name))
- debug_tensor = gs.Variable(name="debug:{}".format(n), dtype=tensors[name].dtype)
- debug_node = gs.Node(op="Identity", name="debug_{}".format(n), inputs=[tensors[name]], outputs=[debug_tensor])
+ debug_tensor = gs.Variable(
+ name="debug:{}".format(n), dtype=tensors[name].dtype
+ )
+ debug_node = gs.Node(
+ op="Identity",
+ name="debug_{}".format(n),
+ inputs=[tensors[name]],
+ outputs=[debug_tensor],
+ )
self.graph.nodes.append(debug_node)
self.graph.outputs.append(debug_tensor)
- log.info("Adding debug output '{}' for graph tensor '{}'".format(debug_tensor.name, name))
+ log.info(
+ "Adding debug output '{}' for graph tensor '{}'".format(
+ debug_tensor.name, name
+ )
+ )
def update_preprocessor(self, batch_size, input_format):
"""
@@ -208,46 +309,71 @@ def update_preprocessor(self, batch_size, input_format):
assert input_format in ["NCHW", "NHWC"]
input_shape = [None] * 4
if input_format == "NHWC":
- input_shape = [self.batch_size, self.height, self.width, 3]
+ input_shape = [self.batch_size, self.height, self.width, 3]
if input_format == "NCHW":
- input_shape = [self.batch_size, 3, self.height, self.width]
+ input_shape = [self.batch_size, 3, self.height, self.width]
self.graph.inputs[0].shape = input_shape
self.graph.inputs[0].dtype = np.float32
self.graph.inputs[0].name = "input_tensor"
self.sanitize()
- log.info("ONNX graph input shape: {} [NCHW format set]".format(self.graph.inputs[0].shape))
+ log.info(
+ "ONNX graph input shape: {} [NCHW format set]".format(
+ self.graph.inputs[0].shape
+ )
+ )
# Find the initial nodes of the graph, whatever the input is first connected to, and disconnect them.
- for node in [node for node in self.graph.nodes if self.graph.inputs[0] in node.inputs]:
+ for node in [
+ node for node in self.graph.nodes if self.graph.inputs[0] in node.inputs
+ ]:
node.inputs.clear()
# Get input tensor.
# Convert to NCHW format if needed.
input_tensor = self.graph.inputs[0]
if input_format == "NHWC":
- input_tensor = self.graph.transpose("preprocessor/transpose", input_tensor, [0, 3, 1, 2])
+ input_tensor = self.graph.transpose(
+ "preprocessor/transpose", input_tensor, [0, 3, 1, 2]
+ )
# Mobilenets' and inception's backbones preprocessor.
- if 'mobilenet' in self.model or 'inception_resnet' in self.model:
- mul_const = np.expand_dims(np.asarray([2 / 255], dtype=np.float32), axis=(0, 2, 3))
- sub_const = np.expand_dims(np.asarray([1], dtype=np.float32), axis=(0, 2, 3))
- mul_out = self.graph.op_with_const("Mul", "preprocessor/scale", input_tensor, mul_const)
- sub_out = self.graph.op_with_const("Sub", "preprocessor/mean", mul_out, sub_const)
+ if "mobilenet" in self.model or "inception_resnet" in self.model:
+ mul_const = np.expand_dims(
+ np.asarray([2 / 255], dtype=np.float32), axis=(0, 2, 3)
+ )
+ sub_const = np.expand_dims(
+ np.asarray([1], dtype=np.float32), axis=(0, 2, 3)
+ )
+ mul_out = self.graph.op_with_const(
+ "Mul", "preprocessor/scale", input_tensor, mul_const
+ )
+ sub_out = self.graph.op_with_const(
+ "Sub", "preprocessor/mean", mul_out, sub_const
+ )
# Resnet backbones' preprocessor.
- elif 'resnet' in self.model:
- sub_const = np.expand_dims(np.asarray([255 * 0.485, 255 * 0.456, 255 * 0.406], dtype=np.float32), axis=(0, 2, 3))
- sub_out = self.graph.op_with_const("Sub", "preprocessor/mean", input_tensor, sub_const)
+ elif "resnet" in self.model:
+ sub_const = np.expand_dims(
+ np.asarray([255 * 0.485, 255 * 0.456, 255 * 0.406], dtype=np.float32),
+ axis=(0, 2, 3),
+ )
+ sub_out = self.graph.op_with_const(
+ "Sub", "preprocessor/mean", input_tensor, sub_const
+ )
# Backbone is not supported.
else:
- log.info("Given model's backbone is not supported, pre-processor algorithm can't be generated")
+ log.info(
+ "Given model's backbone is not supported, pre-processor algorithm can't be generated"
+ )
sys.exit(1)
# Find first Conv node and connect preprocessor directly to it.
conv_node = self.graph.find_node_by_op("Conv")
- log.info("Found {} node '{}' as stem entry".format(conv_node.op, conv_node.name))
+ log.info(
+ "Found {} node '{}' as stem entry".format(conv_node.op, conv_node.name)
+ )
conv_node.inputs[0] = sub_out[0]
# Disconnect the last node in one of the preprocessing branches with first TensorListStack parent node.
@@ -275,9 +401,17 @@ def find_head_end(self, head_name, descendant, end_op):
# and the Box Net end node has the shape [batch_size, num_anchors, 4].
# These end nodes can be be found by searching for all end_op's operation nodes and checking if the node two
# steps above in the graph has a name that begins with one of head_names for Class Net and Box Net respectively.
- for node in [node for node in self.graph.nodes if node.op == descendant and head_name in node.name]:
+ for node in [
+ node
+ for node in self.graph.nodes
+ if node.op == descendant and head_name in node.name
+ ]:
target_node = self.graph.find_descendant_by_op(node, end_op)
- log.info("Found {} node '{}' as the tip of {}".format(target_node.op, target_node.name, head_name))
+ log.info(
+ "Found {} node '{}' as the tip of {}".format(
+ target_node.op, target_node.name, head_name
+ )
+ )
return target_node
def extract_anchors_tensor(self, split):
@@ -314,14 +448,27 @@ def get_anchor(output_idx, op, depth=5):
anchors_h = get_anchor(2, "Mul")
anchors_w = get_anchor(3, "Mul")
- batched_anchors = np.concatenate([anchors_y, anchors_x, anchors_h, anchors_w], axis=2)
+ batched_anchors = np.concatenate(
+ [anchors_y, anchors_x, anchors_h, anchors_w], axis=2
+ )
# Identify num of anchors without repetitions.
- num_anchors = int(batched_anchors.shape[1]/self.batch_size)
+ num_anchors = int(batched_anchors.shape[1] / self.batch_size)
# Trim total number of anchors in order to not have copies introduced by growing number of batch_size.
- anchors = batched_anchors[0:num_anchors,0:num_anchors]
+ anchors = batched_anchors[0:num_anchors, 0:num_anchors]
return gs.Constant(name="nms/anchors:0", values=anchors)
- def NMS(self, box_net_tensor, class_net_tensor, anchors_tensor, background_class, score_activation, iou_threshold, nms_score_threshold, user_threshold, nms_name=None):
+ def NMS(
+ self,
+ box_net_tensor,
+ class_net_tensor,
+ anchors_tensor,
+ background_class,
+ score_activation,
+ iou_threshold,
+ nms_score_threshold,
+ user_threshold,
+ nms_name=None,
+ ):
# Helper function to create the NMS Plugin node with the selected inputs.
# EfficientNMS_TRT TensorRT Plugin is suitable for our use case.
# :param box_net_tensor: The box predictions from the Box Net.
@@ -341,35 +488,53 @@ def NMS(self, box_net_tensor, class_net_tensor, anchors_tensor, background_class
nms_name = "_" + nms_name
# Set score threshold.
- score_threshold = nms_score_threshold if user_threshold is None else user_threshold
+ score_threshold = (
+ nms_score_threshold if user_threshold is None else user_threshold
+ )
# NMS Outputs.
- nms_output_num_detections = gs.Variable(name="num_detections"+nms_name, dtype=np.int32, shape=[self.batch_size, 1])
- nms_output_boxes = gs.Variable(name="detection_boxes"+nms_name, dtype=np.float32,
- shape=[self.batch_size, self.first_stage_max_proposals, 4])
- nms_output_scores = gs.Variable(name="detection_scores"+nms_name, dtype=np.float32,
- shape=[self.batch_size, self.first_stage_max_proposals])
- nms_output_classes = gs.Variable(name="detection_classes"+nms_name, dtype=np.int32,
- shape=[self.batch_size, self.first_stage_max_proposals])
+ nms_output_num_detections = gs.Variable(
+ name="num_detections" + nms_name, dtype=np.int32, shape=[self.batch_size, 1]
+ )
+ nms_output_boxes = gs.Variable(
+ name="detection_boxes" + nms_name,
+ dtype=np.float32,
+ shape=[self.batch_size, self.first_stage_max_proposals, 4],
+ )
+ nms_output_scores = gs.Variable(
+ name="detection_scores" + nms_name,
+ dtype=np.float32,
+ shape=[self.batch_size, self.first_stage_max_proposals],
+ )
+ nms_output_classes = gs.Variable(
+ name="detection_classes" + nms_name,
+ dtype=np.int32,
+ shape=[self.batch_size, self.first_stage_max_proposals],
+ )
- nms_outputs = [nms_output_num_detections, nms_output_boxes, nms_output_scores, nms_output_classes]
+ nms_outputs = [
+ nms_output_num_detections,
+ nms_output_boxes,
+ nms_output_scores,
+ nms_output_classes,
+ ]
# Plugin.
self.graph.plugin(
op="EfficientNMS_TRT",
- name="nms/non_maximum_suppression"+nms_name,
+ name="nms/non_maximum_suppression" + nms_name,
inputs=[box_net_tensor, class_net_tensor, anchors_tensor],
outputs=nms_outputs,
attrs={
- 'plugin_version': "1",
- 'background_class': background_class,
- 'max_output_boxes': self.first_stage_max_proposals,
- 'score_threshold': max(0.01, score_threshold),
- 'iou_threshold': iou_threshold,
- 'score_activation': score_activation,
- 'class_agnostic': False,
- 'box_coding': 1,
- }
+ "plugin_version": "1",
+ "background_class": background_class,
+ "max_output_boxes": self.first_stage_max_proposals,
+ "score_threshold": max(0.01, score_threshold),
+ "iou_threshold": iou_threshold,
+ "score_activation": score_activation,
+ "class_agnostic": False,
+ "box_coding": 1,
+ },
)
log.info("Created 'nms/non_maximum_suppression{}' NMS plugin".format(nms_name))
@@ -384,15 +549,26 @@ def CropAndResize(self, unsqeeze_input, relu_node_outputs, cnr_num):
# CropAndResizePlugin requires 4th dimension of 1: [N, B, 4, 1], so
# we need to add unsqeeze node to make tensor 4 dimensional.
- unsqueeze_node = self.graph.unsqueeze("CNR/detection_boxes_unsqueeze_"+cnr_num, unsqeeze_input)
+ unsqueeze_node = self.graph.unsqueeze(
+ "CNR/detection_boxes_unsqueeze_" + cnr_num, unsqeeze_input
+ )
# CropAndResizePlugin's inputs
feature_maps = relu_node_outputs
rois = unsqueeze_node[0]
# CropAndResize Outputs.
- cnr_pfmap = gs.Variable(name="cnr/pfmap_"+cnr_num, dtype=np.float32,
- shape=[self.batch_size, self.first_stage_max_proposals, feature_maps.shape[1], self.first_stage_crop_size, self.first_stage_crop_size])
+ cnr_pfmap = gs.Variable(
+ name="cnr/pfmap_" + cnr_num,
+ dtype=np.float32,
+ shape=[
+ self.batch_size,
+ self.first_stage_max_proposals,
+ feature_maps.shape[1],
+ self.first_stage_crop_size,
+ self.first_stage_crop_size,
+ ],
+ )
# Create the CropandResize Plugin node with the selected inputs.
# Two inputs are given to the CropAndResize TensorRT node:
@@ -400,19 +576,29 @@ def CropAndResize(self, unsqeeze_input, relu_node_outputs, cnr_num):
# - The rois (clipped and normalized detection boxes resulting from NMS): [batch_size, featuremap, 4, 1]
self.graph.plugin(
op="CropAndResize",
- name="cnr/crop_and_resize_"+cnr_num,
+ name="cnr/crop_and_resize_" + cnr_num,
inputs=[feature_maps, rois],
outputs=[cnr_pfmap],
attrs={
- 'crop_width': self.first_stage_crop_size,
- 'crop_height': self.first_stage_crop_size,
- }
+ "crop_width": self.first_stage_crop_size,
+ "crop_height": self.first_stage_crop_size,
+ },
)
log.info("Created {} CropAndResize plugin".format(cnr_num))
# Reshape node that is preparing CropAndResize's pfmap output shape for MaxPool node that comes next.
- reshape_shape = np.asarray([self.first_stage_max_proposals*self.batch_size, feature_maps.shape[1], self.first_stage_crop_size, self.first_stage_crop_size], dtype=np.int64)
- reshape_node = self.graph.op_with_const("Reshape", "cnr/reshape_"+cnr_num, cnr_pfmap, reshape_shape)
+ reshape_shape = np.asarray(
+ [
+ self.first_stage_max_proposals * self.batch_size,
+ feature_maps.shape[1],
+ self.first_stage_crop_size,
+ self.first_stage_crop_size,
+ ],
+ dtype=np.int64,
+ )
+ reshape_node = self.graph.op_with_const(
+ "Reshape", "cnr/reshape_" + cnr_num, cnr_pfmap, reshape_shape
+ )
return reshape_node[0]
@@ -423,7 +609,10 @@ def process_graph(self, first_nms_threshold=None, second_nms_threshold=None):
:param first_nms_threshold: Override the 1st NMS score threshold value. If set to None, use the value in the graph.
:param second_nms_threshold: Override the 2nd NMS score threshold value. If set to None, use the value in the graph.
"""
- def first_nms(background_class, score_activation, first_nms_threshold, nms_name=None):
+
+ def first_nms(
+ background_class, score_activation, first_nms_threshold, nms_name=None
+ ):
"""
Updates the graph to replace the 1st NMS op by EfficientNMS_TRT TensorRT plugin node.
:param background_class: Set EfficientNMS_TRT's background_class atribute.
@@ -432,35 +621,67 @@ def first_nms(background_class, score_activation, first_nms_threshold, nms_name=
:param nms_name: Set the NMS node name.
"""
# Supported models
- ssd_models = ['ssd_mobilenet_v1_fpn_keras', 'ssd_mobilenet_v2_fpn_keras', 'ssd_resnet50_v1_fpn_keras', 'ssd_resnet101_v1_fpn_keras', 'ssd_resnet152_v1_fpn_keras']
- frcnn_models = ['faster_rcnn_resnet50_keras', 'faster_rcnn_resnet101_keras', 'faster_rcnn_resnet152_keras', 'faster_rcnn_inception_resnet_v2_keras']
+ ssd_models = [
+ "ssd_mobilenet_v1_fpn_keras",
+ "ssd_mobilenet_v2_fpn_keras",
+ "ssd_resnet50_v1_fpn_keras",
+ "ssd_resnet101_v1_fpn_keras",
+ "ssd_resnet152_v1_fpn_keras",
+ ]
+ frcnn_models = [
+ "faster_rcnn_resnet50_keras",
+ "faster_rcnn_resnet101_keras",
+ "faster_rcnn_resnet152_keras",
+ "faster_rcnn_inception_resnet_v2_keras",
+ ]
# Getting SSD's Class and Box Nets final tensors.
if "ssd" in self.model:
# Find the concat node at the end of the class net (multi-scale class predictor).
- class_net_head_name = 'BoxPredictor/ConvolutionalClassHead_' if self.model == 'ssd_mobilenet_v2_keras' else 'WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutionalClassHead'
- class_net = self.find_head_end(class_net_head_name, "Transpose", "Concat")
+ class_net_head_name = (
+ "BoxPredictor/ConvolutionalClassHead_"
+ if self.model == "ssd_mobilenet_v2_keras"
+ else "WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutionalClassHead"
+ )
+ class_net = self.find_head_end(
+ class_net_head_name, "Transpose", "Concat"
+ )
# Final Class Net tensor
- class_net_tensor = self.graph.slice(class_net_head_name+"/slicer", class_net.outputs[0], 1, 91, 2)[0] # Remove background class
+ class_net_tensor = self.graph.slice(
+ class_net_head_name + "/slicer", class_net.outputs[0], 1, 91, 2
+ )[
+ 0
+ ] # Remove background class
# Find the concat or squeeze node at the end of the box net (multi-scale localization predictor).
- if self.model == 'ssd_mobilenet_v2_keras':
- box_net_head_name = 'BoxPredictor/ConvolutionalBoxHead_'
- box_net = self.find_head_end(box_net_head_name, "Transpose", "Squeeze")
+ if self.model == "ssd_mobilenet_v2_keras":
+ box_net_head_name = "BoxPredictor/ConvolutionalBoxHead_"
+ box_net = self.find_head_end(
+ box_net_head_name, "Transpose", "Squeeze"
+ )
else:
- box_net_head_name = 'WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutionalBoxHead'
- box_net = self.find_head_end(box_net_head_name, "Transpose", "Concat")
+ box_net_head_name = "WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutionalBoxHead"
+ box_net = self.find_head_end(
+ box_net_head_name, "Transpose", "Concat"
+ )
box_net_output = box_net.outputs[0]
# 0.1, 0.1, 0.2, 0.2 are localization head variance numbers, they scale box_net_output in order to get accurate coordinates.
- variance_adj = np.expand_dims(np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1))
+ variance_adj = np.expand_dims(
+ np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1)
+ )
# Final Box Net tensor.
- box_net_tensor = self.graph.op_with_const("Mul", box_net_head_name+"/scale", box_net_output, variance_adj)[0]
+ box_net_tensor = self.graph.op_with_const(
+ "Mul", box_net_head_name + "/scale", box_net_output, variance_adj
+ )[0]
# Getting Faster R-CNN's 1st Class and Box Nets tensors.
elif "faster_rcnn" in self.model:
# Identify Class Net and Box Net head names
- head_names = ['FirstStageBoxPredictor/ConvolutionalClassHead_0/ClassPredictor','FirstStageBoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor']
+ head_names = [
+ "FirstStageBoxPredictor/ConvolutionalClassHead_0/ClassPredictor",
+ "FirstStageBoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor",
+ ]
# Find the softmax node at the end of the class net (multi-scale class predictor).
class_net = self.find_head_end(head_names[0], "Transpose", "Softmax")
@@ -472,12 +693,18 @@ def first_nms(background_class, score_activation, first_nms_threshold, nms_name=
# Final Box Net tensor.
box_net_output = box_net.outputs[0]
- #Insert a squeeze node
- squeeze_node = self.graph.squeeze(head_names[1]+"/squeeze", box_net_output)
+ # Insert a squeeze node
+ squeeze_node = self.graph.squeeze(
+ head_names[1] + "/squeeze", box_net_output
+ )
# 0.1, 0.1, 0.2, 0.2 are localization head variance numbers, they scale box_net_output, in order to get accurate coordinates.
- variance_adj = np.expand_dims(np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1))
+ variance_adj = np.expand_dims(
+ np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1)
+ )
# Final Box Net tensor.
- box_net_tensor = self.graph.op_with_const("Mul", head_names[1]+"/scale", squeeze_node, variance_adj)[0]
+ box_net_tensor = self.graph.op_with_const(
+ "Mul", head_names[1] + "/scale", squeeze_node, variance_adj
+ )[0]
# Find the split node that separates the box net coordinates and feeds them into the box decoder.
box_net_split = self.graph.find_descendant_by_op(box_net, "Split")
@@ -487,7 +714,17 @@ def first_nms(background_class, score_activation, first_nms_threshold, nms_name=
anchors_tensor = self.extract_anchors_tensor(box_net_split)
# Create NMS node.
- nms_outputs = self.NMS(box_net_tensor, class_net_tensor, anchors_tensor, background_class, score_activation, self.first_stage_nms_iou_threshold, self.first_stage_nms_score_threshold, first_nms_threshold, nms_name)
+ nms_outputs = self.NMS(
+ box_net_tensor,
+ class_net_tensor,
+ anchors_tensor,
+ background_class,
+ score_activation,
+ self.first_stage_nms_iou_threshold,
+ self.first_stage_nms_score_threshold,
+ first_nms_threshold,
+ nms_name,
+ )
# Return NMS's outputs.
return nms_outputs
@@ -501,26 +738,47 @@ def first_cnr(input):
# Locate the last Relu node of the first backbone (pre 1st NMS). Relu node contains feature maps
# necessary for CropAndResize plugin.
relu_name = "StatefulPartitionedCall/model/"
- relu_node = [node for node in self.graph.nodes if node.op == "Relu" and relu_name in node.name][-1]
+ relu_node = [
+ node
+ for node in self.graph.nodes
+ if node.op == "Relu" and relu_name in node.name
+ ][-1]
# Before passing 1st NMS's detection boxes (rois) to CropAndResize, we need to clip and normalize them.
# Clipping happens for coordinates that are less than 0 and more than self.height.
# Normalization is just divison of every coordinate by self.height.
- clip_out = self.graph.clip("FirstNMS/detection_boxes_clipper", input, 0, self.height)
- div_const = np.expand_dims(np.asarray([self.height, self.width, self.height, self.width], dtype=np.float32), axis=(0, 1))
- div_out = self.graph.op_with_const("Div", "FirstNMS/detection_boxes_normalizer", clip_out[0], div_const)
+ clip_out = self.graph.clip(
+ "FirstNMS/detection_boxes_clipper", input, 0, self.height
+ )
+ div_const = np.expand_dims(
+ np.asarray(
+ [self.height, self.width, self.height, self.width], dtype=np.float32
+ ),
+ axis=(0, 1),
+ )
+ div_out = self.graph.op_with_const(
+ "Div", "FirstNMS/detection_boxes_normalizer", clip_out[0], div_const
+ )
# Linear transformation to convert box coordinates from (TopLeft, BottomRight) Corner encoding
# to CenterSize encoding. 1st NMS boxes are multiplied by transformation matrix in order to
# encode it into CenterSize format.
- matmul_const = np.matrix('0.5 0 -1 0; 0 0.5 0 -1; 0.5 0 1 0; 0 0.5 0 1', dtype=np.float32)
- matmul_out = self.graph.matmul("FirstNMS/detection_boxes_conversion", div_out[0], matmul_const)
+ matmul_const = np.matrix(
+ "0.5 0 -1 0; 0 0.5 0 -1; 0.5 0 1 0; 0 0.5 0 1", dtype=np.float32
+ )
+ matmul_out = self.graph.matmul(
+ "FirstNMS/detection_boxes_conversion", div_out[0], matmul_const
+ )
# Create Crop and Resize node.
cnr_output = self.CropAndResize(div_out, relu_node.outputs[0], "first")
# Find MaxPool node that summarizes CropAndResize structure.
- maxpool_node = [node for node in self.graph.nodes if node.op == "MaxPool" and "MaxPool2D/MaxPool" in node.name][0]
+ maxpool_node = [
+ node
+ for node in self.graph.nodes
+ if node.op == "MaxPool" and "MaxPool2D/MaxPool" in node.name
+ ][0]
maxpool_node.inputs[0] = cnr_output
# Return linear transformation node, it will be located between 1st and 2nd NMS,
@@ -528,7 +786,13 @@ def first_cnr(input):
# In case you are converting Mask R-CNN, feature maps are required for 2nd CropAndResize.
return matmul_out[0], relu_node.outputs[0]
- def second_nms(background_class, score_activation, encoded_boxes, second_nms_threshold, nms_name=None):
+ def second_nms(
+ background_class,
+ score_activation,
+ encoded_boxes,
+ second_nms_threshold,
+ nms_name=None,
+ ):
"""
Updates the graph to replace the 2nd (or final) NMS op by EfficientNMS_TRT TensorRT plugin node.
:param background_class: Set EfficientNMS_TRT's background_class atribute.
@@ -539,14 +803,20 @@ def second_nms(background_class, score_activation, encoded_boxes, second_nms_thr
"""
# Identify Class Net and Box Net head names.
- second_head_names = ['StatefulPartitionedCall/mask_rcnn_keras_box_predictor/mask_rcnn_class_head/ClassPredictor_dense',
- 'StatefulPartitionedCall/mask_rcnn_keras_box_predictor/mask_rcnn_box_head/BoxEncodingPredictor_dense']
+ second_head_names = [
+ "StatefulPartitionedCall/mask_rcnn_keras_box_predictor/mask_rcnn_class_head/ClassPredictor_dense",
+ "StatefulPartitionedCall/mask_rcnn_keras_box_predictor/mask_rcnn_box_head/BoxEncodingPredictor_dense",
+ ]
# Find the softmax node at the end of the 2nd class net (multi-scale class predictor).
- second_class_net = self.find_head_end(second_head_names[0], "MatMul", "Softmax")
+ second_class_net = self.find_head_end(
+ second_head_names[0], "MatMul", "Softmax"
+ )
# Faster R-CNN's slice operation to adjust third dimension of Class Net's last node tensor (adjusting class values).
- slice_out = self.graph.slice(second_head_names[0]+"/slicer", second_class_net.outputs[0], 1, 91, 2)
+ slice_out = self.graph.slice(
+ second_head_names[0] + "/slicer", second_class_net.outputs[0], 1, 91, 2
+ )
# Final Class Net tensor.
second_class_net_tensor = slice_out[0]
@@ -561,19 +831,56 @@ def second_nms(background_class, score_activation, encoded_boxes, second_nms_thr
# If use_matmul_crop_and_resize in pipeline.config is set to True, expect: [batch_size, first_stage_max_proposals, 4].
# Else use_matmul_crop_and_resize is either False or absent, expect: [batch_size, first_stage_max_proposals, num_classes, 4]
if self.matmul_crop_and_resize:
- reshape_shape_second = np.asarray([self.batch_size, self.first_stage_max_proposals, second_box_net.outputs[0].shape[1]], dtype=np.int64)
+ reshape_shape_second = np.asarray(
+ [
+ self.batch_size,
+ self.first_stage_max_proposals,
+ second_box_net.outputs[0].shape[1],
+ ],
+ dtype=np.int64,
+ )
else:
- reshape_shape_second = np.asarray([self.batch_size, self.first_stage_max_proposals, self.num_classes, second_box_net.outputs[0].shape[1]/self.num_classes], dtype=np.int64)
- reshape_node_second = self.graph.op_with_const("Reshape", second_head_names[1]+"/reshape", second_box_net_output, reshape_shape_second)
+ reshape_shape_second = np.asarray(
+ [
+ self.batch_size,
+ self.first_stage_max_proposals,
+ self.num_classes,
+ second_box_net.outputs[0].shape[1] / self.num_classes,
+ ],
+ dtype=np.int64,
+ )
+ reshape_node_second = self.graph.op_with_const(
+ "Reshape",
+ second_head_names[1] + "/reshape",
+ second_box_net_output,
+ reshape_shape_second,
+ )
# 0.1, 0.1, 0.2, 0.2 are localization head variance numbers, they scale second_box_net_output, in order to get accurate coordinates.
- second_scale_adj = np.expand_dims(np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1))
- second_scale_out = self.graph.op_with_const("Mul", second_head_names[1]+"/scale_second", reshape_node_second[0], second_scale_adj)
+ second_scale_adj = np.expand_dims(
+ np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1)
+ )
+ second_scale_out = self.graph.op_with_const(
+ "Mul",
+ second_head_names[1] + "/scale_second",
+ reshape_node_second[0],
+ second_scale_adj,
+ )
# Final Box Net tensor.
second_box_net_tensor = second_scale_out[0]
# Create NMS node.
- nms_outputs = self.NMS(second_box_net_tensor, second_class_net_tensor, encoded_boxes, background_class, score_activation, self.second_stage_iou_threshold, self.second_stage_nms_score_threshold, second_nms_threshold, nms_name)
+ nms_outputs = self.NMS(
+ second_box_net_tensor,
+ second_class_net_tensor,
+ encoded_boxes,
+ background_class,
+ score_activation,
+ self.second_stage_iou_threshold,
+ self.second_stage_nms_score_threshold,
+ second_nms_threshold,
+ nms_name,
+ )
return nms_outputs
@@ -585,24 +892,36 @@ def second_cnr(feature_maps, second_nms_outputs):
# Before passing 2nd NMS's detection boxes (rois) to second CropAndResize, we need to clip them.
# Clipping happens for coordinates that are less than 0 and more than 1 (binary).
- clip_out = self.graph.clip("SecondNMS/detection_boxes_clipper", second_nms_outputs[1], 0, 1)
+ clip_out = self.graph.clip(
+ "SecondNMS/detection_boxes_clipper", second_nms_outputs[1], 0, 1
+ )
# Create Crop and Resize node.
cnr_output = self.CropAndResize(clip_out, feature_maps, "second")
# Find MaxPool node that summarizes CropAndResize structure
- maxpool_node = [node for node in self.graph.nodes if node.op == "MaxPool" and "MaxPool2D/MaxPool_1" in node.name][0]
+ maxpool_node = [
+ node
+ for node in self.graph.nodes
+ if node.op == "MaxPool" and "MaxPool2D/MaxPool_1" in node.name
+ ][0]
maxpool_node.inputs[0] = cnr_output
# Reshape node that is preparing 2nd NMS class outputs for Add node that comes next.
# [self.batch_size, self.first_stage_max_proposals] -> [self.first_stage_max_proposals*self.batch_size]
- class_reshape_shape = np.asarray([self.first_stage_max_proposals*self.batch_size], dtype=np.int64)
- class_reshape_node = self.graph.op_with_const("Reshape", "Reshape_Class", second_nms_outputs[3], class_reshape_shape)
+ class_reshape_shape = np.asarray(
+ [self.first_stage_max_proposals * self.batch_size], dtype=np.int64
+ )
+ class_reshape_node = self.graph.op_with_const(
+ "Reshape", "Reshape_Class", second_nms_outputs[3], class_reshape_shape
+ )
# Find sigmoid node in the end of the network, applies sigmoid to get instance segmentation masks
- last_sigmoid_node = self.graph.find_descendant_by_op(maxpool_node, "Sigmoid", 40)
+ last_sigmoid_node = self.graph.find_descendant_by_op(
+ maxpool_node, "Sigmoid", 40
+ )
- if (self.num_classes > 1):
+ if self.num_classes > 1:
# Find first ancestor of Sigmoid of operation type Add. This Add node is one of the Gather node inputs,
# Gather node performs gather on 0th axis of data tensor and requires indices that set tesnors to be withing bounds,
# this Add node provides the bounds for Gather.
@@ -610,8 +929,21 @@ def second_cnr(feature_maps, second_nms_outputs):
add_node.inputs[1] = class_reshape_node[0]
# Final Reshape node, reshapes output of Sigmoid, important for various batch_size support.
- final_reshape_shape = np.asarray([self.batch_size, self.first_stage_max_proposals, self.mask_height, self.mask_width], dtype=np.int64)
- final_reshape_node = self.graph.op_with_const("Reshape", "Reshape_Final_Masks", last_sigmoid_node.outputs[0], final_reshape_shape)
+ final_reshape_shape = np.asarray(
+ [
+ self.batch_size,
+ self.first_stage_max_proposals,
+ self.mask_height,
+ self.mask_width,
+ ],
+ dtype=np.int64,
+ )
+ final_reshape_node = self.graph.op_with_const(
+ "Reshape",
+ "Reshape_Final_Masks",
+ last_sigmoid_node.outputs[0],
+ final_reshape_shape,
+ )
final_reshape_node[0].dtype = np.float32
final_reshape_node[0].name = "detection_masks"
@@ -623,17 +955,27 @@ def second_cnr(feature_maps, second_nms_outputs):
self.graph.outputs = first_nms(-1, True, first_nms_threshold)
self.sanitize()
# If your model is Faster R-CNN, you will need 2 NMS nodes with CropAndResize in between.
- elif "faster_rcnn" in self.model and self.mask_height is None and self.mask_width is None:
+ elif (
+ "faster_rcnn" in self.model
+ and self.mask_height is None
+ and self.mask_width is None
+ ):
first_nms_outputs = first_nms(0, False, first_nms_threshold, "rpn")
first_cnr_output, feature_maps = first_cnr(first_nms_outputs[1])
# Set graph outputs.
- self.graph.outputs = second_nms(-1, False, first_cnr_output, second_nms_threshold)
+ self.graph.outputs = second_nms(
+ -1, False, first_cnr_output, second_nms_threshold
+ )
self.sanitize()
# Mask R-CNN
- elif "faster_rcnn" in self.model and not (self.mask_height is None and self.mask_width is None):
+ elif "faster_rcnn" in self.model and not (
+ self.mask_height is None and self.mask_width is None
+ ):
first_nms_outputs = first_nms(0, False, first_nms_threshold, "rpn")
first_cnr_output, feature_maps = first_cnr(first_nms_outputs[1])
- second_nms_outputs = second_nms(-1, False, first_cnr_output, second_nms_threshold)
+ second_nms_outputs = second_nms(
+ -1, False, first_cnr_output, second_nms_threshold
+ )
second_cnr_output = second_cnr(feature_maps, second_nms_outputs)
# Append segmentation head output.
second_nms_outputs.append(second_cnr_output)
@@ -655,20 +997,57 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-p", "--pipeline_config", help="Pipeline configuration file to load", type=str)
- parser.add_argument("-m", "--saved_model", help="The TensorFlow saved model directory to load", type=str)
- parser.add_argument("-o", "--onnx", help="The output ONNX model file to write", type=str)
- parser.add_argument("-b", "--batch_size", help="Batch size for the model", type=int, default=1)
- parser.add_argument("-t1", "--first_nms_threshold", help="Override the score threshold for the 1st NMS operation", type=float)
- parser.add_argument("-t2", "--second_nms_threshold", help="Override the score threshold for the 2nd NMS operation", type=float)
- parser.add_argument("-d", "--debug", action='append', help="Add an extra output to debug a particular node")
- parser.add_argument("-f", "--input_format", default="NHWC", choices=["NHWC", "NCHW"],
- help="Set the input shape of the graph, as comma-separated dimensions in NCHW or NHWC format, default: NHWC")
- parser.add_argument("--tf2onnx", help="The path where to save the intermediate ONNX graph generated by tf2onnx, "
- "useful for debugging purposes, default: not saved", type=str)
+ parser.add_argument(
+ "-p", "--pipeline_config", help="Pipeline configuration file to load", type=str
+ )
+ parser.add_argument(
+ "-m",
+ "--saved_model",
+ help="The TensorFlow saved model directory to load",
+ type=str,
+ )
+ parser.add_argument(
+ "-o", "--onnx", help="The output ONNX model file to write", type=str
+ )
+ parser.add_argument(
+ "-b", "--batch_size", help="Batch size for the model", type=int, default=1
+ )
+ parser.add_argument(
+ "-t1",
+ "--first_nms_threshold",
+ help="Override the score threshold for the 1st NMS operation",
+ type=float,
+ )
+ parser.add_argument(
+ "-t2",
+ "--second_nms_threshold",
+ help="Override the score threshold for the 2nd NMS operation",
+ type=float,
+ )
+ parser.add_argument(
+ "-d",
+ "--debug",
+ action="append",
+ help="Add an extra output to debug a particular node",
+ )
+ parser.add_argument(
+ "-f",
+ "--input_format",
+ default="NHWC",
+ choices=["NHWC", "NCHW"],
+ help="Set the input shape of the graph, as comma-separated dimensions in NCHW or NHWC format, default: NHWC",
+ )
+ parser.add_argument(
+ "--tf2onnx",
+ help="The path where to save the intermediate ONNX graph generated by tf2onnx, "
+ "useful for debugging purposes, default: not saved",
+ type=str,
+ )
args = parser.parse_args()
if not all([args.pipeline_config, args.saved_model, args.onnx]):
parser.print_help()
- print("\nThese arguments are required: --pipeline_config, --saved_model and --onnx")
+ print(
+ "\nThese arguments are required: --pipeline_config, --saved_model and --onnx"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/tensorflow_object_detection_api/eval_coco.py b/samples/python/tensorflow_object_detection_api/eval_coco.py
index 5086c660..f04c17f3 100644
--- a/samples/python/tensorflow_object_detection_api/eval_coco.py
+++ b/samples/python/tensorflow_object_detection_api/eval_coco.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,23 +24,35 @@
from infer import TensorRTInfer
from image_batcher import ImageBatcher
+
def main(args):
try:
import object_detection.metrics.coco_tools as coco_tools
except ImportError:
- print("Could not import the 'object_detection.metrics.coco_tools' module from TFOD. Maybe you did not install TFOD API")
- print("Please install TensorFlow 2 Object Detection API, check https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/install.html")
+ print(
+ "Could not import the 'object_detection.metrics.coco_tools' module from TFOD. Maybe you did not install TFOD API"
+ )
+ print(
+ "Please install TensorFlow 2 Object Detection API, check https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/install.html"
+ )
sys.exit(1)
- trt_infer = TensorRTInfer(args.engine, args.preprocessor, args.detection_type, args.iou_threshold)
- batcher = ImageBatcher(args.input, *trt_infer.input_spec(), preprocessor=args.preprocessor)
+ trt_infer = TensorRTInfer(
+ args.engine, args.preprocessor, args.detection_type, args.iou_threshold
+ )
+ batcher = ImageBatcher(
+ args.input, *trt_infer.input_spec(), preprocessor=args.preprocessor
+ )
# Read annotations json as dictionary.
with open(args.annotations) as f:
data = json.load(f)
groundtruth = coco_tools.COCOWrapper(data, detection_type=args.detection_type)
detections_list = []
for batch, images, scales in batcher.get_batch():
- print("Processing Image {} / {}".format(batcher.image_index, batcher.num_images), end="\r")
+ print(
+ "Processing Image {} / {}".format(batcher.image_index, batcher.num_images),
+ end="\r",
+ )
detections = trt_infer.infer(batch, scales, args.nms_threshold)
for i in range(len(images)):
# Get inference image resolution.
@@ -49,43 +61,52 @@ def main(args):
for n in range(len(detections[i])):
source_id = int(os.path.splitext(os.path.basename(images[i]))[0])
det = detections[i][n]
- if args.detection_type == 'bbox':
+ if args.detection_type == "bbox":
coco_det = {
- 'image_id': source_id,
- 'category_id': det['class']+1, # adjust class num
- 'bbox': [det['xmin'], det['ymin'], det['xmax'] - det['xmin'], det['ymax'] - det['ymin']],
- 'score': det['score']
+ "image_id": source_id,
+ "category_id": det["class"] + 1, # adjust class num
+ "bbox": [
+ det["xmin"],
+ det["ymin"],
+ det["xmax"] - det["xmin"],
+ det["ymax"] - det["ymin"],
+ ],
+ "score": det["score"],
}
detections_list.append(coco_det)
- elif args.detection_type == 'segmentation':
+ elif args.detection_type == "segmentation":
# Get detection bbox resolution.
- det_width = round(det['xmax'] - det['xmin'])
- det_height = round(det['ymax'] - det['ymin'])
+ det_width = round(det["xmax"] - det["xmin"])
+ det_height = round(det["ymax"] - det["ymin"])
# Create an image out of predicted mask array.
- small_mask = Image.fromarray(det['mask'])
+ small_mask = Image.fromarray(det["mask"])
# Upsample mask to detection bbox's size.
- mask = small_mask.resize((det_width, det_height), resample=Image.BILINEAR)
+ mask = small_mask.resize(
+ (det_width, det_height), resample=Image.BILINEAR
+ )
# Create an original image sized template for correct mask placement.
pad = Image.new("L", (im_width, im_height))
# Place your mask according to detection bbox placement.
- pad.paste(mask, (round(det['xmin']), (round(det['ymin']))))
+ pad.paste(mask, (round(det["xmin"]), (round(det["ymin"]))))
# Reconvert mask into numpy array for evaluation.
padded_mask = np.array(pad)
# Add one more dimension of 1, this is required by ExportSingleImageDetectionMasksToCoco.
final_mask = padded_mask[np.newaxis, :, :]
# Export detection mask to COCO format
- coco_mask = coco_tools.ExportSingleImageDetectionMasksToCoco(image_id=source_id,
- category_id_set=set(list(range(1,91))),
- detection_classes=np.array([det['class']+1]),
- detection_scores=np.array([det['score']]),
- detection_masks=final_mask)
+ coco_mask = coco_tools.ExportSingleImageDetectionMasksToCoco(
+ image_id=source_id,
+ category_id_set=set(list(range(1, 91))),
+ detection_classes=np.array([det["class"] + 1]),
+ detection_scores=np.array([det["score"]]),
+ detection_masks=final_mask,
+ )
detections_list.append(coco_mask[0])
# Finish evalutions.
detections = groundtruth.LoadAnnotations(detections_list)
- if args.detection_type == 'bbox':
+ if args.detection_type == "bbox":
evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections, iou_type="bbox")
- elif args.detection_type == 'segmentation':
+ elif args.detection_type == "segmentation":
evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections, iou_type="segm")
evaluator.ComputeMetrics()
@@ -93,20 +114,46 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", help="The TensorRT engine to infer with.")
- parser.add_argument("-i", "--input",
- help="The input to infer, either a single image path, or a directory of images.")
- parser.add_argument("-d", "--detection_type", default="bbox", choices=["bbox", "segmentation"],
- help="Detection type for COCO, either bbox or if you are using Mask R-CNN's instance segmentation - segmentation.")
- parser.add_argument("-a", "--annotations", help="Set the json file to use for COCO instance annotations.")
- parser.add_argument("-t", "--nms_threshold", type=float,
- help="Override the score threshold for the NMS operation, if higher than the threshold in the engine.")
- parser.add_argument("--iou_threshold", default=0.5, type=float,
- help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0.")
- parser.add_argument("--preprocessor", default="fixed_shape_resizer", choices=["fixed_shape_resizer", "keep_aspect_ratio_resizer"],
- help="Select the image preprocessor to use based on your pipeline.config, either 'fixed_shape_resizer' or 'keep_aspect_ratio_resizer', default: fixed_shape_resizer.")
+ parser.add_argument(
+ "-i",
+ "--input",
+ help="The input to infer, either a single image path, or a directory of images.",
+ )
+ parser.add_argument(
+ "-d",
+ "--detection_type",
+ default="bbox",
+ choices=["bbox", "segmentation"],
+ help="Detection type for COCO, either bbox or if you are using Mask R-CNN's instance segmentation - segmentation.",
+ )
+ parser.add_argument(
+ "-a",
+ "--annotations",
+ help="Set the json file to use for COCO instance annotations.",
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the score threshold for the NMS operation, if higher than the threshold in the engine.",
+ )
+ parser.add_argument(
+ "--iou_threshold",
+ default=0.5,
+ type=float,
+ help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0.",
+ )
+ parser.add_argument(
+ "--preprocessor",
+ default="fixed_shape_resizer",
+ choices=["fixed_shape_resizer", "keep_aspect_ratio_resizer"],
+ help="Select the image preprocessor to use based on your pipeline.config, either 'fixed_shape_resizer' or 'keep_aspect_ratio_resizer', default: fixed_shape_resizer.",
+ )
args = parser.parse_args()
if not all([args.engine, args.input, args.annotations, args.preprocessor]):
parser.print_help()
- print("\nThese arguments are required: --engine --input --output and --preprocessor")
+ print(
+ "\nThese arguments are required: --engine --input --output and --preprocessor"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/tensorflow_object_detection_api/image_batcher.py b/samples/python/tensorflow_object_detection_api/image_batcher.py
index c40e86c8..202e998d 100644
--- a/samples/python/tensorflow_object_detection_api/image_batcher.py
+++ b/samples/python/tensorflow_object_detection_api/image_batcher.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,7 +27,15 @@ class ImageBatcher:
Creates batches of pre-processed images.
"""
- def __init__(self, input, shape, dtype, max_num_images=None, exact_batches=False, preprocessor="fixed_shape_resizer"):
+ def __init__(
+ self,
+ input,
+ shape,
+ dtype,
+ max_num_images=None,
+ exact_batches=False,
+ preprocessor="fixed_shape_resizer",
+ ):
"""
:param input: The input directory to read images from.
:param shape: The tensor shape of the batch to prepare, either in NCHW or NHWC format.
@@ -45,10 +53,16 @@ def __init__(self, input, shape, dtype, max_num_images=None, exact_batches=False
extensions = [".jpg", ".jpeg", ".png", ".bmp"]
def is_image(path):
- return os.path.isfile(path) and os.path.splitext(path)[1].lower() in extensions
+ return (
+ os.path.isfile(path) and os.path.splitext(path)[1].lower() in extensions
+ )
if os.path.isdir(input):
- self.images = [os.path.join(input, f) for f in os.listdir(input) if is_image(os.path.join(input, f))]
+ self.images = [
+ os.path.join(input, f)
+ for f in os.listdir(input)
+ if is_image(os.path.join(input, f))
+ ]
self.images.sort()
elif os.path.isfile(input):
if is_image(input):
@@ -85,7 +99,7 @@ def is_image(path):
if self.num_images < 1:
print("Not enough images to create batches")
sys.exit(1)
- self.images = self.images[0:self.num_images]
+ self.images = self.images[0 : self.num_images]
# Subdivide the list of images into batches
self.num_batches = 1 + int((self.num_images - 1) / self.batch_size)
@@ -133,7 +147,10 @@ def resize_pad(image, pad_color=(0, 0, 0)):
return image, scale
elif self.preprocessor == "keep_aspect_ratio_resizer":
scale = 1.0 / max(width_scale, height_scale)
- image = image.resize((round(width * scale), round(height * scale)), resample=Image.BILINEAR)
+ image = image.resize(
+ (round(width * scale), round(height * scale)),
+ resample=Image.BILINEAR,
+ )
pad = Image.new("RGB", (self.width, self.height))
pad.paste(pad_color, [0, 0, self.width, self.height])
pad.paste(image)
@@ -141,9 +158,12 @@ def resize_pad(image, pad_color=(0, 0, 0)):
scale = None
image = Image.open(image_path)
- image = image.convert(mode='RGB')
- if self.preprocessor == "fixed_shape_resizer" or self.preprocessor == "keep_aspect_ratio_resizer":
- #Resize & Pad with ImageNet mean values and keep as [0,255] Normalization
+ image = image.convert(mode="RGB")
+ if (
+ self.preprocessor == "fixed_shape_resizer"
+ or self.preprocessor == "keep_aspect_ratio_resizer"
+ ):
+ # Resize & Pad with ImageNet mean values and keep as [0,255] Normalization
image, scale = resize_pad(image, (124, 116, 104))
image = np.asarray(image, dtype=self.dtype)
else:
diff --git a/samples/python/tensorflow_object_detection_api/infer.py b/samples/python/tensorflow_object_detection_api/infer.py
index 3ea07863..298b7a0c 100644
--- a/samples/python/tensorflow_object_detection_api/infer.py
+++ b/samples/python/tensorflow_object_detection_api/infer.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,6 +28,7 @@
from image_batcher import ImageBatcher
from visualize import visualize_detections
+
class TensorRTInfer:
"""
Implements inference for the Model TensorRT engine.
@@ -68,11 +69,11 @@ def __init__(self, engine_path, preprocessor, detection_type, iou_threshold):
size *= s
allocation = common.cuda_call(cudart.cudaMalloc(size))
binding = {
- 'index': i,
- 'name': name,
- 'dtype': np.dtype(trt.nptype(dtype)),
- 'shape': list(shape),
- 'allocation': allocation,
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(trt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
}
self.allocations.append(allocation)
if is_input:
@@ -90,7 +91,7 @@ def input_spec(self):
Get the specs for the input tensor of the network. Useful to prepare memory allocations.
:return: Two items, the shape of the input tensor and its (numpy) datatype.
"""
- return self.inputs[0]['shape'], self.inputs[0]['dtype']
+ return self.inputs[0]["shape"], self.inputs[0]["dtype"]
def output_spec(self):
"""
@@ -99,7 +100,7 @@ def output_spec(self):
"""
specs = []
for o in self.outputs:
- specs.append((o['shape'], o['dtype']))
+ specs.append((o["shape"], o["dtype"]))
return specs
def infer(self, batch, scales=None, nms_threshold=None):
@@ -117,10 +118,12 @@ def infer(self, batch, scales=None, nms_threshold=None):
outputs.append(np.zeros(shape, dtype))
# Process I/O and execute the network
- common.memcpy_host_to_device(self.inputs[0]['allocation'], np.ascontiguousarray(batch))
+ common.memcpy_host_to_device(
+ self.inputs[0]["allocation"], np.ascontiguousarray(batch)
+ )
self.context.execute_v2(self.allocations)
for o in range(len(outputs)):
- common.memcpy_device_to_host(outputs[o], self.outputs[o]['allocation'])
+ common.memcpy_device_to_host(outputs[o], self.outputs[o]["allocation"])
# Process the results
nums = outputs[0]
@@ -131,14 +134,14 @@ def infer(self, batch, scales=None, nms_threshold=None):
if len(outputs) == 5:
masks = outputs[4]
detections = []
- normalized = (np.max(boxes) < 2.0)
+ normalized = np.max(boxes) < 2.0
for i in range(self.batch_size):
detections.append([])
for n in range(int(nums[i])):
# Depending on preprocessor, box scaling will be slightly different.
if self.preprocessor == "fixed_shape_resizer":
- scale_x = self.inputs[0]['shape'][1] if normalized else 1.0
- scale_y = self.inputs[0]['shape'][2] if normalized else 1.0
+ scale_x = self.inputs[0]["shape"][1] if normalized else 1.0
+ scale_y = self.inputs[0]["shape"][2] if normalized else 1.0
if scales and i < len(scales):
scale_x /= scales[i][0]
@@ -146,11 +149,11 @@ def infer(self, batch, scales=None, nms_threshold=None):
if nms_threshold and scores[i][n] < nms_threshold:
continue
# Depending on detection type you need slightly different data.
- if self.detection_type == 'bbox':
+ if self.detection_type == "bbox":
mask = None
# Segmentation is only supported with Mask R-CNN, which has
# fixed_shape_resizer as image_resizer (lookup pipeline.config)
- elif self.detection_type == 'segmentation':
+ elif self.detection_type == "segmentation":
# Select a mask
mask = masks[i][n]
# Slight scaling, to get binary masks after float32 -> uint8
@@ -161,7 +164,7 @@ def infer(self, batch, scales=None, nms_threshold=None):
elif self.preprocessor == "keep_aspect_ratio_resizer":
# No segmentation models with keep_aspect_ratio_resizer
mask = None
- scale = self.inputs[0]['shape'][2] if normalized else 1.0
+ scale = self.inputs[0]["shape"][2] if normalized else 1.0
if scales and i < len(scales):
scale /= scales[i]
scale_y = scale
@@ -169,15 +172,17 @@ def infer(self, batch, scales=None, nms_threshold=None):
if nms_threshold and scores[i][n] < nms_threshold:
continue
# Append to detections
- detections[i].append({
- 'ymin': boxes[i][n][0] * scale_y,
- 'xmin': boxes[i][n][1] * scale_x,
- 'ymax': boxes[i][n][2] * scale_y,
- 'xmax': boxes[i][n][3] * scale_x,
- 'score': scores[i][n],
- 'class': int(classes[i][n]),
- 'mask': mask,
- })
+ detections[i].append(
+ {
+ "ymin": boxes[i][n][0] * scale_y,
+ "xmin": boxes[i][n][1] * scale_x,
+ "ymax": boxes[i][n][2] * scale_y,
+ "xmax": boxes[i][n][3] * scale_x,
+ "score": scores[i][n],
+ "class": int(classes[i][n]),
+ "mask": mask,
+ }
+ )
return detections
@@ -191,10 +196,17 @@ def main(args):
for i, label in enumerate(f):
labels.append(label.strip())
- trt_infer = TensorRTInfer(args.engine, args.preprocessor, args.detection_type, args.iou_threshold)
- batcher = ImageBatcher(args.input, *trt_infer.input_spec(), preprocessor=args.preprocessor)
+ trt_infer = TensorRTInfer(
+ args.engine, args.preprocessor, args.detection_type, args.iou_threshold
+ )
+ batcher = ImageBatcher(
+ args.input, *trt_infer.input_spec(), preprocessor=args.preprocessor
+ )
for batch, images, scales in batcher.get_batch():
- print("Processing Image {} / {}".format(batcher.image_index, batcher.num_images), end="\r")
+ print(
+ "Processing Image {} / {}".format(batcher.image_index, batcher.num_images),
+ end="\r",
+ )
detections = trt_infer.infer(batch, scales, args.nms_threshold)
for i in range(len(images)):
basename = os.path.splitext(os.path.basename(images[i]))[0]
@@ -204,7 +216,14 @@ def main(args):
# Text Results
output_results = ""
for d in detections[i]:
- line = [d['xmin'], d['ymin'], d['xmax'], d['ymax'], d['score'], d['class']]
+ line = [
+ d["xmin"],
+ d["ymin"],
+ d["xmax"],
+ d["ymax"],
+ d["score"],
+ d["class"],
+ ]
output_results += "\t".join([str(f) for f in line]) + "\n"
with open(os.path.join(args.output, "{}.txt".format(basename)), "w") as f:
f.write(output_results)
@@ -214,22 +233,54 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-e", "--engine", default=None, help="The serialized TensorRT engine")
- parser.add_argument("-i", "--input", default=None, help="Path to the image or directory to process")
- parser.add_argument("-o", "--output", default=None, help="Directory where to save the visualization results")
- parser.add_argument("-l", "--labels", default="./labels_coco.txt",
- help="File to use for reading the class labels from, default: ./labels_coco.txt")
- parser.add_argument("-d", "--detection_type", default="bbox", choices=["bbox", "segmentation"],
- help="Detection type for COCO, either bbox or if you are using Mask R-CNN's instance segmentation - segmentation")
- parser.add_argument("-t", "--nms_threshold", type=float,
- help="Override the score threshold for the NMS operation, if higher than the threshold in the engine.")
- parser.add_argument("--iou_threshold", default=0.5, type=float,
- help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0")
- parser.add_argument("--preprocessor", default="fixed_shape_resizer", choices=["fixed_shape_resizer", "keep_aspect_ratio_resizer"],
- help="Select the image preprocessor to use based on your pipeline.config, either 'fixed_shape_resizer' or 'keep_aspect_ratio_resizer', default: fixed_shape_resizer")
+ parser.add_argument(
+ "-e", "--engine", default=None, help="The serialized TensorRT engine"
+ )
+ parser.add_argument(
+ "-i", "--input", default=None, help="Path to the image or directory to process"
+ )
+ parser.add_argument(
+ "-o",
+ "--output",
+ default=None,
+ help="Directory where to save the visualization results",
+ )
+ parser.add_argument(
+ "-l",
+ "--labels",
+ default="./labels_coco.txt",
+ help="File to use for reading the class labels from, default: ./labels_coco.txt",
+ )
+ parser.add_argument(
+ "-d",
+ "--detection_type",
+ default="bbox",
+ choices=["bbox", "segmentation"],
+ help="Detection type for COCO, either bbox or if you are using Mask R-CNN's instance segmentation - segmentation",
+ )
+ parser.add_argument(
+ "-t",
+ "--nms_threshold",
+ type=float,
+ help="Override the score threshold for the NMS operation, if higher than the threshold in the engine.",
+ )
+ parser.add_argument(
+ "--iou_threshold",
+ default=0.5,
+ type=float,
+ help="Select the IoU threshold for the mask segmentation. Range is 0 to 1. Pixel values more than threshold will become 1, less 0",
+ )
+ parser.add_argument(
+ "--preprocessor",
+ default="fixed_shape_resizer",
+ choices=["fixed_shape_resizer", "keep_aspect_ratio_resizer"],
+ help="Select the image preprocessor to use based on your pipeline.config, either 'fixed_shape_resizer' or 'keep_aspect_ratio_resizer', default: fixed_shape_resizer",
+ )
args = parser.parse_args()
if not all([args.engine, args.input, args.output, args.preprocessor]):
parser.print_help()
- print("\nThese arguments are required: --engine --input --output and --preprocessor")
+ print(
+ "\nThese arguments are required: --engine --input --output and --preprocessor"
+ )
sys.exit(1)
main(args)
diff --git a/samples/python/tensorflow_object_detection_api/onnx_utils.py b/samples/python/tensorflow_object_detection_api/onnx_utils.py
index b539197a..07819328 100644
--- a/samples/python/tensorflow_object_detection_api/onnx_utils.py
+++ b/samples/python/tensorflow_object_detection_api/onnx_utils.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +23,7 @@
logging.getLogger("SSDHelper").setLevel(logging.INFO)
log = logging.getLogger("SSDHelper")
+
@gs.Graph.register()
def op_with_const(self, op, name, input, value):
"""
@@ -35,7 +36,10 @@ def op_with_const(self, op, name, input, value):
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}': {}".format(op, name, value.squeeze()))
const = gs.Constant(name="{}_value:0".format(name), values=value)
- return self.layer(name=name, op=op, inputs=[input_tensor, const], outputs=[name + ":0"])
+ return self.layer(
+ name=name, op=op, inputs=[input_tensor, const], outputs=[name + ":0"]
+ )
+
@gs.Graph.register()
def matmul(self, name, input, value):
@@ -48,7 +52,10 @@ def matmul(self, name, input, value):
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}': {}".format("MatMul", name, value.squeeze()))
const = gs.Constant(name="{}_value:0".format(name), values=value)
- return self.layer(name=name, op="MatMul", inputs=[input_tensor, const], outputs=[name + ":0"])
+ return self.layer(
+ name=name, op="MatMul", inputs=[input_tensor, const], outputs=[name + ":0"]
+ )
+
@gs.Graph.register()
def clip(self, name, input, clip_min, clip_max):
@@ -61,9 +68,19 @@ def clip(self, name, input, clip_min, clip_max):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}".format("Clip", name))
- const_min = gs.Constant(name="{}_value:0".format(name), values=np.asarray([clip_min], dtype=np.float32))
- const_max = gs.Constant(name="{}_value:1".format(name), values=np.asarray([clip_max], dtype=np.float32))
- return self.layer(name=name, op="Clip", inputs=[input_tensor, const_min, const_max], outputs=[name + ":0"])
+ const_min = gs.Constant(
+ name="{}_value:0".format(name), values=np.asarray([clip_min], dtype=np.float32)
+ )
+ const_max = gs.Constant(
+ name="{}_value:1".format(name), values=np.asarray([clip_max], dtype=np.float32)
+ )
+ return self.layer(
+ name=name,
+ op="Clip",
+ inputs=[input_tensor, const_min, const_max],
+ outputs=[name + ":0"],
+ )
+
@gs.Graph.register()
def slice(self, name, input, starts, ends, axes):
@@ -79,10 +96,22 @@ def slice(self, name, input, starts, ends, axes):
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created {} node '{}".format("Slice", name))
- const_start = gs.Constant(name="{}_value:0".format(name), values=np.asarray([starts], dtype=np.int64))
- const_end = gs.Constant(name="{}_value:1".format(name), values=np.asarray([ends], dtype=np.int64))
- const_axes = gs.Constant(name="{}_value:2".format(name), values=np.asarray([axes], dtype=np.int64))
- return self.layer(name=name, op="Slice", inputs=[input_tensor, const_start, const_end, const_axes], outputs=[name + ":0"])
+ const_start = gs.Constant(
+ name="{}_value:0".format(name), values=np.asarray([starts], dtype=np.int64)
+ )
+ const_end = gs.Constant(
+ name="{}_value:1".format(name), values=np.asarray([ends], dtype=np.int64)
+ )
+ const_axes = gs.Constant(
+ name="{}_value:2".format(name), values=np.asarray([axes], dtype=np.int64)
+ )
+ return self.layer(
+ name=name,
+ op="Slice",
+ inputs=[input_tensor, const_start, const_end, const_axes],
+ outputs=[name + ":0"],
+ )
+
@gs.Graph.register()
def unsqueeze(self, name, input, axes=[3]):
@@ -96,7 +125,14 @@ def unsqueeze(self, name, input, axes=[3]):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Unsqueeze node '{}': {}".format(name, axes))
- return self.layer(name=name, op="Unsqueeze", inputs=[input_tensor], outputs=[name + ":0"], attrs={'axes': axes})
+ return self.layer(
+ name=name,
+ op="Unsqueeze",
+ inputs=[input_tensor],
+ outputs=[name + ":0"],
+ attrs={"axes": axes},
+ )
+
@gs.Graph.register()
def squeeze(self, name, input, axes=[2]):
@@ -110,7 +146,14 @@ def squeeze(self, name, input, axes=[2]):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Squeeze node '{}': {}".format(name, axes))
- return self.layer(name=name, op="Squeeze", inputs=[input_tensor], outputs=[name + ":0"], attrs={'axes': axes})
+ return self.layer(
+ name=name,
+ op="Squeeze",
+ inputs=[input_tensor],
+ outputs=[name + ":0"],
+ attrs={"axes": axes},
+ )
+
@gs.Graph.register()
def transpose(self, name, input, perm):
@@ -124,7 +167,14 @@ def transpose(self, name, input, perm):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Transpose node '{}': {}".format(name, perm))
- return self.layer(name=name, op="Transpose", inputs=[input_tensor], outputs=[name + ":0"], attrs={'perm': perm})
+ return self.layer(
+ name=name,
+ op="Transpose",
+ inputs=[input_tensor],
+ outputs=[name + ":0"],
+ attrs={"perm": perm},
+ )
+
@gs.Graph.register()
def sigmoid(self, name, input):
@@ -137,7 +187,10 @@ def sigmoid(self, name, input):
"""
input_tensor = input if type(input) is gs.Variable else input[0]
log.debug("Created Sigmoid node '{}'".format(name))
- return self.layer(name=name, op="Sigmoid", inputs=[input_tensor], outputs=[name + ":0"])
+ return self.layer(
+ name=name, op="Sigmoid", inputs=[input_tensor], outputs=[name + ":0"]
+ )
+
@gs.Graph.register()
def plugin(self, op, name, inputs, outputs, attrs):
@@ -154,7 +207,10 @@ def plugin(self, op, name, inputs, outputs, attrs):
"""
input_tensors = inputs if type(inputs) is list else [inputs]
log.debug("Created TRT Plugin node '{}': {}".format(name, attrs))
- return self.layer(op=op, name=name, inputs=input_tensors, outputs=outputs, attrs=attrs)
+ return self.layer(
+ op=op, name=name, inputs=input_tensors, outputs=outputs, attrs=attrs
+ )
+
@gs.Graph.register()
def find_node_by_op(self, op):
@@ -169,6 +225,7 @@ def find_node_by_op(self, op):
return node
return None
+
@gs.Graph.register()
def find_descendant_by_op(self, node, op, depth=10):
"""
diff --git a/samples/python/tensorflow_object_detection_api/visualize.py b/samples/python/tensorflow_object_detection_api/visualize.py
index f3e4ffc1..f88ed6f0 100644
--- a/samples/python/tensorflow_object_detection_api/visualize.py
+++ b/samples/python/tensorflow_object_detection_api/visualize.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +16,7 @@
#
import numpy as np
+
np.set_printoptions(threshold=np.inf, suppress=True)
import PIL.Image as Image
@@ -24,95 +25,228 @@
import PIL.ImageFilter as ImageFilter
-
-COLORS = ['GoldenRod', 'MediumTurquoise', 'GreenYellow', 'SteelBlue', 'DarkSeaGreen', 'SeaShell', 'LightGrey',
- 'IndianRed', 'DarkKhaki', 'LawnGreen', 'WhiteSmoke', 'Peru', 'LightCoral', 'FireBrick', 'OldLace',
- 'LightBlue', 'SlateGray', 'OliveDrab', 'NavajoWhite', 'PaleVioletRed', 'SpringGreen', 'AliceBlue', 'Violet',
- 'DeepSkyBlue', 'Red', 'MediumVioletRed', 'PaleTurquoise', 'Tomato', 'Azure', 'Yellow', 'Cornsilk',
- 'Aquamarine', 'CadetBlue', 'CornflowerBlue', 'DodgerBlue', 'Olive', 'Orchid', 'LemonChiffon', 'Sienna',
- 'OrangeRed', 'Orange', 'DarkSalmon', 'Magenta', 'Wheat', 'Lime', 'GhostWhite', 'SlateBlue', 'Aqua',
- 'MediumAquaMarine', 'LightSlateGrey', 'MediumSeaGreen', 'SandyBrown', 'YellowGreen', 'Plum', 'FloralWhite',
- 'LightPink', 'Thistle', 'DarkViolet', 'Pink', 'Crimson', 'Chocolate', 'DarkGrey', 'Ivory', 'PaleGreen',
- 'DarkGoldenRod', 'LavenderBlush', 'SlateGrey', 'DeepPink', 'Gold', 'Cyan', 'LightSteelBlue', 'MediumPurple',
- 'ForestGreen', 'DarkOrange', 'Tan', 'Salmon', 'PaleGoldenRod', 'LightGreen', 'LightSlateGray', 'HoneyDew',
- 'Fuchsia', 'LightSeaGreen', 'DarkOrchid', 'Green', 'Chartreuse', 'LimeGreen', 'AntiqueWhite', 'Beige',
- 'Gainsboro', 'Bisque', 'SaddleBrown', 'Silver', 'Lavender', 'Teal', 'LightCyan', 'PapayaWhip', 'Purple',
- 'Coral', 'BurlyWood', 'LightGray', 'Snow', 'MistyRose', 'PowderBlue', 'DarkCyan', 'White', 'Turquoise',
- 'MediumSlateBlue', 'PeachPuff', 'Moccasin', 'LightSalmon', 'SkyBlue', 'Khaki', 'MediumSpringGreen',
- 'BlueViolet', 'MintCream', 'Linen', 'SeaGreen', 'HotPink', 'LightYellow', 'BlanchedAlmond', 'RoyalBlue',
- 'RosyBrown', 'MediumOrchid', 'DarkTurquoise', 'LightGoldenRodYellow', 'LightSkyBlue']
+COLORS = [
+ "GoldenRod",
+ "MediumTurquoise",
+ "GreenYellow",
+ "SteelBlue",
+ "DarkSeaGreen",
+ "SeaShell",
+ "LightGrey",
+ "IndianRed",
+ "DarkKhaki",
+ "LawnGreen",
+ "WhiteSmoke",
+ "Peru",
+ "LightCoral",
+ "FireBrick",
+ "OldLace",
+ "LightBlue",
+ "SlateGray",
+ "OliveDrab",
+ "NavajoWhite",
+ "PaleVioletRed",
+ "SpringGreen",
+ "AliceBlue",
+ "Violet",
+ "DeepSkyBlue",
+ "Red",
+ "MediumVioletRed",
+ "PaleTurquoise",
+ "Tomato",
+ "Azure",
+ "Yellow",
+ "Cornsilk",
+ "Aquamarine",
+ "CadetBlue",
+ "CornflowerBlue",
+ "DodgerBlue",
+ "Olive",
+ "Orchid",
+ "LemonChiffon",
+ "Sienna",
+ "OrangeRed",
+ "Orange",
+ "DarkSalmon",
+ "Magenta",
+ "Wheat",
+ "Lime",
+ "GhostWhite",
+ "SlateBlue",
+ "Aqua",
+ "MediumAquaMarine",
+ "LightSlateGrey",
+ "MediumSeaGreen",
+ "SandyBrown",
+ "YellowGreen",
+ "Plum",
+ "FloralWhite",
+ "LightPink",
+ "Thistle",
+ "DarkViolet",
+ "Pink",
+ "Crimson",
+ "Chocolate",
+ "DarkGrey",
+ "Ivory",
+ "PaleGreen",
+ "DarkGoldenRod",
+ "LavenderBlush",
+ "SlateGrey",
+ "DeepPink",
+ "Gold",
+ "Cyan",
+ "LightSteelBlue",
+ "MediumPurple",
+ "ForestGreen",
+ "DarkOrange",
+ "Tan",
+ "Salmon",
+ "PaleGoldenRod",
+ "LightGreen",
+ "LightSlateGray",
+ "HoneyDew",
+ "Fuchsia",
+ "LightSeaGreen",
+ "DarkOrchid",
+ "Green",
+ "Chartreuse",
+ "LimeGreen",
+ "AntiqueWhite",
+ "Beige",
+ "Gainsboro",
+ "Bisque",
+ "SaddleBrown",
+ "Silver",
+ "Lavender",
+ "Teal",
+ "LightCyan",
+ "PapayaWhip",
+ "Purple",
+ "Coral",
+ "BurlyWood",
+ "LightGray",
+ "Snow",
+ "MistyRose",
+ "PowderBlue",
+ "DarkCyan",
+ "White",
+ "Turquoise",
+ "MediumSlateBlue",
+ "PeachPuff",
+ "Moccasin",
+ "LightSalmon",
+ "SkyBlue",
+ "Khaki",
+ "MediumSpringGreen",
+ "BlueViolet",
+ "MintCream",
+ "Linen",
+ "SeaGreen",
+ "HotPink",
+ "LightYellow",
+ "BlanchedAlmond",
+ "RoyalBlue",
+ "RosyBrown",
+ "MediumOrchid",
+ "DarkTurquoise",
+ "LightGoldenRodYellow",
+ "LightSkyBlue",
+]
-#Overlay mask with transparency on top of the image.
+# Overlay mask with transparency on top of the image.
def overlay(image, mask, color, alpha_transparency=0.5):
for channel in range(3):
- image[:, :, channel] = np.where(mask == 1,
- image[:, :, channel] *
- (1 - alpha_transparency) + alpha_transparency * color[channel] * 255,
- image[:, :, channel])
+ image[:, :, channel] = np.where(
+ mask == 1,
+ image[:, :, channel] * (1 - alpha_transparency)
+ + alpha_transparency * color[channel] * 255,
+ image[:, :, channel],
+ )
return image
+
def visualize_detections(image_path, output_path, detections, labels=[]):
- image = Image.open(image_path).convert(mode='RGB')
+ image = Image.open(image_path).convert(mode="RGB")
# Get image dimensions.
im_width, im_height = image.size
line_width = 2
font = ImageFont.load_default()
for d in detections:
- color = COLORS[d['class'] % len(COLORS)]
+ color = COLORS[d["class"] % len(COLORS)]
# Dynamically convert PIL color into RGB numpy array.
- pixel_color = Image.new("RGB",(1, 1), color)
+ pixel_color = Image.new("RGB", (1, 1), color)
# Normalize.
- np_color = (np.asarray(pixel_color)[0][0])/255
+ np_color = (np.asarray(pixel_color)[0][0]) / 255
# Process TF and TRT instance segmentation masks.
- if isinstance(d['mask'], np.ndarray) and d['mask'].shape == (33, 33):
+ if isinstance(d["mask"], np.ndarray) and d["mask"].shape == (33, 33):
# Get detection bbox resolution.
- det_width = round(d['xmax'] - d['xmin'])
- det_height = round(d['ymax'] - d['ymin'])
+ det_width = round(d["xmax"] - d["xmin"])
+ det_height = round(d["ymax"] - d["ymin"])
# Create an image out of predicted mask array.
- small_mask = Image.fromarray(d['mask'])
+ small_mask = Image.fromarray(d["mask"])
# Upsample mask to detection bbox's size.
mask = small_mask.resize((det_width, det_height), resample=Image.BILINEAR)
# Create an original image sized template for correct mask placement.
pad = Image.new("L", (im_width, im_height))
# Place your mask according to detection bbox placement.
- pad.paste(mask, (round(d['xmin']), (round(d['ymin']))))
+ pad.paste(mask, (round(d["xmin"]), (round(d["ymin"]))))
# Reconvert mask into numpy array for evaluation.
padded_mask = np.array(pad)
- #Creat np.array from original image, copy in order to modify.
+ # Creat np.array from original image, copy in order to modify.
image_copy = np.asarray(image).copy()
# Image with overlaid mask.
masked_image = overlay(image_copy, padded_mask, np_color)
# Reconvert back to PIL.
image = Image.fromarray(masked_image)
# Separate clause for ground truth instance segmentation masks.
- elif isinstance(d['mask'], np.ndarray):
- #Creat np.array from original image, copy in order to modify.
+ elif isinstance(d["mask"], np.ndarray):
+ # Creat np.array from original image, copy in order to modify.
image_copy = np.asarray(image).copy()
# Image with overlaid mask.
- masked_image = overlay(image_copy, d['mask'], np_color)
+ masked_image = overlay(image_copy, d["mask"], np_color)
# Reconvert back to PIL
image = Image.fromarray(masked_image)
# Bbox lines.
draw = ImageDraw.Draw(image)
- draw.line([(d['xmin'], d['ymin']), (d['xmin'], d['ymax']), (d['xmax'], d['ymax']), (d['xmax'], d['ymin']),
- (d['xmin'], d['ymin'])], width=line_width, fill=color)
- label = "Class {}".format(d['class'])
- if d['class'] < len(labels):
- label = "{}".format(labels[d['class']])
- score = d['score']
+ draw.line(
+ [
+ (d["xmin"], d["ymin"]),
+ (d["xmin"], d["ymax"]),
+ (d["xmax"], d["ymax"]),
+ (d["xmax"], d["ymin"]),
+ (d["xmin"], d["ymin"]),
+ ],
+ width=line_width,
+ fill=color,
+ )
+ label = "Class {}".format(d["class"])
+ if d["class"] < len(labels):
+ label = "{}".format(labels[d["class"]])
+ score = d["score"]
text = "{}: {}%".format(label, int(100 * score))
if score < 0:
text = label
left, top, right, bottom = font.getbbox(text)
text_width, text_height = right - left, bottom - top
- text_bottom = max(text_height, d['ymin'])
- text_left = d['xmin']
+ text_bottom = max(text_height, d["ymin"])
+ text_left = d["xmin"]
margin = np.ceil(0.05 * text_height)
- draw.rectangle([(text_left, text_bottom - text_height - 2 * margin), (text_left + text_width, text_bottom)],
- fill=color)
- draw.text((text_left + margin, text_bottom - text_height - margin), text, fill='black', font=font)
+ draw.rectangle(
+ [
+ (text_left, text_bottom - text_height - 2 * margin),
+ (text_left + text_width, text_bottom),
+ ],
+ fill=color,
+ )
+ draw.text(
+ (text_left + margin, text_bottom - text_height - margin),
+ text,
+ fill="black",
+ font=font,
+ )
if output_path is None:
return image
image.save(output_path)
@@ -123,7 +257,12 @@ def draw_text(draw, font, text, width, bar_height, offset, color):
left, top, right, bottom = font.getbbox(text)
text_width, text_height = right - left, bottom - top
draw.rectangle([(offset, 0), (offset + width, bar_height)], fill=color)
- draw.text((offset + (width - text_width) / 2, text_height - text_height / 2), text, fill='black', font=font)
+ draw.text(
+ (offset + (width - text_width) / 2, text_height - text_height / 2),
+ text,
+ fill="black",
+ font=font,
+ )
bar_height = 18
width = 0
@@ -132,7 +271,7 @@ def draw_text(draw, font, text, width, bar_height, offset, color):
width += im.width
height = max(height, im.height)
- concat = Image.new('RGB', (width, height + bar_height))
+ concat = Image.new("RGB", (width, height + bar_height))
draw = ImageDraw.Draw(concat)
font = ImageFont.load_default()
diff --git a/samples/python/yolov3_onnx/data_processing.py b/samples/python/yolov3_onnx/data_processing.py
index 8a68145f..998cbc5f 100644
--- a/samples/python/yolov3_onnx/data_processing.py
+++ b/samples/python/yolov3_onnx/data_processing.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,7 +31,9 @@ def load_label_categories(label_file_path):
return categories
-LABEL_FILE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "coco_labels.txt")
+LABEL_FILE_PATH = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), "coco_labels.txt"
+)
ALL_CATEGORIES = load_label_categories(LABEL_FILE_PATH)
# Let's make sure that there are 80 classes, as expected for the COCO data set:
@@ -103,7 +105,14 @@ def _shuffle_and_normalize(self, image):
class PostprocessYOLO(object):
"""Class for post-processing the three outputs tensors from YOLOv3-608."""
- def __init__(self, yolo_masks, yolo_anchors, obj_threshold, nms_threshold, yolo_input_resolution):
+ def __init__(
+ self,
+ yolo_masks,
+ yolo_anchors,
+ obj_threshold,
+ nms_threshold,
+ yolo_input_resolution,
+ ):
"""Initialize with all values that will be kept when processing several frames.
Assuming 3 outputs of the network in the case of (large) YOLOv3.
@@ -135,7 +144,9 @@ def process(self, outputs, resolution_raw):
for output in outputs:
outputs_reshaped.append(self._reshape_output(output))
- boxes, categories, confidences = self._process_yolo_output(outputs_reshaped, resolution_raw)
+ boxes, categories, confidences = self._process_yolo_output(
+ outputs_reshaped, resolution_raw
+ )
return boxes, categories, confidences
@@ -311,8 +322,12 @@ def _nms_boxes(self, boxes, box_confidences):
keep.append(i)
xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]])
yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]])
- xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]])
- yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]])
+ xx2 = np.minimum(
+ x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]]
+ )
+ yy2 = np.minimum(
+ y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]]
+ )
width1 = np.maximum(0.0, xx2 - xx1 + 1)
height1 = np.maximum(0.0, yy2 - yy1 + 1)
diff --git a/samples/python/yolov3_onnx/onnx_to_tensorrt.py b/samples/python/yolov3_onnx/onnx_to_tensorrt.py
index c7e54d16..2ba322bc 100644
--- a/samples/python/yolov3_onnx/onnx_to_tensorrt.py
+++ b/samples/python/yolov3_onnx/onnx_to_tensorrt.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,23 +18,25 @@
from __future__ import print_function
+import os
+import sys
+
import numpy as np
import tensorrt as trt
-
+from data_processing import ALL_CATEGORIES, PostprocessYOLO, PreprocessYOLO
from PIL import ImageDraw
-from data_processing import PreprocessYOLO, PostprocessYOLO, ALL_CATEGORIES
-
-import sys, os
-
sys.path.insert(1, os.path.join(sys.path[0], ".."))
-import common
from downloader import getFilePath
+import common
+
TRT_LOGGER = trt.Logger()
-def draw_bboxes(image_raw, bboxes, confidences, categories, all_categories, bbox_color="blue"):
+def draw_bboxes(
+ image_raw, bboxes, confidences, categories, all_categories, bbox_color="blue"
+):
"""Draw the bounding boxes on the original input image and return it.
Keyword arguments:
@@ -58,7 +60,11 @@ def draw_bboxes(image_raw, bboxes, confidences, categories, all_categories, bbox
bottom = min(image_raw.height, np.floor(y_coord + height + 0.5).astype(int))
draw.rectangle(((left, top), (right, bottom)), outline=bbox_color)
- draw.text((left, top - 12), "{0} {1:.2f}".format(all_categories[category], score), fill=bbox_color)
+ draw.text(
+ (left, top - 12),
+ "{0} {1:.2f}".format(all_categories[category], score),
+ fill=bbox_color,
+ )
return image_raw
@@ -69,17 +75,21 @@ def get_engine(onnx_file_path, engine_file_path=""):
def build_engine():
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
- 0
+ 0
) as network, builder.create_builder_config() as config, trt.OnnxParser(
network, TRT_LOGGER
) as parser, trt.Runtime(
TRT_LOGGER
) as runtime:
- config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 28) # 256MiB
+ config.set_memory_pool_limit(
+ trt.MemoryPoolType.WORKSPACE, 1 << 28
+ ) # 256MiB
# Parse model file
if not os.path.exists(onnx_file_path):
print(
- "ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.".format(onnx_file_path)
+ "ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.".format(
+ onnx_file_path
+ )
)
exit(0)
print("Loading ONNX file from path {}...".format(onnx_file_path))
@@ -93,7 +103,11 @@ def build_engine():
# The actual yolov3.onnx is generated with batch size 64. Reshape input to batch size 1
network.get_input(0).shape = [1, 3, 608, 608]
print("Completed parsing of ONNX file")
- print("Building an engine from file {}; this may take a while...".format(onnx_file_path))
+ print(
+ "Building an engine from file {}; this may take a while...".format(
+ onnx_file_path
+ )
+ )
plan = builder.build_serialized_network(network, config)
engine = runtime.deserialize_cuda_engine(plan)
print("Completed creating Engine")
@@ -131,19 +145,34 @@ def main():
output_shapes = [(1, 255, 19, 19), (1, 255, 38, 38), (1, 255, 76, 76)]
# Do inference with TensorRT
trt_outputs = []
- with get_engine(onnx_file_path, engine_file_path) as engine, engine.create_execution_context() as context:
+ with get_engine(
+ onnx_file_path, engine_file_path
+ ) as engine, engine.create_execution_context() as context:
inputs, outputs, bindings, stream = common.allocate_buffers(engine)
# Do inference
print("Running inference on image {}...".format(input_image_path))
# Set host input to the image. The common.do_inference function will copy the input to the GPU before executing.
inputs[0].host = image
- trt_outputs = common.do_inference(context, engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
+ trt_outputs = common.do_inference(
+ context,
+ engine=engine,
+ bindings=bindings,
+ inputs=inputs,
+ outputs=outputs,
+ stream=stream,
+ )
# Before doing post-processing, we need to reshape the outputs as the common.do_inference will give us flat arrays.
- trt_outputs = [output.reshape(shape) for output, shape in zip(trt_outputs, output_shapes)]
+ trt_outputs = [
+ output.reshape(shape) for output, shape in zip(trt_outputs, output_shapes)
+ ]
postprocessor_args = {
- "yolo_masks": [(6, 7, 8), (3, 4, 5), (0, 1, 2)], # A list of 3 three-dimensional tuples for the YOLO masks
+ "yolo_masks": [
+ (6, 7, 8),
+ (3, 4, 5),
+ (0, 1, 2),
+ ], # A list of 3 three-dimensional tuples for the YOLO masks
"yolo_anchors": [
(10, 13),
(16, 30),
@@ -168,7 +197,11 @@ def main():
obj_detected_img = draw_bboxes(image_raw, boxes, scores, classes, ALL_CATEGORIES)
output_image_path = "dog_bboxes.png"
obj_detected_img.save(output_image_path, "PNG")
- print("Saved image with bounding boxes of detected objects to {}.".format(output_image_path))
+ print(
+ "Saved image with bounding boxes of detected objects to {}.".format(
+ output_image_path
+ )
+ )
# Free host and device memory used for inputs and outputs
common.free_buffers(inputs, outputs, stream)
diff --git a/samples/python/yolov3_onnx/yolov3_to_onnx.py b/samples/python/yolov3_onnx/yolov3_to_onnx.py
index 59f8b3a6..ffd9d19f 100644
--- a/samples/python/yolov3_onnx/yolov3_to_onnx.py
+++ b/samples/python/yolov3_onnx/yolov3_to_onnx.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -128,7 +128,9 @@ def _parse_params(self, param_line):
param_value = layer_indexes
elif isinstance(param_value_raw, str) and not param_value_raw.isalpha():
condition_param_value_positive = param_value_raw.isdigit()
- condition_param_value_negative = param_value_raw[0] == "-" and param_value_raw[1:].isdigit()
+ condition_param_value_negative = (
+ param_value_raw[0] == "-" and param_value_raw[1:].isdigit()
+ )
if condition_param_value_positive or condition_param_value_negative:
param_value = int(param_value_raw)
else:
@@ -276,17 +278,29 @@ def load_conv_weights(self, conv_params):
initializer = list()
inputs = list()
if conv_params.batch_normalize:
- bias_init, bias_input = self._create_param_tensors(conv_params, "bn", "bias")
- bn_scale_init, bn_scale_input = self._create_param_tensors(conv_params, "bn", "scale")
- bn_mean_init, bn_mean_input = self._create_param_tensors(conv_params, "bn", "mean")
- bn_var_init, bn_var_input = self._create_param_tensors(conv_params, "bn", "var")
+ bias_init, bias_input = self._create_param_tensors(
+ conv_params, "bn", "bias"
+ )
+ bn_scale_init, bn_scale_input = self._create_param_tensors(
+ conv_params, "bn", "scale"
+ )
+ bn_mean_init, bn_mean_input = self._create_param_tensors(
+ conv_params, "bn", "mean"
+ )
+ bn_var_init, bn_var_input = self._create_param_tensors(
+ conv_params, "bn", "var"
+ )
initializer.extend([bn_scale_init, bias_init, bn_mean_init, bn_var_init])
inputs.extend([bn_scale_input, bias_input, bn_mean_input, bn_var_input])
else:
- bias_init, bias_input = self._create_param_tensors(conv_params, "conv", "bias")
+ bias_init, bias_input = self._create_param_tensors(
+ conv_params, "conv", "bias"
+ )
initializer.append(bias_init)
inputs.append(bias_input)
- conv_init, conv_input = self._create_param_tensors(conv_params, "conv", "weights")
+ conv_init, conv_input = self._create_param_tensors(
+ conv_params, "conv", "weights"
+ )
initializer.append(conv_init)
inputs.append(conv_input)
return initializer, inputs
@@ -299,7 +313,11 @@ def _open_weights_file(self, weights_file_path):
"""
weights_file = open(weights_file_path, "rb")
length_header = 5
- np.ndarray(shape=(length_header,), dtype="int32", buffer=weights_file.read(length_header * 4))
+ np.ndarray(
+ shape=(length_header,),
+ dtype="int32",
+ buffer=weights_file.read(length_header * 4),
+ )
return weights_file
def _create_param_tensors(self, conv_params, param_category, suffix):
@@ -312,10 +330,16 @@ def _create_param_tensors(self, conv_params, param_category, suffix):
suffix -- a string determining the sub-type of above param_category (e.g.,
'weights' or 'bias')
"""
- param_name, param_data, param_data_shape = self._load_one_param_type(conv_params, param_category, suffix)
+ param_name, param_data, param_data_shape = self._load_one_param_type(
+ conv_params, param_category, suffix
+ )
- initializer_tensor = helper.make_tensor(param_name, TensorProto.FLOAT, param_data_shape, param_data)
- input_tensor = helper.make_tensor_value_info(param_name, TensorProto.FLOAT, param_data_shape)
+ initializer_tensor = helper.make_tensor(
+ param_name, TensorProto.FLOAT, param_data_shape, param_data
+ )
+ input_tensor = helper.make_tensor_value_info(
+ param_name, TensorProto.FLOAT, param_data_shape
+ )
return initializer_tensor, input_tensor
def _load_one_param_type(self, conv_params, param_category, suffix):
@@ -337,7 +361,11 @@ def _load_one_param_type(self, conv_params, param_category, suffix):
elif suffix == "bias":
param_shape = [channels_out]
param_size = np.product(np.array(param_shape))
- param_data = np.ndarray(shape=param_shape, dtype="float32", buffer=self.weights_file.read(param_size * 4))
+ param_data = np.ndarray(
+ shape=param_shape,
+ dtype="float32",
+ buffer=self.weights_file.read(param_size * 4),
+ )
param_data = param_data.flatten().astype(float)
return param_name, param_data, param_shape
@@ -385,7 +413,9 @@ def build_onnx_graph(self, layer_configs, weights_file_path, verbose=True):
output_dims = [
self.batch_size,
] + self.output_tensors[tensor_name]
- output_tensor = helper.make_tensor_value_info(tensor_name, TensorProto.FLOAT, output_dims)
+ output_tensor = helper.make_tensor_value_info(
+ tensor_name, TensorProto.FLOAT, output_dims
+ )
outputs.append(output_tensor)
inputs = [self.input_tensor]
weight_loader = WeightLoader(weights_file_path)
@@ -395,20 +425,30 @@ def build_onnx_graph(self, layer_configs, weights_file_path, verbose=True):
_, layer_type = layer_name.split("_", 1)
params = self.param_dict[layer_name]
if layer_type == "convolutional":
- initializer_layer, inputs_layer = weight_loader.load_conv_weights(params)
+ initializer_layer, inputs_layer = weight_loader.load_conv_weights(
+ params
+ )
initializer.extend(initializer_layer)
inputs.extend(inputs_layer)
elif layer_type == "upsample":
- initializer_layer, inputs_layer = weight_loader.load_resize_scales(params)
+ initializer_layer, inputs_layer = weight_loader.load_resize_scales(
+ params
+ )
initializer.extend(initializer_layer)
inputs.extend(inputs_layer)
del weight_loader
self.graph_def = helper.make_graph(
- nodes=self._nodes, name="YOLOv3-608", inputs=inputs, outputs=outputs, initializer=initializer
+ nodes=self._nodes,
+ name="YOLOv3-608",
+ inputs=inputs,
+ outputs=outputs,
+ initializer=initializer,
)
if verbose:
print(helper.printable_graph(self.graph_def))
- model_def = helper.make_model(self.graph_def, producer_name="NVIDIA TensorRT sample")
+ model_def = helper.make_model(
+ self.graph_def, producer_name="NVIDIA TensorRT sample"
+ )
return model_def
def _make_onnx_node(self, layer_name, layer_dict):
@@ -423,8 +463,12 @@ def _make_onnx_node(self, layer_name, layer_dict):
layer_type = layer_dict["type"]
if self.input_tensor is None:
if layer_type == "net":
- major_node_output_name, major_node_output_channels = self._make_input_tensor(layer_name, layer_dict)
- major_node_specs = MajorNodeSpecs(major_node_output_name, major_node_output_channels)
+ major_node_output_name, major_node_output_channels = (
+ self._make_input_tensor(layer_name, layer_dict)
+ )
+ major_node_specs = MajorNodeSpecs(
+ major_node_output_name, major_node_output_channels
+ )
else:
raise ValueError('The first node has to be of type "net".')
else:
@@ -435,10 +479,17 @@ def _make_onnx_node(self, layer_name, layer_dict):
node_creators["upsample"] = self._make_resize_node
if layer_type in node_creators.keys():
- major_node_output_name, major_node_output_channels = node_creators[layer_type](layer_name, layer_dict)
- major_node_specs = MajorNodeSpecs(major_node_output_name, major_node_output_channels)
+ major_node_output_name, major_node_output_channels = node_creators[
+ layer_type
+ ](layer_name, layer_dict)
+ major_node_specs = MajorNodeSpecs(
+ major_node_output_name, major_node_output_channels
+ )
else:
- print("Layer of type %s not supported, skipping ONNX node generation." % layer_type)
+ print(
+ "Layer of type %s not supported, skipping ONNX node generation."
+ % layer_type
+ )
major_node_specs = MajorNodeSpecs(layer_name, None)
return major_node_specs
@@ -491,7 +542,10 @@ def _make_conv_node(self, layer_name, layer_dict):
stride = layer_dict["stride"]
filters = layer_dict["filters"]
batch_normalize = False
- if "batch_normalize" in layer_dict.keys() and layer_dict["batch_normalize"] == 1:
+ if (
+ "batch_normalize" in layer_dict.keys()
+ and layer_dict["batch_normalize"] == 1
+ ):
batch_normalize = True
kernel_shape = [kernel_size, kernel_size]
@@ -542,7 +596,11 @@ def _make_conv_node(self, layer_name, layer_dict):
layer_name_lrelu = layer_name + "_lrelu"
lrelu_node = helper.make_node(
- "LeakyRelu", inputs=inputs, outputs=[layer_name_lrelu], name=layer_name_lrelu, alpha=self.alpha_lrelu
+ "LeakyRelu",
+ inputs=inputs,
+ outputs=[layer_name_lrelu],
+ name=layer_name_lrelu,
+ alpha=self.alpha_lrelu,
)
self._nodes.append(lrelu_node)
inputs = [layer_name_lrelu]
@@ -633,7 +691,9 @@ def _make_resize_node(self, layer_name, layer_dict):
"""
resize_scale_factors = float(layer_dict["stride"])
# Create the scale factor array with node parameters
- scales = np.array([1.0, 1.0, resize_scale_factors, resize_scale_factors]).astype(np.float32)
+ scales = np.array(
+ [1.0, 1.0, resize_scale_factors, resize_scale_factors]
+ ).astype(np.float32)
previous_node_specs = self._get_previous_node_specs()
inputs = [previous_node_specs.name]
diff --git a/samples/sampleAlgorithmSelector/CMakeLists.txt b/samples/sampleAlgorithmSelector/CMakeLists.txt
index ef9386b3..3b30570c 100644
--- a/samples/sampleAlgorithmSelector/CMakeLists.txt
+++ b/samples/sampleAlgorithmSelector/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleAlgorithmSelector/sampleAlgorithmSelector.cpp b/samples/sampleAlgorithmSelector/sampleAlgorithmSelector.cpp
index 0072f761..02fd9975 100644
--- a/samples/sampleAlgorithmSelector/sampleAlgorithmSelector.cpp
+++ b/samples/sampleAlgorithmSelector/sampleAlgorithmSelector.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleCharRNN/CMakeLists.txt b/samples/sampleCharRNN/CMakeLists.txt
index 89d82682..d52245fb 100644
--- a/samples/sampleCharRNN/CMakeLists.txt
+++ b/samples/sampleCharRNN/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleCharRNN/sampleCharRNN.cpp b/samples/sampleCharRNN/sampleCharRNN.cpp
index 73ba53cc..8ddbb2ac 100644
--- a/samples/sampleCharRNN/sampleCharRNN.cpp
+++ b/samples/sampleCharRNN/sampleCharRNN.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleDynamicReshape/CMakeLists.txt b/samples/sampleDynamicReshape/CMakeLists.txt
index 374b5566..548e9bd5 100644
--- a/samples/sampleDynamicReshape/CMakeLists.txt
+++ b/samples/sampleDynamicReshape/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleDynamicReshape/sampleDynamicReshape.cpp b/samples/sampleDynamicReshape/sampleDynamicReshape.cpp
index d91b1a68..0f880509 100644
--- a/samples/sampleDynamicReshape/sampleDynamicReshape.cpp
+++ b/samples/sampleDynamicReshape/sampleDynamicReshape.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleINT8API/CMakeLists.txt b/samples/sampleINT8API/CMakeLists.txt
index e8eed5c3..00a6e82b 100644
--- a/samples/sampleINT8API/CMakeLists.txt
+++ b/samples/sampleINT8API/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleINT8API/sampleINT8API.cpp b/samples/sampleINT8API/sampleINT8API.cpp
index a20acff3..7cf6e819 100644
--- a/samples/sampleINT8API/sampleINT8API.cpp
+++ b/samples/sampleINT8API/sampleINT8API.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleIOFormats/CMakeLists.txt b/samples/sampleIOFormats/CMakeLists.txt
index 4ec93187..4640a2ff 100755
--- a/samples/sampleIOFormats/CMakeLists.txt
+++ b/samples/sampleIOFormats/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +16,11 @@
#
SET(SAMPLE_SOURCES
sampleIOFormats.cpp
+ ../common/sampleDevice.cpp
+ ../common/sampleEngines.cpp
+ ../common/sampleOptions.cpp
+ ../common/sampleUtils.cpp
+ ../common/bfloat16.cpp
)
SET(SAMPLE_PARSERS "onnx")
diff --git a/samples/sampleIOFormats/sampleIOFormats.cpp b/samples/sampleIOFormats/sampleIOFormats.cpp
index 2c8b87af..9e167134 100644
--- a/samples/sampleIOFormats/sampleIOFormats.cpp
+++ b/samples/sampleIOFormats/sampleIOFormats.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,6 +33,7 @@
#include "half.h"
#include "logger.h"
#include "parserOnnxConfig.h"
+#include "sampleOptions.h"
#include "NvInfer.h"
#include "NvOnnxParser.h"
@@ -144,6 +145,15 @@ class BufferDesc
}
};
+//! Specification for a network I/O tensor.
+class TypeSpec
+{
+public:
+ DataType dtype; //!< datatype
+ TensorFormat format; //!< format
+ std::string formatName; //!< name of the format
+};
+
class SampleBuffer
{
public:
@@ -245,30 +255,14 @@ class SampleIOFormats
bool build(int32_t dataWidth);
//!
- //! \brief Runs the TensorRT inference engine for this sample
- //!
- bool infer(SampleBuffer& inputBuf, SampleBuffer& outputBuf);
-
- //!
- //! \brief Used to run CPU reference and get result
- //!
- bool reference();
-
- //!
- //! \brief Used to compare the CPU reference with the TRT result
+ //! \brief Verify the built engine I/O types and formats.
//!
- void compareResult();
+ bool verify(TypeSpec const& spec);
//!
- //! \brief Reads the digit map from the file
- //!
- bool readDigits(SampleBuffer& buffer, int32_t groundTruthDigit);
-
- //!
- //! \brief Verifies that the output is correct and prints it
+ //! \brief Runs the TensorRT inference engine for this sample
//!
- template
- bool verifyOutput(SampleBuffer& outputBuf, int32_t groundTruthDigit) const;
+ bool infer(SampleBuffer& inputBuf, SampleBuffer& outputBuf);
private:
//!
@@ -293,6 +287,62 @@ class SampleIOFormats
int32_t mDigit;
};
+//!
+//! \brief Validates engine I/O datatypes and formats against a reference.
+//!
+//! \details This function queries I/O datatype and format description from the built engine.
+//! Validating them is sufficient to ensure that `ITensor::setType` and `ITensor::setAllowedFormats` API as
+//! expected.
+//!
+//! \return true if type and format validation succeeds.
+//!
+bool SampleIOFormats::verify(TypeSpec const& spec)
+{
+ assert(mEngine->getNbIOTensors() == 2);
+ char const* inputName = mEngine->getIOTensorName(0);
+ char const* outputName = mEngine->getIOTensorName(1);
+
+ auto verifyType = [](DataType actual, DataType expected) {
+ if (actual != expected)
+ {
+ sample::gLogError << "Expected " << expected << " data type, got " << actual;
+ return false;
+ }
+ return true;
+ };
+
+ if (!verifyType(mEngine->getTensorDataType(inputName), spec.dtype))
+ {
+ return false;
+ }
+
+ if (!verifyType(mEngine->getTensorDataType(outputName), spec.dtype))
+ {
+ return false;
+ }
+
+ auto verifyFormat = [](std::string actual, std::string expected) {
+ if (expected.find(actual) != std::string::npos)
+ {
+ sample::gLogError << "Expected " << expected << " format, got " << actual;
+ return false;
+ }
+ return true;
+ };
+
+ if (!verifyFormat(std::string(mEngine->getTensorFormatDesc(inputName)), spec.formatName))
+ {
+ return false;
+ }
+
+ if (!verifyFormat(std::string(mEngine->getTensorFormatDesc(inputName)), "kLINEAR"))
+ {
+ return false;
+ }
+
+ return true;
+}
+
//!
//! \brief Creates the network, configures the builder and creates the network engine
//!
@@ -474,134 +524,6 @@ bool SampleIOFormats::infer(SampleBuffer& inputBuf, SampleBuffer& outputBuf)
return true;
}
-//!
-//! \brief Reads the digit map from file
-//!
-bool SampleIOFormats::readDigits(SampleBuffer& buffer, int32_t groundTruthDigit)
-{
- int32_t const inputH = buffer.dims.d[2];
- int32_t const inputW = buffer.dims.d[3];
-
- // Read a random digit file
- std::vector fileData(inputH * inputW);
- readPGMFile(
- locateFile(std::to_string(groundTruthDigit) + ".pgm", mParams.dataDirs), fileData.data(), inputH, inputW);
-
- // Print ASCII representation of digit
- for (int32_t i = 0; i < inputH * inputW; i++)
- {
- sample::gLogInfo << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % inputW) ? "" : "\n");
- }
- sample::gLogInfo << std::endl;
-
- float* inputBuf = reinterpret_cast(buffer.buffer);
-
- for (int32_t i = 0; i < inputH * inputW; i++)
- {
- inputBuf[i] = 1.0F - static_cast(fileData[i] / 255.0F);
- }
-
- return true;
-}
-
-//!
-//! \brief Verifies that the output is correct and prints it
-//!
-template
-bool SampleIOFormats::verifyOutput(SampleBuffer& outputBuf, int32_t groundTruthDigit) const
-{
- T const* prob = reinterpret_cast(outputBuf.buffer);
-
- float val{0.0F};
- float elem{0.0F};
- int32_t idx{0};
- int32_t const kDIGITS = 10;
-
- for (int32_t i = 0; i < kDIGITS; i++)
- {
- elem = static_cast(prob[i]);
- if (val < elem)
- {
- val = elem;
- idx = i;
- }
- }
- sample::gLogInfo << "Predicted Output: " << idx << std::endl;
-
- return (idx == groundTruthDigit && val > 0.9F);
-}
-
-int32_t calcIndex(SampleBuffer& buffer, int32_t c, int32_t h, int32_t w)
-{
- int32_t index;
-
- if (!buffer.desc.channelPivot)
- {
- index = c / buffer.desc.dims[4] * buffer.desc.dims[2] * buffer.desc.dims[3] * buffer.desc.dims[4]
- + h * buffer.desc.dims[3] * buffer.desc.dims[4] + w * buffer.desc.dims[4] + c % buffer.desc.dims[4];
- }
- else
- {
- index = h * buffer.desc.dims[3] * buffer.desc.dims[2] + w * buffer.desc.dims[3] + c;
- }
-
- return index;
-}
-
-//!
-//! \brief Reformats the buffer. Src and dst buffers should be of same datatype and dims.
-//!
-template
-void reformat(SampleBuffer& src, SampleBuffer& dst)
-{
- if (src.format == dst.format)
- {
- memcpy(dst.buffer, src.buffer, src.getBufferSize());
- return;
- }
-
- int32_t srcIndex, dstIndex;
-
- T* srcBuf = reinterpret_cast(src.buffer);
- T* dstBuf = reinterpret_cast(dst.buffer);
-
- for (int32_t c = 0; c < src.dims.d[1]; c++)
- {
- for (int32_t h = 0; h < src.dims.d[2]; h++)
- {
- for (int32_t w = 0; w < src.dims.d[3]; w++)
- {
- srcIndex = calcIndex(src, c, h, w);
- dstIndex = calcIndex(dst, c, h, w);
- dstBuf[dstIndex] = srcBuf[srcIndex];
- }
- }
- }
-}
-
-template
-void convertGoldenData(SampleBuffer& goldenInput, SampleBuffer& dstInput)
-{
- SampleBuffer tmpBuf(goldenInput.dims, sizeof(T), goldenInput.format, true);
-
- float* golden = reinterpret_cast(goldenInput.buffer);
- T* tmp = reinterpret_cast(tmpBuf.buffer);
-
- for (int32_t i = 0; i < goldenInput.desc.getElememtSize(); i++)
- {
- if (std::is_same::value)
- {
- tmp[i] = static_cast(1 - ((1.0F - golden[i]) * 255.0F - 128) / 255.0F);
- }
- else
- {
- tmp[i] = static_cast(golden[i]);
- }
- }
-
- reformat(tmpBuf, dstInput);
-}
-
//!
//! \brief Initializes members of the params struct using the command line args
//!
@@ -644,67 +566,29 @@ void printHelpInfo()
//!
template
bool process(SampleIOFormats& sample, sample::Logger::TestAtom const& sampleTest, SampleBuffer& inputBuf,
- SampleBuffer& outputBuf, SampleBuffer& goldenInput)
+ SampleBuffer& outputBuf, TypeSpec& spec)
{
sample::gLogInfo << "Building and running a GPU inference engine with specified I/O formats." << std::endl;
- inputBuf = SampleBuffer(sample.mInputDims, sizeof(T), sample.mTensorFormat, true);
- outputBuf = SampleBuffer(sample.mOutputDims, sizeof(T), TensorFormat::kLINEAR, false);
if (!sample.build(sizeof(T)))
{
return false;
}
- convertGoldenData(goldenInput, inputBuf);
-
- if (!sample.infer(inputBuf, outputBuf))
- {
- return false;
- }
-
- if (!sample.verifyOutput(outputBuf, sample.mDigit))
- {
- return false;
- }
-
- return true;
-}
-
-bool runFP32Reference(SampleIOFormats& sample, sample::Logger::TestAtom const& sampleTest, SampleBuffer& goldenInput,
- SampleBuffer& goldenOutput)
-{
- sample::gLogInfo << "Building and running a FP32 GPU inference to get golden input/output" << std::endl;
-
- if (!sample.build(sizeof(float)))
+ if (!sample.verify(spec))
{
return false;
}
- goldenInput = SampleBuffer(sample.mInputDims, sizeof(float), TensorFormat::kLINEAR, true);
- goldenOutput = SampleBuffer(sample.mOutputDims, sizeof(float), TensorFormat::kLINEAR, false);
-
- sample.readDigits(goldenInput, sample.mDigit);
-
- if (!sample.infer(goldenInput, goldenOutput))
- {
- return false;
- }
+ inputBuf = SampleBuffer(sample.mInputDims, sizeof(T), sample.mTensorFormat, true);
+ outputBuf = SampleBuffer(sample.mOutputDims, sizeof(T), TensorFormat::kLINEAR, false);
- if (!sample.verifyOutput(goldenOutput, sample.mDigit))
+ if (!sample.infer(inputBuf, outputBuf))
{
return false;
}
-
return true;
}
-//! Specification for a network I/O tensor.
-class IOSpec
-{
-public:
- TensorFormat format; //!< format
- std::string formatName; //!< name of the format
-};
-
int32_t main(int32_t argc, char** argv)
{
samplesCommon::Args args;
@@ -727,56 +611,45 @@ int32_t main(int32_t argc, char** argv)
samplesCommon::OnnxSampleParams params = initializeSampleParams(args);
- std::vector vecFP16TensorFmt = {
- IOSpec{TensorFormat::kLINEAR, "kLINEAR"},
- IOSpec{TensorFormat::kCHW2, "kCHW2"},
- IOSpec{TensorFormat::kHWC8, "kHWC8"},
- };
- std::vector vecINT8TensorFmt = {
- IOSpec{TensorFormat::kLINEAR, "kLINEAR"},
- IOSpec{TensorFormat::kCHW4, "kCHW4"},
- IOSpec{TensorFormat::kCHW32, "kCHW32"},
+ std::vector fp16TypeSpec = {
+ TypeSpec{DataType::kHALF, TensorFormat::kLINEAR, "kLINEAR"},
+ TypeSpec{DataType::kHALF, TensorFormat::kCHW2, "kCHW2"},
+ TypeSpec{DataType::kHALF, TensorFormat::kHWC8, "kHWC8"},
};
- SampleBuffer goldenInput, goldenOutput;
+ std::vector int8TypeSpec = {
+ TypeSpec{DataType::kINT8, TensorFormat::kLINEAR, "kLINEAR"},
+ TypeSpec{DataType::kINT8, TensorFormat::kCHW4, "kCHW4"},
+ TypeSpec{DataType::kINT8, TensorFormat::kCHW32, "kCHW32"},
+ };
SampleIOFormats sample(params);
- srand(unsigned(time(nullptr)));
- sample.mDigit = rand() % 10;
-
- sample::gLogInfo << "The test chooses MNIST as the network and recognizes a randomly generated digit" << std::endl;
sample::gLogInfo
- << "Firstly it runs the FP32 as the golden data, then INT8/FP16 with different formats will be tested"
- << std::endl
+ << "Build TRT engine with different IO data type and formats. Ensure that built engine abide by them"
<< std::endl;
- if (!runFP32Reference(sample, sampleTest, goldenInput, goldenOutput))
- {
- return sample::gLogger.reportFail(sampleTest);
- }
-
// Test FP16 formats
- for (auto spec : vecFP16TensorFmt)
+ for (auto spec : fp16TypeSpec)
{
sample::gLogInfo << "Testing datatype FP16 with format " << spec.formatName << std::endl;
sample.mTensorFormat = spec.format;
SampleBuffer inputBuf, outputBuf;
- if (!process(sample, sampleTest, inputBuf, outputBuf, goldenInput))
+ if (!process(sample, sampleTest, inputBuf, outputBuf, spec))
{
return sample::gLogger.reportFail(sampleTest);
}
}
// Test INT8 formats
- for (auto spec : vecINT8TensorFmt)
+ for (auto spec : int8TypeSpec)
{
sample::gLogInfo << "Testing datatype INT8 with format " << spec.formatName << std::endl;
sample.mTensorFormat = spec.format;
SampleBuffer inputBuf, outputBuf;
- if (!process(sample, sampleTest, inputBuf, outputBuf, goldenInput))
+ if (!process(sample, sampleTest, inputBuf, outputBuf, spec))
{
return sample::gLogger.reportFail(sampleTest);
}
diff --git a/samples/sampleNamedDimensions/CMakeLists.txt b/samples/sampleNamedDimensions/CMakeLists.txt
index f03d19b1..21662668 100644
--- a/samples/sampleNamedDimensions/CMakeLists.txt
+++ b/samples/sampleNamedDimensions/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleNamedDimensions/create_model.py b/samples/sampleNamedDimensions/create_model.py
index e4146aa5..575bd4e6 100644
--- a/samples/sampleNamedDimensions/create_model.py
+++ b/samples/sampleNamedDimensions/create_model.py
@@ -1,5 +1,5 @@
#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleNamedDimensions/sampleNamedDimensions.cpp b/samples/sampleNamedDimensions/sampleNamedDimensions.cpp
index 42298ba4..11e04841 100644
--- a/samples/sampleNamedDimensions/sampleNamedDimensions.cpp
+++ b/samples/sampleNamedDimensions/sampleNamedDimensions.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/samples/sampleNonZeroPlugin/README.md b/samples/sampleNonZeroPlugin/README.md
index 15e8e4c2..10e45109 100644
--- a/samples/sampleNonZeroPlugin/README.md
+++ b/samples/sampleNonZeroPlugin/README.md
@@ -16,7 +16,7 @@
## Description
This sample, sampleNonZeroPlugin, implements a plugin for the NonZero operation, customizable to output the non-zero indices in
-either a row major (each set of indices in the same row) or column major format (each set of indices in the same column).
+either a row order (each set of indices in the same row) or column order format (each set of indices in the same column).
NonZero is an operation where the non-zero indices of the input tensor is found.
@@ -36,7 +36,7 @@ Until `IPluginV3` (and associated interfaces), TensorRT plugins could not have o
on input shapes). `IPluginV3OneBuild` which exposes a build capability for `IPluginV3`, provides support for such data-dependent output shapes.
`NonZeroPlugin` in this sample is written to handle 2-D input tensors of shape $R \times C$. Assume that the tensor contains $K$ non-zero elements and that the
-non-zero indices are required in a row-major order. Then the output shape would be $K \times 2$.
+non-zero indices are required in a row ordering (each set of indices in its own row). Then the output shape would be $K \times 2$.
The output shapes are expressed to the TensorRT builder through the `IPluginV3OneBuild::getOutputShapes()` API. Expressing the second dimension of the output is
straightforward:
@@ -70,7 +70,7 @@ and let's not forget to declare that the size tensor is a scalar (0-D):
outputs[1].nbDims = 0;
```
-The `NonZeroPlugin` can also be configured to emit the non-zero indices in a column-major fashion through the `rowMajor` plugin attribute, by setting it to `0`.
+The `NonZeroPlugin` can also be configured to emit the non-zero indices in a column-order fashion through the `rowOrder` plugin attribute, by setting it to `0`.
In this case, the first output of the plugin will have shape $2 \times K$, and the output shape specification must be adjusted accordingly.
### Creating network and building the engine
@@ -95,7 +95,7 @@ Download the sample data from the [TensorRT release tarball](https://developer.n
2. Run the sample to build and run the MNIST engine from the ONNX model.
```
- ./sample_non_zero_plugin [-h or --help] [-d or --datadir=] [--columnMajor] [--fp16]
+ ./sample_non_zero_plugin [-h or --help] [-d or --datadir=] [--columnOrder] [--fp16]
```
3. Verify that the sample ran successfully. If the sample runs successfully you should see output similar to the following:
diff --git a/samples/sampleNonZeroPlugin/nonZeroKernel.cu b/samples/sampleNonZeroPlugin/nonZeroKernel.cu
index 7e015b2c..cdb4c615 100644
--- a/samples/sampleNonZeroPlugin/nonZeroKernel.cu
+++ b/samples/sampleNonZeroPlugin/nonZeroKernel.cu
@@ -17,8 +17,23 @@
#include "nonZeroKernel.h"
+inline __device__ int32_t isZero(float const& a)
+{
+ return a == 0.F;
+}
+
+inline __device__ int32_t isZero(half const& a)
+{
+#if __CUDA_ARCH__ >= 530
+ return a == __float2half(0.F);
+#else
+ return __half2float(a) == 0.F;
+#endif
+}
+
+template
__global__ void findNonZeroIndicesKernel(
- float const* X, int32_t* indices, int32_t* count, int32_t const* K, int32_t R, int32_t C, bool rowMajor)
+ T const* X, int32_t* indices, int32_t* count, int32_t const* K, int32_t R, int32_t C, int32_t rowOrder)
{
int32_t col = blockIdx.x * blockDim.x + threadIdx.x;
@@ -27,12 +42,12 @@ __global__ void findNonZeroIndicesKernel(
{
for (int32_t row = 0; row < R; ++row)
{
- if (X[row + R * col] != 0.F)
+ if (!isZero(X[row * C + col]))
{
int32_t index = atomicAdd(count, 1); // Increment count atomically and get the previous value
if (indices)
{
- if(!rowMajor)
+ if(rowOrder == 0)
{
indices[index] = row;
indices[index + *K] = col;
@@ -48,11 +63,20 @@ __global__ void findNonZeroIndicesKernel(
}
}
-void nonZeroIndicesImpl(
- float const* X, int32_t* indices, int32_t* count, int32_t const* K, int32_t R, int32_t C, bool rowMajor, cudaStream_t stream)
+template
+void nonZeroIndicesImpl(T const* X, int32_t* indices, int32_t* count, int32_t const* K, int32_t R, int32_t C,
+ bool rowOrder, cudaStream_t stream)
{
constexpr int32_t kBLOCK_SIZE = 256;
- int32_t const blocksPerGrid = (R + kBLOCK_SIZE - 1) / kBLOCK_SIZE;
-
- findNonZeroIndicesKernel<<