Visual-Behavior · nntrongnghia · Jun 18, 2021 · Jun 18, 2021 · Jun 18, 2021 · Jun 18, 2021
diff --git a/.gitignore b/.gitignore
@@ -130,3 +130,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+.vscode
+detr_tf/custom_ops/ms_deform_attn/ms_deform_im2col.o
diff --git a/README.md b/README.md
@@ -7,8 +7,9 @@ Tensorflow implementation of DETR : Object Detection with Transformers, includin
 * [3. Tutorials](#tutorials)
 * [4. Finetuning](#finetuning)
 * [5. Training](#training)
-* [5. inference](#inference)
-* [6. Acknowledgement](#acknowledgement)
+* [5. Inference](#inference)
+* [6. Inference with TensorRT](#inference-with-tensorrt)
+* [7. Acknowledgement](#acknowledgement)
 
 
 <b>DETR paper:</b> https://arxiv.org/pdf/2005.12872.pdf <br>
@@ -152,6 +153,82 @@ python webcam_inference.py
 
 <img src="images/webcam_detr.png" width="400"></img>
 
+## Inference with TensorRT
+
+### Requirements:
+```
+cmake >= 3.8
+TensorRT 8
+```
+To install TensorRT 8, follow [NVIDIA TensorRT official installation guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html).
+
+Python package requirements:
+```
+onnx
+tf2onnx
+```
+
+### Custom plugin for Deformable DETR
+Deformable DETR use a custom operation Im2Col in its Transformer layer. This operation is not supported by TensorRT so we need to build a TensorRT custom plugin from source.
+
+```
+cd detr_tensorrt/plugins/ms_deform_im2col
+mkdir build && cd build
+cmake .. \
+       -DTRT_LIB=/path/to/tensorrt/lib/
+       -DTRT_INCLUDE=/path/to/tensorrt/include/
+       -DCUDA_ARCH_SM=/your_gpu_cuda_arch/
+make -j
+```
+For more detail, see: `detr_tensorrt/plugins/ms_deform_im2col/README.txt` 
+
+Parameters:
+- `-DTRT_LIB`: Path to TensorRT lib. It could be `YOUR_TENSORRT_DIR/lib` or `/usr/lib/x86_64-linux-gnu`
+- `-DTRT_INCLUDE`: Path to TensorRT C++ include. It could be `YOUR_TENSORRT_DIR/include` or `/usr/include/x86_64-linux-gnu`
+- `-DCUDA_ARCHE_SM`: Compute capability of your NVIDIA GPU. Example: `70` for Tesla V100. Check [here](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for other GPU.
+
+### Workflow
+Tensorflow model --> ONNX --> TensorRT serialized engine
+
+#### Export Tensorflow graph to ONNX graph:
+
+For each model (detr/deformable-detr), we have:
+```
+python3 detr_tensorrt/export_onnx.py MODEL_NAME 
+       [--input_shape H W] 
+       [--save_to DIR_TO_SAVE_ONNX_FILE]
+```
+Parameters:
+- `--input_shape`: image height and width, default: 1280 1920
+- `--save_to`: directory that onnx file will be saved to. Default: `./weights/MODEL_NAME/MODEL_NAME_trt/`
+
+#### Convert ONNX model to TensorRT serialized engine:
+```
+python3 detr_tensorrt/onnx2engine.py MODEL_NAME
+       [--precision PRECISION]
+       [--onnx_dir ONNX_DIR]
+       [--verbose]
+```
+Parameters:
+- `--precision`: precision of model weights: FP32, FP16, MIX. MIX precision will let TensorRT the freedom to optimize weights as either FP32 or FP16. In most cases, the inference time between FP16 and MIX has no big difference.
+- `--onnx_dir`: directory containing the ONNX file to be converted to TensorRT engine. The required ONNX file must be named `MODEL_NAME.onnx`. Default: `./weights/MODEL_NAME/MODEL_NAME_trt/`
+- `--verbose`: Print out TensorRT log of all levels
+
+The TensorRT serialized engine will be saved in `ONNX_DIR/MODEL_NAME_PRECISION.engine`
+
+### Run inference
+An example of inference with a test image: `images/test.jpeg`
+
+```
+python tensorrt_inference.py --engine_path ENGINE_PATH
+```
+
+Inference time in milisecond:
+|               | DETR | Deformable DETR |
+|---------------|------|-----------------|
+| Tensorflow    | 100  | 160             |
+| TensorRT FP32 | 60   | 100             |
+| TensorRT FP16 | 27   | 60              |
 
 ## Acknowledgement
 

diff --git a/detr_tensorrt/TRTEngineBuilder.py b/detr_tensorrt/TRTEngineBuilder.py
@@ -0,0 +1,87 @@
+import tensorrt as trt
+import os
+
+TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
+network_creation_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+def GiB(val):
+    return val * 1 << 30
+
+class TRTEngineBuilder():
+    """
+    Work with TensorRT 8. Should work fine with TensorRT 7.2.3 (not tested)
+
+    Helper class to build TensorRT engine from ONNX graph file (including weights). 
+    The graph must have defined input shape. For more detail, please see TensorRT Developer Guide:
+    https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics
+    """
+    def __init__(self, onnx_file_path, FP16_allowed=False, INT8_allowed=False, strict_type=False, calibrator=None, logger=TRT_LOGGER):
+        """
+        Parameters:
+        -----------
+        onnx_file_path: str
+            path to ONNX graph file
+        FP16_allowed: bool
+            Enable FP16 precision for engine builder
+        INT8_allowed: bool
+            Enable FP16 precision for engine builder, user must provide also a calibrator
+        strict_type: bool
+            Ensure that the builder understands to force the precision
+        calibrator: extended instance from tensorrt.IInt8Calibrator
+            Used for INT8 quantization
+        """
+        self.FP16_allowed = FP16_allowed
+        self.INT8_allowed = INT8_allowed
+        self.onnx_file_path = onnx_file_path
+        self.calibrator = calibrator
+        self.max_workspace_size = GiB(8)
+        self.strict_type = strict_type
+        self.logger = logger
+
+    def set_workspace_size(self, workspace_size_GiB):
+        self.max_workspace_size = GiB(workspace_size_GiB)
+
+    def get_engine(self):
+        """
+        Setup engine builder, read ONNX graph and build TensorRT engine.
+        """
+        global network_creation_flag
+        with trt.Builder(self.logger) as builder, builder.create_network(network_creation_flag) as network, trt.OnnxParser(network, self.logger) as parser:
+            builder.max_batch_size = 1
+            config = builder.create_builder_config()
+            config.max_workspace_size = self.max_workspace_size
+            # FP16
+            if self.FP16_allowed:
+                config.set_flag(trt.BuilderFlag.FP16)
+            # INT8
+            if self.INT8_allowed:
+                raise NotImplementedError()
+            if self.strict_type:
+                config.set_flag(trt.BuilderFlag.STRICT_TYPES)
+
+            # Load and build model 
+            with open(self.onnx_file_path, 'rb') as model:
+                if not parser.parse(model.read()):
+                    print ('ERROR: Failed to parse the ONNX file.')
+                    for error in range(parser.num_errors):
+                        print (parser.get_error(error))
+                    return None
+                else:
+                    print("ONNX file is loaded")
+            print("Building engine...")
+            engine = builder.build_engine(network, config)
+            if engine is None:
+                raise Exception("TRT export engine error. Check log")
+            print("Engine built")
+        return engine
+
+    def export_engine(self, engine_path):
+        """Seriazlize TensorRT engine"""
+        engine = self.get_engine()
+        assert engine is not None, "Error while parsing engine from ONNX"
+        with open(engine_path, "wb") as f:
+                print("Serliaze and save as engine: " + engine_path)
+                f.write(engine.serialize())
+        print("Engine exported")
+
+
diff --git a/detr_tensorrt/TRTExecutor.py b/detr_tensorrt/TRTExecutor.py
@@ -0,0 +1,131 @@
+import ctypes
+import pycuda.autoinit as cuda_init
+from surroundnet.detr.tensorrt.trt_helper import *
+import tensorrt as trt
+
+TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
+# trt.init_libnvinfer_plugins(None, "")
+
+class TRTExecutor():
+    """
+    A helper class to execute a TensorRT engine.
+
+    Attributes:
+    -----------
+    stream: pycuda.driver.Stream
+    engine: tensorrt.ICudaEngine
+    context: tensorrt.IExecutionContext
+    inputs/outputs: list[HostDeviceMem]
+        see trt_helper.py
+    bindings: list[int] 
+        pointers in GPU for each input/output of the engine
+    dict_inputs/dict_outputs: dict[str, HostDeviceMem]
+        key = input node name
+        value = HostDeviceMem of corresponding binding
+
+    """
+    def __init__(self, engine_path=None, has_dynamic_shape=False, stream=None, engine=None):
+        """
+        Parameters:
+        ----------
+        engine_path: str
+            path to serialized TensorRT engine
+        has_dynamic_shape: bool
+        stream: pycuda.driver.Stream
+            if None, one will be created by allocate_buffers function
+        """
+        self.stream = stream
+        if engine_path is not None:
+            with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+                print("Reading engine  ...")
+                self.engine = runtime.deserialize_cuda_engine(f.read())
+                assert self.engine is not None, "Read engine failed"
+                print("Engine loaded")
+        elif engine is not None:
+            self.engine = engine
+        self.context = self.engine.create_execution_context()
+        if not has_dynamic_shape:
+            self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.context, self.stream)
+            self.dict_inputs = {mem_obj.name:mem_obj for mem_obj in self.inputs}
+            self.dict_outputs = {mem_obj.name:mem_obj for mem_obj in self.outputs}
+
+    def print_bindings_info(self):
+        print("ID / Name / isInput / shape / dtype")
+        for i in range(self.engine.num_bindings):
+            print(f"Binding: {i}, name: {self.engine.get_binding_name(i)}, input: {self.engine.binding_is_input(i)}, shape: {self.engine.get_binding_shape(i)}, dtype: {self.engine.get_binding_dtype(i)}")
+
+    def execute(self):
+        do_inference_async(
+            self.context, 
+            bindings=self.bindings, 
+            inputs=self.inputs, 
+            outputs=self.outputs, 
+            stream=self.stream
+        )
+
+    def set_binding_shape(self, binding:int, shape:tuple):
+        self.context.set_binding_shape(binding, shape)
+
+    def allocate_mem(self):
+        self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.context, self.stream)
+        self.dict_inputs = {mem_obj.name:mem_obj for mem_obj in self.inputs}
+        self.dict_outputs = {mem_obj.name:mem_obj for mem_obj in self.outputs}
+
+class TRTExecutor_Sync():
+    """
+    A helper class to execute a TensorRT engine.
+
+    Attributes:
+    -----------
+    engine: tensorrt.ICudaEngine
+    context: tensorrt.IExecutionContext
+    inputs/outputs: list[HostDeviceMem]
+        see trt_helper.py
+    bindings: list[int] 
+        pointers in GPU for each input/output of the engine
+    dict_inputs/dict_outputs: dict[str, HostDeviceMem]
+        key = input node name
+        value = HostDeviceMem of corresponding binding
+
+    """
+    def __init__(self, engine_path=None, has_dynamic_shape=False, engine=None):
+        """
+        Parameters:
+        ----------
+        engine_path: str
+            path to serialized TensorRT engine
+        has_dynamic_shape: bool
+        """
+        if engine_path is not None:
+            with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+                print("Reading engine  ...")
+                self.engine = runtime.deserialize_cuda_engine(f.read())
+                assert self.engine is not None, "Read engine failed"
+                print("Engine loaded")
+        elif engine is not None:
+            self.engine = engine
+        self.context = self.engine.create_execution_context()
+        if not has_dynamic_shape:
+            self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.context, is_async=False)
+            self.dict_inputs = {mem_obj.name:mem_obj for mem_obj in self.inputs}
+            self.dict_outputs = {mem_obj.name:mem_obj for mem_obj in self.outputs}
+
+    def print_bindings_info(self):
+        print("ID / Name / isInput / shape / dtype")
+        for i in range(self.engine.num_bindings):
+            print(f"Binding: {i}, name: {self.engine.get_binding_name(i)}, input: {self.engine.binding_is_input(i)}, shape: {self.engine.get_binding_shape(i)}, dtype: {self.engine.get_binding_dtype(i)}")
+
+    def execute(self):
+        do_inference(
+            self.context, 
+            bindings=self.bindings, 
+            inputs=self.inputs, 
+            outputs=self.outputs,
+        )
+
+    def set_binding_shape(self, binding:int, shape:tuple):
+        self.context.set_binding_shape(binding, shape)
+
+
+
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -130,3 +130,6 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    .vscode
+    detr_tf/custom_ops/ms_deform_attn/ms_deform_im2col.o