From d17efa7bd7acefeaa8cec364f63c48210e1ce091 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Thu, 25 May 2023 17:22:08 -0500
Subject: [PATCH 01/33] Working multiprocess solution

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 CMakeLists.txt             |  164 ++++--
 src/config.h               |   28 +
 src/model_instance.cc      |  396 +++++++++++++
 src/model_instance.h       |  116 ++++
 src/model_instance_main.cc |   86 +++
 src/model_instance_utils.h |   25 +
 src/papi_profiler.cc       |   15 +-
 src/papi_profiler.h        |    1 -
 src/tflite.cc              | 1098 ++++++++++++++++++------------------
 src/tflite_utils.cc        |   46 +-
 src/tflite_utils.h         |    8 +-
 11 files changed, 1353 insertions(+), 630 deletions(-)
 create mode 100644 src/config.h
 create mode 100644 src/model_instance.cc
 create mode 100644 src/model_instance.h
 create mode 100644 src/model_instance_main.cc
 create mode 100644 src/model_instance_utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 940188a..177b8b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,6 +118,9 @@ if(NOT (ACL_VERSION VERSION_GREATER "21.05"))
   list(APPEND ACL_BUILD_FLAGS "internal_only=0")
 endif()
 
+# Enable REPROC++
+set(REPROC++ ON)
+
 #
 # Dependencies
 #
@@ -141,8 +144,18 @@ FetchContent_Declare(
   GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
   GIT_TAG ${TRITON_BACKEND_REPO_TAG}
   GIT_SHALLOW ON)
+FetchContent_Declare(
+    tensorpipe
+    GIT_REPOSITORY https://github.com/pytorch/tensorpipe.git
+    GIT_TAG bb1473a4b38b18268e8693044afdb8635bc8351b
+    GIT_SHALLOW ON)
+FetchContent_Declare(
+    reproc
+    GIT_REPOSITORY https://github.com/DaanDeMeyer/reproc
+    GIT_TAG v14.2.4
+    GIT_SHALLOW ON)
 
-set(MAKE_AVAILABLE_LIST repo-common repo-core repo-backend)
+set(MAKE_AVAILABLE_LIST repo-common repo-core repo-backend tensorpipe reproc)
 
 if(NOT TFLITE_BAZEL_BUILD)
   FetchContent_Declare(
@@ -335,58 +348,140 @@ if (PAPI_PROFILING_ENABLE)
     BUILD_COMMAND make -j$(nproc)
     UPDATE_COMMAND ""
     INSTALL_COMMAND make install
-    TEST_COMMAND make test
-  )
+    TEST_COMMAND make test)
 endif()
 
 #
-# Handle libs for TFLite Backend
+# Handle libs for Model Instance standalone executable
 #
 
-set(BACKEND_SRCS src/tflite.cc src/tflite_utils.cc src/tflite_utils.h)
-
+set(MODEL_INSTANCE_SRCS 
+      src/model_instance_main.cc 
+      src/model_instance.cc 
+      src/model_instance.h
+      src/model_instance_utils.h)
 if(PAPI_PROFILING_ENABLE)
-  list(APPEND BACKEND_SRCS src/papi_profiler.cc)
+  list(APPEND MODEL_INSTANCE_SRCS src/papi_profiler.cc)
 endif()
 
-add_library(triton-armnn-tflite-backend SHARED ${BACKEND_SRCS})
+add_executable(model_instance ${MODEL_INSTANCE_SRCS})
+
+set(MODEL_INSTANCE_LINK_LIBS 
+      tensorpipe 
+      triton-core-serverstub
+      triton-backend-utils)
 
+set(MODEL_INSTANCE_INCLUDE_DIRS
+    ${CMAKE_CURRENT_SOURCE_DIR}/src 
+    ${TENSORFLOW_ROOT} # for tflite headers
+)
 if(ARMNN_DELEGATE_ENABLE)
-  add_dependencies(triton-armnn-tflite-backend armnn)
+  add_dependencies(model_instance armnn)
+  list(APPEND MODEL_INSTANCE_INCLUDE_DIRS
+    ${ARMNN_LOCATION}/include # for armnn headers
+    ${ARMNN_LOCATION}/src/armnn/delegate/include # for delegate headers
+  )
+  # As per https://review.mlplatform.org/c/ml/armnn/+/7327
+  if(ARMNN_VERSION VERSION_GREATER_EQUAL "22.05")
+    list(APPEND MODEL_INSTANCE_INCLUDE_DIRS ${ARMNN_LOCATION}/src/armnn/profiling)
+  endif()
+  target_compile_definitions(model_instance PRIVATE ARMNN_DELEGATE_ENABLE=1)
+  # Link the armnn lib
+  target_link_libraries(
+    model_instance PRIVATE "-L${ARMNN_LOCATION}/lib" -larmnn -larmnnDelegate)
 endif()
 
 if(PAPI_PROFILING_ENABLE)
-  add_dependencies(triton-armnn-tflite-backend papi)
+  add_dependencies(model_instance papi)
   target_compile_definitions(
-    triton-armnn-tflite-backend
+    model_instance
     PRIVATE PAPI_PROFILING_ENABLE=1
   )
-  target_include_directories(triton-armnn-tflite-backend PRIVATE ${CMAKE_BINARY_DIR}/papi-prefix/include)
+  list(APPEND MODEL_INSTANCE_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/papi-prefix/include)
 
   # Note that linking the STATIC papi library results in a segfault on call to PAPI_library_init, use shared lib
-  target_link_libraries(triton-armnn-tflite-backend PRIVATE ${CMAKE_BINARY_DIR}/papi-prefix/lib/libpapi.so)
+  target_link_libraries(model_instance PRIVATE ${CMAKE_BINARY_DIR}/papi-prefix/lib/libpapi.so)
+endif()
+
+if(TFLITE_BAZEL_BUILD)
+  list(APPEND MODEL_INSTANCE_INCLUDE_DIRS
+       ${TENSORFLOW_ROOT}/bazel-tensorflow-lite/external/flatbuffers/include)
+  # Link the tensorflow lite library from bazel tfile build
+  target_link_libraries(
+    model_instance
+    PRIVATE "-L${TFLITE_LOCATION}/src/tensorflow-lite/bazel-bin/tensorflow/lite"
+            -ltensorflowlite)
+else()
+  list(APPEND MODEL_INSTANCE_INCLUDE_DIRS
+    ${TFLITE_LIB_ROOT}/flatbuffers/include)
+  list(APPEND MODEL_INSTANCE_LINK_LIBS tensorflow-lite)
 endif()
 
+target_include_directories(model_instance PRIVATE ${MODEL_INSTANCE_INCLUDE_DIRS})
+target_link_libraries(model_instance PRIVATE ${MODEL_INSTANCE_LINK_LIBS})
+
+target_compile_features(model_instance PRIVATE cxx_std_11)
+target_compile_options(
+  model_instance
+  PRIVATE
+    $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
+    -Wall
+    -Wextra
+    -Wno-unused-parameter
+    -Wno-type-limits
+    -Wno-comment
+    -Werror>)
+
+set_target_properties(
+  model_instance
+  PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME model_instance
+    SKIP_BUILD_RPATH TRUE
+    BUILD_WITH_INSTALL_RPATH TRUE
+    INSTALL_RPATH_USE_LINK_PATH FALSE
+    INSTALL_RPATH "$\{ORIGIN\}"
+    LINK_FLAGS
+      "-Wl,--no-as-needed")
+
+#
+# Handle libs for TFLite Backend
+#
+
+set(BACKEND_SRCS 
+      src/tflite.cc 
+      src/tflite_utils.cc 
+      src/tflite_utils.h)
+
+add_library(triton-armnn-tflite-backend SHARED ${BACKEND_SRCS})
+
 add_library(TritonArmNNTFLiteBackend::triton-armnn-tflite-backend ALIAS
             triton-armnn-tflite-backend)
 
 set(BACKEND_INCLUDE_DIRS
     ${CMAKE_CURRENT_SOURCE_DIR}/src ${TENSORFLOW_ROOT} # for tflite headers
-    ${ARMNN_LOCATION}/include # for armnn headers
-    ${ARMNN_LOCATION}/src/armnn/delegate/include # for delegate headers
 )
 
-# As per https://review.mlplatform.org/c/ml/armnn/+/7327
-if(ARMNN_VERSION VERSION_GREATER_EQUAL "22.05")
-  list(APPEND BACKEND_INCLUDE_DIRS ${ARMNN_LOCATION}/src/armnn/profiling)
-endif()
+set(BACKEND_LINK_LIBS
+    triton-core-serverapi triton-core-backendapi triton-core-serverstub
+    triton-backend-utils tensorpipe reproc++ ${CMAKE_DL_LIBS})
 
 if(TFLITE_BAZEL_BUILD)
   list(APPEND BACKEND_INCLUDE_DIRS
        ${TENSORFLOW_ROOT}/bazel-tensorflow-lite/external/flatbuffers/include)
+  # Link the tensorflow lite library from bazel tfile build
+  target_link_libraries(
+    triton-armnn-tflite-backend
+    PRIVATE "-L${TFLITE_LOCATION}/src/tensorflow-lite/bazel-bin/tensorflow/lite"
+            -ltensorflowlite)
 else()
   list(APPEND BACKEND_INCLUDE_DIRS
        ${TFLITE_LIB_ROOT}/flatbuffers/include)
+  list(APPEND BACKEND_LINK_LIBS tensorflow-lite)
+endif()
+
+if(ARMNN_DELEGATE_ENABLE)
+  target_compile_definitions(triton-armnn-tflite-backend PRIVATE ARMNN_DELEGATE_ENABLE=1)
 endif()
 
 target_include_directories(triton-armnn-tflite-backend
@@ -404,12 +499,6 @@ target_compile_options(
     -Wno-comment
     -Werror>)
 
-# ARMNN_DELEGATE_ENABLE exposed in header so set PUBLIC
-if(${ARMNN_DELEGATE_ENABLE})
-  target_compile_definitions(triton-armnn-tflite-backend
-                             PUBLIC ARMNN_DELEGATE_ENABLE=1)
-endif() # ARMNN_DELEGATE_ENABLE
-
 set_target_properties(
   triton-armnn-tflite-backend
   PROPERTIES
@@ -423,35 +512,18 @@ set_target_properties(
     LINK_FLAGS
       "-Wl,--no-as-needed,--version-script libtriton_armnn_tflite.ldscript")
 
-set(BACKEND_LINK_LIBS
-    triton-core-serverapi triton-core-backendapi triton-core-serverstub
-    triton-backend-utils ${CMAKE_DL_LIBS})
-
-if(TFLITE_BAZEL_BUILD)
-  # Link the tensorflow lite library from bazel tfile build
-  target_link_libraries(
-    triton-armnn-tflite-backend
-    PRIVATE "-L${TFLITE_LOCATION}/src/tensorflow-lite/bazel-bin/tensorflow/lite"
-            -ltensorflowlite)
-else()
-  list(APPEND BACKEND_LINK_LIBS tensorflow-lite)
-endif()
-
 target_link_libraries(triton-armnn-tflite-backend PRIVATE ${BACKEND_LINK_LIBS})
 
-if(ARMNN_DELEGATE_ENABLE)
-  # Link the armnn lib
-  target_link_libraries(
-    triton-armnn-tflite-backend PRIVATE "-L${ARMNN_LOCATION}/lib" -larmnn
-                                        -larmnnDelegate)
-endif()
-
 #
 # Install
 #
 include(GNUInstallDirs)
 set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonArmNNTFLiteBackend)
 
+install(
+  TARGETS model_instance
+  DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/armnn_tflite)
+
 install(
   TARGETS triton-armnn-tflite-backend
   EXPORT triton-armnn-tflite-backend-targets
diff --git a/src/config.h b/src/config.h
new file mode 100644
index 0000000..fcb243a
--- /dev/null
+++ b/src/config.h
@@ -0,0 +1,28 @@
+//
+// Copyright © 2023 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+// This class is used to map an optimizer option to an index in an array so
+// options can be sent across a tensorpipe payload
+enum OptimizerOption {
+  TFLITE_NUM_THREADS,
+  XNNPACK_ENABLE,
+  XNNPACK_CPU_NUM_THREADS,
+
+#ifdef ARMNN_DELEGATE_ENABLE
+  ARMNN_CPU_ENABLE,
+  ARMNN_GPU_ENABLE,
+  ARMNN_CPU_NUM_THREADS,
+  ARMNN_CPU_REDUCE_FP32_TO_FP16,
+  ARMNN_CPU_REDUCE_FP32_TO_BF16,
+  ARMNN_CPU_FAST_MATH_ENABLED,
+  ARMNN_GPU_FAST_MATH_ENABLED,
+  ARMNN_GPU_REDUCE_FP32_TO_FP16,
+  ARMNN_GPU_REDUCE_FP32_TO_BF16,
+#endif  // ARMNN_DELEGATE_ENABLE
+
+  COUNT  // Just used to track the number of options
+};
\ No newline at end of file
diff --git a/src/model_instance.cc b/src/model_instance.cc
new file mode 100644
index 0000000..2505353
--- /dev/null
+++ b/src/model_instance.cc
@@ -0,0 +1,396 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "model_instance.h"
+
+#include <future>
+#include <unordered_set>
+
+#include "config.h"
+#include "model_instance_utils.h"
+
+// Triton backend headers
+#include "triton/backend/backend_common.h"
+
+// TFLite headers
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/type_to_tflitetype.h"
+
+#ifdef ARMNN_DELEGATE_ENABLE
+// ArmNN headers
+#include "armnn/ArmNN.hpp"
+#include "armnn_delegate.hpp"
+#endif  // ARMNN_DELEGATE_ENABLE
+
+void
+ModelInstance::Finalize()
+{
+  listener_->close();
+  pipe_->close();
+}
+
+void
+ModelInstance::Start(const std::string& addr)
+{
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("ModelInstance starts on: ") + addr).c_str());
+  listener_ = context_->listen({addr});
+  listener_->accept([&, this](
+                        const tensorpipe::Error& error,
+                        std::shared_ptr<tensorpipe::Pipe> pipe) {
+    if (error) {
+      if (error.isOfType<tensorpipe::ListenerClosedError>()) {
+        // Expected.
+      } else {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR,
+            (std::string("Unexpected error when accepting incoming pipe: ") +
+             error.what())
+                .c_str());
+      }
+      return;
+    }
+    pipe_ = std::move(pipe);
+    ReceiveFromPipe();
+  });
+}
+
+TfLiteStatus
+ModelInstance::BuildInterpreter(tensorpipe::Descriptor descriptor)
+{
+  // Build the tflite interpreter
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder builder(*model_, resolver);
+  builder(&interpreter_);
+  if (!interpreter_) {
+    return kTfLiteError;
+  }
+
+  // Set interpreter threads
+  if (interpreter_->SetNumThreads(std::stoi(
+          descriptor.payloads[OptimizerOption::TFLITE_NUM_THREADS].metadata)) !=
+      kTfLiteOk) {
+    return kTfLiteError;
+  }
+
+#ifdef ARMNN_DELEGATE_ENABLE
+  armnn::OptimizerOptions armnn_optimizer_options_cpu;
+  armnn::OptimizerOptions armnn_optimizer_options_gpu;
+  bool armnn_cpu_delegate_enabled =
+      descriptor.payloads[OptimizerOption::ARMNN_CPU_ENABLE].metadata ==
+      std::string("y");
+
+  bool armnn_gpu_delegate_enabled =
+      descriptor.payloads[OptimizerOption::ARMNN_GPU_ENABLE].metadata ==
+      std::string("y");
+
+  if (armnn_cpu_delegate_enabled || armnn_gpu_delegate_enabled) {
+    armnnDelegate::DelegateOptions armnn_delegate_options =
+        armnnDelegate::TfLiteArmnnDelegateOptionsDefault();
+
+    // Set backend prefs based on gpu or cpu selection
+    if (armnn_gpu_delegate_enabled) {
+      armnn_delegate_options.SetBackends(
+          {armnn::Compute::GpuAcc, armnn::Compute::CpuAcc});
+      armnn_optimizer_options_gpu.m_ReduceFp32ToFp16 =
+          descriptor.payloads[OptimizerOption::ARMNN_GPU_REDUCE_FP32_TO_FP16]
+              .metadata == std::string("on");
+      armnn_optimizer_options_gpu.m_ReduceFp32ToBf16 =
+          descriptor.payloads[OptimizerOption::ARMNN_GPU_REDUCE_FP32_TO_BF16]
+              .metadata == std::string("on");
+      armnn::BackendOptions gpu_fast_math_option(
+          "GpuAcc",
+          {{"FastMathEnabled",
+            descriptor.payloads[OptimizerOption::ARMNN_GPU_FAST_MATH_ENABLED]
+                    .metadata == std::string("on")}});
+      armnn_optimizer_options_gpu.m_ModelOptions.push_back(
+          gpu_fast_math_option);
+      armnn_delegate_options.SetOptimizerOptions(armnn_optimizer_options_gpu);
+    } else {
+      // Set backend pref to Neon ACL backend
+      armnn_delegate_options.SetBackends({armnn::Compute::CpuAcc});
+      armnn_optimizer_options_cpu.m_ReduceFp32ToFp16 =
+          descriptor.payloads[OptimizerOption::ARMNN_CPU_REDUCE_FP32_TO_FP16]
+              .metadata == std::string("on");
+      armnn_optimizer_options_cpu.m_ReduceFp32ToBf16 =
+          descriptor.payloads[OptimizerOption::ARMNN_CPU_REDUCE_FP32_TO_BF16]
+              .metadata == std::string("on");
+      armnn::BackendOptions cpu_fast_math_option(
+          "CpuAcc",
+          {{"FastMathEnabled",
+            descriptor.payloads[OptimizerOption::ARMNN_CPU_FAST_MATH_ENABLED]
+                    .metadata == std::string("on")}});
+      armnn_optimizer_options_cpu.m_ModelOptions.push_back(
+          cpu_fast_math_option);
+      armnn::BackendOptions num_threads_option(
+          "CpuAcc",
+          {{"NumberOfThreads",
+            static_cast<unsigned int>(std::stoi(
+                descriptor.payloads[OptimizerOption::ARMNN_CPU_NUM_THREADS]
+                    .metadata))}});
+      armnn_optimizer_options_cpu.m_ModelOptions.push_back(num_threads_option);
+      armnn_delegate_options.SetOptimizerOptions(armnn_optimizer_options_cpu);
+    }
+
+    // Create ArmNN Delegate with options registered in model state
+    std::unique_ptr<
+        TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
+        armnn_delegate(
+            armnnDelegate::TfLiteArmnnDelegateCreate(armnn_delegate_options),
+            armnnDelegate::TfLiteArmnnDelegateDelete);
+
+    // Instruct the Interpreter to use the armnnDelegate
+    if (interpreter_->ModifyGraphWithDelegate(std::move(armnn_delegate)) !=
+        kTfLiteOk) {
+      return kTfLiteError;
+    }
+    LogDelegation("armnn");
+  } else if (
+      descriptor.payloads[OptimizerOption::XNNPACK_ENABLE].metadata ==
+      std::string("y")) {
+#else
+  if (descriptor.payloads[OptimizerOption::XNNPACK_ENABLE].metadata ==
+      std::string("y")) {
+#endif  // ARMNN_DELEGATE_ENABLE
+    // Create the XNNPack Delegate
+    TfLiteXNNPackDelegateOptions options =
+        TfLiteXNNPackDelegateOptionsDefault();
+
+    options.num_threads = std::stoi(
+        descriptor.payloads[OptimizerOption::XNNPACK_CPU_NUM_THREADS].metadata);
+
+    tflite::Interpreter::TfLiteDelegatePtr xnnpack_delegate(
+        TfLiteXNNPackDelegateCreate(&options),
+        [](TfLiteDelegate* xnnpack_delegate) {
+          TfLiteXNNPackDelegateDelete(xnnpack_delegate);
+        });
+
+    // Instruct the Interpreter to use the xnnpack
+    if (interpreter_->ModifyGraphWithDelegate(std::move(xnnpack_delegate)) !=
+        kTfLiteOk) {
+      return kTfLiteError;
+    }
+    LogDelegation("xnnpack");
+  } else {
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "No delegates used for model execution");
+  }
+
+#ifdef PAPI_PROFILING_ENABLE
+  interpreter_->AddProfiler(papi_profiler_.get());
+#endif  // PAPI_PROFILING_ENABLE
+
+  return kTfLiteOk;
+}
+
+void
+ModelInstance::LogDelegation(const std::string& delegate_name)
+{
+  std::unordered_set<unsigned int> checked_node_ids;
+  unsigned int num_delegated_kernels = 0;
+  for (uint64_t i = 0; i < interpreter_->execution_plan().size(); i++) {
+    int node_id = interpreter_->execution_plan()[i];
+    if (checked_node_ids.find(node_id) != checked_node_ids.end()) {
+      continue;
+    }
+    const TfLiteNode& node =
+        interpreter_->node_and_registration(node_id)->first;
+
+    if (node.delegate != nullptr) {
+      num_delegated_kernels++;
+      checked_node_ids.insert(node_id);
+    }
+  }
+  bool fully_delegated =
+      (num_delegated_kernels == 1 &&
+       interpreter_->execution_plan().size() == 1);
+
+  if (fully_delegated) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO, ("Applied " + delegate_name +
+                                " delegate, and the model graph will be "
+                                "completely executed by the delegate.")
+                                   .c_str());
+  } else if (num_delegated_kernels > 0) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        ("Applied " + delegate_name +
+         " delegate, and the model graph will be paritally executed by the "
+         "delegate w/ " +
+         std::to_string(num_delegated_kernels) + " delegate kernels.")
+            .c_str());
+  } else {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO, ("Though " + delegate_name +
+                                " delegate is applied, the model graph will "
+                                "not be executed by the delegate.")
+                                   .c_str());
+  }
+}
+
+void
+ModelInstance::ReceiveFromPipe()
+{
+  pipe_->readDescriptor([this](
+                            const tensorpipe::Error& error,
+                            tensorpipe::Descriptor descriptor) {
+    if (error) {
+      // Error may happen when the pipe is closed
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("Unexpected error when reading from accepted pipe: ") +
+           error.what())
+              .c_str());
+      return;
+    }
+    if (descriptor.metadata == "model_load") {
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Loading model");
+      LoadModelFromPipe(descriptor);
+    } else if (descriptor.metadata == "model_input") {
+      Infer(descriptor);
+    }
+  });
+}
+
+void
+ModelInstance::LoadModelFromPipe(tensorpipe::Descriptor descriptor)
+{
+  // TODO: Make sure this can only be called once as it loads the model and
+  // builds the interpreter
+  tensorpipe::Allocation allocation;
+  allocation.payloads.resize(descriptor.payloads.size());
+  allocation.payloads[OptimizerOption::COUNT].data =
+      new char[descriptor.payloads[OptimizerOption::COUNT].length];
+  pipe_->read(
+      allocation,
+      [this, descriptor, allocation](const tensorpipe::Error& error) {
+        if (error) {
+          LOG_MESSAGE(
+              TRITONSERVER_LOG_ERROR,
+              ("Failed to read model from pipe with err:" + error.what())
+                  .c_str());
+          return;
+        }
+        // Load the tflite model from the buffer
+        tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates
+            builtin_op_resolver;
+        model_ = tflite::FlatBufferModel::BuildFromBuffer(
+            reinterpret_cast<char*>(
+                allocation.payloads[OptimizerOption::COUNT].data),
+            descriptor.payloads[OptimizerOption::COUNT].length);
+
+        // Initalize the interpreter after loading the flatbuffers model
+        BuildInterpreter(descriptor);
+
+        // Arm for getting more data
+        ReceiveFromPipe();
+      });
+}
+
+void
+ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
+{
+  bool allocate_tensors = false;
+
+  // Create allocation to hold incoming input tensor data
+  tensorpipe::Allocation allocation;
+  allocation.tensors.resize(descriptor.tensors.size());
+
+  // Get model inputs from request and ready the buffers (Allocation obj) to
+  // write tensor data
+  for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
+    // If the size of the incoming tensor
+    // is different from the last call, tell the interpreter to resize the
+    // input tensor and note that we are going to have to make another call to
+    // AllocateTensors below
+
+    // First element of tensor_info is input tensor index, remaining is the dims
+    // of the input tensor
+    int input_tensor_index = std::stoi(descriptor.tensors[i].metadata);
+
+    // Length holds the num bytes of the incoming vector
+    int length = descriptor.tensors[i].length;
+
+    TfLiteIntArray* tflite_input_tensor_dims =
+        interpreter_->tensor(input_tensor_index)->dims;
+    int tflite_input_tensor_len =
+        interpreter_->tensor(input_tensor_index)->bytes;
+    std::vector<int> tflite_input_shape(
+        tflite_input_tensor_dims->data,
+        (tflite_input_tensor_dims->data + tflite_input_tensor_dims->size));
+    if (length != tflite_input_tensor_len) {
+      // Resize input tensors based on current total batch size
+      allocate_tensors = true;
+
+      // Set the new batch size
+      tflite_input_shape[0] = length > tflite_input_tensor_len
+                                  ? length / tflite_input_tensor_len
+                                  : tflite_input_tensor_len / length;
+
+      interpreter_->ResizeInputTensor(input_tensor_index, tflite_input_shape);
+    }
+  }
+
+  // Once we have resized all input tensors in the loop above,
+  // now we can allocate the memory plan within the tflite runtime if
+  // necessary
+  if (allocate_tensors || first_inference_) {
+    if (interpreter_->AllocateTensors() != kTfLiteOk) {
+      return;
+    }
+  }
+
+  // Assign Cpu buffers to read incoming tensor bytes into after allocate
+  // tensors is called
+  for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
+    tensorpipe::CpuBuffer cpu_buffer{
+        .ptr = interpreter_->tensor(std::stoi(descriptor.tensors[i].metadata))
+                   ->data.raw};
+    allocation.tensors[i].buffer = cpu_buffer;
+  }
+
+  pipe_->read(allocation, [this](const tensorpipe::Error& error) {
+    if (error) {
+      return;
+    }
+    // At this point our input tensors should be written to by the read
+    // function,
+    // now we invoke the interpreter and read the output
+    if (interpreter_->Invoke() != kTfLiteOk) {
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to invoke model");
+      return;
+    }
+
+    first_inference_ = false;
+
+    // Write output back to client
+    tensorpipe::Message tp_msg;
+
+    for (uint64_t i = 0; i < interpreter_->outputs().size(); ++i) {
+      int output_index = interpreter_->outputs()[i];
+      TfLiteTensor* output_tensor = interpreter_->tensor(output_index);
+      tensorpipe::Message::Tensor tensor;
+      // We use the output tensor name as the metadata in the request
+      tensor.metadata = std::string(output_tensor->name);
+      tensor.length = output_tensor->bytes;
+      tensor.buffer = tensorpipe::CpuBuffer{.ptr = output_tensor->data.raw};
+      tp_msg.tensors.push_back(tensor);
+    }
+    pipe_->write(tp_msg, [](const tensorpipe::Error& error) {
+      if (error) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR,
+            ("Failed to send inference response to client. Details:" +
+             error.what())
+                .c_str());
+      }
+    });
+    // Arm for getting more data
+    ReceiveFromPipe();
+  });
+}
diff --git a/src/model_instance.h b/src/model_instance.h
new file mode 100644
index 0000000..591843a
--- /dev/null
+++ b/src/model_instance.h
@@ -0,0 +1,116 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/optional_debug_tools.h"
+#include "tensorpipe/tensorpipe.h"
+
+#ifdef PAPI_PROFILING_ENABLE
+#include "papi.h"
+#include "papi_profiler.h"
+#endif  // PAPI_PROFILING_ENABLE
+
+/*!
+ * \brief ModelInstance for backend end execution of model.
+ *
+ * Tensorpipe Receiver is the communicator implemented by tcp.
+ */
+class ModelInstance {
+ public:
+  /*!
+   * \brief Receiver constructor
+   */
+  ModelInstance()
+  {
+    context_ = std::make_shared<tensorpipe::Context>();
+    auto transportContext = tensorpipe::transport::shm::create();
+    context_->registerTransport(0 /* priority */, "shm", transportContext);
+    // Register basic shm channel
+    auto basicChannel = tensorpipe::channel::basic::create();
+    context_->registerChannel(0 /* low priority */, "basic", basicChannel);
+  }
+
+  /*!
+   * \brief ModelInstance destructor
+   */
+  ~ModelInstance() { Finalize(); }
+
+  /*!
+   * \brief Start server
+   * \param addr Networking address, e.g., 'tcp://127.0.0.1:50051'
+   */
+  void Start(const std::string& addr);
+
+  /*!
+   * \brief Finalize ModelInstance
+   *
+   * Finalize() is not thread-safe and only one thread can invoke this API.
+   */
+  void Finalize();
+
+  /*!
+   * \brief Issue a receive request pipe
+   */
+  void ReceiveFromPipe();
+
+ private:
+  /*!
+   * \brief Callback for new connection is accepted.
+   */
+  void OnAccepted(const tensorpipe::Error&, std::shared_ptr<tensorpipe::Pipe>);
+
+  /*!
+   * \brief Callback for loading a tflite model.
+   */
+  void LoadModelFromPipe(tensorpipe::Descriptor descriptor);
+
+  TfLiteStatus BuildInterpreter(tensorpipe::Descriptor descriptor);
+
+  void LogDelegation(const std::string& delegate_name);
+
+  /*!
+   * \brief Callback for inferencing on a loaded tflite model.
+   */
+  void Infer(tensorpipe::Descriptor& descriptor);
+
+  /*!
+   * \brief global context of tensorpipe
+   */
+  std::shared_ptr<tensorpipe::Context> context_;
+
+  /*!
+   * \brief pipe for client connection
+   */
+  std::shared_ptr<tensorpipe::Pipe> pipe_;
+
+  /*!
+   * \brief listener to build pipe
+   */
+  std::shared_ptr<tensorpipe::Listener> listener_{nullptr};
+
+  /*!
+   * \brief tflite interpreter
+   */
+  std::unique_ptr<tflite::Interpreter> interpreter_;
+
+  /*!
+   * \brief tflite model
+   */
+  std::unique_ptr<tflite::FlatBufferModel> model_;
+
+  // Unique model instance name
+  std::string model_instance_name_;
+
+  // State variable to register whether inference has been called at least once
+  bool first_inference_ = true;
+
+#ifdef PAPI_PROFILING_ENABLE
+  std::unique_ptr<tflite::Profiler> papi_profiler_ = MaybeCreatePapiProfiler();
+#endif  // PAPI_PROFILING_ENABLE
+};
\ No newline at end of file
diff --git a/src/model_instance_main.cc b/src/model_instance_main.cc
new file mode 100644
index 0000000..f5bd47b
--- /dev/null
+++ b/src/model_instance_main.cc
@@ -0,0 +1,86 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <signal.h>
+
+#include <atomic>
+#include <future>
+
+#include "model_instance.h"
+
+// Triton backend headers
+#include "triton/backend/backend_common.h"
+
+#ifdef PAPI_PROFILING_ENABLE
+#include "papi.h"
+#endif  // PAPI_PROFILING_ENABLE
+
+int
+main(int argc, char* argv[])
+{
+#ifdef PAPI_PROFILING_ENABLE
+  // Init PAPI library
+  if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) {
+    LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to init PAPI lib");
+    return 1;
+  }
+  if (PAPI_thread_init(pthread_self) != PAPI_OK) {
+    LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to init PAPI thread lib");
+    return 1;
+  }
+#endif  // PAPI_PROFILING_ENABLE
+
+  // Parse listen address
+  if (argc != 2) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_ERROR,
+        "Args should be model_instance <bind "
+        "address>");
+
+    return 1;
+  }
+  const char* bind_addr = argv[1];
+
+  // block signals in this thread and subsequently
+  // spawned threads
+  sigset_t sigset;
+  sigemptyset(&sigset);
+  sigaddset(&sigset, SIGINT);
+  sigaddset(&sigset, SIGTERM);
+  pthread_sigmask(SIG_BLOCK, &sigset, nullptr);
+
+  std::atomic<bool> shutdown_requested(false);
+  std::mutex cv_mutex;
+  std::condition_variable cv;
+
+  auto signal_handler = [&shutdown_requested, &cv, &sigset]() {
+    int signum = 0;
+    // wait until a signal is delivered:
+    sigwait(&sigset, &signum);
+    shutdown_requested.store(true);
+    // notify all waiting workers to check their predicate:
+    cv.notify_all();
+    return signum;
+  };
+
+  auto ft_signal_handler = std::async(std::launch::async, signal_handler);
+
+  ModelInstance model_instance;
+
+  // Will listen on the address provided as the first argument in the list
+  model_instance.Start(std::string(bind_addr));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      "Model instance waiting for SIGTERM or SIGINT ([CTRL]+[c])...");
+
+  // wait for signal handler to complete
+  int signal = ft_signal_handler.get();
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("Received signal: ") + std::to_string(signal)).c_str());
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/model_instance_utils.h b/src/model_instance_utils.h
new file mode 100644
index 0000000..3074a63
--- /dev/null
+++ b/src/model_instance_utils.h
@@ -0,0 +1,25 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#ifdef PAPI_PROFILING_ENABLE
+#include "papi.h"
+
+bool
+PAPIEventValid(std::string& event_name)
+{
+  int event_set = PAPI_NULL;
+  bool valid = false;
+  if (PAPI_create_eventset(&event_set) == PAPI_OK) {
+    valid = PAPI_add_named_event(event_set, event_name.c_str()) == PAPI_OK;
+    if (valid) {
+      if (PAPI_cleanup_eventset(event_set) != PAPI_OK) {
+      }
+    }
+    if (PAPI_destroy_eventset(&event_set) != PAPI_OK) {
+    }
+  }
+  return valid;
+}
+#endif  // PAPI_PROFILING_ENABLE
\ No newline at end of file
diff --git a/src/papi_profiler.cc b/src/papi_profiler.cc
index 36b63cd..fe98be9 100644
--- a/src/papi_profiler.cc
+++ b/src/papi_profiler.cc
@@ -5,7 +5,6 @@
 
 #include "papi_profiler.h"
 
-#include <papi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <tensorflow/lite/core/api/profiler.h>
@@ -14,16 +13,20 @@
 #include <unordered_map>
 #include <vector>
 
-#include "triton/backend/backend_model.h"
+// Triton backend headers
+#include "papi.h"
+#include "triton/backend/backend_common.h"
 
 constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
 
 void
 handle_error(int retval)
 {
-  throw triton::backend::BackendModelException(TRITONSERVER_ErrorNew(
-      TRITONSERVER_ERROR_INTERNAL,
-      ("PAPI error: " + std::string(PAPI_strerror(retval))).c_str()));
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_ERROR,
+      ("PAPI error: " + std::to_string(retval) + ", " + PAPI_strerror(retval))
+          .c_str());
+  exit(1);
 }
 
 class PapiProfiler : public tflite::Profiler {
@@ -92,7 +95,7 @@ MaybeCreatePapiProfiler()
   if (getenv("PAPI_EVENTS") == NULL) {
     LOG_MESSAGE(
         TRITONSERVER_LOG_WARN,
-        "PAPI_EVENTS not specified, op level profiling disabled");
+        "PAPI_EVENTS not specified, op level profiling disabled!");
     return nullptr;
   }
   return std::unique_ptr<tflite::Profiler>(new PapiProfiler());
diff --git a/src/papi_profiler.h b/src/papi_profiler.h
index 434f90f..470bb40 100644
--- a/src/papi_profiler.h
+++ b/src/papi_profiler.h
@@ -8,7 +8,6 @@
 #include <memory>
 
 #include "tensorflow/lite/core/api/profiler.h"
-#include "triton/core/tritonbackend.h"
 
 
 // Creates a profiler which reports the papi traced events.
diff --git a/src/tflite.cc b/src/tflite.cc
index 134516a..9753afd 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -3,11 +3,18 @@
 // SPDX-License-Identifier: MIT
 //
 
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <link.h>
 #include <stdint.h>
+#include <unistd.h>
 
 #include <algorithm>
 #include <exception>
 #include <fstream>
+#include <future>
 #include <iostream>
 #include <limits>
 #include <map>
@@ -17,7 +24,14 @@
 #include <unordered_set>
 #include <vector>
 
+// Local headers
+#include "config.h"
 #include "tflite_utils.h"
+
+// Tensorpipe headers
+#include "tensorpipe/tensorpipe.h"
+
+// Triton headers
 #include "triton/backend/backend_common.h"
 #include "triton/backend/backend_input_collector.h"
 #include "triton/backend/backend_memory.h"
@@ -33,17 +47,9 @@
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
 
-#ifdef ARMNN_DELEGATE_ENABLE
-// ArmNN headers
-#include "armnn/ArmNN.hpp"
-#include "armnn_delegate.hpp"
-#endif  // ARMNN_DELEGATE_ENABLE
-
-#ifdef PAPI_PROFILING_ENABLE
-#include <papi.h>
-
-#include "papi_profiler.h"
-#endif  // PAPI_PROFILING_ENABLE
+// Reproc headers
+#include "reproc++/drain.hpp"
+#include "reproc++/reproc.hpp"
 
 //
 // TFLite Backend that implements the TRITONBACKEND API.
@@ -61,22 +67,18 @@ namespace triton { namespace backend { namespace tensorflowlite {
 class ModelState : public BackendModel {
  public:
   static TRITONSERVER_Error* Create(
-      TRITONBACKEND_Model* triton_model, ModelState** state,
-      int32_t* armnn_threads);
+      TRITONBACKEND_Model* triton_model, ModelState** state);
   ~ModelState();
 
-  // Load a serialized tflite model using 'artifact_name' as the name for the
-  // tflite model file. Return in 'model_path' the full path to the
-  // tflite model file. Return in 'model' the TFLite network,
-  // representing the model.
-  TRITONSERVER_Error* LoadModel(
-      const std::string& artifact_name, std::string* model_path,
-      common::TritonJson::Value& model_config,
-      std::unique_ptr<tflite::FlatBufferModel>* model);
+  TRITONSERVER_Error* LoadModel();
+
+  TRITONSERVER_Error* InitConfig();
 
   // Validate that model configuration is supported by this backend.
   TRITONSERVER_Error* ValidateModelConfig();
 
+  void InitTensorPipe();
+
   // Default TFLite runtime options
   int32_t tflite_num_threads_ =
       static_cast<int32_t>(std::thread::hardware_concurrency());
@@ -85,9 +87,16 @@ class ModelState : public BackendModel {
   // ArmNN Delegate options
   bool use_armnn_delegate_cpu_ = false;
   bool use_armnn_delegate_gpu_ = false;
-  armnn::OptimizerOptions armnn_optimizer_options_cpu_;
-  armnn::OptimizerOptions armnn_optimizer_options_gpu_;
-  int32_t* armnn_threads_;
+
+  int32_t armnn_cpu_num_threads_ =
+      static_cast<int32_t>(std::thread::hardware_concurrency());
+  std::string armnn_cpu_reduce_fp32_to_fp16_ = "off";
+  std::string armnn_cpu_reduce_fp32_to_bf16_ = "off";
+  std::string armnn_cpu_fast_math_enabled_ = "off";
+
+  std::string armnn_gpu_fast_math_enabled_ = "off";
+  std::string armnn_gpu_reduce_fp32_to_fp16_ = "off";
+  std::string armnn_gpu_reduce_fp32_to_bf16_ = "off";
 #endif  // ARMNN_DELEGATE_ENABLE
 
   // XNNPACK Delegate options
@@ -104,6 +113,16 @@ class ModelState : public BackendModel {
   // that output in the model.
   std::unordered_map<std::string, int> output_index_map_;
   std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
+  std::unordered_map<std::string, std::vector<int64_t>> output_shape_map_;
+
+  // The pointer to the tflite network
+  std::unique_ptr<tflite::FlatBufferModel> model_;
+
+  // Global context of tensorpipe
+  std::shared_ptr<tensorpipe::Context> context_;
+
+  // Path string for the model_instance binary
+  const char* model_instance_location_;
 
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
@@ -112,9 +131,7 @@ class ModelState : public BackendModel {
 
 
 TRITONSERVER_Error*
-ModelState::Create(
-    TRITONBACKEND_Model* triton_model, ModelState** state,
-    int32_t* armnn_threads)
+ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 {
   try {
     *state = new ModelState(triton_model);
@@ -126,57 +143,32 @@ ModelState::Create(
     RETURN_IF_ERROR(ex.err_);
   }
 
-#ifdef ARMNN_DELEGATE_ENABLE
-  (*state)->armnn_threads_ = armnn_threads;
-#endif
-
   return nullptr;  // success
 }
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model)
 {  // Here we can add information to the model state that can be shared across
-   // model instances. See onnx backend for example. MALI GPU optimization level
-   // may be candidate.
+  // model instances. See onnx backend for example. MALI GPU optimization level
+  // may be candidate.
+  InitTensorPipe();
+  THROW_IF_BACKEND_MODEL_ERROR(InitConfig());
+  THROW_IF_BACKEND_MODEL_ERROR(LoadModel());
+
+  // Get the directory of the backend to find the path to the model instance
+  // binary
+  TRITONBACKEND_Backend* backend;
+  TRITONBACKEND_ArtifactType artifact_type;
+  TRITONBACKEND_ModelBackend(triton_model, &backend);
+  TRITONBACKEND_BackendArtifacts(
+      backend, &artifact_type, &model_instance_location_);
 }
 
 ModelState::~ModelState() {}
 
 TRITONSERVER_Error*
-ModelState::LoadModel(
-    const std::string& artifact_name, std::string* model_path,
-    common::TritonJson::Value& model_config,
-    std::unique_ptr<tflite::FlatBufferModel>* model)
+ModelState::InitConfig()
 {
-  // Find the TFLite model file that describes the model. If the model
-  // configuration doesn't have an explicit model file specified then
-  // use the default name ("model.tflite").
-  std::string cc_model_filename = artifact_name;
-  if (cc_model_filename.empty()) {
-    cc_model_filename = "model.tflite";
-  }
-
-  *model_path = JoinPath(
-      {RepositoryPath(), std::to_string(Version()), cc_model_filename});
-
-  {
-    bool exists;
-    RETURN_IF_ERROR(FileExists(*model_path, &exists));
-    RETURN_ERROR_IF_FALSE(
-        exists, TRITONSERVER_ERROR_UNAVAILABLE,
-        std::string("unable to find '") + *model_path +
-            "' for model instance '" + Name() + "'");
-  }
-
-  // Load the Tflite FlatBufferModel into memory
-  *model = tflite::FlatBufferModel::BuildFromFile((*model_path).c_str());
-
-  if (!*model) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        ("failed to load model " + Name()).c_str());
-  }
-
   // Handle tflite default interpeter options set in parameters
   {
     triton::common::TritonJson::Value params;
@@ -241,10 +233,8 @@ ModelState::LoadModel(
                   if (param_key == "reduce_fp32_to_fp16") {
                     RETURN_IF_ERROR(params.MemberAsString(
                         param_key.c_str(), &value_string));
-                    if (value_string == "on") {
-                      armnn_optimizer_options_cpu_.m_ReduceFp32ToFp16 = true;
-                    } else if (value_string == "off") {
-                      armnn_optimizer_options_cpu_.m_ReduceFp32ToFp16 = false;
+                    if (value_string == "on" || value_string == "off") {
+                      armnn_cpu_reduce_fp32_to_fp16_ = value_string;
                     } else {
                       RETURN_ERROR_IF_FALSE(
                           false, TRITONSERVER_ERROR_INVALID_ARG,
@@ -255,10 +245,8 @@ ModelState::LoadModel(
                   } else if (param_key == "reduce_fp32_to_bf16") {
                     RETURN_IF_ERROR(params.MemberAsString(
                         param_key.c_str(), &value_string));
-                    if (value_string == "on") {
-                      armnn_optimizer_options_cpu_.m_ReduceFp32ToBf16 = true;
-                    } else if (value_string == "off") {
-                      armnn_optimizer_options_cpu_.m_ReduceFp32ToBf16 = false;
+                    if (value_string == "on" || value_string == "off") {
+                      armnn_cpu_reduce_fp32_to_bf16_ = value_string;
                     } else {
                       RETURN_ERROR_IF_FALSE(
                           false, TRITONSERVER_ERROR_INVALID_ARG,
@@ -269,16 +257,8 @@ ModelState::LoadModel(
                   } else if (param_key == "fast_math_enabled") {
                     RETURN_IF_ERROR(params.MemberAsString(
                         param_key.c_str(), &value_string));
-                    if (value_string == "on") {
-                      armnn::BackendOptions option(
-                          "CpuAcc", {{"FastMathEnabled", true}});
-                      armnn_optimizer_options_cpu_.m_ModelOptions.push_back(
-                          option);
-                    } else if (value_string == "off") {
-                      armnn::BackendOptions option(
-                          "CpuAcc", {{"FastMathEnabled", false}});
-                      armnn_optimizer_options_cpu_.m_ModelOptions.push_back(
-                          option);
+                    if (value_string == "on" || value_string == "off") {
+                      armnn_cpu_fast_math_enabled_ = value_string;
                     } else {
                       RETURN_ERROR_IF_FALSE(
                           false, TRITONSERVER_ERROR_INVALID_ARG,
@@ -287,41 +267,18 @@ ModelState::LoadModel(
                               value_string + "' is requested");
                     }
                   } else if (param_key == "num_threads") {
-                    int32_t num_threads;
                     RETURN_IF_ERROR(params.MemberAsString(
                         param_key.c_str(), &value_string));
-                    RETURN_IF_ERROR(ParseIntValue(value_string, &num_threads));
-                    if (num_threads < 0) {
+                    RETURN_IF_ERROR(
+                        ParseIntValue(value_string, &armnn_cpu_num_threads_));
+                    if (armnn_cpu_num_threads_ < -1) {
                       return TRITONSERVER_ErrorNew(
                           TRITONSERVER_ERROR_INVALID_ARG,
                           std::string(
                               "armnn thread count '" + value_string +
-                              "' is not in range [1-64]")
+                              "' is not in range [-1-64]")
                               .c_str());
                     }
-
-                    // Here we do an ugly hack to prevent armnn/acl thread
-                    // issues For now we make sure the next armnn accelerated
-                    // model loaded does not request more threads than the
-                    // previous, as this creates a segfault
-                    if (num_threads > *armnn_threads_) {
-                      num_threads = *armnn_threads_;
-                      LOG_MESSAGE(
-                          TRITONSERVER_LOG_INFO,
-                          (std::string("Model threads requested larger than "
-                                       "that of first model loaded: ") +
-                           value_string + " > " +
-                           std::to_string(*armnn_threads_) +
-                           ". Using smaller thread value instead.")
-                              .c_str());
-                    } else {
-                      *armnn_threads_ = num_threads;
-                    }
-                    armnn::BackendOptions option(
-                        "CpuAcc", {{"NumberOfThreads",
-                                    static_cast<unsigned int>(num_threads)}});
-                    armnn_optimizer_options_cpu_.m_ModelOptions.push_back(
-                        option);
                   } else {
                     return TRITONSERVER_ErrorNew(
                         TRITONSERVER_ERROR_INVALID_ARG,
@@ -386,7 +343,6 @@ ModelState::LoadModel(
             RETURN_IF_ERROR(ea.MemberAsString("name", &name));
             if (name == "armnn") {
               use_armnn_delegate_gpu_ = true;
-              armnn::OptimizerOptions armnn_optimizer_options_gpu_;
               LOG_MESSAGE(
                   TRITONSERVER_LOG_VERBOSE,
                   (std::string(
@@ -403,10 +359,8 @@ ModelState::LoadModel(
                   if (param_key == "reduce_fp32_to_fp16") {
                     RETURN_IF_ERROR(params.MemberAsString(
                         param_key.c_str(), &value_string));
-                    if (value_string == "on") {
-                      armnn_optimizer_options_gpu_.m_ReduceFp32ToFp16 = true;
-                    } else if (value_string == "off") {
-                      armnn_optimizer_options_gpu_.m_ReduceFp32ToFp16 = false;
+                    if (value_string == "on" || value_string == "off") {
+                      armnn_gpu_reduce_fp32_to_fp16_ == value_string;
                     } else {
                       RETURN_ERROR_IF_FALSE(
                           false, TRITONSERVER_ERROR_INVALID_ARG,
@@ -417,10 +371,8 @@ ModelState::LoadModel(
                   } else if (param_key == "reduce_fp32_to_bf16") {
                     RETURN_IF_ERROR(params.MemberAsString(
                         param_key.c_str(), &value_string));
-                    if (value_string == "on") {
-                      armnn_optimizer_options_gpu_.m_ReduceFp32ToBf16 = true;
-                    } else if (value_string == "off") {
-                      armnn_optimizer_options_gpu_.m_ReduceFp32ToBf16 = false;
+                    if (value_string == "on" || value_string == "off") {
+                      armnn_gpu_reduce_fp32_to_bf16_ == value_string;
                     } else {
                       RETURN_ERROR_IF_FALSE(
                           false, TRITONSERVER_ERROR_INVALID_ARG,
@@ -431,16 +383,8 @@ ModelState::LoadModel(
                   } else if (param_key == "fast_math_enabled") {
                     RETURN_IF_ERROR(params.MemberAsString(
                         param_key.c_str(), &value_string));
-                    if (value_string == "on") {
-                      armnn::BackendOptions option(
-                          "GpuAcc", {{"FastMathEnabled", true}});
-                      armnn_optimizer_options_gpu_.m_ModelOptions.push_back(
-                          option);
-                    } else if (value_string == "off") {
-                      armnn::BackendOptions option(
-                          "GpuAcc", {{"FastMathEnabled", false}});
-                      armnn_optimizer_options_gpu_.m_ModelOptions.push_back(
-                          option);
+                    if (value_string == "on" || value_string == "off") {
+                      armnn_gpu_fast_math_enabled_ == value_string;
                     } else {
                       RETURN_ERROR_IF_FALSE(
                           false, TRITONSERVER_ERROR_INVALID_ARG,
@@ -472,7 +416,46 @@ ModelState::LoadModel(
     }
   }
 
-  return nullptr;  // success
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelState::LoadModel()
+{
+  std::string artifact_filename;
+  RETURN_IF_ERROR(ModelConfig().MemberAsString(
+      "default_model_filename", &artifact_filename));
+
+  // Find the TFLite model file that describes the model. If the model
+  // configuration doesn't have an explicit model file specified then
+  // use the default name ("model.tflite").
+  std::string cc_model_filename = artifact_filename;
+  if (cc_model_filename.empty()) {
+    cc_model_filename = "model.tflite";
+  }
+
+  std::string model_path = JoinPath(
+      {RepositoryPath(), std::to_string(Version()), cc_model_filename});
+
+  {
+    bool exists;
+    RETURN_IF_ERROR(FileExists(model_path, &exists));
+    RETURN_ERROR_IF_FALSE(
+        exists, TRITONSERVER_ERROR_UNAVAILABLE,
+        std::string("unable to find '") + model_path +
+            "' for model instance '" + Name() + "'");
+  }
+
+  // Load the Tflite FlatBufferModel into memory
+  model_ = tflite::FlatBufferModel::BuildFromFile((model_path).c_str());
+
+  if (!model_) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        ("failed to load model " + Name()).c_str());
+  }
+
+  return nullptr;
 }
 
 TRITONSERVER_Error*
@@ -487,17 +470,9 @@ ModelState::ValidateModelConfig()
 
   // To check input and output names we will load and release the model during
   // the validation process without allocating memory for inference
-  std::string model_path;
-  std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
-  std::string artifact_filename;
-  RETURN_IF_ERROR(ModelConfig().MemberAsString(
-      "default_model_filename", &artifact_filename));
-  RETURN_IF_ERROR(
-      LoadModel(artifact_filename, &model_path, ModelConfig(), &model));
-
   tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
-  tflite::InterpreterBuilder builder(*model, resolver);
+  tflite::InterpreterBuilder builder(*model_, resolver);
   builder(&interpreter);
   if (!interpreter) {
     return TRITONSERVER_ErrorNew(
@@ -521,11 +496,17 @@ ModelState::ValidateModelConfig()
         ConvertTFLiteTypeToDataType(interpreter->tensor(inputs[i])->type);
   }
 
-  // Populate output name and dtype map
+  // Populate output name, dtype, shape map
   for (size_t i = 0; i < num_outputs; i++) {
-    output_index_map_[interpreter->GetOutputName(i)] = outputs[i];
-    output_dtype_map_[interpreter->GetOutputName(i)] =
-        ConvertTFLiteTypeToDataType(interpreter->tensor(outputs[i])->type);
+    TfLiteTensor* output_tensor = interpreter->tensor(outputs[i]);
+    TfLiteIntArray* tflite_output_tensor_dims = output_tensor->dims;
+    std::vector<int64_t> output_shape_vector = std::vector<int64_t>(
+        tflite_output_tensor_dims->data,
+        (tflite_output_tensor_dims->data + tflite_output_tensor_dims->size));
+    output_shape_map_[output_tensor->name] = output_shape_vector;
+    output_index_map_[output_tensor->name] = outputs[i];
+    output_dtype_map_[output_tensor->name] =
+        ConvertTFLiteTypeToDataType(output_tensor->type);
   }
 
   triton::common::TritonJson::Value ios;
@@ -698,19 +679,32 @@ ModelState::AutoCompleteConfig()
   return nullptr;  // success
 }
 
+void
+ModelState::InitTensorPipe()
+{
+  context_ = std::make_shared<tensorpipe::Context>();
+  auto transportContext = tensorpipe::transport::shm::create();
+  // Consider here also registering tcp transport if shm not avail
+  context_->registerTransport(0 /* priority */, "shm", transportContext);
+  // Register basic shm channel
+  auto basicChannel = tensorpipe::channel::basic::create();
+  context_->registerChannel(0 /* low priority */, "basic", basicChannel);
+}
 
 //
 // ModelInstanceState
 //
 // State associated with a model instance. An object of this class is
 // created and associated with each TRITONBACKEND_ModelInstance.
+// This class acts as a manager for a subprocess which handles the actual tflite
+// inference.
 //
 class ModelInstanceState : public BackendModelInstance {
  public:
   static TRITONSERVER_Error* Create(
       ModelState* model_state,
       TRITONBACKEND_ModelInstance* triton_model_instance,
-      ModelInstanceState** state);
+      const std::string& model_instance_name, ModelInstanceState** state);
   virtual ~ModelInstanceState();
 
   // Get the state of the model that corresponds to this instance.
@@ -723,49 +717,52 @@ class ModelInstanceState : public BackendModelInstance {
  private:
   ModelInstanceState(
       ModelState* model_state,
-      TRITONBACKEND_ModelInstance* triton_model_instance);
-  TRITONSERVER_Error* BuildInterpreter();
-  void LogDelegation(const std::string& delegate_name);
-  void Execute(
-      std::vector<TRITONBACKEND_Response*>* responses,
-      const uint32_t response_count);
-  void SetInputTensors(
+      TRITONBACKEND_ModelInstance* triton_model_instance,
+      const std::string& model_instance_name);
+  TRITONSERVER_Error* ConnectModelInstance();
+  TRITONSERVER_Error* SendModel();
+  TRITONSERVER_Error* LaunchModelInstance();
+  void DestroyModelInstance();
+  bool ModelInstanceRunning();
+  TRITONSERVER_Error* SetInputTensors(
       size_t total_batch_size, TRITONBACKEND_Request** requests,
       const uint32_t request_count,
       std::vector<TRITONBACKEND_Response*>* responses,
       BackendInputCollector* collector,
-      std::vector<BackendMemory*>* input_memories);
-  void ReadOutputTensors(
+      std::vector<BackendMemory*>* input_memories, tensorpipe::Message* tp_msg);
+  void Execute(
+      std::vector<TRITONBACKEND_Response*>* responses,
+      const uint32_t response_count, tensorpipe::Message* tp_msg,
+      std::unordered_map<std::string, std::vector<char>>& inference_output);
+  TRITONSERVER_Error* ReadOutputTensors(
       size_t total_batch_size, TRITONBACKEND_Request** requests,
       const uint32_t request_count,
-      std::vector<TRITONBACKEND_Response*>* responses);
+      std::vector<TRITONBACKEND_Response*>* responses,
+      const std::unordered_map<std::string, std::vector<char>>&
+          inference_output);
 
+  // Pointer to the model state shared between instances
   ModelState* model_state_;
 
-  // The full path to the TFLite model file.
-  std::string model_path_;
+  // Name of the model instance used as a unique indenfier for this
+  // instance
+  const std::string model_instance_name_;
 
-  // The pointer to the tflite network
-  std::unique_ptr<tflite::FlatBufferModel> model_;
-
-  // The pointer to the tflite interpreter instance
-  std::unique_ptr<tflite::Interpreter> interpreter_;
-
-  // State variable to register whether inference has been called at least once
-  bool first_inference_ = true;
+  // Tensorpipe to send input tensors over
+  std::shared_ptr<tensorpipe::Pipe> pipe_;
 
-#ifdef PAPI_PROFILING_ENABLE
-  std::unique_ptr<tflite::Profiler> papi_profiler_ = MaybeCreatePapiProfiler();
-#endif  // PAPI_PROFILING_ENABLE
+  // Process object for our backend model instance
+  reproc::process model_instance_process_;
 };
 
 TRITONSERVER_Error*
 ModelInstanceState::Create(
     ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
-    ModelInstanceState** state)
+    const std::string& model_instance_name, ModelInstanceState** state)
 {
   try {
-    *state = new ModelInstanceState(model_state, triton_model_instance);
+    *state = new ModelInstanceState(
+        model_state, triton_model_instance, model_instance_name);
   }
   catch (const BackendModelInstanceException& ex) {
     RETURN_ERROR_IF_TRUE(
@@ -778,170 +775,210 @@ ModelInstanceState::Create(
 }
 
 ModelInstanceState::ModelInstanceState(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
+    const std::string& model_instance_name)
     : BackendModelInstance(model_state, triton_model_instance),
-      model_state_(model_state)
+      model_state_(model_state), model_instance_name_(model_instance_name)
 {
-  // Load the TFLite network
-  THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
-      ArtifactFilename(), &model_path_, model_state->ModelConfig(), &model_));
+  THROW_IF_BACKEND_INSTANCE_ERROR(LaunchModelInstance());
 
-  // Build interpreter
-  THROW_IF_BACKEND_INSTANCE_ERROR(BuildInterpreter());
+  // This is a gross thing to do, the backend deadlocks if the tensorpipe
+  // context tries to connect to the model_instance process to before it's
+  // ready.
+  sleep(2);
+  pipe_ = model_state_->context_->connect("shm://" + model_instance_name_);
 
-#ifdef PAPI_PROFILING_ENABLE
-  interpreter_->AddProfiler(papi_profiler_.get());
-#endif  // PAPI_PROFILING_ENABLE
+  THROW_IF_BACKEND_INSTANCE_ERROR(SendModel());
 }
 
-ModelInstanceState::~ModelInstanceState()
+ModelInstanceState::~ModelInstanceState() {}
+
+TRITONSERVER_Error*
+ModelInstanceState::LaunchModelInstance()
 {
-  // Consider the function ReleaseNonPersistentMemory here for our interpreter
-  interpreter_.reset();
+  std::vector<std::string> model_instance_args = {
+      std::string(model_state_->model_instance_location_) + "/model_instance",
+      std::string("shm://") + model_instance_name_};
+
+  // We have the model_instance process inherit the parent's standard streams so
+  // the it reads directly from the stdin and writes directly to the
+  // stdout/stderr triton.
+  reproc::options options;
+  options.redirect.out.type = reproc::redirect::type::parent;
+  options.redirect.err.type = reproc::redirect::type::parent;
+  options.env.behavior = reproc::env::extend;
+
+  // For the child process to use Triton logging infra, we have to give it the
+  // location of the actual tritonserver.so lib, as the backend is just linked
+  // against a stub
+  std::string* tritonserver_lib_path;
+  dl_iterate_phdr(
+      [](struct dl_phdr_info* info, size_t size, void* data) -> int {
+        if (std::string(info->dlpi_name).find("tritonserver.so") !=
+            std::string::npos) {
+          *(reinterpret_cast<std::string**>(data)) =
+              new std::string(info->dlpi_name);
+          return 1;
+        }
+        return 0;
+      },
+      &tritonserver_lib_path);
+
+  auto base_path = [](const std::string& str) -> std::string {
+    size_t found;
+    found = str.find_last_of("/\\");
+    return str.substr(0, found);
+  };
+
+  options.env.extra = std::unordered_map<std::string, std::string>{
+      {"LD_LIBRARY_PATH", base_path(*tritonserver_lib_path)}};
+
+  std::error_code ec =
+      model_instance_process_.start(model_instance_args, options);
+
+  RETURN_ERROR_IF_TRUE(
+      ec == std::errc::no_such_file_or_directory, TRITONSERVER_ERROR_INTERNAL,
+      std::string(
+          "model_instance binary not found. Make sure it's available from the "
+          "PATH."));
+  RETURN_ERROR_IF_TRUE(
+      ec, TRITONSERVER_ERROR_INTERNAL,
+      (std::string("Failed to launch model instance process: ") +
+       ec.message()));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("Launched model instance: ") + model_instance_name_)
+          .c_str());
+
+  // logging_thread_.reset(
+  //     new std::thread(&ModelInstanceState::ModelInstanceLogHandler, this));
+
+  return nullptr;
+}
+
+bool
+ModelInstanceState::ModelInstanceRunning()
+{
+  int events = 0;
+  std::error_code ec;
+  std::tie(events, ec) = model_instance_process_.poll(
+      reproc::event::exit, reproc::milliseconds(1000));
+  return !ec && ((events & reproc::event::exit) != 0);
 }
 
 TRITONSERVER_Error*
-ModelInstanceState::BuildInterpreter()
+ModelInstanceState::SendModel()
 {
-  // Build the tflite interpreter
-  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
-  tflite::InterpreterBuilder builder(*model_, resolver);
-  builder(&interpreter_);
-  if (!interpreter_) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        ("failed to build tflite interpreter for model " + Name()).c_str());
+  tensorpipe::Message tp_msg;
+  tp_msg.metadata = "model_load";
+
+  // Size the payloads vector
+  tp_msg.payloads.resize(OptimizerOption::COUNT + 1);
+
+  // Place deserialized flatbuffer model in msg payload field
+  const tflite::Allocation* model_allocation =
+      model_state_->model_->allocation();
+  tensorpipe::Message::Payload model_payload{
+      .data = const_cast<void*>(model_allocation->base()),
+      .length = model_allocation->bytes(),
+      .metadata = std::string(model_instance_name_),
+  };
+  tp_msg.payloads[OptimizerOption::COUNT] = model_payload;
+
+  // Define a helper function for generating payloads for our options
+  auto gen_metadata = [](std::string s) {
+    tensorpipe::Message::Payload result{.metadata = s};
+    return result;
+  };
+
+  // Add in model configuration data to message
+  tp_msg.payloads[OptimizerOption::TFLITE_NUM_THREADS] =
+      gen_metadata(std::to_string(model_state_->tflite_num_threads_));
+
+  // Add in use xnnpack
+  std::string use_xnnpack = std::string("n");
+  if (model_state_->use_xnnpack_delegate_ &&
+      Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU) {
+    use_xnnpack = std::string("y");
   }
+  tp_msg.payloads[OptimizerOption::XNNPACK_ENABLE] = gen_metadata(use_xnnpack);
 
-  // Tell interpreter to use max threads available to system
-  if (interpreter_->SetNumThreads(model_state_->tflite_num_threads_) !=
-      kTfLiteOk) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        ("failed to set number of threads for interpreter for model " + Name())
-            .c_str());
-  }
+  // Add in xnnpack threads
+  tp_msg.payloads[OptimizerOption::XNNPACK_CPU_NUM_THREADS] =
+      gen_metadata(std::to_string(model_state_->num_threads_xnnpack_));
 
 #ifdef ARMNN_DELEGATE_ENABLE
-  bool armnn_gpu_delegate_enabled =
-      model_state_->use_armnn_delegate_gpu_ &&
-      Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU;
-  bool armnn_cpu_delegate_enabled =
-      model_state_->use_armnn_delegate_cpu_ &&
-      Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU;
-  if (armnn_cpu_delegate_enabled || armnn_gpu_delegate_enabled) {
-    armnnDelegate::DelegateOptions armnn_delegate_options =
-        armnnDelegate::TfLiteArmnnDelegateOptionsDefault();
-
-    // Set backend prefs based on gpu or cpu selection
-    if (armnn_gpu_delegate_enabled) {
-      armnn_delegate_options.SetBackends(
-          {armnn::Compute::GpuAcc, armnn::Compute::CpuAcc});
-      armnn_delegate_options.SetOptimizerOptions(
-          model_state_->armnn_optimizer_options_gpu_);
-    } else {
-      // Set backend pref to Neon ACL backend
-      armnn_delegate_options.SetBackends({armnn::Compute::CpuAcc});
-      armnn_delegate_options.SetOptimizerOptions(
-          model_state_->armnn_optimizer_options_cpu_);
-    }
+  // Add in use armnn cpu
+  std::string use_armnn_cpu = std::string("n");
+  if (model_state_->use_armnn_delegate_cpu_ &&
+      Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU) {
+    use_armnn_cpu = std::string("y");
+  }
+  tp_msg.payloads[OptimizerOption::ARMNN_CPU_ENABLE] =
+      gen_metadata(use_armnn_cpu);
+
+  // Add in use armnn gpu
+  std::string use_armnn_gpu = std::string("n");
+  if (model_state_->use_armnn_delegate_gpu_ &&
+      Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+    use_armnn_gpu = std::string("y");
+  }
+  tp_msg.payloads[OptimizerOption::ARMNN_GPU_ENABLE] =
+      gen_metadata(use_armnn_gpu);
 
-    // Create ArmNN Delegate with options registered in model state
-    std::unique_ptr<
-        TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
-        armnn_delegate(
-            armnnDelegate::TfLiteArmnnDelegateCreate(armnn_delegate_options),
-            armnnDelegate::TfLiteArmnnDelegateDelete);
+  // Add in armnn threads
+  tp_msg.payloads[OptimizerOption::ARMNN_CPU_NUM_THREADS] =
+      gen_metadata(std::to_string(model_state_->armnn_cpu_num_threads_));
 
-    // Instruct the Interpreter to use the armnnDelegate
-    if (interpreter_->ModifyGraphWithDelegate(std::move(armnn_delegate)) !=
-        kTfLiteOk) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("failed to use armnn delegate for model " + Name()).c_str());
-    }
-    LogDelegation("armnn");
-  } else if (
-      model_state_->use_xnnpack_delegate_ &&
-      Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU) {
-#else
-  if (model_state_->use_xnnpack_delegate_ &&
-      Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU) {
-#endif  // ARMNN_DELEGATE_ENABLE
-    // Create the XNNPack Delegate
-    TfLiteXNNPackDelegateOptions options =
-        TfLiteXNNPackDelegateOptionsDefault();
+  // Add in use cpu and gpu options
+  tp_msg.payloads[OptimizerOption::ARMNN_CPU_FAST_MATH_ENABLED] =
+      gen_metadata(model_state_->armnn_cpu_fast_math_enabled_);
 
-    options.num_threads = model_state_->num_threads_xnnpack_;
+  tp_msg.payloads[OptimizerOption::ARMNN_CPU_REDUCE_FP32_TO_FP16] =
+      gen_metadata(model_state_->armnn_cpu_reduce_fp32_to_fp16_);
 
-    tflite::Interpreter::TfLiteDelegatePtr xnnpack_delegate(
-        TfLiteXNNPackDelegateCreate(&options),
-        [](TfLiteDelegate* xnnpack_delegate) {
-          TfLiteXNNPackDelegateDelete(xnnpack_delegate);
-        });
+  tp_msg.payloads[OptimizerOption::ARMNN_CPU_REDUCE_FP32_TO_BF16] =
+      gen_metadata(model_state_->armnn_cpu_reduce_fp32_to_bf16_);
 
-    // Instruct the Interpreter to use the xnnpack
-    if (interpreter_->ModifyGraphWithDelegate(std::move(xnnpack_delegate)) !=
-        kTfLiteOk) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("failed to use xnnpack delegate for model " + Name()).c_str());
-    }
-    LogDelegation("xnnpack");
-  }
+  tp_msg.payloads[OptimizerOption::ARMNN_GPU_FAST_MATH_ENABLED] =
+      gen_metadata(model_state_->armnn_gpu_fast_math_enabled_);
 
-  return nullptr;
-}
+  tp_msg.payloads[OptimizerOption::ARMNN_GPU_REDUCE_FP32_TO_BF16] =
+      gen_metadata(model_state_->armnn_gpu_reduce_fp32_to_bf16_);
 
-void
-ModelInstanceState::LogDelegation(const std::string& delegate_name)
-{
-  std::unordered_set<uint32_t> checked_node_ids;
-  uint32_t num_delegated_kernels = 0;
-  for (uint64_t i = 0; i < interpreter_->execution_plan().size(); i++) {
-    int32_t node_id = interpreter_->execution_plan()[i];
-    if (checked_node_ids.find(node_id) != checked_node_ids.end()) {
-      continue;
-    }
-    const TfLiteNode& node =
-        interpreter_->node_and_registration(node_id)->first;
+  tp_msg.payloads[OptimizerOption::ARMNN_GPU_REDUCE_FP32_TO_FP16] =
+      gen_metadata(model_state_->armnn_gpu_reduce_fp32_to_fp16_);
+#endif  // ARMNN_DELEGATE_ENABLE
 
-    if (node.delegate != nullptr) {
-      num_delegated_kernels++;
-      checked_node_ids.insert(node_id);
+  // Write the message
+  auto done = std::make_shared<std::promise<bool>>();
+  pipe_->write(tp_msg, [done](const tensorpipe::Error& error) {
+    if (error) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          ("Failed to send model load message: " + error.what()).c_str());
+      done->set_value(false);
+    } else {
+      done->set_value(true);
     }
-  }
-  bool fully_delegated =
-      (num_delegated_kernels == 1 &&
-       interpreter_->execution_plan().size() == 1);
-
-  if (fully_delegated) {
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO, ("Applied " + delegate_name +
-                                " delegate, and the model graph will be "
-                                "completely executed by the delegate.")
-                                   .c_str());
-  } else if (num_delegated_kernels > 0) {
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        ("Applied " + delegate_name +
-         " delegate, and the model graph will be paritally executed by the "
-         "delegate w/ " +
-         std::to_string(num_delegated_kernels) + " delegate kernels.")
-            .c_str());
-  } else {
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO, ("Though " + delegate_name +
-                                " delegate is applied, the model graph will "
-                                "not be executed by the delegate.")
-                                   .c_str());
-  }
+  });
+  RETURN_ERROR_IF_TRUE(
+      !done->get_future().get(), TRITONSERVER_ERROR_INTERNAL,
+      std::string("Failed to send model load message."));
+  return nullptr;
 }
 
 void
 ModelInstanceState::ProcessRequests(
     TRITONBACKEND_Request** requests, const uint32_t request_count)
 {
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
+       std::to_string(request_count) + " requests")
+          .c_str());
+
   uint64_t exec_start_ns = 0;
   SET_TIMESTAMP(exec_start_ns);
 
@@ -964,31 +1001,6 @@ ModelInstanceState::ProcessRequests(
                   .c_str()));
       return;
     }
-
-    if (max_batch_size > 0) {
-      // Retrieve the batch size from one of the inputs, if the model
-      // supports batching, the first dimension size is batch size
-      TRITONBACKEND_Input* input;
-      TRITONSERVER_Error* err =
-          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
-      if (err == nullptr) {
-        const int64_t* shape;
-        err = TRITONBACKEND_InputProperties(
-            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
-        total_batch_size += shape[0];
-      }
-      if (err != nullptr) {
-        RequestsRespondWithError(requests, request_count, err);
-        return;
-      }
-    } else {
-      total_batch_size += 1;
-    }
-  }
-
-  // If there are no valid payloads then no need to run the inference.
-  if (total_batch_size == 0) {
-    return;
   }
 
   // Make sure the maximum batch size is not exceeded. The
@@ -1022,8 +1034,9 @@ ModelInstanceState::ProcessRequests(
   // can skip them in the output tensors).
   std::vector<TRITONBACKEND_Response*> responses;
   responses.reserve(request_count);
+  bool all_response_failed = false;
 
-  for (size_t i = 0; i < request_count; i++) {
+  for (size_t i = 0; i < request_count; ++i) {
     TRITONBACKEND_Response* response;
     auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
     if (err == nullptr) {
@@ -1035,22 +1048,84 @@ ModelInstanceState::ProcessRequests(
     }
   }
 
-  std::vector<BackendMemory*> input_memories;
-  BackendInputCollector collector(
-      requests, request_count, &responses, model_state_->TritonMemoryManager(),
-      false, nullptr);
+  for (size_t i = 0; i < request_count; i++) {
+    if (max_batch_size > 0) {
+      // Retrieve the batch size from one of the inputs, if the model
+      // supports batching, the first dimension size is batch size
+      TRITONBACKEND_Input* input;
+      TRITONSERVER_Error* err =
+          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
+      if (err == nullptr) {
+        const int64_t* shape;
+        err = TRITONBACKEND_InputProperties(
+            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+        total_batch_size += shape[0];
+      }
+      if (err != nullptr) {
+        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+            responses, request_count, all_response_failed, err);
+      }
+    } else {
+      total_batch_size += 1;
+    }
+  }
 
-  // Note here we are copying the triton input buffers to the tflite allocated
-  // buffers
-  SetInputTensors(
-      total_batch_size, requests, request_count, &responses, &collector,
-      &input_memories);
+  // If there are no valid payloads then no need to run the inference.
+  if (total_batch_size == 0) {
+    return;
+  }
+
+  // Make sure the maximum batch size is not exceeded. The
+  // total_batch_size must be 1 for models that don't support batching
+  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
+  // scheduler has done something badly wrong so fail and release all
+  // requests.
+  if (!all_response_failed) {
+    if ((total_batch_size != 1) &&
+        (total_batch_size > (size_t)max_batch_size)) {
+      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+          responses, request_count, all_response_failed,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              std::string(
+                  "batch size " + std::to_string(total_batch_size) + " for '" +
+                  Name() + "', max allowed is " +
+                  std::to_string(max_batch_size))
+                  .c_str()));
+    }
+  }
+
+  // Here we allocate the space for the tensorpipe message that's used to
+  // communicate with our backend ModelInstance process
+  tensorpipe::Message tp_msg;
+
+  // Here we allocate the space for the tensorpipe allocation that the result of
+  // the inference is written to upon success
+  std::unordered_map<std::string, std::vector<char>> inference_output;
+
+  std::vector<BackendMemory*> input_memories;
+  std::unique_ptr<BackendInputCollector> collector;
+
+  if (!all_response_failed) {
+    collector.reset(new BackendInputCollector(
+        requests, request_count, &responses,
+        model_state_->TritonMemoryManager(), false, nullptr));
+    // Note here we are copying the triton input buffers to the tflite allocated
+    // buffers
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        SetInputTensors(
+            total_batch_size, requests, request_count, &responses,
+            collector.get(), &input_memories, &tp_msg));
+  }
 
   uint64_t compute_start_ns = 0;
   SET_TIMESTAMP(compute_start_ns);
 
   // Run...
-  Execute(&responses, request_count);
+  if (!all_response_failed) {
+    Execute(&responses, request_count, &tp_msg, inference_output);
+  }
 
   uint64_t compute_end_ns = 0;
   SET_TIMESTAMP(compute_end_ns);
@@ -1061,7 +1136,11 @@ ModelInstanceState::ProcessRequests(
   }
   input_memories.clear();
 
-  ReadOutputTensors(total_batch_size, requests, request_count, &responses);
+  if (!all_response_failed) {
+    ReadOutputTensors(
+        total_batch_size, requests, request_count, &responses,
+        inference_output);
+  }
 
   uint64_t exec_end_ns = 0;
   SET_TIMESTAMP(exec_end_ns);
@@ -1080,7 +1159,7 @@ ModelInstanceState::ProcessRequests(
   }
 
   // Report statistics for each request.
-  for (uint32_t r = 0; r < request_count; ++r) {
+  for (uint64_t r = 0; r < request_count; ++r) {
     auto& request = requests[r];
     LOG_IF_ERROR(
         TRITONBACKEND_ModelInstanceReportStatistics(
@@ -1094,73 +1173,56 @@ ModelInstanceState::ProcessRequests(
         "failed releasing request");
   }
 
-  // Report the entire batch statistics.
-  LOG_IF_ERROR(
-      TRITONBACKEND_ModelInstanceReportBatchStatistics(
-          TritonModelInstance(), total_batch_size, exec_start_ns,
-          compute_start_ns, compute_end_ns, exec_end_ns),
-      "failed reporting batch request statistics");
-}
-
-void
-ModelInstanceState::Execute(
-    std::vector<TRITONBACKEND_Response*>* responses,
-    const uint32_t response_count)
-{
-  static TfLiteStatus status;
-  status = interpreter_->Invoke();
-  if (status != kTfLiteOk) {
-    SendErrorForResponses(
-        responses, response_count,
-        TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL, ("TFLite execute failure")));
+  if (!all_response_failed) {
+    // Report the entire batch statistics.
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportBatchStatistics(
+            TritonModelInstance(), total_batch_size, exec_start_ns,
+            compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting batch request statistics");
   }
-  first_inference_ = false;
 }
 
-void
+TRITONSERVER_Error*
 ModelInstanceState::SetInputTensors(
     size_t total_batch_size, TRITONBACKEND_Request** requests,
     const uint32_t request_count,
     std::vector<TRITONBACKEND_Response*>* responses,
     BackendInputCollector* collector,
-    std::vector<BackendMemory*>* input_memories)
+    std::vector<BackendMemory*>* input_memories, tensorpipe::Message* tp_msg)
 {
   const int32_t max_batch_size = model_state_->MaxBatchSize();
-  bool allocate_tensors = false;
+
+  // Construct tensorpipe message
+  tp_msg->metadata = "model_input";
+  tp_msg->tensors.resize(model_state_->input_index_map_.size());
 
   // All requests must have equally-sized input tensors so use any
   // request as the representative for the input tensors.
   uint32_t input_count;
-  RESPOND_ALL_AND_RETURN_IF_ERROR(
-      responses, request_count,
-      TRITONBACKEND_RequestInputCount(requests[0], &input_count));
-  for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
+  RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count));
+  for (uint64_t input_idx = 0; input_idx < input_count; input_idx++) {
     TRITONBACKEND_Input* input;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        responses, request_count,
+    RETURN_IF_ERROR(
         TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
 
     const char* input_name;
     TRITONSERVER_DataType input_datatype;
     const int64_t* input_shape;
     uint32_t input_dims_count;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        responses, request_count,
-        TRITONBACKEND_InputProperties(
-            input, &input_name, &input_datatype, &input_shape,
-            &input_dims_count, nullptr, nullptr));
+    uint64_t byte_size;
+    RETURN_IF_ERROR(TRITONBACKEND_InputProperties(
+        input, &input_name, &input_datatype, &input_shape, &input_dims_count,
+        &byte_size, nullptr));
 
     // Return an error if the input name within the request DNE in model
     if (model_state_->input_index_map_.count(input_name) == 0) {
-      SendErrorForResponses(
-          responses, request_count,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_NOT_FOUND,
-              std::string(
-                  "Model input: " + std::string(input_name) +
-                  " is not a valid input name for '" + Name() + "'")
-                  .c_str()));
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_NOT_FOUND,
+          std::string(
+              "Model input: " + std::string(input_name) +
+              " is not a valid input name for '" + Name() + "'")
+              .c_str());
     }
 
     // The shape for the entire input patch, [total_batch_size, ...]
@@ -1177,93 +1239,125 @@ ModelInstanceState::SetInputTensors(
              " is: " + std::to_string(total_batch_size) + "\n"))
             .c_str());
 
-    // Get the batch input tensor shape and compare against the shape of the
-    // input tensor as is registered with the current interpreter. If the size
-    // is different from the last call, tell the interpreter to resize the
-    // input tensor and note that we are going to have to make another call to
-    // AllocateTensors below
-    std::vector<int32_t> batchn_tflite_size_vector(
-        begin(batchn_shape), end(batchn_shape));
-    TfLiteIntArray* tflite_input_tensor_dims =
-        interpreter_->tensor(model_state_->input_index_map_[input_name])->dims;
-    std::vector<int32_t> tflite_input_shape(
-        tflite_input_tensor_dims->data,
-        (tflite_input_tensor_dims->data + tflite_input_tensor_dims->size));
-    if (batchn_tflite_size_vector != tflite_input_shape) {
-      // Resize input tensors based on current total batch size
-      allocate_tensors = true;
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_VERBOSE,
-          (std::string(
-               "resizing input " + std::string(input_name) +
-               " with total batch size: " + std::to_string(total_batch_size) +
-               "\n"))
-              .c_str());
-      interpreter_->ResizeInputTensor(
-          model_state_->input_index_map_[input_name],
-          batchn_tflite_size_vector);
-    }
-  }
-
-  // Once we have resized all input tensors in the loop above,
-  // now we can allocate the memory plan within the tflite runtime if
-  // necessary
-  if (allocate_tensors || first_inference_) {
-    if (interpreter_->AllocateTensors() != kTfLiteOk) {
-      SendErrorForResponses(
-          responses, request_count,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              "TfLite interpreter failed to allocate tensor inputs"));
-    }
-  }
-
-  // With the memory now allocated appropriately for all input tensors, we can
-  // call process tensor for each
-  for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
-    TRITONBACKEND_Input* input;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        responses, request_count,
-        TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
-
-    const char* input_name;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        responses, request_count,
-        TRITONBACKEND_InputProperties(
-            input, &input_name, nullptr, nullptr, nullptr, nullptr, nullptr));
+    // We use the metadata string field to pass the input tensor index.
+    tp_msg->tensors[input_idx].metadata =
+        std::to_string(model_state_->input_index_map_[input_name]);
 
     // Even if running on MALI GPU, we use CPU memory
     std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
     alloc_perference = {{TRITONSERVER_MEMORY_CPU, 0}};
 
-    const char* input_buffer;
     size_t batchn_byte_size;
     TRITONSERVER_MemoryType memory_type;
     int64_t memory_type_id;
-    TfLiteTensor* tflite_input_tensor =
-        interpreter_->tensor(model_state_->input_index_map_[input_name]);
-    char* tflite_input_buffer = tflite_input_tensor->data.raw;
-
-    // Here we use ProcessTensor to copy the data from triton into the buffer
-    // allocated by the tflite interpreter. I don't believe the data copy can
-    // be avoided using the tflite runtime
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        responses, request_count,
-        collector->ProcessTensor(
-            input_name, tflite_input_buffer, tflite_input_tensor->bytes,
-            alloc_perference, &input_buffer, &batchn_byte_size, &memory_type,
-            &memory_type_id));
+
+    // Here we use ProcessTensor to manage the input buffer for the tensor. In
+    // the overload of this function, the backend input collector manages the
+    // memory, as opposed to copying it into the destination buffer we could
+    // pass, `buffer`. At the end of this call, cpu_buffer will point to the
+    // contiguous memory for the potentially batched input tensors
+    tensorpipe::CpuBuffer cpu_buffer;
+    RETURN_IF_ERROR(collector->ProcessTensor(
+        input_name, nullptr, 0, alloc_perference,
+        const_cast<const char**>(reinterpret_cast<char**>(&cpu_buffer.ptr)),
+        &batchn_byte_size, &memory_type, &memory_type_id));
+
+    // Set the space for the tensors for tensorpipe message
+    tp_msg->tensors[input_idx].length = static_cast<size_t>(batchn_byte_size);
+    tp_msg->tensors[input_idx].buffer = cpu_buffer;
   }
 
   // Finalize Backend Input Collector...
   collector->Finalize();
+
+  return nullptr;
 }
 
 void
+ModelInstanceState::Execute(
+    std::vector<TRITONBACKEND_Response*>* responses,
+    const uint32_t response_count, tensorpipe::Message* tp_msg,
+    std::unordered_map<std::string, std::vector<char>>& inference_output)
+{
+  // Write tensor across pipe and wait for completion asynchronously
+  std::promise<bool> done;
+  pipe_->write(
+      *tp_msg,
+      [this, &inference_output, &done](const tensorpipe::Error& error) {
+        if (error) {
+          LOG_MESSAGE(
+              TRITONSERVER_LOG_ERROR,
+              (std::string(
+                   "Failed to send model_input request to server. Details: ") +
+               error.what())
+                  .c_str());
+          done.set_value(false);
+          return;
+        }
+        // Read a response from the server with description of incoming
+        // result tensors so we can get ready to write the data
+        pipe_->readDescriptor([this, &inference_output, &done](
+                                  const tensorpipe::Error& error,
+                                  tensorpipe::Descriptor descriptor) {
+          if (error) {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_ERROR,
+                (std::string(
+                     "Unexpected error when reading descriptor from accepted "
+                     "pipe. Details: ") +
+                 error.what())
+                    .c_str());
+            done.set_value(false);
+            return;
+          }
+
+          // Create a cpu buffer instance and assign its buffer
+          // pointer to that of the tflite allocated buffer for our
+          // output tensor
+          tensorpipe::Allocation allocation;
+          allocation.tensors.resize(descriptor.tensors.size());
+          for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
+            inference_output[descriptor.tensors[i].metadata].resize(
+                descriptor.tensors[i].length);
+
+            allocation.tensors[i].buffer = tensorpipe::CpuBuffer{
+                .ptr = static_cast<void*>(
+                    inference_output[descriptor.tensors[i].metadata].data())};
+          }
+
+          // Read the data from the server response into the tensor
+          // buffer assigned above
+          pipe_->read(allocation, [&done](const tensorpipe::Error& error) {
+            if (error) {
+              LOG_MESSAGE(
+                  TRITONSERVER_LOG_ERROR,
+                  (std::string(
+                       "Unexpected error when reading data from accepted "
+                       "pipe. Details: ") +
+                   error.what())
+                      .c_str());
+              done.set_value(false);
+              return;
+            }
+            done.set_value(true);
+          });
+        });
+      });
+
+  if (!done.get_future().get()) {
+    SendErrorForResponses(
+        responses, response_count,
+        TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL, ("TFLite execute failure")));
+  }
+}
+
+TRITONSERVER_Error*
 ModelInstanceState::ReadOutputTensors(
     size_t total_batch_size, TRITONBACKEND_Request** requests,
     const uint32_t request_count,
-    std::vector<TRITONBACKEND_Response*>* responses)
+    std::vector<TRITONBACKEND_Response*>* responses,
+    const std::unordered_map<std::string, std::vector<char>>& inference_output)
 {
   BackendOutputResponder responder(
       requests, request_count, responses, model_state_->MaxBatchSize(),
@@ -1271,106 +1365,28 @@ ModelInstanceState::ReadOutputTensors(
 
   for (const auto& map_entry : model_state_->output_index_map_) {
     std::string output_name = map_entry.first;
-    int tensor_index = map_entry.second;
-
-    TfLiteTensor* tflite_output_tensor = interpreter_->tensor(tensor_index);
-
-    // Verify output datatype matches datatype from model config
-    TRITONSERVER_DataType output_dtype =
-        ConvertTFLiteTypeToDataType(tflite_output_tensor->type);
-    TRITONSERVER_DataType config_datatype =
-        model_state_->output_dtype_map_[output_name];
-    if (config_datatype != output_dtype) {
-      RESPOND_ALL_AND_RETURN_IF_ERROR(
-          responses, request_count,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              (std::string("unexpected datatype TYPE_") +
-               TRITONSERVER_DataTypeString(output_dtype) +
-               " for inference output '" + output_name + "', expecting TYPE_" +
-               TRITONSERVER_DataTypeString(config_datatype))
-                  .c_str()));
-    }
-
-    // Assign data pointer to head of data container for output tensor
-    const char* output_buffer =
-        static_cast<const char*>(tflite_output_tensor->data.raw);
-
-    // Set output shape
-    std::vector<int64_t> batchn_shape;
-    TfLiteIntArray* dims = tflite_output_tensor->dims;
-    for (int32_t i = 0; i < dims->size; i++) {
-      batchn_shape.push_back(dims->data[i]);
-    }
+    std::vector<int64_t> output_shape =
+        model_state_->output_shape_map_[output_name];
+    output_shape[0] = total_batch_size;
 
     responder.ProcessTensor(
-        output_name, output_dtype, batchn_shape, output_buffer,
-        TRITONSERVER_MEMORY_CPU, 0);
+        output_name, model_state_->output_dtype_map_[output_name], output_shape,
+        inference_output.at(output_name).data(), TRITONSERVER_MEMORY_CPU, 0);
   }
 
   // Finalize and wait for any pending buffer copies.
   responder.Finalize();
+
+  return nullptr;
 }
 
 /////////////
 
 extern "C" {
 
-int32_t armnn_threads = INT_MAX;
-
 TRITONSERVER_Error*
 TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
 {
-#ifdef PAPI_PROFILING_ENABLE
-  // Init PAPI library
-  RETURN_ERROR_IF_FALSE(
-      PAPI_library_init(PAPI_VER_CURRENT) == PAPI_VER_CURRENT,
-      TRITONSERVER_ERROR_UNAVAILABLE, std::string("Failed to init PAPI lib"));
-  RETURN_ERROR_IF_FALSE(
-      PAPI_thread_init(pthread_self) == PAPI_OK, TRITONSERVER_ERROR_UNAVAILABLE,
-      std::string("Failed to init PAPI thread lib"));
-
-  // The backend configuration may contain information needed by the
-  // backend, such a command-line arguments.
-  TRITONSERVER_Message* backend_config_message;
-  RETURN_IF_ERROR(
-      TRITONBACKEND_BackendConfig(backend, &backend_config_message));
-  const char* buffer;
-  size_t byte_size;
-  RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
-      backend_config_message, &buffer, &byte_size));
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      (std::string("backend configuration:\n") + buffer).c_str());
-  triton::common::TritonJson::Value backend_config;
-  if (byte_size != 0) {
-    RETURN_IF_ERROR(backend_config.Parse(buffer, byte_size));
-  }
-  triton::common::TritonJson::Value cmdline;
-  if (backend_config.Find("cmdline", &cmdline)) {
-    triton::common::TritonJson::Value value;
-    std::string value_str;
-    if (cmdline.Find("papi-events", &value)) {
-      RETURN_IF_ERROR(value.AsString(&value_str));
-      std::stringstream ss(value_str);
-      while (ss.good()) {
-        std::string substr;
-        std::getline(ss, substr, ',');
-        // Validate counter is a valid papi counter
-        RETURN_ERROR_IF_FALSE(
-            PAPIEventValid(substr), TRITONSERVER_ERROR_INVALID_ARG,
-            std::string("PAPI event '") + substr +
-                "' is requested but invalid");
-      }
-      // Set environment for papi to do high level op profiling
-      RETURN_ERROR_IF_TRUE(
-          setenv("PAPI_EVENTS", value_str.c_str(), 1),
-          TRITONSERVER_ERROR_INVALID_ARG,
-          std::string("Could not set PAPI_EVENTS env variable"));
-    }
-  }
-#endif  // PAPI_PROFILING_ENABLE
-
   const char* cname;
   RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
   std::string name(cname);
@@ -1433,7 +1449,7 @@ TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
   // Create a ModelState object and associate it with the
   // TRITONBACKEND_Model.
   ModelState* model_state;
-  RETURN_IF_ERROR(ModelState::Create(model, &model_state, &armnn_threads));
+  RETURN_IF_ERROR(ModelState::Create(model, &model_state));
   RETURN_IF_ERROR(
       TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
 
@@ -1489,7 +1505,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
   // TRITONBACKEND_ModelInstance.
   ModelInstanceState* instance_state;
   RETURN_IF_ERROR(
-      ModelInstanceState::Create(model_state, instance, &instance_state));
+      ModelInstanceState::Create(model_state, instance, name, &instance_state));
   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
       instance, reinterpret_cast<void*>(instance_state)));
 
diff --git a/src/tflite_utils.cc b/src/tflite_utils.cc
index 5be2fbd..40d4fd3 100644
--- a/src/tflite_utils.cc
+++ b/src/tflite_utils.cc
@@ -1,8 +1,11 @@
+//
+// Copyright © 2023 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
 #include "tflite_utils.h"
 
-#ifdef PAPI_PROFILING_ENABLE
-#include <papi.h>
-#endif  // PAPI_PROFILING_ENABLE
+#include <sstream>
 
 namespace triton { namespace backend { namespace tensorflowlite {
 
@@ -115,37 +118,18 @@ ModelConfigDataTypeToTFLiteType(const std::string& data_type_str)
   return std::make_pair(true, type);
 }
 
-#ifdef PAPI_PROFILING_ENABLE
-bool
-PAPIEventValid(std::string& event_name)
+std::vector<int>
+StringToIntVector(std::string const& s)
 {
-  int event_set = PAPI_NULL;
-  bool valid = false;
-  if (PAPI_create_eventset(&event_set) == PAPI_OK) {
-    valid = PAPI_add_named_event(event_set, event_name.c_str()) == PAPI_OK;
-    if (valid) {
-      if (PAPI_cleanup_eventset(event_set) != PAPI_OK) {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_WARN,
-            (std::string(
-                 "Call to cleanup event_set failed when trying to check "
-                 "event ") +
-             event_name)
-                .c_str());
-      }
-    }
-    if (PAPI_destroy_eventset(&event_set) != PAPI_OK) {
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_WARN,
-          (std::string("Call to destroy event_set failed when trying to check "
-                       "event ") +
-           event_name)
-              .c_str());
-    }
+  std::stringstream iss(s);
+
+  int val;
+  std::vector<int> result;
+  while (iss >> val) {
+    result.push_back(val);
   }
-  return valid;
+  return result;
 }
-#endif  // PAPI_PROFILING_ENABLE
 
 
 }}}  // namespace triton::backend::tensorflowlite
diff --git a/src/tflite_utils.h b/src/tflite_utils.h
index 9438937..745c38e 100644
--- a/src/tflite_utils.h
+++ b/src/tflite_utils.h
@@ -27,6 +27,8 @@ std::pair<bool, TfLiteType> ConvertDataTypeToTFLiteType(
 std::pair<bool, TfLiteType> ModelConfigDataTypeToTFLiteType(
     const std::string& data_type_str);
 
+std::vector<int> StringToIntVector(std::string const& s);
+
 template <typename T, typename A>
 std::string
 VectorToString(std::vector<T, A> const& v)
@@ -34,15 +36,11 @@ VectorToString(std::vector<T, A> const& v)
   std::stringstream ss;
   for (size_t i = 0; i < v.size(); i++) {
     if (i != 0) {
-      ss << ", ";
+      ss << ",";
     }
     ss << v[i];
   }
   return ss.str();
 }
 
-#ifdef PAPI_PROFILING_ENABLE
-bool PAPIEventValid(std::string& event_name);
-#endif  // PAPI_PROFILING_ENABLE
-
 }}}  // namespace triton::backend::tensorflowlite

From 31384e5850838f29cfc80e26a449465f93530f32 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Tue, 30 May 2023 10:48:34 -0500
Subject: [PATCH 02/33] Support papi op profiling per model

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 CMakeLists.txt |  7 +++++++
 README.md      | 10 +++++++++-
 src/tflite.cc  | 50 +++++++++++++++++++++++++++++++++++---------------
 3 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 177b8b6..105fda6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -484,6 +484,13 @@ if(ARMNN_DELEGATE_ENABLE)
   target_compile_definitions(triton-armnn-tflite-backend PRIVATE ARMNN_DELEGATE_ENABLE=1)
 endif()
 
+if(PAPI_PROFILING_ENABLE)
+  target_compile_definitions(
+    triton-armnn-tflite-backend
+    PRIVATE PAPI_PROFILING_ENABLE=1
+  )
+endif()
+
 target_include_directories(triton-armnn-tflite-backend
                            PRIVATE ${BACKEND_INCLUDE_DIRS})
 
diff --git a/README.md b/README.md
index 81f54d4..4ea154a 100644
--- a/README.md
+++ b/README.md
@@ -194,5 +194,13 @@ instance_group [
 ```
 
 ## Enabling PAPI events
-This backend supports PAPI performance counter sampling. This is exposed through the PAPI High Level API. We support performance counter tracing at the tflite operator level using tflite tracing instrumentation. To enable this, when launching triton pass the flag `--backend-config=armnn_tflite,papi-events=PAPI_TOT_CYC,PAPI_LD_INS`. Internally, the events listed get set to the environment variable `PAPI_EVENTS` as per the PAPI High Level API documentation. Results of this will be written to a newly created `papi_hl_output` folder in the directory you launched the server from.
+This backend supports PAPI performance counter sampling. This is exposed through the PAPI High Level API. We support performance counter tracing at the tflite operator level using tflite tracing instrumentation. To enable this, you can use the following in your model config:
+```
+parameters {
+    key: "papi_events"
+    value: {
+        string_value:"PAPI_TOT_CYC,PAPI_LD_INS"
+    }
+}
+```
 Internally, the events listed get set to the environment variable `PAPI_EVENTS` as per the PAPI High Level API documentation. Results of this will be written to a newly created `papi_hl_output` folder in the directory you launched the server from.
diff --git a/src/tflite.cc b/src/tflite.cc
index 9753afd..75734b3 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -124,6 +124,11 @@ class ModelState : public BackendModel {
   // Path string for the model_instance binary
   const char* model_instance_location_;
 
+#ifdef PAPI_PROFILING_ENABLE
+  // String holding comma-separated list of events for child inference process
+  std::string papi_events_ = "";
+#endif  // PAPI_PROFILING_ENABLE
+
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
   TRITONSERVER_Error* AutoCompleteConfig();
@@ -468,6 +473,22 @@ ModelState::ValidateModelConfig()
       TRITONSERVER_LOG_VERBOSE,
       (std::string("model configuration:\n") + buffer.Contents()).c_str());
 
+#ifdef PAPI_PROFILING_ENABLE
+  // Take this opportunity to handle papi events
+  triton::common::TritonJson::Value params;
+  if (ModelConfig().Find("parameters", &params)) {
+    auto err = GetParameterValue(params, "papi_events", &papi_events_);
+    // papi_events is not required so clear error if not found
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+  }
+#endif  // PAPI_PROFILING_ENABLE
+
   // To check input and output names we will load and release the model during
   // the validation process without allocating memory for inference
   std::unique_ptr<tflite::Interpreter> interpreter;
@@ -830,9 +851,17 @@ ModelInstanceState::LaunchModelInstance()
     return str.substr(0, found);
   };
 
-  options.env.extra = std::unordered_map<std::string, std::string>{
+  std::unordered_map<std::string, std::string> model_instance_env{
       {"LD_LIBRARY_PATH", base_path(*tritonserver_lib_path)}};
 
+#ifdef PAPI_PROFILING_ENABLE
+  if (!model_state_->papi_events_.empty()) {
+    model_instance_env.insert({"PAPI_EVENTS", model_state_->papi_events_});
+  }
+#endif  // PAPI_PROFILING_ENABLE
+
+  options.env.extra = model_instance_env;
+
   std::error_code ec =
       model_instance_process_.start(model_instance_args, options);
 
@@ -851,9 +880,6 @@ ModelInstanceState::LaunchModelInstance()
       (std::string("Launched model instance: ") + model_instance_name_)
           .c_str());
 
-  // logging_thread_.reset(
-  //     new std::thread(&ModelInstanceState::ModelInstanceLogHandler, this));
-
   return nullptr;
 }
 
@@ -952,20 +978,14 @@ ModelInstanceState::SendModel()
 #endif  // ARMNN_DELEGATE_ENABLE
 
   // Write the message
-  auto done = std::make_shared<std::promise<bool>>();
+  auto done = std::make_shared<std::promise<const tensorpipe::Error&>>();
   pipe_->write(tp_msg, [done](const tensorpipe::Error& error) {
-    if (error) {
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_ERROR,
-          ("Failed to send model load message: " + error.what()).c_str());
-      done->set_value(false);
-    } else {
-      done->set_value(true);
-    }
+    done->set_value(error);
   });
+  const tensorpipe::Error& error = done->get_future().get();
   RETURN_ERROR_IF_TRUE(
-      !done->get_future().get(), TRITONSERVER_ERROR_INTERNAL,
-      std::string("Failed to send model load message."));
+      error, TRITONSERVER_ERROR_INTERNAL,
+      ("Failed to send model load message: " + error.what()));
   return nullptr;
 }
 

From 5ac8615272aea3a4250bbf9cb89bfc1e932e1f27 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Tue, 30 May 2023 13:45:10 -0500
Subject: [PATCH 03/33] Use cma channel first, and improve error handling

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc | 20 ++++++++++++--------
 src/model_instance.h  |  5 ++++-
 src/tflite.cc         |  5 ++++-
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index 2505353..cd6c5ce 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -30,8 +30,8 @@
 void
 ModelInstance::Finalize()
 {
-  listener_->close();
   pipe_->close();
+  listener_->close();
 }
 
 void
@@ -240,13 +240,17 @@ ModelInstance::ReceiveFromPipe()
                             const tensorpipe::Error& error,
                             tensorpipe::Descriptor descriptor) {
     if (error) {
-      // Error may happen when the pipe is closed
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_ERROR,
-          (std::string("Unexpected error when reading from accepted pipe: ") +
-           error.what())
-              .c_str());
-      return;
+      if (error.isOfType<tensorpipe::PipeClosedError>()) {
+        // Expected.
+      } else {
+        // Error may happen when the pipe is closed
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR,
+            (std::string("Unexpected error when reading from accepted pipe: ") +
+             error.what())
+                .c_str());
+        return;
+      }
     }
     if (descriptor.metadata == "model_load") {
       LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Loading model");
diff --git a/src/model_instance.h b/src/model_instance.h
index 591843a..c2dc440 100644
--- a/src/model_instance.h
+++ b/src/model_instance.h
@@ -31,7 +31,10 @@ class ModelInstance {
     context_ = std::make_shared<tensorpipe::Context>();
     auto transportContext = tensorpipe::transport::shm::create();
     context_->registerTransport(0 /* priority */, "shm", transportContext);
-    // Register basic shm channel
+    // Register cma shm channel
+    auto cmaChannel = tensorpipe::channel::cma::create();
+    context_->registerChannel(1 /* low priority */, "cma", cmaChannel);
+    // Register basic channel
     auto basicChannel = tensorpipe::channel::basic::create();
     context_->registerChannel(0 /* low priority */, "basic", basicChannel);
   }
diff --git a/src/tflite.cc b/src/tflite.cc
index 75734b3..7d0f8d6 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -707,7 +707,10 @@ ModelState::InitTensorPipe()
   auto transportContext = tensorpipe::transport::shm::create();
   // Consider here also registering tcp transport if shm not avail
   context_->registerTransport(0 /* priority */, "shm", transportContext);
-  // Register basic shm channel
+  // Register cma shm channel
+  auto cmaChannel = tensorpipe::channel::cma::create();
+  context_->registerChannel(1 /* low priority */, "cma", cmaChannel);
+  // Register basic channel
   auto basicChannel = tensorpipe::channel::basic::create();
   context_->registerChannel(0 /* low priority */, "basic", basicChannel);
 }

From 6344cfd90abaa51363058165ebb20cb50b9f64e0 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Tue, 30 May 2023 17:22:12 -0500
Subject: [PATCH 04/33] Make parent process handle the connection listen

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc      | 27 ++-----------
 src/model_instance.h       | 10 +----
 src/model_instance_main.cc |  6 +--
 src/tflite.cc              | 81 +++++++++++++++++++++++++-------------
 4 files changed, 60 insertions(+), 64 deletions(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index cd6c5ce..3966f46 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -31,34 +31,13 @@ void
 ModelInstance::Finalize()
 {
   pipe_->close();
-  listener_->close();
 }
 
 void
 ModelInstance::Start(const std::string& addr)
 {
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      (std::string("ModelInstance starts on: ") + addr).c_str());
-  listener_ = context_->listen({addr});
-  listener_->accept([&, this](
-                        const tensorpipe::Error& error,
-                        std::shared_ptr<tensorpipe::Pipe> pipe) {
-    if (error) {
-      if (error.isOfType<tensorpipe::ListenerClosedError>()) {
-        // Expected.
-      } else {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_ERROR,
-            (std::string("Unexpected error when accepting incoming pipe: ") +
-             error.what())
-                .c_str());
-      }
-      return;
-    }
-    pipe_ = std::move(pipe);
-    ReceiveFromPipe();
-  });
+  pipe_ = context_->connect(addr);
+  ReceiveFromPipe();
 }
 
 TfLiteStatus
@@ -253,7 +232,7 @@ ModelInstance::ReceiveFromPipe()
       }
     }
     if (descriptor.metadata == "model_load") {
-      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Loading model");
+      LOG_MESSAGE(TRITONSERVER_LOG_INFO, "Loading model");
       LoadModelFromPipe(descriptor);
     } else if (descriptor.metadata == "model_input") {
       Infer(descriptor);
diff --git a/src/model_instance.h b/src/model_instance.h
index c2dc440..a3c16cc 100644
--- a/src/model_instance.h
+++ b/src/model_instance.h
@@ -33,10 +33,7 @@ class ModelInstance {
     context_->registerTransport(0 /* priority */, "shm", transportContext);
     // Register cma shm channel
     auto cmaChannel = tensorpipe::channel::cma::create();
-    context_->registerChannel(1 /* low priority */, "cma", cmaChannel);
-    // Register basic channel
-    auto basicChannel = tensorpipe::channel::basic::create();
-    context_->registerChannel(0 /* low priority */, "basic", basicChannel);
+    context_->registerChannel(0 /* low priority */, "cma", cmaChannel);
   }
 
   /*!
@@ -92,11 +89,6 @@ class ModelInstance {
    */
   std::shared_ptr<tensorpipe::Pipe> pipe_;
 
-  /*!
-   * \brief listener to build pipe
-   */
-  std::shared_ptr<tensorpipe::Listener> listener_{nullptr};
-
   /*!
    * \brief tflite interpreter
    */
diff --git a/src/model_instance_main.cc b/src/model_instance_main.cc
index f5bd47b..1fdd9c0 100644
--- a/src/model_instance_main.cc
+++ b/src/model_instance_main.cc
@@ -41,7 +41,7 @@ main(int argc, char* argv[])
 
     return 1;
   }
-  const char* bind_addr = argv[1];
+  const char* addr = argv[1];
 
   // block signals in this thread and subsequently
   // spawned threads
@@ -69,8 +69,8 @@ main(int argc, char* argv[])
 
   ModelInstance model_instance;
 
-  // Will listen on the address provided as the first argument in the list
-  model_instance.Start(std::string(bind_addr));
+  // Will connect to the address provided as the first argument in the list
+  model_instance.Start(std::string(addr));
 
   LOG_MESSAGE(
       TRITONSERVER_LOG_VERBOSE,
diff --git a/src/tflite.cc b/src/tflite.cc
index 7d0f8d6..f0bb468 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -9,7 +9,6 @@
 
 #include <link.h>
 #include <stdint.h>
-#include <unistd.h>
 
 #include <algorithm>
 #include <exception>
@@ -709,10 +708,7 @@ ModelState::InitTensorPipe()
   context_->registerTransport(0 /* priority */, "shm", transportContext);
   // Register cma shm channel
   auto cmaChannel = tensorpipe::channel::cma::create();
-  context_->registerChannel(1 /* low priority */, "cma", cmaChannel);
-  // Register basic channel
-  auto basicChannel = tensorpipe::channel::basic::create();
-  context_->registerChannel(0 /* low priority */, "basic", basicChannel);
+  context_->registerChannel(0 /* low priority */, "cma", cmaChannel);
 }
 
 //
@@ -746,7 +742,6 @@ class ModelInstanceState : public BackendModelInstance {
   TRITONSERVER_Error* ConnectModelInstance();
   TRITONSERVER_Error* SendModel();
   TRITONSERVER_Error* LaunchModelInstance();
-  void DestroyModelInstance();
   bool ModelInstanceRunning();
   TRITONSERVER_Error* SetInputTensors(
       size_t total_batch_size, TRITONBACKEND_Request** requests,
@@ -772,6 +767,9 @@ class ModelInstanceState : public BackendModelInstance {
   // instance
   const std::string model_instance_name_;
 
+  // Tensorpipe listener to establish connection with child process
+  std::shared_ptr<tensorpipe::Listener> listener_{nullptr};
+
   // Tensorpipe to send input tensors over
   std::shared_ptr<tensorpipe::Pipe> pipe_;
 
@@ -805,21 +803,39 @@ ModelInstanceState::ModelInstanceState(
       model_state_(model_state), model_instance_name_(model_instance_name)
 {
   THROW_IF_BACKEND_INSTANCE_ERROR(LaunchModelInstance());
-
-  // This is a gross thing to do, the backend deadlocks if the tensorpipe
-  // context tries to connect to the model_instance process to before it's
-  // ready.
-  sleep(2);
-  pipe_ = model_state_->context_->connect("shm://" + model_instance_name_);
-
-  THROW_IF_BACKEND_INSTANCE_ERROR(SendModel());
 }
 
-ModelInstanceState::~ModelInstanceState() {}
+ModelInstanceState::~ModelInstanceState()
+{
+  pipe_->close();
+  listener_->close();
+}
 
 TRITONSERVER_Error*
 ModelInstanceState::LaunchModelInstance()
 {
+  // Start listening for child process to connect to shm channel
+  listener_ = model_state_->context_->listen({"shm://" + model_instance_name_});
+  auto done = std::make_shared<std::promise<bool>>();
+  listener_->accept([&, this](
+                        const tensorpipe::Error& error,
+                        std::shared_ptr<tensorpipe::Pipe> pipe) {
+    // When the child process connects, we act here in this lambda function
+    if (error) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("Unexpected error when accepting incoming pipe: ") +
+           error.what())
+              .c_str());
+
+      done->set_value(false);
+      return;
+    }
+    pipe_ = std::move(pipe);
+
+    done->set_value(true);
+  });
+
   std::vector<std::string> model_instance_args = {
       std::string(model_state_->model_instance_location_) + "/model_instance",
       std::string("shm://") + model_instance_name_};
@@ -883,6 +899,16 @@ ModelInstanceState::LaunchModelInstance()
       (std::string("Launched model instance: ") + model_instance_name_)
           .c_str());
 
+  // If the process did not come up in time something has gone wrong
+  RETURN_ERROR_IF_TRUE(
+      done->get_future().wait_for(std::chrono::seconds(5)) ==
+          std::future_status::timeout,
+      TRITONSERVER_ERROR_INTERNAL,
+      std::string(
+          "Model instance failed: process did not connect back to parent"));
+
+  SendModel();
+
   return nullptr;
 }
 
@@ -960,7 +986,7 @@ ModelInstanceState::SendModel()
   tp_msg.payloads[OptimizerOption::ARMNN_CPU_NUM_THREADS] =
       gen_metadata(std::to_string(model_state_->armnn_cpu_num_threads_));
 
-  // Add in use cpu and gpu options
+  // Add in armnn cpu and gpu options
   tp_msg.payloads[OptimizerOption::ARMNN_CPU_FAST_MATH_ENABLED] =
       gen_metadata(model_state_->armnn_cpu_fast_math_enabled_);
 
@@ -981,14 +1007,13 @@ ModelInstanceState::SendModel()
 #endif  // ARMNN_DELEGATE_ENABLE
 
   // Write the message
-  auto done = std::make_shared<std::promise<const tensorpipe::Error&>>();
+  auto done = std::make_shared<std::promise<bool>>();
   pipe_->write(tp_msg, [done](const tensorpipe::Error& error) {
-    done->set_value(error);
+    done->set_value(!error);
   });
-  const tensorpipe::Error& error = done->get_future().get();
-  RETURN_ERROR_IF_TRUE(
-      error, TRITONSERVER_ERROR_INTERNAL,
-      ("Failed to send model load message: " + error.what()));
+  RETURN_ERROR_IF_FALSE(
+      done->get_future().get(), TRITONSERVER_ERROR_INTERNAL,
+      std::string("Failed to send model load message"));
   return nullptr;
 }
 
@@ -1303,7 +1328,7 @@ ModelInstanceState::Execute(
     std::unordered_map<std::string, std::vector<char>>& inference_output)
 {
   // Write tensor across pipe and wait for completion asynchronously
-  std::promise<bool> done;
+  auto done = std::make_shared<std::promise<bool>>();
   pipe_->write(
       *tp_msg,
       [this, &inference_output, &done](const tensorpipe::Error& error) {
@@ -1314,7 +1339,7 @@ ModelInstanceState::Execute(
                    "Failed to send model_input request to server. Details: ") +
                error.what())
                   .c_str());
-          done.set_value(false);
+          done->set_value(false);
           return;
         }
         // Read a response from the server with description of incoming
@@ -1330,7 +1355,7 @@ ModelInstanceState::Execute(
                      "pipe. Details: ") +
                  error.what())
                     .c_str());
-            done.set_value(false);
+            done->set_value(false);
             return;
           }
 
@@ -1359,15 +1384,15 @@ ModelInstanceState::Execute(
                        "pipe. Details: ") +
                    error.what())
                       .c_str());
-              done.set_value(false);
+              done->set_value(false);
               return;
             }
-            done.set_value(true);
+            done->set_value(true);
           });
         });
       });
 
-  if (!done.get_future().get()) {
+  if (!done->get_future().get()) {
     SendErrorForResponses(
         responses, response_count,
         TRITONSERVER_ErrorNew(

From 53adc1a5e5faab582cab70b41fa2addb49e7c5aa Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Wed, 31 May 2023 08:57:48 -0500
Subject: [PATCH 05/33] Fix build.yml

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 .github/workflows/build.yml | 2 +-
 src/tflite.cc               | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2e7fee2..b07fe31 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -175,7 +175,7 @@ jobs:
           -DTRITON_BACKEND_REPO_TAG=${{env.TRITON_REPO_TAG}} \
           -DTRITON_CORE_REPO_TAG=${{env.TRITON_REPO_TAG}} \
           -DTRITON_COMMON_REPO_TAG=${{env.TRITON_REPO_TAG}} \
-          -PAPI_PROFILING_ENABLE=ON \
+          -DPAPI_PROFILING_ENABLE=ON \
           -DTRITON_ENABLE_MALI_GPU=${{env.TRITON_ENABLE_MALI_GPU}} \
           -DTFLITE_ENABLE_RUY=${{env.TFLITE_ENABLE_RUY}} \
           -DTFLITE_BAZEL_BUILD=${{env.TFLITE_BAZEL_BUILD}} \
diff --git a/src/tflite.cc b/src/tflite.cc
index f0bb468..bf38dee 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -832,7 +832,6 @@ ModelInstanceState::LaunchModelInstance()
       return;
     }
     pipe_ = std::move(pipe);
-
     done->set_value(true);
   });
 

From 64f6c8172c33d402b6ad4f21f52c549715080c5c Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Wed, 31 May 2023 10:10:29 -0500
Subject: [PATCH 06/33] Fix cleanup of child process

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index 3966f46..a483c41 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -228,7 +228,7 @@ ModelInstance::ReceiveFromPipe()
             (std::string("Unexpected error when reading from accepted pipe: ") +
              error.what())
                 .c_str());
-        return;
+        exit(1);
       }
     }
     if (descriptor.metadata == "model_load") {

From 81eb50ada9ef0de93baee93064ea4e2e8567a5ea Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Wed, 31 May 2023 11:08:24 -0500
Subject: [PATCH 07/33] Handle model execution failure correctly

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc | 39 +++++++++++++++++++++------------------
 src/tflite.cc         | 30 ++++++++++++++++++++----------
 2 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index a483c41..49574fe 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -319,34 +319,33 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
     }
   }
 
+  bool success = true;
+
   // Once we have resized all input tensors in the loop above,
   // now we can allocate the memory plan within the tflite runtime if
   // necessary
   if (allocate_tensors || first_inference_) {
     if (interpreter_->AllocateTensors() != kTfLiteOk) {
-      return;
+      success = false;
     }
   }
 
   // Assign Cpu buffers to read incoming tensor bytes into after allocate
   // tensors is called
   for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
-    tensorpipe::CpuBuffer cpu_buffer{
+    allocation.tensors[i].buffer = tensorpipe::CpuBuffer{
         .ptr = interpreter_->tensor(std::stoi(descriptor.tensors[i].metadata))
                    ->data.raw};
-    allocation.tensors[i].buffer = cpu_buffer;
   }
 
-  pipe_->read(allocation, [this](const tensorpipe::Error& error) {
+  pipe_->read(allocation, [this, &success](const tensorpipe::Error& error) {
     if (error) {
-      return;
+      success = false;
     }
     // At this point our input tensors should be written to by the read
-    // function,
-    // now we invoke the interpreter and read the output
+    // function, now we invoke the interpreter and read the output
     if (interpreter_->Invoke() != kTfLiteOk) {
-      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to invoke model");
-      return;
+      success = false;
     }
 
     first_inference_ = false;
@@ -354,15 +353,19 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
     // Write output back to client
     tensorpipe::Message tp_msg;
 
-    for (uint64_t i = 0; i < interpreter_->outputs().size(); ++i) {
-      int output_index = interpreter_->outputs()[i];
-      TfLiteTensor* output_tensor = interpreter_->tensor(output_index);
-      tensorpipe::Message::Tensor tensor;
-      // We use the output tensor name as the metadata in the request
-      tensor.metadata = std::string(output_tensor->name);
-      tensor.length = output_tensor->bytes;
-      tensor.buffer = tensorpipe::CpuBuffer{.ptr = output_tensor->data.raw};
-      tp_msg.tensors.push_back(tensor);
+    if (!success) {
+      tp_msg.metadata = "f";
+    } else {
+      for (uint64_t i = 0; i < interpreter_->outputs().size(); ++i) {
+        int output_index = interpreter_->outputs()[i];
+        TfLiteTensor* output_tensor = interpreter_->tensor(output_index);
+        tensorpipe::Message::Tensor tensor;
+        // We use the output tensor name as the metadata in the request
+        tensor.metadata = std::string(output_tensor->name);
+        tensor.length = output_tensor->bytes;
+        tensor.buffer = tensorpipe::CpuBuffer{.ptr = output_tensor->data.raw};
+        tp_msg.tensors.push_back(tensor);
+      }
     }
     pipe_->write(tp_msg, [](const tensorpipe::Error& error) {
       if (error) {
diff --git a/src/tflite.cc b/src/tflite.cc
index bf38dee..0d78a52 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -749,7 +749,7 @@ class ModelInstanceState : public BackendModelInstance {
       std::vector<TRITONBACKEND_Response*>* responses,
       BackendInputCollector* collector,
       std::vector<BackendMemory*>* input_memories, tensorpipe::Message* tp_msg);
-  void Execute(
+  TRITONSERVER_Error* Execute(
       std::vector<TRITONBACKEND_Response*>* responses,
       const uint32_t response_count, tensorpipe::Message* tp_msg,
       std::unordered_map<std::string, std::vector<char>>& inference_output);
@@ -1171,7 +1171,9 @@ ModelInstanceState::ProcessRequests(
 
   // Run...
   if (!all_response_failed) {
-    Execute(&responses, request_count, &tp_msg, inference_output);
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        Execute(&responses, request_count, &tp_msg, inference_output));
   }
 
   uint64_t compute_end_ns = 0;
@@ -1320,7 +1322,7 @@ ModelInstanceState::SetInputTensors(
   return nullptr;
 }
 
-void
+TRITONSERVER_Error*
 ModelInstanceState::Execute(
     std::vector<TRITONBACKEND_Response*>* responses,
     const uint32_t response_count, tensorpipe::Message* tp_msg,
@@ -1358,10 +1360,20 @@ ModelInstanceState::Execute(
             return;
           }
 
+          tensorpipe::Allocation allocation;
+
+          // If there was a problem running the inference we get that back in
+          // the message metadata
+          if (descriptor.metadata == "f") {
+            LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to run inference");
+            pipe_->read(allocation, [&done](const tensorpipe::Error& error) {});
+            done->set_value(false);
+            return;
+          }
+
           // Create a cpu buffer instance and assign its buffer
           // pointer to that of the tflite allocated buffer for our
           // output tensor
-          tensorpipe::Allocation allocation;
           allocation.tensors.resize(descriptor.tensors.size());
           for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
             inference_output[descriptor.tensors[i].metadata].resize(
@@ -1391,12 +1403,10 @@ ModelInstanceState::Execute(
         });
       });
 
-  if (!done->get_future().get()) {
-    SendErrorForResponses(
-        responses, response_count,
-        TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL, ("TFLite execute failure")));
-  }
+  RETURN_ERROR_IF_FALSE(
+      done->get_future().get(), TRITONSERVER_ERROR_INTERNAL,
+      std::string("TFLite execute failure"));
+  return nullptr;
 }
 
 TRITONSERVER_Error*

From c1c81fd4d5c0fc66d69bc00a74501deb7e546d82 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Wed, 31 May 2023 11:27:32 -0500
Subject: [PATCH 08/33] Improve error handling

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index 49574fe..acfd51e 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -279,6 +279,7 @@ void
 ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
 {
   bool allocate_tensors = false;
+  bool success = true;
 
   // Create allocation to hold incoming input tensor data
   tensorpipe::Allocation allocation;
@@ -319,8 +320,6 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
     }
   }
 
-  bool success = true;
-
   // Once we have resized all input tensors in the loop above,
   // now we can allocate the memory plan within the tflite runtime if
   // necessary
@@ -339,9 +338,8 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
   }
 
   pipe_->read(allocation, [this, &success](const tensorpipe::Error& error) {
-    if (error) {
-      success = false;
-    }
+    success = !error;
+
     // At this point our input tensors should be written to by the read
     // function, now we invoke the interpreter and read the output
     if (interpreter_->Invoke() != kTfLiteOk) {

From a94b986ea13cc2bedb8c4bc54546962afb4cec96 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Fri, 2 Jun 2023 15:42:34 -0500
Subject: [PATCH 09/33] Fix issue with MMap input tensors in models

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc |  34 ++++++-------
 src/model_instance.h  |   3 ++
 src/tflite.cc         | 109 +++++++++++++++++++-----------------------
 3 files changed, 70 insertions(+), 76 deletions(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index acfd51e..846d44c 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -281,9 +281,9 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
   bool allocate_tensors = false;
   bool success = true;
 
-  // Create allocation to hold incoming input tensor data
-  tensorpipe::Allocation allocation;
-  allocation.tensors.resize(descriptor.tensors.size());
+  if (first_inference_) {
+    allocation_.tensors.resize(descriptor.tensors.size());
+  }
 
   // Get model inputs from request and ready the buffers (Allocation obj) to
   // write tensor data
@@ -297,24 +297,26 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
     // of the input tensor
     int input_tensor_index = std::stoi(descriptor.tensors[i].metadata);
 
-    // Length holds the num bytes of the incoming vector
-    int length = descriptor.tensors[i].length;
+    // incoming_length holds the num bytes of the incoming vector
+    int incoming_length = descriptor.tensors[i].length;
 
-    TfLiteIntArray* tflite_input_tensor_dims =
-        interpreter_->tensor(input_tensor_index)->dims;
     int tflite_input_tensor_len =
         interpreter_->tensor(input_tensor_index)->bytes;
-    std::vector<int> tflite_input_shape(
-        tflite_input_tensor_dims->data,
-        (tflite_input_tensor_dims->data + tflite_input_tensor_dims->size));
-    if (length != tflite_input_tensor_len) {
+
+    if (incoming_length != tflite_input_tensor_len) {
       // Resize input tensors based on current total batch size
+      TfLiteIntArray* tflite_input_tensor_dims =
+          interpreter_->tensor(input_tensor_index)->dims;
+      std::vector<int> tflite_input_shape(
+          tflite_input_tensor_dims->data,
+          (tflite_input_tensor_dims->data + tflite_input_tensor_dims->size));
+
       allocate_tensors = true;
 
       // Set the new batch size
-      tflite_input_shape[0] = length > tflite_input_tensor_len
-                                  ? length / tflite_input_tensor_len
-                                  : tflite_input_tensor_len / length;
+      tflite_input_shape[0] = incoming_length > tflite_input_tensor_len
+                                  ? incoming_length / tflite_input_tensor_len
+                                  : tflite_input_tensor_len / incoming_length;
 
       interpreter_->ResizeInputTensor(input_tensor_index, tflite_input_shape);
     }
@@ -332,12 +334,12 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
   // Assign Cpu buffers to read incoming tensor bytes into after allocate
   // tensors is called
   for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
-    allocation.tensors[i].buffer = tensorpipe::CpuBuffer{
+    allocation_.tensors[i].buffer = tensorpipe::CpuBuffer{
         .ptr = interpreter_->tensor(std::stoi(descriptor.tensors[i].metadata))
                    ->data.raw};
   }
 
-  pipe_->read(allocation, [this, &success](const tensorpipe::Error& error) {
+  pipe_->read(allocation_, [this, &success](const tensorpipe::Error& error) {
     success = !error;
 
     // At this point our input tensors should be written to by the read
diff --git a/src/model_instance.h b/src/model_instance.h
index a3c16cc..18bf8d8 100644
--- a/src/model_instance.h
+++ b/src/model_instance.h
@@ -105,6 +105,9 @@ class ModelInstance {
   // State variable to register whether inference has been called at least once
   bool first_inference_ = true;
 
+  // Tensorpipe allocation that we can reuse
+  tensorpipe::Allocation allocation_;
+
 #ifdef PAPI_PROFILING_ENABLE
   std::unique_ptr<tflite::Profiler> papi_profiler_ = MaybeCreatePapiProfiler();
 #endif  // PAPI_PROFILING_ENABLE
diff --git a/src/tflite.cc b/src/tflite.cc
index 0d78a52..6b594fa 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -511,9 +511,13 @@ ModelState::ValidateModelConfig()
 
   // Populate input name map
   for (size_t i = 0; i < num_inputs; i++) {
-    input_index_map_[interpreter->GetInputName(i)] = inputs[i];
-    input_dtype_map_[interpreter->GetInputName(i)] =
-        ConvertTFLiteTypeToDataType(interpreter->tensor(inputs[i])->type);
+    TfLiteTensor* input_tensor = interpreter->tensor(inputs[i]);
+    if (input_tensor->allocation_type == kTfLiteArenaRw) {
+      // Only worry about inputs that require user input
+      input_index_map_[input_tensor->name] = inputs[i];
+      input_dtype_map_[input_tensor->name] =
+          ConvertTFLiteTypeToDataType(input_tensor->type);
+    }
   }
 
   // Populate output name, dtype, shape map
@@ -533,6 +537,10 @@ ModelState::ValidateModelConfig()
 
   // Validate model inputs
   RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &ios));
+  RETURN_ERROR_IF_FALSE(
+      input_index_map_.size() == ios.ArraySize(), TRITONSERVER_ERROR_INTERNAL,
+      std::string(
+          "Number of required inputs for model does not match provided"));
 
   for (size_t i = 0; i < ios.ArraySize(); i++) {
     triton::common::TritonJson::Value io;
@@ -543,39 +551,31 @@ ModelState::ValidateModelConfig()
     RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
 
     // Return an error if the input name within the model config DNE in model
-    if (input_index_map_.count(io_name) == 0) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_NOT_FOUND,
-          std::string(
-              "Model input: " + std::string(io_name) +
-              " is not a valid input name for '" + Name() + "'")
-              .c_str());
-    }
+    RETURN_ERROR_IF_TRUE(
+        input_index_map_.count(io_name) == 0, TRITONSERVER_ERROR_NOT_FOUND,
+        std::string(
+            "Model input: " + std::string(io_name) +
+            " is not a valid input name for '" + Name() + "'"));
+
 
     // Validate data type
     std::string io_dtype;
     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
     const auto pr = ModelConfigDataTypeToTFLiteType(io_dtype);
-    if (!pr.first) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("unsupported datatype " + io_dtype + " for input '" + io_name +
-           "' for model '" + Name() + "'")
-              .c_str());
-    }
+    RETURN_ERROR_IF_TRUE(
+        !pr.first, TRITONSERVER_ERROR_INTERNAL,
+        ("unsupported datatype " + io_dtype + " for input '" + io_name +
+         "' for model '" + Name() + "'"));
 
     // Validate datatype matches expected from model
     TRITONSERVER_DataType config_dtype =
         TRITONSERVER_StringToDataType(io_dtype.substr(strlen("TYPE_")).c_str());
-    if (config_dtype != input_dtype_map_[io_name]) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("data type " + io_dtype + " for input '" + io_name +
-           "' does not match expected of '" +
-           TRITONSERVER_DataTypeString(input_dtype_map_[io_name]) + "'" +
-           "' for model '" + Name() + "'")
-              .c_str());
-    }
+    RETURN_ERROR_IF_TRUE(
+        config_dtype != input_dtype_map_[io_name], TRITONSERVER_ERROR_INTERNAL,
+        ("data type " + io_dtype + " for input '" + io_name +
+         "' does not match expected of '" +
+         TRITONSERVER_DataTypeString(input_dtype_map_[io_name]) + "'" +
+         "' for model '" + Name() + "'"));
 
     // Validate input shape matches expected from model
     TfLiteIntArray* tflite_dims = interpreter->tensor(inputs[i])->dims;
@@ -621,38 +621,29 @@ ModelState::ValidateModelConfig()
     RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
 
     // Return an error if the output name within the model config DNE in model
-    if (output_index_map_.count(io_name) == 0) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_NOT_FOUND,
-          std::string(
-              "Model output: " + std::string(io_name) +
-              " is not a valid output name for '" + Name() + "'")
-              .c_str());
-    }
+    RETURN_ERROR_IF_TRUE(
+        output_index_map_.count(io_name) == 0, TRITONSERVER_ERROR_NOT_FOUND,
+        std::string(
+            "Model output: " + std::string(io_name) +
+            " is not a valid output name for '" + Name() + "'"));
 
     // Validate data type
     std::string io_dtype;
     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
     const auto pr = ModelConfigDataTypeToTFLiteType(io_dtype);
-    if (!pr.first) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("unsupported datatype " + io_dtype + " for output '" + io_name +
-           "' for model '" + Name() + "'")
-              .c_str());
-    }
+    RETURN_ERROR_IF_TRUE(
+        !pr.first, TRITONSERVER_ERROR_INTERNAL,
+        ("unsupported datatype " + io_dtype + " for output '" + io_name +
+         "' for model '" + Name() + "'"));
     // Validate datatype matches expected from model
     TRITONSERVER_DataType config_dtype =
         TRITONSERVER_StringToDataType(io_dtype.substr(strlen("TYPE_")).c_str());
-    if (config_dtype != output_dtype_map_[io_name]) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("data type " + io_dtype + " for output '" + io_name +
-           "' does not match expected of '" +
-           TRITONSERVER_DataTypeString(output_dtype_map_[io_name]) + "'" +
-           "' for model '" + Name() + "'")
-              .c_str());
-    }
+    RETURN_ERROR_IF_TRUE(
+        config_dtype != output_dtype_map_[io_name], TRITONSERVER_ERROR_INTERNAL,
+        ("data type " + io_dtype + " for output '" + io_name +
+         "' does not match expected of '" +
+         TRITONSERVER_DataTypeString(output_dtype_map_[io_name]) + "'" +
+         "' for model '" + Name() + "'"));
 
     // Validate output shape matches expected from model
     TfLiteIntArray* tflite_dims = interpreter->tensor(outputs[i])->dims;
@@ -671,15 +662,13 @@ ModelState::ValidateModelConfig()
       if (max_batch_size_ > 0) {
         config_output_shape.insert(config_output_shape.begin(), 1);
       }
-      if (config_output_shape != model_output_shape) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            ("shape " + VectorToString(config_output_shape) + " for output '" +
-             io_name + "' does not match expected of '" +
-             VectorToString(model_output_shape) + "'" + "' for model '" +
-             Name() + "'")
-                .c_str());
-      }
+      RETURN_ERROR_IF_TRUE(
+          config_output_shape != model_output_shape,
+          TRITONSERVER_ERROR_INTERNAL,
+          ("shape " + VectorToString(config_output_shape) + " for output '" +
+           io_name + "' does not match expected of '" +
+           VectorToString(model_output_shape) + "'" + "' for model '" + Name() +
+           "'"));
     }
   }
 

From 7a6ab0c4993dd45c82be86d1a1fce2bb09442700 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Fri, 2 Jun 2023 16:01:08 -0500
Subject: [PATCH 10/33] Improve efficiency of model instance message passing

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc | 96 ++++++++++++++++++++++---------------------
 src/model_instance.h  | 61 ++++++++-------------------
 2 files changed, 65 insertions(+), 92 deletions(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index 846d44c..dad07a0 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -283,6 +283,7 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
 
   if (first_inference_) {
     allocation_.tensors.resize(descriptor.tensors.size());
+    tp_response_msg_.tensors.resize(interpreter_->outputs().size());
   }
 
   // Get model inputs from request and ready the buffers (Allocation obj) to
@@ -329,54 +330,55 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
     if (interpreter_->AllocateTensors() != kTfLiteOk) {
       success = false;
     }
-  }
-
-  // Assign Cpu buffers to read incoming tensor bytes into after allocate
-  // tensors is called
-  for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
-    allocation_.tensors[i].buffer = tensorpipe::CpuBuffer{
-        .ptr = interpreter_->tensor(std::stoi(descriptor.tensors[i].metadata))
-                   ->data.raw};
-  }
-
-  pipe_->read(allocation_, [this, &success](const tensorpipe::Error& error) {
-    success = !error;
-
-    // At this point our input tensors should be written to by the read
-    // function, now we invoke the interpreter and read the output
-    if (interpreter_->Invoke() != kTfLiteOk) {
-      success = false;
+    // Assign Cpu buffers to read incoming tensor bytes into after allocate
+    // tensors is called
+    for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
+      allocation_.tensors[i].buffer = tensorpipe::CpuBuffer{
+          .ptr = interpreter_->tensor(std::stoi(descriptor.tensors[i].metadata))
+                     ->data.raw};
     }
+  }
 
-    first_inference_ = false;
-
-    // Write output back to client
-    tensorpipe::Message tp_msg;
+  pipe_->read(
+      allocation_,
+      [this, &success, &allocate_tensors](const tensorpipe::Error& error) {
+        success = !error;
+
+        // At this point our input tensors should be written to by the read
+        // function, now we invoke the interpreter and read the output
+        if (interpreter_->Invoke() != kTfLiteOk) {
+          success = false;
+        }
 
-    if (!success) {
-      tp_msg.metadata = "f";
-    } else {
-      for (uint64_t i = 0; i < interpreter_->outputs().size(); ++i) {
-        int output_index = interpreter_->outputs()[i];
-        TfLiteTensor* output_tensor = interpreter_->tensor(output_index);
-        tensorpipe::Message::Tensor tensor;
-        // We use the output tensor name as the metadata in the request
-        tensor.metadata = std::string(output_tensor->name);
-        tensor.length = output_tensor->bytes;
-        tensor.buffer = tensorpipe::CpuBuffer{.ptr = output_tensor->data.raw};
-        tp_msg.tensors.push_back(tensor);
-      }
-    }
-    pipe_->write(tp_msg, [](const tensorpipe::Error& error) {
-      if (error) {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_ERROR,
-            ("Failed to send inference response to client. Details:" +
-             error.what())
-                .c_str());
-      }
-    });
-    // Arm for getting more data
-    ReceiveFromPipe();
-  });
+        first_inference_ = false;
+
+        // Write output back to client
+        if (!success) {
+          tp_response_msg_.metadata = "f";
+        } else if (allocate_tensors) {
+          // If we (re)allocated tensors then we need to update response message
+          for (uint64_t i = 0; i < interpreter_->outputs().size(); ++i) {
+            int output_index = interpreter_->outputs()[i];
+            TfLiteTensor* output_tensor = interpreter_->tensor(output_index);
+            tensorpipe::Message::Tensor tensor;
+            // We use the output tensor name as the metadata in the request
+            tensor.metadata = std::string(output_tensor->name);
+            tensor.length = output_tensor->bytes;
+            tensor.buffer =
+                tensorpipe::CpuBuffer{.ptr = output_tensor->data.raw};
+            tp_response_msg_.tensors[i] = tensor;
+          }
+        }
+        pipe_->write(tp_response_msg_, [](const tensorpipe::Error& error) {
+          if (error) {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_ERROR,
+                ("Failed to send inference response to client. Details:" +
+                 error.what())
+                    .c_str());
+          }
+        });
+        // Arm for getting more data
+        ReceiveFromPipe();
+      });
 }
diff --git a/src/model_instance.h b/src/model_instance.h
index 18bf8d8..28a491e 100644
--- a/src/model_instance.h
+++ b/src/model_instance.h
@@ -16,16 +16,9 @@
 #include "papi_profiler.h"
 #endif  // PAPI_PROFILING_ENABLE
 
-/*!
- * \brief ModelInstance for backend end execution of model.
- *
- * Tensorpipe Receiver is the communicator implemented by tcp.
- */
+// ModelInstance for backend end execution of model
 class ModelInstance {
  public:
-  /*!
-   * \brief Receiver constructor
-   */
   ModelInstance()
   {
     context_ = std::make_shared<tensorpipe::Context>();
@@ -36,67 +29,42 @@ class ModelInstance {
     context_->registerChannel(0 /* low priority */, "cma", cmaChannel);
   }
 
-  /*!
-   * \brief ModelInstance destructor
-   */
   ~ModelInstance() { Finalize(); }
 
-  /*!
-   * \brief Start server
-   * \param addr Networking address, e.g., 'tcp://127.0.0.1:50051'
-   */
+  // Start model instance and attempt to connect to passed address
   void Start(const std::string& addr);
 
-  /*!
-   * \brief Finalize ModelInstance
-   *
-   * Finalize() is not thread-safe and only one thread can invoke this API.
-   */
+  // Cleanup
   void Finalize();
 
-  /*!
-   * \brief Issue a receive request pipe
-   */
+  // Issue a receive request pipe
   void ReceiveFromPipe();
 
  private:
-  /*!
-   * \brief Callback for new connection is accepted.
-   */
+  // Callback for new connection is accepted.
   void OnAccepted(const tensorpipe::Error&, std::shared_ptr<tensorpipe::Pipe>);
 
-  /*!
-   * \brief Callback for loading a tflite model.
-   */
+  // Callback for loading a tflite model.
   void LoadModelFromPipe(tensorpipe::Descriptor descriptor);
 
+  // Builds the tflite interpreter based on passed descriptor
   TfLiteStatus BuildInterpreter(tensorpipe::Descriptor descriptor);
 
   void LogDelegation(const std::string& delegate_name);
 
-  /*!
-   * \brief Callback for inferencing on a loaded tflite model.
-   */
+  // Callback for inferencing on a loaded tflite model.
   void Infer(tensorpipe::Descriptor& descriptor);
 
-  /*!
-   * \brief global context of tensorpipe
-   */
+  // Global tensorpipe context
   std::shared_ptr<tensorpipe::Context> context_;
 
-  /*!
-   * \brief pipe for client connection
-   */
+  // Pipe for client connection
   std::shared_ptr<tensorpipe::Pipe> pipe_;
 
-  /*!
-   * \brief tflite interpreter
-   */
+  // Tflite interpreter
   std::unique_ptr<tflite::Interpreter> interpreter_;
 
-  /*!
-   * \brief tflite model
-   */
+  // Tflite model
   std::unique_ptr<tflite::FlatBufferModel> model_;
 
   // Unique model instance name
@@ -105,9 +73,12 @@ class ModelInstance {
   // State variable to register whether inference has been called at least once
   bool first_inference_ = true;
 
-  // Tensorpipe allocation that we can reuse
+  // Tensorpipe allocation that we can reuse to write inputs into
   tensorpipe::Allocation allocation_;
 
+  // Tensorpipe response message we can reuse to write outputs into
+  tensorpipe::Message tp_response_msg_;
+
 #ifdef PAPI_PROFILING_ENABLE
   std::unique_ptr<tflite::Profiler> papi_profiler_ = MaybeCreatePapiProfiler();
 #endif  // PAPI_PROFILING_ENABLE

From c7c5e59dd177ca8b7e360c20e713ac72709f347c Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Tue, 6 Jun 2023 13:34:27 -0500
Subject: [PATCH 11/33] Fix bug in model instance tensor allocation

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 CMakeLists.txt             |  2 ++
 src/model_instance.cc      | 25 ++++++++++++++++++-------
 src/model_instance_utils.h | 17 ++++++++++++++++-
 src/tflite.cc              | 37 +++++++++++++++++++++++--------------
 4 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 105fda6..793bcb4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,8 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+set(CMAKE_CXX_STANDARD 17)
+
 set(TARGET_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
 
 # Triton Options
diff --git a/src/model_instance.cc b/src/model_instance.cc
index dad07a0..1712d0e 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -219,21 +219,32 @@ ModelInstance::ReceiveFromPipe()
                             const tensorpipe::Error& error,
                             tensorpipe::Descriptor descriptor) {
     if (error) {
-      if (error.isOfType<tensorpipe::PipeClosedError>()) {
+      if (error.isOfType<tensorpipe::EOFError>()) {
         // Expected.
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Remote side hungup: ") + error.what()).c_str());
+        exit(0);
       } else {
-        // Error may happen when the pipe is closed
         LOG_MESSAGE(
             TRITONSERVER_LOG_ERROR,
             (std::string("Unexpected error when reading from accepted pipe: ") +
              error.what())
                 .c_str());
-        exit(1);
       }
+      exit(1);
     }
     if (descriptor.metadata == "model_load") {
+      for (auto pid : CurrentThreadIds()) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            ("Thread id: " + std::to_string(pid)).c_str());
+      }
       LOG_MESSAGE(TRITONSERVER_LOG_INFO, "Loading model");
       LoadModelFromPipe(descriptor);
+      // int num_threads;
+      // PAPI_list_threads(NULL, &num_threads);
+
     } else if (descriptor.metadata == "model_input") {
       Infer(descriptor);
     }
@@ -278,7 +289,7 @@ ModelInstance::LoadModelFromPipe(tensorpipe::Descriptor descriptor)
 void
 ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
 {
-  bool allocate_tensors = false;
+  bool allocate_tensors = first_inference_;
   bool success = true;
 
   if (first_inference_) {
@@ -341,17 +352,17 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
 
   pipe_->read(
       allocation_,
-      [this, &success, &allocate_tensors](const tensorpipe::Error& error) {
+      [this, &success, allocate_tensors](const tensorpipe::Error& error) {
         success = !error;
 
         // At this point our input tensors should be written to by the read
         // function, now we invoke the interpreter and read the output
         if (interpreter_->Invoke() != kTfLiteOk) {
           success = false;
+        } else {
+          first_inference_ = false;
         }
 
-        first_inference_ = false;
-
         // Write output back to client
         if (!success) {
           tp_response_msg_.metadata = "f";
diff --git a/src/model_instance_utils.h b/src/model_instance_utils.h
index 3074a63..ff1e159 100644
--- a/src/model_instance_utils.h
+++ b/src/model_instance_utils.h
@@ -3,6 +3,9 @@
 // SPDX-License-Identifier: MIT
 //
 
+#include <filesystem>
+#include <vector>
+
 #ifdef PAPI_PROFILING_ENABLE
 #include "papi.h"
 
@@ -22,4 +25,16 @@ PAPIEventValid(std::string& event_name)
   }
   return valid;
 }
-#endif  // PAPI_PROFILING_ENABLE
\ No newline at end of file
+#endif  // PAPI_PROFILING_ENABLE
+
+std::vector<pid_t>
+CurrentThreadIds()
+{
+  std::vector<pid_t> r;
+  for (auto& p : std::filesystem::directory_iterator("/proc/self/task")) {
+    if (p.is_directory()) {
+      r.push_back(std::stoi(p.path().filename().string()));
+    }
+  }
+  return r;
+}
\ No newline at end of file
diff --git a/src/tflite.cc b/src/tflite.cc
index 6b594fa..f8fb1e3 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -1175,9 +1175,11 @@ ModelInstanceState::ProcessRequests(
   input_memories.clear();
 
   if (!all_response_failed) {
-    ReadOutputTensors(
-        total_batch_size, requests, request_count, &responses,
-        inference_output);
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        ReadOutputTensors(
+            total_batch_size, requests, request_count, &responses,
+            inference_output));
   }
 
   uint64_t exec_end_ns = 0;
@@ -1332,7 +1334,7 @@ ModelInstanceState::Execute(
           done->set_value(false);
           return;
         }
-        // Read a response from the server with description of incoming
+        // Read a response from the client with description of incoming
         // result tensors so we can get ready to write the data
         pipe_->readDescriptor([this, &inference_output, &done](
                                   const tensorpipe::Error& error,
@@ -1373,7 +1375,7 @@ ModelInstanceState::Execute(
                     inference_output[descriptor.tensors[i].metadata].data())};
           }
 
-          // Read the data from the server response into the tensor
+          // Read the data from the client response into the tensor
           // buffer assigned above
           pipe_->read(allocation, [&done](const tensorpipe::Error& error) {
             if (error) {
@@ -1409,15 +1411,22 @@ ModelInstanceState::ReadOutputTensors(
       requests, request_count, responses, model_state_->MaxBatchSize(),
       model_state_->TritonMemoryManager(), false, nullptr);
 
-  for (const auto& map_entry : model_state_->output_index_map_) {
-    std::string output_name = map_entry.first;
-    std::vector<int64_t> output_shape =
-        model_state_->output_shape_map_[output_name];
-    output_shape[0] = total_batch_size;
-
-    responder.ProcessTensor(
-        output_name, model_state_->output_dtype_map_[output_name], output_shape,
-        inference_output.at(output_name).data(), TRITONSERVER_MEMORY_CPU, 0);
+  // Respond to each output individually
+  try {
+    for (const auto& map_entry : model_state_->output_index_map_) {
+      const std::string& output_name = map_entry.first;
+      model_state_->output_shape_map_[output_name][0] = total_batch_size;
+
+      responder.ProcessTensor(
+          output_name, model_state_->output_dtype_map_[output_name],
+          model_state_->output_shape_map_[output_name],
+          inference_output.at(output_name).data(), TRITONSERVER_MEMORY_CPU, 0);
+    }
+  }
+  catch (std::out_of_range& err) {
+    responder.Finalize();
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_NOT_FOUND, "Failed to process output tensor");
   }
 
   // Finalize and wait for any pending buffer copies.

From 20eeff4710bbbcbbf4afa7f4376ec9e6250a12c5 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Wed, 7 Jun 2023 14:12:09 -0500
Subject: [PATCH 12/33] Use papi low level api

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc      |  27 ++++-----
 src/model_instance.h       |   2 +-
 src/model_instance_main.cc |   4 +-
 src/model_instance_utils.h |  18 +++++-
 src/papi_profiler.cc       | 111 ++++++++++++++++++++++++++++++++-----
 src/tflite.cc              |   1 -
 6 files changed, 127 insertions(+), 36 deletions(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index 1712d0e..44890fa 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -160,10 +160,6 @@ ModelInstance::BuildInterpreter(tensorpipe::Descriptor descriptor)
     LOG_MESSAGE(TRITONSERVER_LOG_INFO, "No delegates used for model execution");
   }
 
-#ifdef PAPI_PROFILING_ENABLE
-  interpreter_->AddProfiler(papi_profiler_.get());
-#endif  // PAPI_PROFILING_ENABLE
-
   return kTfLiteOk;
 }
 
@@ -219,12 +215,12 @@ ModelInstance::ReceiveFromPipe()
                             const tensorpipe::Error& error,
                             tensorpipe::Descriptor descriptor) {
     if (error) {
-      if (error.isOfType<tensorpipe::EOFError>()) {
+      if (error.isOfType<tensorpipe::PipeClosedError>()) {
         // Expected.
         LOG_MESSAGE(
             TRITONSERVER_LOG_INFO,
             (std::string("Remote side hungup: ") + error.what()).c_str());
-        exit(0);
+        return;
       } else {
         LOG_MESSAGE(
             TRITONSERVER_LOG_ERROR,
@@ -235,16 +231,7 @@ ModelInstance::ReceiveFromPipe()
       exit(1);
     }
     if (descriptor.metadata == "model_load") {
-      for (auto pid : CurrentThreadIds()) {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_INFO,
-            ("Thread id: " + std::to_string(pid)).c_str());
-      }
-      LOG_MESSAGE(TRITONSERVER_LOG_INFO, "Loading model");
       LoadModelFromPipe(descriptor);
-      // int num_threads;
-      // PAPI_list_threads(NULL, &num_threads);
-
     } else if (descriptor.metadata == "model_input") {
       Infer(descriptor);
     }
@@ -360,9 +347,17 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
         if (interpreter_->Invoke() != kTfLiteOk) {
           success = false;
         } else {
-          first_inference_ = false;
+#ifdef PAPI_PROFILING_ENABLE
+          // After the first inference, all threads should be alive to profile
+          if (first_inference_) {
+            papi_profiler_ = MaybeCreatePapiProfiler();
+            interpreter_->AddProfiler(papi_profiler_.get());
+          }
+#endif  // PAPI_PROFILING_ENABLE
         }
 
+        first_inference_ = false;
+
         // Write output back to client
         if (!success) {
           tp_response_msg_.metadata = "f";
diff --git a/src/model_instance.h b/src/model_instance.h
index 28a491e..656f7ef 100644
--- a/src/model_instance.h
+++ b/src/model_instance.h
@@ -80,6 +80,6 @@ class ModelInstance {
   tensorpipe::Message tp_response_msg_;
 
 #ifdef PAPI_PROFILING_ENABLE
-  std::unique_ptr<tflite::Profiler> papi_profiler_ = MaybeCreatePapiProfiler();
+  std::unique_ptr<tflite::Profiler> papi_profiler_;
 #endif  // PAPI_PROFILING_ENABLE
 };
\ No newline at end of file
diff --git a/src/model_instance_main.cc b/src/model_instance_main.cc
index 1fdd9c0..eb02de3 100644
--- a/src/model_instance_main.cc
+++ b/src/model_instance_main.cc
@@ -73,13 +73,13 @@ main(int argc, char* argv[])
   model_instance.Start(std::string(addr));
 
   LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
+      TRITONSERVER_LOG_INFO,
       "Model instance waiting for SIGTERM or SIGINT ([CTRL]+[c])...");
 
   // wait for signal handler to complete
   int signal = ft_signal_handler.get();
   LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
+      TRITONSERVER_LOG_INFO,
       (std::string("Received signal: ") + std::to_string(signal)).c_str());
 
   return 0;
diff --git a/src/model_instance_utils.h b/src/model_instance_utils.h
index ff1e159..3ab1b06 100644
--- a/src/model_instance_utils.h
+++ b/src/model_instance_utils.h
@@ -3,13 +3,18 @@
 // SPDX-License-Identifier: MIT
 //
 
+#pragma once
+
 #include <filesystem>
 #include <vector>
 
+// Triton backend headers
+#include "triton/backend/backend_common.h"
+
 #ifdef PAPI_PROFILING_ENABLE
 #include "papi.h"
 
-bool
+inline bool
 PAPIEventValid(std::string& event_name)
 {
   int event_set = PAPI_NULL;
@@ -27,7 +32,7 @@ PAPIEventValid(std::string& event_name)
 }
 #endif  // PAPI_PROFILING_ENABLE
 
-std::vector<pid_t>
+inline std::vector<pid_t>
 CurrentThreadIds()
 {
   std::vector<pid_t> r;
@@ -37,4 +42,13 @@ CurrentThreadIds()
     }
   }
   return r;
+}
+
+inline void
+LogThreads()
+{
+  for (auto pid : CurrentThreadIds()) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO, ("Thread id: " + std::to_string(pid)).c_str());
+  }
 }
\ No newline at end of file
diff --git a/src/papi_profiler.cc b/src/papi_profiler.cc
index fe98be9..24a8a4e 100644
--- a/src/papi_profiler.cc
+++ b/src/papi_profiler.cc
@@ -10,35 +10,77 @@
 #include <tensorflow/lite/core/api/profiler.h>
 
 #include <cstdint>
+#include <sstream>
 #include <unordered_map>
 #include <vector>
 
 // Triton backend headers
+#include "model_instance_utils.h"
 #include "papi.h"
 #include "triton/backend/backend_common.h"
 
 constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
 
 void
-handle_error(int retval)
+handle_error(int retval, int line, const std::string& file)
 {
   LOG_MESSAGE(
       TRITONSERVER_LOG_ERROR,
-      ("PAPI error: " + std::to_string(retval) + ", " + PAPI_strerror(retval))
+      ("PAPI error at line " + file + ":" + std::to_string(line) + " " +
+       std::to_string(retval) + ", " + PAPI_strerror(retval))
           .c_str());
   exit(1);
 }
 
 class PapiProfiler : public tflite::Profiler {
  public:
-  PapiProfiler()
+  PapiProfiler(const std::vector<std::string>& papi_events)
       : supported_event_types_(
             static_cast<uint64_t>(EventType::DELEGATE_OPERATOR_INVOKE_EVENT) +
-            static_cast<uint64_t>(EventType::OPERATOR_INVOKE_EVENT))
+            static_cast<uint64_t>(EventType::OPERATOR_INVOKE_EVENT)),
+        papi_events_(papi_events)
   {
+    int retval;
+    // The first 3 threads for the model instance don't do anything for
+    // inference, so we aren't interested in them
+    for (uint64_t i = 3; i < current_thread_ids_.size(); ++i) {
+      event_sets_.push_back(PAPI_NULL);
+      retval = PAPI_create_eventset(&event_sets_.back());
+      if (retval != PAPI_OK) {
+        handle_error(retval, __LINE__, __FILE__);
+      }
+      for (auto& event_name : papi_events_) {
+        retval = PAPI_add_named_event(event_sets_.back(), event_name.c_str());
+        if (retval != PAPI_OK)
+          handle_error(retval, __LINE__, __FILE__);
+      }
+
+      // Attach event to thread
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          ("Attaching to " + std::to_string(current_thread_ids_[i])).c_str());
+      retval = PAPI_attach(event_sets_.back(), current_thread_ids_[i]);
+      if (retval != PAPI_OK)
+        handle_error(retval, __LINE__, __FILE__);
+    }
+    event_values_.resize(current_thread_ids_.size() - 3);
   }
 
-  ~PapiProfiler() { PAPI_hl_stop(); }
+  ~PapiProfiler()
+  {
+    // Save results to file
+    for (auto& event : results_) {
+      LOG_MESSAGE(TRITONSERVER_LOG_INFO, ("Operation " + event.first).c_str());
+      for (auto& value : event.second) {
+        LOG_MESSAGE(TRITONSERVER_LOG_INFO, std::to_string(value).c_str());
+      }
+    }
+
+    for (auto& event_set : event_sets_) {
+      PAPI_cleanup_eventset(event_set);
+      PAPI_destroy_eventset(&event_set);
+    }
+  }
 
   // This function wants to return a handle to the profile event, which seems to
   // be a unique value. Because we are interested in the op names, we just has
@@ -57,10 +99,24 @@ class PapiProfiler : public tflite::Profiler {
     std::string trace_event_tag = tag;
     trace_event_tag += ("_" + std::to_string(event_metadata1));
 
-    // Begin tracking counters
-    int retval = PAPI_hl_region_begin(trace_event_tag.c_str());
-    if (retval != PAPI_OK)
-      handle_error(retval);
+    int retval;
+    // For the event set attached to each thread, start or restart the event set
+    for (uint64_t i = 0; i < event_sets_.size(); ++i) {
+      int state;
+      PAPI_state(event_sets_[i], &state);
+      if (!(state & PAPI_RUNNING)) {
+        // Begin tracking counters
+        retval = PAPI_start(event_sets_[i]);
+        if (retval != PAPI_OK)
+          handle_error(retval, __LINE__, __FILE__);
+
+      } else {
+        // Reset counters
+        retval = PAPI_reset(event_sets_[i]);
+        if (retval != PAPI_OK)
+          handle_error(retval, __LINE__, __FILE__);
+      }
+    }
 
     uint32_t event_handle = event_index_++;
     papi_regions_[event_handle] = trace_event_tag;
@@ -72,9 +128,14 @@ class PapiProfiler : public tflite::Profiler {
     if (event_handle == kInvalidEventHandle) {
       return;
     }
-    int retval = PAPI_hl_region_end(papi_regions_[event_handle].c_str());
-    if (retval != PAPI_OK)
-      handle_error(retval);
+
+    int retval;
+    for (uint64_t i = 0; i < event_sets_.size(); ++i) {
+      retval = PAPI_read(event_sets_[i], &event_values_[i]);
+      if (retval != PAPI_OK)
+        handle_error(retval, __LINE__, __FILE__);
+      results_[papi_regions_[event_handle]].push_back(event_values_[i]);
+    }
   }
 
  protected:
@@ -87,16 +148,38 @@ class PapiProfiler : public tflite::Profiler {
   uint32_t event_index_ = 0;
   std::unordered_map<uint32_t, std::string> papi_regions_;
   const uint64_t supported_event_types_;
+  std::vector<std::string> papi_events_;
+  std::vector<int> event_sets_;
+  std::vector<pid_t> current_thread_ids_ = CurrentThreadIds();
+  std::vector<long long> event_values_;
+  std::unordered_map<std::string, std::vector<long long>> results_;
 };
 
 std::unique_ptr<tflite::Profiler>
 MaybeCreatePapiProfiler()
 {
-  if (getenv("PAPI_EVENTS") == NULL) {
+  char* papi_events = getenv("PAPI_EVENTS");
+  std::vector<std::string> papi_events_vec;
+  if (papi_events == NULL) {
     LOG_MESSAGE(
         TRITONSERVER_LOG_WARN,
         "PAPI_EVENTS not specified, op level profiling disabled!");
     return nullptr;
+  } else {
+    // Parse out all papi events indivdually
+    std::stringstream ss(papi_events);
+    while (ss.good()) {
+      std::string substr;
+      std::getline(ss, substr, ',');
+      if (!PAPIEventValid(substr)) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_WARN,
+            ("Event: " + substr + " invalid, op level profiling disabled!")
+                .c_str());
+        return nullptr;
+      }
+      papi_events_vec.push_back(substr);
+    }
   }
-  return std::unique_ptr<tflite::Profiler>(new PapiProfiler());
+  return std::unique_ptr<tflite::Profiler>(new PapiProfiler(papi_events_vec));
 }
diff --git a/src/tflite.cc b/src/tflite.cc
index f8fb1e3..1775841 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -47,7 +47,6 @@
 #include "tensorflow/lite/type_to_tflitetype.h"
 
 // Reproc headers
-#include "reproc++/drain.hpp"
 #include "reproc++/reproc.hpp"
 
 //

From 35fb9540a5a9546dce21fcbec8eea28ed99accac Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Thu, 8 Jun 2023 08:48:47 -0500
Subject: [PATCH 13/33] Fix csv generation script

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 .gitignore           |  1 +
 src/papi_profiler.cc | 44 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index f772ec4..552674b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@
 /**/triton_qa_models
 /**/armnn_tflite_backend_triton_model_repo.tar.gz
 **/papi_hl_output*
+*.csv
 
diff --git a/src/papi_profiler.cc b/src/papi_profiler.cc
index 24a8a4e..926d39a 100644
--- a/src/papi_profiler.cc
+++ b/src/papi_profiler.cc
@@ -10,6 +10,8 @@
 #include <tensorflow/lite/core/api/profiler.h>
 
 #include <cstdint>
+#include <fstream>
+#include <iostream>
 #include <sstream>
 #include <unordered_map>
 #include <vector>
@@ -40,10 +42,16 @@ class PapiProfiler : public tflite::Profiler {
             static_cast<uint64_t>(EventType::OPERATOR_INVOKE_EVENT)),
         papi_events_(papi_events)
   {
+    // We only care about the 4th thread in the process on, as these are used
+    // for inference
+    std::vector<pid_t> current_threads = CurrentThreadIds();
+    inf_thread_ids_ =
+        std::vector<pid_t>(current_threads.begin() + 3, current_threads.end());
+
     int retval;
     // The first 3 threads for the model instance don't do anything for
     // inference, so we aren't interested in them
-    for (uint64_t i = 3; i < current_thread_ids_.size(); ++i) {
+    for (uint64_t i = 0; i < inf_thread_ids_.size(); ++i) {
       event_sets_.push_back(PAPI_NULL);
       retval = PAPI_create_eventset(&event_sets_.back());
       if (retval != PAPI_OK) {
@@ -58,23 +66,31 @@ class PapiProfiler : public tflite::Profiler {
       // Attach event to thread
       LOG_MESSAGE(
           TRITONSERVER_LOG_INFO,
-          ("Attaching to " + std::to_string(current_thread_ids_[i])).c_str());
-      retval = PAPI_attach(event_sets_.back(), current_thread_ids_[i]);
+          ("Attaching to " + std::to_string(inf_thread_ids_[i])).c_str());
+      retval = PAPI_attach(event_sets_.back(), inf_thread_ids_[i]);
       if (retval != PAPI_OK)
         handle_error(retval, __LINE__, __FILE__);
     }
-    event_values_.resize(current_thread_ids_.size() - 3);
+    event_values_.resize(papi_events_.size());
   }
 
   ~PapiProfiler()
   {
     // Save results to file
+    std::ofstream myfile;
+    myfile.open("counters.csv");
+    // Header
+    myfile << "op_id,thread_id,papi_event,value\n";
+    // Iterate over map keyed on tflite operation id
     for (auto& event : results_) {
-      LOG_MESSAGE(TRITONSERVER_LOG_INFO, ("Operation " + event.first).c_str());
-      for (auto& value : event.second) {
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO, std::to_string(value).c_str());
+      for (uint64_t i = 0; i < event.second.size(); ++i) {
+        myfile << event.first << ","
+               << inf_thread_ids_[i / papi_events_.size() % event_sets_.size()]
+               << "," << papi_events_[i % papi_events_.size()] << ","
+               << event.second[i] << "\n";
       }
     }
+    myfile.close();
 
     for (auto& event_set : event_sets_) {
       PAPI_cleanup_eventset(event_set);
@@ -130,11 +146,15 @@ class PapiProfiler : public tflite::Profiler {
     }
 
     int retval;
+    // For each thread we are profiling
     for (uint64_t i = 0; i < event_sets_.size(); ++i) {
-      retval = PAPI_read(event_sets_[i], &event_values_[i]);
+      retval = PAPI_read(event_sets_[i], event_values_.data());
       if (retval != PAPI_OK)
         handle_error(retval, __LINE__, __FILE__);
-      results_[papi_regions_[event_handle]].push_back(event_values_[i]);
+      // For each of the events we collected a counter value for
+      for (auto val : event_values_) {
+        results_[papi_regions_[event_handle]].push_back(val);
+      }
     }
   }
 
@@ -150,7 +170,11 @@ class PapiProfiler : public tflite::Profiler {
   const uint64_t supported_event_types_;
   std::vector<std::string> papi_events_;
   std::vector<int> event_sets_;
-  std::vector<pid_t> current_thread_ids_ = CurrentThreadIds();
+
+  // We only care about the 4th thread in the process on, as these are used for
+  // inference
+  std::vector<pid_t> inf_thread_ids_;
+
   std::vector<long long> event_values_;
   std::unordered_map<std::string, std::vector<long long>> results_;
 };

From 06ec744b0b7475a85f457b21ee02bb01dd3d29f6 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Thu, 8 Jun 2023 09:04:15 -0500
Subject: [PATCH 14/33] Append utc time to csv file name

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/papi_profiler.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/papi_profiler.cc b/src/papi_profiler.cc
index 926d39a..bae6295 100644
--- a/src/papi_profiler.cc
+++ b/src/papi_profiler.cc
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <tensorflow/lite/core/api/profiler.h>
 
+#include <chrono>
 #include <cstdint>
 #include <fstream>
 #include <iostream>
@@ -78,7 +79,11 @@ class PapiProfiler : public tflite::Profiler {
   {
     // Save results to file
     std::ofstream myfile;
-    myfile.open("counters.csv");
+    auto now = std::chrono::system_clock::now();
+    auto utc =
+        std::chrono::duration_cast<std::chrono::seconds>(now.time_since_epoch())
+            .count();
+    myfile.open(("counters_" + std::to_string(utc) + ".csv").c_str());
     // Header
     myfile << "op_id,thread_id,papi_event,value\n";
     // Iterate over map keyed on tflite operation id

From 0c87cbb8668f2b48c0bce6dcc24a814b37e59128 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Thu, 8 Jun 2023 09:33:18 -0500
Subject: [PATCH 15/33] Wait for model load message before claiming model is
 ready

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance.cc      | 14 +++++++++++++-
 src/model_instance_main.cc |  4 +---
 src/tflite.cc              | 39 +++++++++++++++++++++++++++++++++-----
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/src/model_instance.cc b/src/model_instance.cc
index 44890fa..184e8ba 100644
--- a/src/model_instance.cc
+++ b/src/model_instance.cc
@@ -266,7 +266,19 @@ ModelInstance::LoadModelFromPipe(tensorpipe::Descriptor descriptor)
             descriptor.payloads[OptimizerOption::COUNT].length);
 
         // Initalize the interpreter after loading the flatbuffers model
-        BuildInterpreter(descriptor);
+        TfLiteStatus status = BuildInterpreter(descriptor);
+
+        tensorpipe::Message tp_msg;
+        tp_msg.metadata = status == kTfLiteOk ? "success" : "fail";
+
+        pipe_->write(tp_msg, [](const tensorpipe::Error& error) {
+          if (error) {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_ERROR,
+                ("Failed send model load ack:" + error.what()).c_str());
+            return;
+          }
+        });
 
         // Arm for getting more data
         ReceiveFromPipe();
diff --git a/src/model_instance_main.cc b/src/model_instance_main.cc
index eb02de3..3991785 100644
--- a/src/model_instance_main.cc
+++ b/src/model_instance_main.cc
@@ -72,9 +72,7 @@ main(int argc, char* argv[])
   // Will connect to the address provided as the first argument in the list
   model_instance.Start(std::string(addr));
 
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      "Model instance waiting for SIGTERM or SIGINT ([CTRL]+[c])...");
+  LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Model instance running...");
 
   // wait for signal handler to complete
   int signal = ft_signal_handler.get();
diff --git a/src/tflite.cc b/src/tflite.cc
index 1775841..044ce53 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -995,12 +995,41 @@ ModelInstanceState::SendModel()
 
   // Write the message
   auto done = std::make_shared<std::promise<bool>>();
-  pipe_->write(tp_msg, [done](const tensorpipe::Error& error) {
-    done->set_value(!error);
+  pipe_->write(tp_msg, [this, done](const tensorpipe::Error& error) {
+    // We now listen for a message to come back indicating the model load was
+    // successful
+    if (error) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          ("Failed to model load message. Details:" + error.what()).c_str());
+      done->set_value(false);
+      return;
+    }
+    pipe_->readDescriptor([this, done](
+                              const tensorpipe::Error& error,
+                              tensorpipe::Descriptor descriptor) {
+      if (error) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR,
+            (std::string("Unexpected error when reading from accepted pipe: ") +
+             error.what())
+                .c_str());
+        done->set_value(false);
+        return;
+      }
+      tensorpipe::Allocation allocation;
+      pipe_->read(
+          allocation, [descriptor, done](const tensorpipe::Error& error) {
+            done->set_value(descriptor.metadata == "success");
+          });
+    });
   });
-  RETURN_ERROR_IF_FALSE(
-      done->get_future().get(), TRITONSERVER_ERROR_INTERNAL,
-      std::string("Failed to send model load message"));
+  RETURN_ERROR_IF_TRUE(
+      done->get_future().wait_for(std::chrono::seconds(30)) ==
+          std::future_status::timeout,
+      TRITONSERVER_ERROR_INTERNAL,
+      std::string("Model instance failed: process did not send model load "
+                  "acknowledgement"));
   return nullptr;
 }
 

From 53acc2645d6e55b685425d2f8c1e85b49eb997b4 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Thu, 8 Jun 2023 10:21:34 -0500
Subject: [PATCH 16/33] Don't worry about papi hl

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 552674b..1346996 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,5 @@
 /.devcontainer
 /**/triton_qa_models
 /**/armnn_tflite_backend_triton_model_repo.tar.gz
-**/papi_hl_output*
 *.csv
 

From 8b0e28e4212d4625b412acaf50c4b8d93de510fd Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Fri, 9 Jun 2023 15:16:22 -0500
Subject: [PATCH 17/33] Add support for model NUMA policies

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 CMakeLists.txt                                |  51 ++++++-
 cmake/FindNuma.cmake                          |  43 ++++++
 src/config.h                                  |  60 ++++++++-
 src/{ => model_instance}/model_instance.cc    | 114 +++++++++++++++-
 src/{ => model_instance}/model_instance.h     |  28 ++++
 .../model_instance_main.cc                    |   0
 .../model_instance_utils.h                    |  10 ++
 src/{ => model_instance}/papi_profiler.cc     |   0
 src/{ => model_instance}/papi_profiler.h      |   0
 src/tflite.cc                                 | 126 +++++++++++++++++-
 10 files changed, 420 insertions(+), 12 deletions(-)
 create mode 100644 cmake/FindNuma.cmake
 rename src/{ => model_instance}/model_instance.cc (81%)
 rename src/{ => model_instance}/model_instance.h (79%)
 rename src/{ => model_instance}/model_instance_main.cc (100%)
 rename src/{ => model_instance}/model_instance_utils.h (83%)
 rename src/{ => model_instance}/papi_profiler.cc (100%)
 rename src/{ => model_instance}/papi_profiler.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 793bcb4..5fc325e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,8 @@ endif()
 
 set(CMAKE_CXX_STANDARD 17)
 
+SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
+
 set(TARGET_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
 
 # Triton Options
@@ -123,6 +125,9 @@ endif()
 # Enable REPROC++
 set(REPROC++ ON)
 
+# Numa
+option(LIBNUMA_ENABLE "Enable libnuma usage" OFF)
+
 #
 # Dependencies
 #
@@ -358,12 +363,12 @@ endif()
 #
 
 set(MODEL_INSTANCE_SRCS 
-      src/model_instance_main.cc 
-      src/model_instance.cc 
-      src/model_instance.h
-      src/model_instance_utils.h)
+      src/model_instance/model_instance_main.cc 
+      src/model_instance/model_instance.cc 
+      src/model_instance/model_instance.h
+      src/model_instance/model_instance_utils.h)
 if(PAPI_PROFILING_ENABLE)
-  list(APPEND MODEL_INSTANCE_SRCS src/papi_profiler.cc)
+  list(APPEND MODEL_INSTANCE_SRCS src/model_instance/papi_profiler.cc)
 endif()
 
 add_executable(model_instance ${MODEL_INSTANCE_SRCS})
@@ -373,6 +378,28 @@ set(MODEL_INSTANCE_LINK_LIBS
       triton-core-serverstub
       triton-backend-utils)
 
+# Handle discovery of libnuma
+if(LIBNUMA_ENABLE)
+    find_package(Numa)
+    if(NUMA_FOUND)
+        # Here we just make numa available to all of our targets
+        link_directories(${NUMA_LIBRARY_DIR})
+        list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
+        list(APPEND CMAKE_REQUIRED_INCLUDES ${NUMA_INCLUDE_DIR})
+        list(APPEND CMAKE_REQUIRED_LINK_OPTIONS "-L${NUMA_LIBRARY_DIR}")
+        check_symbol_exists(numa_node_of_cpu "numa.h" NUMA_V2)
+        if(NUMA_V2)
+            add_definitions(-DHAVE_LIBNUMA)
+            message(STATUS "libnuma found, building with support for NUMA nodes")
+            list(APPEND MODEL_INSTANCE_LINK_LIBS numa)
+            include_directories(SYSTEM ${NUMA_INCLUDE_DIR})
+        else()
+            message(FATAL_ERROR "libnuma not found, but was requested via option LIBNUMA_ENABLE")
+        endif()
+    endif()
+    mark_as_advanced(NUMA_FOUND)
+endif(LIBNUMA_ENABLE)
+
 set(MODEL_INSTANCE_INCLUDE_DIRS
     ${CMAKE_CURRENT_SOURCE_DIR}/src 
     ${TENSORFLOW_ROOT} # for tflite headers
@@ -405,6 +432,13 @@ if(PAPI_PROFILING_ENABLE)
   target_link_libraries(model_instance PRIVATE ${CMAKE_BINARY_DIR}/papi-prefix/lib/libpapi.so)
 endif()
 
+if(LIBNUMA_ENABLE)
+  target_compile_definitions(
+    model_instance
+    PRIVATE LIBNUMA_ENABLE=1
+  )
+endif()
+
 if(TFLITE_BAZEL_BUILD)
   list(APPEND MODEL_INSTANCE_INCLUDE_DIRS
        ${TENSORFLOW_ROOT}/bazel-tensorflow-lite/external/flatbuffers/include)
@@ -493,6 +527,13 @@ if(PAPI_PROFILING_ENABLE)
   )
 endif()
 
+if(LIBNUMA_ENABLE)
+  target_compile_definitions(
+    triton-armnn-tflite-backend
+    PRIVATE LIBNUMA_ENABLE=1
+  )
+endif()
+
 target_include_directories(triton-armnn-tflite-backend
                            PRIVATE ${BACKEND_INCLUDE_DIRS})
 
diff --git a/cmake/FindNuma.cmake b/cmake/FindNuma.cmake
new file mode 100644
index 0000000..94b23c8
--- /dev/null
+++ b/cmake/FindNuma.cmake
@@ -0,0 +1,43 @@
+# Module for locating libnuma
+#
+# Read-only variables:
+#   NUMA_FOUND
+#     Indicates that the library has been found.
+#
+#   NUMA_INCLUDE_DIR
+#     Points to the libnuma include directory.
+#
+#   NUMA_LIBRARY_DIR
+#     Points to the directory that contains the libraries.
+#     The content of this variable can be passed to link_directories.
+#
+#   NUMA_LIBRARY
+#     Points to the libnuma that can be passed to target_link_libararies.
+#
+# Copyright (c) 2013-2020 MulticoreWare, Inc
+
+include(FindPackageHandleStandardArgs)
+
+find_path(NUMA_ROOT_DIR
+  NAMES include/numa.h
+  PATHS ENV NUMA_ROOT
+  DOC "NUMA root directory")
+
+find_path(NUMA_INCLUDE_DIR
+  NAMES numa.h
+  HINTS ${NUMA_ROOT_DIR}
+  PATH_SUFFIXES include
+  DOC "NUMA include directory")
+
+find_library(NUMA_LIBRARY
+  NAMES numa
+  HINTS ${NUMA_ROOT_DIR}
+  DOC "NUMA library")
+
+if (NUMA_LIBRARY)
+    get_filename_component(NUMA_LIBRARY_DIR ${NUMA_LIBRARY} PATH)
+endif()
+
+mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARY_DIR NUMA_LIBRARY)
+
+find_package_handle_standard_args(NUMA REQUIRED_VARS NUMA_ROOT_DIR NUMA_INCLUDE_DIR NUMA_LIBRARY)
\ No newline at end of file
diff --git a/src/config.h b/src/config.h
index fcb243a..225ebd6 100644
--- a/src/config.h
+++ b/src/config.h
@@ -5,12 +5,18 @@
 
 #pragma once
 
+#include <algorithm>
+#include <string>
+
 // This class is used to map an optimizer option to an index in an array so
 // options can be sent across a tensorpipe payload
 enum OptimizerOption {
   TFLITE_NUM_THREADS,
   XNNPACK_ENABLE,
   XNNPACK_CPU_NUM_THREADS,
+  NUMA_ALLOC_POLICY,
+  NUMA_LOCAL_NODE_ID,
+  NUMA_REMOTE_NODE_ID,
 
 #ifdef ARMNN_DELEGATE_ENABLE
   ARMNN_CPU_ENABLE,
@@ -25,4 +31,56 @@ enum OptimizerOption {
 #endif  // ARMNN_DELEGATE_ENABLE
 
   COUNT  // Just used to track the number of options
-};
\ No newline at end of file
+};
+
+enum class AllocationPolicy {
+  LOCAL,
+  WEIGHT_REMOTE_RESULT_LOCAL,
+  WEIGHT_LOCAL_RESULT_REMOTE,
+  REMOTE,
+  NONE
+};
+
+inline AllocationPolicy
+AllocationPolicyFromString(std::string str)
+{
+  // Convert copy of string to uppercase
+  std::transform(str.begin(), str.end(), str.begin(), ::toupper);
+
+  if (str == "LOCAL") {
+    return AllocationPolicy::LOCAL;
+  } else if (str == "WEIGHT_REMOTE_RESULT_LOCAL") {
+    return AllocationPolicy::WEIGHT_REMOTE_RESULT_LOCAL;
+  } else if (str == "WEIGHT_LOCAL_RESULT_REMOTE") {
+    return AllocationPolicy::WEIGHT_LOCAL_RESULT_REMOTE;
+  } else if (str == "REMOTE") {
+    return AllocationPolicy::REMOTE;
+  } else if (str == "NONE") {
+    return AllocationPolicy::NONE;
+  } else {
+    return AllocationPolicy::NONE;
+  }
+}
+
+inline std::string
+AllocationPolicyToString(const AllocationPolicy& alloc_policy)
+{
+  switch (alloc_policy) {
+    case AllocationPolicy::LOCAL: {
+      return "LOCAL";
+    }
+    case AllocationPolicy::WEIGHT_REMOTE_RESULT_LOCAL: {
+      return "WEIGHT_REMOTE_RESULT_LOCAL";
+    }
+    case AllocationPolicy::WEIGHT_LOCAL_RESULT_REMOTE: {
+      return "WEIGHT_LOCAL_RESULT_REMOTE";
+    }
+    case AllocationPolicy::REMOTE: {
+      return "REMOTE";
+    }
+    case AllocationPolicy::NONE: {
+      return "NONE";
+    }
+  }
+  return "NONE";
+}
\ No newline at end of file
diff --git a/src/model_instance.cc b/src/model_instance/model_instance.cc
similarity index 81%
rename from src/model_instance.cc
rename to src/model_instance/model_instance.cc
index 184e8ba..f330750 100644
--- a/src/model_instance.cc
+++ b/src/model_instance/model_instance.cc
@@ -8,7 +8,6 @@
 #include <future>
 #include <unordered_set>
 
-#include "config.h"
 #include "model_instance_utils.h"
 
 // Triton backend headers
@@ -58,6 +57,16 @@ ModelInstance::BuildInterpreter(tensorpipe::Descriptor descriptor)
     return kTfLiteError;
   }
 
+  // Set numa parameters
+  numa_alloc_policy_ = AllocationPolicyFromString(
+      descriptor.payloads[OptimizerOption::NUMA_ALLOC_POLICY].metadata);
+
+  local_numa_node_id_ = std::stoi(
+      descriptor.payloads[OptimizerOption::NUMA_LOCAL_NODE_ID].metadata);
+
+  remote_numa_node_id_ = std::stoi(
+      descriptor.payloads[OptimizerOption::NUMA_REMOTE_NODE_ID].metadata);
+
 #ifdef ARMNN_DELEGATE_ENABLE
   armnn::OptimizerOptions armnn_optimizer_options_cpu;
   armnn::OptimizerOptions armnn_optimizer_options_gpu;
@@ -266,18 +275,22 @@ ModelInstance::LoadModelFromPipe(tensorpipe::Descriptor descriptor)
             descriptor.payloads[OptimizerOption::COUNT].length);
 
         // Initalize the interpreter after loading the flatbuffers model
-        TfLiteStatus status = BuildInterpreter(descriptor);
-
         tensorpipe::Message tp_msg;
+        TfLiteStatus status = BuildInterpreter(descriptor);
         tp_msg.metadata = status == kTfLiteOk ? "success" : "fail";
 
-        pipe_->write(tp_msg, [](const tensorpipe::Error& error) {
+        pipe_->write(tp_msg, [this](const tensorpipe::Error& error) {
           if (error) {
             LOG_MESSAGE(
                 TRITONSERVER_LOG_ERROR,
                 ("Failed send model load ack:" + error.what()).c_str());
             return;
           }
+#ifdef LIBNUMA_ENABLE
+          // Assuming we wrote the message successfully, now our model is
+          // loaded, and we can apply the numa policy
+          InitNuma(local_numa_node_id_, remote_numa_node_id_);
+#endif  // LIBNUMA_ENABLE
         });
 
         // Arm for getting more data
@@ -340,6 +353,7 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
     if (interpreter_->AllocateTensors() != kTfLiteOk) {
       success = false;
     }
+
     // Assign Cpu buffers to read incoming tensor bytes into after allocate
     // tensors is called
     for (uint64_t i = 0; i < descriptor.tensors.size(); ++i) {
@@ -400,3 +414,95 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
         ReceiveFromPipe();
       });
 }
+
+#ifdef LIBNUMA_ENABLE
+void
+ModelInstance::InitNuma(int local_node_id, int remote_node_id)
+{
+  if (numa_alloc_policy_ == AllocationPolicy::NONE) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_WARN, "Allocation policy ignored, policy is NONE");
+    return;
+  }
+
+  if (numa_available() < 0) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_WARN,
+        "System does not support NUMA API, Allocation policy ignored");
+    return;
+  } else if (num_numa_nodes_ < 2) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_WARN,
+        "Only one numa node available to system. Allocation policy "
+        "ignored\n");
+    return;
+  }
+
+  // Set numa mem pollicies
+  // In the case of the split policies, we need to explictly move the pages of
+  // the weights to the target numa node, as it goes against the set memory
+  // policy this process was launched with
+  switch (numa_alloc_policy_) {
+    case AllocationPolicy::WEIGHT_LOCAL_RESULT_REMOTE: {
+      MoveModelWeights(local_node_id);
+      break;
+    }
+    case AllocationPolicy::WEIGHT_REMOTE_RESULT_LOCAL: {
+      MoveModelWeights(remote_node_id);
+      break;
+    }
+    default: {
+      break;
+    }
+  }
+}
+
+void
+ModelInstance::MoveModelWeights(int numa_node_id)
+{
+  // Get pointer to base of mmapped model file
+  const void* model_file_base = model_->allocation()->base();
+  int page_size = getpagesize();
+
+  std::vector<void*> pages(model_->allocation()->bytes() / page_size + 1);
+
+  char* begin = (char*)model_file_base;
+  char* end = begin + model_->allocation()->bytes();
+
+  int i = 0;
+  for (char* piter = (char*)AlignPage(model_file_base); piter < end;
+       piter += page_size) {
+    pages[i++] = (void*)piter;
+  }
+
+  std::vector<int> dst(pages.size(), numa_node_id);
+  std::vector<int> status(pages.size(), 0);
+
+  // Touch all pages of the file to force mapping to phys mem
+  volatile char c;
+  for (char* piter = (char*)AlignPage(model_file_base); piter < end;
+       piter += page_size) {
+    c = *piter;
+  }
+  // This is just to avoid the unused var compiler warning
+  (void)c;
+
+  // With all pages mapped, now move them to target numa node
+  int ret = numa_move_pages(
+      0, pages.size(), pages.data(), dst.data(), status.data(),
+      MPOL_MF_MOVE_ALL);
+
+  if (ret < 0) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_ERROR,
+        (std::string("Numa move page error: ") + strerror(errno)).c_str());
+    for (auto& i : status) {
+      if (i < 0) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR,
+            (std::string("Page error status: ") + strerror(i)).c_str());
+      }
+    }
+  }
+}
+#endif  // LIBNUMA_ENABLE
diff --git a/src/model_instance.h b/src/model_instance/model_instance.h
similarity index 79%
rename from src/model_instance.h
rename to src/model_instance/model_instance.h
index 656f7ef..7a16c2a 100644
--- a/src/model_instance.h
+++ b/src/model_instance/model_instance.h
@@ -5,17 +5,25 @@
 
 #pragma once
 
+#include "config.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/optional_debug_tools.h"
 #include "tensorpipe/tensorpipe.h"
 
+
 #ifdef PAPI_PROFILING_ENABLE
 #include "papi.h"
 #include "papi_profiler.h"
 #endif  // PAPI_PROFILING_ENABLE
 
+#ifdef LIBNUMA_ENABLE
+// Lib Numa headers
+#include <numa.h>
+#include <numaif.h>
+#endif  // LIBNUMA_ENABLE
+
 // ModelInstance for backend end execution of model
 class ModelInstance {
  public:
@@ -55,6 +63,26 @@ class ModelInstance {
   // Callback for inferencing on a loaded tflite model.
   void Infer(tensorpipe::Descriptor& descriptor);
 
+  // Numa policy for instance
+  AllocationPolicy numa_alloc_policy_;
+
+  // Local numa node id
+  int local_numa_node_id_ = 0;
+
+  // remote numa node id
+  int remote_numa_node_id_ = 1;
+
+#ifdef LIBNUMA_ENABLE
+  // Initalize numa policy for this model
+  void InitNuma(int local_node_id, int remote_node_id);
+
+  // Move model weights to target numa node
+  void MoveModelWeights(int numa_node_id);
+
+  // Numa nodes available to the instance
+  const int num_numa_nodes_ = numa_max_node() + 1;
+#endif  // LIBNUMA_ENABLE
+
   // Global tensorpipe context
   std::shared_ptr<tensorpipe::Context> context_;
 
diff --git a/src/model_instance_main.cc b/src/model_instance/model_instance_main.cc
similarity index 100%
rename from src/model_instance_main.cc
rename to src/model_instance/model_instance_main.cc
diff --git a/src/model_instance_utils.h b/src/model_instance/model_instance_utils.h
similarity index 83%
rename from src/model_instance_utils.h
rename to src/model_instance/model_instance_utils.h
index 3ab1b06..209fc06 100644
--- a/src/model_instance_utils.h
+++ b/src/model_instance/model_instance_utils.h
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include <unistd.h>
+
 #include <filesystem>
 #include <vector>
 
@@ -51,4 +53,12 @@ LogThreads()
     LOG_MESSAGE(
         TRITONSERVER_LOG_INFO, ("Thread id: " + std::to_string(pid)).c_str());
   }
+}
+
+// Get base page address for given pointer
+inline static void*
+AlignPage(const void* ptr)
+{
+  static uintptr_t PAGE_MASK = ~(uintptr_t(getpagesize() - 1));
+  return (void*)(((uintptr_t)ptr) & PAGE_MASK);
 }
\ No newline at end of file
diff --git a/src/papi_profiler.cc b/src/model_instance/papi_profiler.cc
similarity index 100%
rename from src/papi_profiler.cc
rename to src/model_instance/papi_profiler.cc
diff --git a/src/papi_profiler.h b/src/model_instance/papi_profiler.h
similarity index 100%
rename from src/papi_profiler.h
rename to src/model_instance/papi_profiler.h
diff --git a/src/tflite.cc b/src/tflite.cc
index 044ce53..682c8c8 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -49,6 +49,12 @@
 // Reproc headers
 #include "reproc++/reproc.hpp"
 
+#ifdef LIBNUMA_ENABLE
+// Lib Numa headers
+#include <numa.h>
+#include <numaif.h>
+#endif  // LIBNUMA_ENABLE
+
 //
 // TFLite Backend that implements the TRITONBACKEND API.
 //
@@ -127,6 +133,15 @@ class ModelState : public BackendModel {
   std::string papi_events_ = "";
 #endif  // PAPI_PROFILING_ENABLE
 
+  // Numa policy for instance
+  AllocationPolicy numa_alloc_policy_ = AllocationPolicy::NONE;
+
+  // Local numa node id
+  int local_numa_node_id_ = 0;
+
+  // remote numa node id
+  int remote_numa_node_id_ = 1;
+
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
   TRITONSERVER_Error* AutoCompleteConfig();
@@ -199,6 +214,75 @@ ModelState::InitConfig()
                   .c_str());
         }
       }
+
+      // Handle numa parameters
+      err = GetParameterValue(params, "numa_alloc_policy", &value_str);
+
+      // numa_alloc_policy is not required so clear error if not found
+      if (err != nullptr) {
+        if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+          return err;
+        } else {
+          TRITONSERVER_ErrorDelete(err);
+        }
+      } else {
+        numa_alloc_policy_ = AllocationPolicyFromString(value_str);
+      }
+
+#ifdef LIBNUMA_ENABLE
+      err = GetParameterValue(params, "local_numa_node_id", &value_str);
+
+      // local_numa_node_id is not required so clear error if not found
+      if (err != nullptr) {
+        if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+          return err;
+        } else {
+          TRITONSERVER_ErrorDelete(err);
+        }
+      } else {
+        RETURN_IF_ERROR(ParseIntValue(value_str, &local_numa_node_id_));
+        if (local_numa_node_id_ < 0 || local_numa_node_id_ > numa_max_node()) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string(
+                   "parameter 'local_numa_node_id_' must be non-negative "
+                   "or less than max numa node id for tflite model '") +
+               Name() + "'")
+                  .c_str());
+        }
+      }
+
+      // Handle remote_numa_node_id parameter
+      err = GetParameterValue(params, "remote_numa_node_id", &value_str);
+
+      // remote_numa_node_id is not required so clear error if not found
+      if (err != nullptr) {
+        if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+          return err;
+        } else {
+          TRITONSERVER_ErrorDelete(err);
+        }
+      } else {
+        RETURN_IF_ERROR(ParseIntValue(value_str, &remote_numa_node_id_));
+        if (remote_numa_node_id_ < 0 ||
+            remote_numa_node_id_ > numa_max_node()) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string(
+                   "parameter 'remote_numa_node_id_' must be non-negative "
+                   "or less than max numa node id for tflite model '") +
+               Name() + "'")
+                  .c_str());
+        }
+      }
+
+#else
+      RETURN_ERROR_IF_TRUE(
+          numa_alloc_policy_ != AllocationPolicy::NONE,
+          TRITONSERVER_ERROR_INVALID_ARG,
+          std::string("Backend built without NUMA support, only valid "
+                      "allocation policy is 'NONE'"))
+#endif  // LIBNUMA_ENABLE
     }
   }
 
@@ -827,8 +911,36 @@ ModelInstanceState::LaunchModelInstance()
       std::string(model_state_->model_instance_location_) + "/model_instance",
       std::string("shm://") + model_instance_name_};
 
-  // We have the model_instance process inherit the parent's standard streams so
-  // the it reads directly from the stdin and writes directly to the
+#ifdef LIBNUMA_ENABLE
+  // Model instance will always be pinned to numa node set as local, it's the
+  // membinding we change
+  switch (model_state_->numa_alloc_policy_) {
+    case AllocationPolicy::LOCAL:
+    case AllocationPolicy::WEIGHT_REMOTE_RESULT_LOCAL:
+      // In the case of local result tensors (heap), membind to local numa node
+      model_instance_args.insert(
+          model_instance_args.begin(),
+          {"numactl", "--membind",
+           std::to_string(model_state_->local_numa_node_id_), "--cpunodebind",
+           std::to_string(model_state_->local_numa_node_id_)});
+      break;
+    case AllocationPolicy::WEIGHT_LOCAL_RESULT_REMOTE:
+    case AllocationPolicy::REMOTE:
+      // In the case of remote result tensors (heap), membind to local numa node
+      model_instance_args.insert(
+          model_instance_args.begin(),
+          {"numactl", "--membind",
+           std::to_string(model_state_->remote_numa_node_id_), "--cpunodebind",
+           std::to_string(model_state_->local_numa_node_id_)});
+      break;
+    default: {
+      break;
+    }
+  }
+#endif  // LIBNUMA_ENABLE
+
+  // We have the model_instance process inherit the parent's standard streams
+  // so the it reads directly from the stdin and writes directly to the
   // stdout/stderr triton.
   reproc::options options;
   options.redirect.out.type = reproc::redirect::type::parent;
@@ -938,6 +1050,16 @@ ModelInstanceState::SendModel()
   tp_msg.payloads[OptimizerOption::TFLITE_NUM_THREADS] =
       gen_metadata(std::to_string(model_state_->tflite_num_threads_));
 
+  // Add in numa config data to message
+  tp_msg.payloads[OptimizerOption::NUMA_ALLOC_POLICY] =
+      gen_metadata(AllocationPolicyToString(model_state_->numa_alloc_policy_));
+
+  tp_msg.payloads[OptimizerOption::NUMA_LOCAL_NODE_ID] =
+      gen_metadata(std::to_string(model_state_->local_numa_node_id_));
+
+  tp_msg.payloads[OptimizerOption::NUMA_REMOTE_NODE_ID] =
+      gen_metadata(std::to_string(model_state_->remote_numa_node_id_));
+
   // Add in use xnnpack
   std::string use_xnnpack = std::string("n");
   if (model_state_->use_xnnpack_delegate_ &&

From adf2c0b1b5913eaaf5d1c993630f2ba42cd28841 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Mon, 12 Jun 2023 10:37:55 -0500
Subject: [PATCH 18/33] Fix non numa build

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance/model_instance.cc | 14 +++++++-------
 src/tflite.cc                        |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/model_instance/model_instance.cc b/src/model_instance/model_instance.cc
index f330750..6d8a13a 100644
--- a/src/model_instance/model_instance.cc
+++ b/src/model_instance/model_instance.cc
@@ -276,10 +276,9 @@ ModelInstance::LoadModelFromPipe(tensorpipe::Descriptor descriptor)
 
         // Initalize the interpreter after loading the flatbuffers model
         tensorpipe::Message tp_msg;
-        TfLiteStatus status = BuildInterpreter(descriptor);
-        tp_msg.metadata = status == kTfLiteOk ? "success" : "fail";
-
-        pipe_->write(tp_msg, [this](const tensorpipe::Error& error) {
+        bool success = BuildInterpreter(descriptor) == kTfLiteOk;
+        tp_msg.metadata = success ? "success" : "fail";
+        pipe_->write(tp_msg, [this, success](const tensorpipe::Error& error) {
           if (error) {
             LOG_MESSAGE(
                 TRITONSERVER_LOG_ERROR,
@@ -287,9 +286,10 @@ ModelInstance::LoadModelFromPipe(tensorpipe::Descriptor descriptor)
             return;
           }
 #ifdef LIBNUMA_ENABLE
-          // Assuming we wrote the message successfully, now our model is
-          // loaded, and we can apply the numa policy
-          InitNuma(local_numa_node_id_, remote_numa_node_id_);
+          if (success) {
+            // Model is loaded, apply the numa policy
+            InitNuma(local_numa_node_id_, remote_numa_node_id_);
+          }
 #endif  // LIBNUMA_ENABLE
         });
 
diff --git a/src/tflite.cc b/src/tflite.cc
index 682c8c8..d5f931e 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -281,7 +281,7 @@ ModelState::InitConfig()
           numa_alloc_policy_ != AllocationPolicy::NONE,
           TRITONSERVER_ERROR_INVALID_ARG,
           std::string("Backend built without NUMA support, only valid "
-                      "allocation policy is 'NONE'"))
+                      "allocation policy is 'NONE'"));
 #endif  // LIBNUMA_ENABLE
     }
   }

From 40c32d1e7d4d480a8f46dca187696df194ddc3df Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Mon, 12 Jun 2023 13:56:02 -0500
Subject: [PATCH 19/33] Add time to profiling data by default

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance/papi_profiler.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/model_instance/papi_profiler.cc b/src/model_instance/papi_profiler.cc
index bae6295..9f8b98e 100644
--- a/src/model_instance/papi_profiler.cc
+++ b/src/model_instance/papi_profiler.cc
@@ -73,6 +73,9 @@ class PapiProfiler : public tflite::Profiler {
         handle_error(retval, __LINE__, __FILE__);
     }
     event_values_.resize(papi_events_.size());
+
+    // Separately we will also track time stamp deltas
+    papi_events_.push_back("TIME_NS");
   }
 
   ~PapiProfiler()
@@ -141,6 +144,7 @@ class PapiProfiler : public tflite::Profiler {
 
     uint32_t event_handle = event_index_++;
     papi_regions_[event_handle] = trace_event_tag;
+    timings_[event_handle] = PAPI_get_real_nsec();
     return event_handle;
   }
 
@@ -150,6 +154,8 @@ class PapiProfiler : public tflite::Profiler {
       return;
     }
 
+    timings_[event_handle] = PAPI_get_real_nsec() - timings_[event_handle];
+
     int retval;
     // For each thread we are profiling
     for (uint64_t i = 0; i < event_sets_.size(); ++i) {
@@ -160,6 +166,8 @@ class PapiProfiler : public tflite::Profiler {
       for (auto val : event_values_) {
         results_[papi_regions_[event_handle]].push_back(val);
       }
+      // Push back the op timing
+      results_[papi_regions_[event_handle]].push_back(timings_[event_handle]);
     }
   }
 
@@ -172,8 +180,13 @@ class PapiProfiler : public tflite::Profiler {
  private:
   uint32_t event_index_ = 0;
   std::unordered_map<uint32_t, std::string> papi_regions_;
+  std::unordered_map<uint32_t, long long> timings_;
   const uint64_t supported_event_types_;
+
+  // Vector holding the papi event names we are tracking
   std::vector<std::string> papi_events_;
+
+  // Vector holding papi event set data structures (one per tracked inf thread)
   std::vector<int> event_sets_;
 
   // We only care about the 4th thread in the process on, as these are used for

From c81b65ccb9c7b1016845a66d753a1677e932eb38 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Mon, 12 Jun 2023 15:42:28 -0500
Subject: [PATCH 20/33] Explicitly call terminate on child process

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/tflite.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tflite.cc b/src/tflite.cc
index d5f931e..67e501a 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -881,6 +881,7 @@ ModelInstanceState::~ModelInstanceState()
 {
   pipe_->close();
   listener_->close();
+  model_instance_process_.terminate();
 }
 
 TRITONSERVER_Error*

From a939d817439a5ce6698f4f33dcb5b9fd4c8fcc4a Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Mon, 12 Jun 2023 16:15:40 -0500
Subject: [PATCH 21/33] Put timeout on cleanup

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/tflite.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/tflite.cc b/src/tflite.cc
index 67e501a..53fd8d1 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -881,7 +881,13 @@ ModelInstanceState::~ModelInstanceState()
 {
   pipe_->close();
   listener_->close();
-  model_instance_process_.terminate();
+  reproc::stop_actions stop = {
+      {reproc::stop::terminate, reproc::milliseconds(10000)},
+      {reproc::stop::kill, reproc::milliseconds(2000)},
+      {reproc::stop::noop, reproc::milliseconds(0)}};
+  reproc::options options;
+  options.stop = stop;
+  model_instance_process_.stop(options.stop);
 }
 
 TRITONSERVER_Error*

From d90d739f0840d6593ae92bd23de0afcc68b1219d Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Mon, 12 Jun 2023 16:52:48 -0500
Subject: [PATCH 22/33] Fix exit handling in child process

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance/model_instance.cc | 3 +--
 src/model_instance/papi_profiler.cc  | 7 ++++++-
 src/tflite.cc                        | 9 +++++++--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/model_instance/model_instance.cc b/src/model_instance/model_instance.cc
index 6d8a13a..31cbd5b 100644
--- a/src/model_instance/model_instance.cc
+++ b/src/model_instance/model_instance.cc
@@ -229,7 +229,6 @@ ModelInstance::ReceiveFromPipe()
         LOG_MESSAGE(
             TRITONSERVER_LOG_INFO,
             (std::string("Remote side hungup: ") + error.what()).c_str());
-        return;
       } else {
         LOG_MESSAGE(
             TRITONSERVER_LOG_ERROR,
@@ -237,7 +236,7 @@ ModelInstance::ReceiveFromPipe()
              error.what())
                 .c_str());
       }
-      exit(1);
+      return;
     }
     if (descriptor.metadata == "model_load") {
       LoadModelFromPipe(descriptor);
diff --git a/src/model_instance/papi_profiler.cc b/src/model_instance/papi_profiler.cc
index 9f8b98e..081bb3a 100644
--- a/src/model_instance/papi_profiler.cc
+++ b/src/model_instance/papi_profiler.cc
@@ -32,6 +32,8 @@ handle_error(int retval, int line, const std::string& file)
       ("PAPI error at line " + file + ":" + std::to_string(line) + " " +
        std::to_string(retval) + ", " + PAPI_strerror(retval))
           .c_str());
+
+  // TODO: graceful exit here
   exit(1);
 }
 
@@ -74,7 +76,7 @@ class PapiProfiler : public tflite::Profiler {
     }
     event_values_.resize(papi_events_.size());
 
-    // Separately we will also track time stamp deltas
+    // Separately we will also track operation timings in nanos
     papi_events_.push_back("TIME_NS");
   }
 
@@ -193,7 +195,10 @@ class PapiProfiler : public tflite::Profiler {
   // inference
   std::vector<pid_t> inf_thread_ids_;
 
+  // Vector to hold papi counter values when we read them
   std::vector<long long> event_values_;
+
+  // Vector holding all counter values to be processed at end
   std::unordered_map<std::string, std::vector<long long>> results_;
 };
 
diff --git a/src/tflite.cc b/src/tflite.cc
index 53fd8d1..e4659f7 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -884,10 +884,15 @@ ModelInstanceState::~ModelInstanceState()
   reproc::stop_actions stop = {
       {reproc::stop::terminate, reproc::milliseconds(10000)},
       {reproc::stop::kill, reproc::milliseconds(2000)},
-      {reproc::stop::noop, reproc::milliseconds(0)}};
+      {}};
   reproc::options options;
   options.stop = stop;
-  model_instance_process_.stop(options.stop);
+  std::error_code ec;
+  int status = 0;
+  std::tie(status, ec) = model_instance_process_.stop(options.stop);
+  if (ec) {
+    LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to stop child process");
+  }
 }
 
 TRITONSERVER_Error*

From c08cd3b08851767ee7dc611dfdb80bf160b94780 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Thu, 27 Jul 2023 15:54:14 -0500
Subject: [PATCH 23/33] Add support for uncore papi events

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 README.md                                 |  12 ++-
 qa/config-template.pbtxt                  |   9 ++
 qa/helpers/triton_model_config.py         |   2 +
 src/model_instance/model_instance_main.cc |   5 +
 src/model_instance/papi_profiler.cc       | 123 +++++++++++++++++++---
 src/tflite.cc                             |  21 +++-
 6 files changed, 156 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 4ea154a..1ca2d1c 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,7 @@ instance_group [
 ```
 
 ## Enabling PAPI events
-This backend supports PAPI performance counter sampling. This is exposed through the PAPI High Level API. We support performance counter tracing at the tflite operator level using tflite tracing instrumentation. To enable this, you can use the following in your model config:
+This backend supports PAPI performance counter sampling. We support performance counter tracing at the tflite operator level using tflite tracing instrumentation. To enable this, you can use the following in your model config:
 ```
 parameters {
     key: "papi_events"
@@ -202,5 +202,13 @@ parameters {
         string_value:"PAPI_TOT_CYC,PAPI_LD_INS"
     }
 }
+parameters {
+    key: "papi_uncore_events"
+    value: {
+        string_value:"tx2_dmc0::UNC_DMC_READS:u:cpu=0"
+    }
+}
 ```
-Internally, the events listed get set to the environment variable `PAPI_EVENTS` as per the PAPI High Level API documentation. Results of this will be written to a newly created `papi_hl_output` folder in the directory you launched the server from.
+`papi_events` is used for the per core events such as total load instructions, and can be tracked at the thread level, `papi_uncore_events` are uncore events which are tracked at the socket level such as userspace DRAM reads for socket 0 in the example above.
+
+Internally, the events listed get set to the environment variables `PAPI_EVENTS` and `PAPI_UNCORE_EVENTS`. Results of this will be written to a newly created file `counters_*.csv` file for you to use as you wish.
diff --git a/qa/config-template.pbtxt b/qa/config-template.pbtxt
index a114043..561c36a 100644
--- a/qa/config-template.pbtxt
+++ b/qa/config-template.pbtxt
@@ -43,6 +43,15 @@ string_value:"{{ model.papi_events }}"
 }
 {% endif %}
 
+{% if model.papi_uncore_events %}
+parameters {
+key: "papi_uncore_events"
+value: {
+string_value:"{{ model.papi_uncore_events }}"
+}
+}
+{% endif %}
+
 instance_group [
     {% if model.gpu > 0 %}
     {
diff --git a/qa/helpers/triton_model_config.py b/qa/helpers/triton_model_config.py
index d5d3cfe..99a8176 100644
--- a/qa/helpers/triton_model_config.py
+++ b/qa/helpers/triton_model_config.py
@@ -42,6 +42,7 @@ def __init__(
         outputs: List[Model.TensorIO],
         tflite_num_threads: int = None,
         papi_events: str = None,
+        papi_uncore_events: str = None,
         gpu: int = 0,
         cpu: int = 1,
         max_batch_size: int = 0,
@@ -64,6 +65,7 @@ def __init__(
         )
         self.tflite_num_threads = tflite_num_threads
         self.papi_events = papi_events
+        self.papi_uncore_events = papi_uncore_events
         self.armnn_cpu = armnn_cpu
         self.armnn_gpu = armnn_gpu
         self.armnn_cpu_parameters = armnn_cpu_parameters
diff --git a/src/model_instance/model_instance_main.cc b/src/model_instance/model_instance_main.cc
index 3991785..6918375 100644
--- a/src/model_instance/model_instance_main.cc
+++ b/src/model_instance/model_instance_main.cc
@@ -26,6 +26,11 @@ main(int argc, char* argv[])
     LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to init PAPI lib");
     return 1;
   }
+  if (PAPI_multiplex_init() != PAPI_OK) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_ERROR, "Failed to init multiplexing for PAPI lib");
+    return 1;
+  }
   if (PAPI_thread_init(pthread_self) != PAPI_OK) {
     LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to init PAPI thread lib");
     return 1;
diff --git a/src/model_instance/papi_profiler.cc b/src/model_instance/papi_profiler.cc
index 081bb3a..6007b40 100644
--- a/src/model_instance/papi_profiler.cc
+++ b/src/model_instance/papi_profiler.cc
@@ -39,11 +39,13 @@ handle_error(int retval, int line, const std::string& file)
 
 class PapiProfiler : public tflite::Profiler {
  public:
-  PapiProfiler(const std::vector<std::string>& papi_events)
+  PapiProfiler(
+      const std::vector<std::string>& papi_events,
+      const std::vector<std::string>& papi_uncore_events)
       : supported_event_types_(
             static_cast<uint64_t>(EventType::DELEGATE_OPERATOR_INVOKE_EVENT) +
             static_cast<uint64_t>(EventType::OPERATOR_INVOKE_EVENT)),
-        papi_events_(papi_events)
+        papi_events_(papi_events), papi_uncore_events_(papi_uncore_events)
   {
     // We only care about the 4th thread in the process on, as these are used
     // for inference
@@ -51,9 +53,8 @@ class PapiProfiler : public tflite::Profiler {
     inf_thread_ids_ =
         std::vector<pid_t>(current_threads.begin() + 3, current_threads.end());
 
+    // Handle core specific events per inference thread
     int retval;
-    // The first 3 threads for the model instance don't do anything for
-    // inference, so we aren't interested in them
     for (uint64_t i = 0; i < inf_thread_ids_.size(); ++i) {
       event_sets_.push_back(PAPI_NULL);
       retval = PAPI_create_eventset(&event_sets_.back());
@@ -78,6 +79,20 @@ class PapiProfiler : public tflite::Profiler {
 
     // Separately we will also track operation timings in nanos
     papi_events_.push_back("TIME_NS");
+
+    // Handle uncore events separately
+    if (!papi_uncore_events_.empty()) {
+      retval = PAPI_create_eventset(&uncore_event_set_);
+      if (retval != PAPI_OK) {
+        handle_error(retval, __LINE__, __FILE__);
+      }
+      for (auto& event_name : papi_uncore_events_) {
+        retval = PAPI_add_named_event(uncore_event_set_, event_name.c_str());
+        if (retval != PAPI_OK)
+          handle_error(retval, __LINE__, __FILE__);
+      }
+      uncore_event_values_.resize(papi_uncore_events_.size());
+    }
   }
 
   ~PapiProfiler()
@@ -88,17 +103,28 @@ class PapiProfiler : public tflite::Profiler {
     auto utc =
         std::chrono::duration_cast<std::chrono::seconds>(now.time_since_epoch())
             .count();
+
     myfile.open(("counters_" + std::to_string(utc) + ".csv").c_str());
     // Header
     myfile << "op_id,thread_id,papi_event,value\n";
-    // Iterate over map keyed on tflite operation id
+    // Iterate over map keyed on tflite operation id, with values being a vector
+    // of counter values for each tracked perf event
+    pid_t inf_thread_id;
     for (auto& event : results_) {
+      // Write all of the per-core events first, broken down by thread
       for (uint64_t i = 0; i < event.second.size(); ++i) {
-        myfile << event.first << ","
-               << inf_thread_ids_[i / papi_events_.size() % event_sets_.size()]
-               << "," << papi_events_[i % papi_events_.size()] << ","
+        inf_thread_id =
+            inf_thread_ids_[i / papi_events_.size() % event_sets_.size()];
+        myfile << event.first << "," << inf_thread_id << ","
+               << papi_events_[i % papi_events_.size()] << ","
                << event.second[i] << "\n";
       }
+      // Now write the uncore events with a dummy thread id of -1
+      for (uint64_t i = 0; i < results_uncore_[event.first].size(); ++i) {
+        myfile << event.first << "," << -1 << ","
+               << papi_uncore_events_[i % papi_uncore_events_.size()] << ","
+               << results_uncore_[event.first][i] << "\n";
+      }
     }
     myfile.close();
 
@@ -144,6 +170,24 @@ class PapiProfiler : public tflite::Profiler {
       }
     }
 
+    // Handle uncore events
+    if (!papi_uncore_events_.empty()) {
+      int state;
+      PAPI_state(uncore_event_set_, &state);
+      if (!(state & PAPI_RUNNING)) {
+        // Begin tracking counters
+        retval = PAPI_start(uncore_event_set_);
+        if (retval != PAPI_OK)
+          handle_error(retval, __LINE__, __FILE__);
+
+      } else {
+        // Reset counters
+        retval = PAPI_reset(uncore_event_set_);
+        if (retval != PAPI_OK)
+          handle_error(retval, __LINE__, __FILE__);
+      }
+    }
+
     uint32_t event_handle = event_index_++;
     papi_regions_[event_handle] = trace_event_tag;
     timings_[event_handle] = PAPI_get_real_nsec();
@@ -168,8 +212,20 @@ class PapiProfiler : public tflite::Profiler {
       for (auto val : event_values_) {
         results_[papi_regions_[event_handle]].push_back(val);
       }
-      // Push back the op timing
-      results_[papi_regions_[event_handle]].push_back(timings_[event_handle]);
+    }
+
+    // Push back the op timing
+    results_[papi_regions_[event_handle]].push_back(timings_[event_handle]);
+
+    // Handle uncore events
+    if (!papi_uncore_events_.empty()) {
+      retval = PAPI_read(uncore_event_set_, uncore_event_values_.data());
+      if (retval != PAPI_OK)
+        handle_error(retval, __LINE__, __FILE__);
+      // For each of the events we collected a counter value for
+      for (auto val : uncore_event_values_) {
+        results_uncore_[papi_regions_[event_handle]].push_back(val);
+      }
     }
   }
 
@@ -185,12 +241,20 @@ class PapiProfiler : public tflite::Profiler {
   std::unordered_map<uint32_t, long long> timings_;
   const uint64_t supported_event_types_;
 
-  // Vector holding the papi event names we are tracking
+  // Vector holding the papi event names we are tracking for each core/thread
   std::vector<std::string> papi_events_;
 
+  // Vector holding the papi event names we are tracking which are socket
+  // specific
+  std::vector<std::string> papi_uncore_events_;
+
   // Vector holding papi event set data structures (one per tracked inf thread)
   std::vector<int> event_sets_;
 
+  // Vector holding papi event set data structures for our uncore events because
+  // this is per socket, we only need one event set
+  int uncore_event_set_ = PAPI_NULL;
+
   // We only care about the 4th thread in the process on, as these are used for
   // inference
   std::vector<pid_t> inf_thread_ids_;
@@ -198,13 +262,20 @@ class PapiProfiler : public tflite::Profiler {
   // Vector to hold papi counter values when we read them
   std::vector<long long> event_values_;
 
-  // Vector holding all counter values to be processed at end
+  // Vector to hold papi uncore values when we read them
+  std::vector<long long> uncore_event_values_;
+
+  // Vector holding all per core counter values to be processed at end
   std::unordered_map<std::string, std::vector<long long>> results_;
+
+  // Vector holding all per core counter values to be processed at end
+  std::unordered_map<std::string, std::vector<long long>> results_uncore_;
 };
 
 std::unique_ptr<tflite::Profiler>
 MaybeCreatePapiProfiler()
 {
+  // Per core events
   char* papi_events = getenv("PAPI_EVENTS");
   std::vector<std::string> papi_events_vec;
   if (papi_events == NULL) {
@@ -228,5 +299,31 @@ MaybeCreatePapiProfiler()
       papi_events_vec.push_back(substr);
     }
   }
-  return std::unique_ptr<tflite::Profiler>(new PapiProfiler(papi_events_vec));
+
+  // Uncore events
+  char* papi_uncore_events = getenv("PAPI_UNCORE_EVENTS");
+  std::vector<std::string> papi_uncore_events_vec;
+  if (papi_uncore_events == NULL) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_WARN,
+        "PAPI_UNCORE_EVENTS not specified, op level profiling disabled!");
+    return nullptr;
+  } else {
+    // Parse out all papi events indivdually
+    std::stringstream ss(papi_uncore_events);
+    while (ss.good()) {
+      std::string substr;
+      std::getline(ss, substr, ',');
+      if (!PAPIEventValid(substr)) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_WARN,
+            ("Event: " + substr + " invalid, op level profiling disabled!")
+                .c_str());
+        return nullptr;
+      }
+      papi_uncore_events_vec.push_back(substr);
+    }
+  }
+  return std::unique_ptr<tflite::Profiler>(
+      new PapiProfiler(papi_events_vec, papi_uncore_events_vec));
 }
diff --git a/src/tflite.cc b/src/tflite.cc
index e4659f7..4ea90ac 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -131,6 +131,10 @@ class ModelState : public BackendModel {
 #ifdef PAPI_PROFILING_ENABLE
   // String holding comma-separated list of events for child inference process
   std::string papi_events_ = "";
+
+  // String holding comma-separated list of uncore events for child inference
+  // process
+  std::string papi_uncore_events_ = "";
 #endif  // PAPI_PROFILING_ENABLE
 
   // Numa policy for instance
@@ -568,6 +572,16 @@ ModelState::ValidateModelConfig()
         TRITONSERVER_ErrorDelete(err);
       }
     }
+
+    err = GetParameterValue(params, "papi_uncore_events", &papi_uncore_events_);
+    // papi_events is not required so clear error if not found
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
   }
 #endif  // PAPI_PROFILING_ENABLE
 
@@ -938,7 +952,8 @@ ModelInstanceState::LaunchModelInstance()
       break;
     case AllocationPolicy::WEIGHT_LOCAL_RESULT_REMOTE:
     case AllocationPolicy::REMOTE:
-      // In the case of remote result tensors (heap), membind to local numa node
+      // In the case of remote result tensors (heap), membind to remote numa
+      // node
       model_instance_args.insert(
           model_instance_args.begin(),
           {"numactl", "--membind",
@@ -988,6 +1003,10 @@ ModelInstanceState::LaunchModelInstance()
   if (!model_state_->papi_events_.empty()) {
     model_instance_env.insert({"PAPI_EVENTS", model_state_->papi_events_});
   }
+  if (!model_state_->papi_uncore_events_.empty()) {
+    model_instance_env.insert(
+        {"PAPI_UNCORE_EVENTS", model_state_->papi_uncore_events_});
+  }
 #endif  // PAPI_PROFILING_ENABLE
 
   options.env.extra = model_instance_env;

From 7bfade4c12c87745ff3ab04c5f3c75a7cb211c7c Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Fri, 28 Jul 2023 15:31:31 -0500
Subject: [PATCH 24/33] Simplify perf counter infra

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance/papi_profiler.cc | 162 +++++++++++++++-------------
 1 file changed, 86 insertions(+), 76 deletions(-)

diff --git a/src/model_instance/papi_profiler.cc b/src/model_instance/papi_profiler.cc
index 6007b40..ecc34d5 100644
--- a/src/model_instance/papi_profiler.cc
+++ b/src/model_instance/papi_profiler.cc
@@ -53,32 +53,43 @@ class PapiProfiler : public tflite::Profiler {
     inf_thread_ids_ =
         std::vector<pid_t>(current_threads.begin() + 3, current_threads.end());
 
-    // Handle core specific events per inference thread
+    papi_regions_.reserve(1000);
+    timings_.reserve(1000);
+
     int retval;
-    for (uint64_t i = 0; i < inf_thread_ids_.size(); ++i) {
-      event_sets_.push_back(PAPI_NULL);
-      retval = PAPI_create_eventset(&event_sets_.back());
-      if (retval != PAPI_OK) {
-        handle_error(retval, __LINE__, __FILE__);
-      }
-      for (auto& event_name : papi_events_) {
-        retval = PAPI_add_named_event(event_sets_.back(), event_name.c_str());
+
+    // Handle core specific events per inference thread
+    if (!papi_events_.empty()) {
+      for (uint64_t i = 0; i < inf_thread_ids_.size(); ++i) {
+        event_sets_.push_back(PAPI_NULL);
+        retval = PAPI_create_eventset(&event_sets_.back());
+        if (retval != PAPI_OK) {
+          handle_error(retval, __LINE__, __FILE__);
+        }
+        for (auto& event_name : papi_events_) {
+          retval = PAPI_add_named_event(event_sets_.back(), event_name.c_str());
+          if (retval != PAPI_OK)
+            handle_error(retval, __LINE__, __FILE__);
+        }
+
+        // Attach event to thread
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            ("Attaching to " + std::to_string(inf_thread_ids_[i])).c_str());
+        retval = PAPI_attach(event_sets_.back(), inf_thread_ids_[i]);
+        if (retval != PAPI_OK)
+          handle_error(retval, __LINE__, __FILE__);
+
+        // Start eventset
+        retval = PAPI_start(event_sets_.back());
         if (retval != PAPI_OK)
           handle_error(retval, __LINE__, __FILE__);
       }
+      event_values_.resize(papi_events_.size());
 
-      // Attach event to thread
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          ("Attaching to " + std::to_string(inf_thread_ids_[i])).c_str());
-      retval = PAPI_attach(event_sets_.back(), inf_thread_ids_[i]);
-      if (retval != PAPI_OK)
-        handle_error(retval, __LINE__, __FILE__);
+      // Separately we will also track operation timings in nanos
+      papi_events_.push_back("TIME_NS");
     }
-    event_values_.resize(papi_events_.size());
-
-    // Separately we will also track operation timings in nanos
-    papi_events_.push_back("TIME_NS");
 
     // Handle uncore events separately
     if (!papi_uncore_events_.empty()) {
@@ -92,6 +103,10 @@ class PapiProfiler : public tflite::Profiler {
           handle_error(retval, __LINE__, __FILE__);
       }
       uncore_event_values_.resize(papi_uncore_events_.size());
+      // Start uncore eventset
+      retval = PAPI_start(uncore_event_set_);
+      if (retval != PAPI_OK)
+        handle_error(retval, __LINE__, __FILE__);
     }
   }
 
@@ -119,6 +134,8 @@ class PapiProfiler : public tflite::Profiler {
                << papi_events_[i % papi_events_.size()] << ","
                << event.second[i] << "\n";
       }
+    }
+    for (auto& event : results_uncore_) {
       // Now write the uncore events with a dummy thread id of -1
       for (uint64_t i = 0; i < results_uncore_[event.first].size(); ++i) {
         myfile << event.first << "," << -1 << ","
@@ -152,17 +169,9 @@ class PapiProfiler : public tflite::Profiler {
     trace_event_tag += ("_" + std::to_string(event_metadata1));
 
     int retval;
-    // For the event set attached to each thread, start or restart the event set
-    for (uint64_t i = 0; i < event_sets_.size(); ++i) {
-      int state;
-      PAPI_state(event_sets_[i], &state);
-      if (!(state & PAPI_RUNNING)) {
-        // Begin tracking counters
-        retval = PAPI_start(event_sets_[i]);
-        if (retval != PAPI_OK)
-          handle_error(retval, __LINE__, __FILE__);
 
-      } else {
+    if (!papi_events_.empty()) {  // Reset event set attached to each thread
+      for (uint64_t i = 0; i < event_sets_.size(); ++i) {
         // Reset counters
         retval = PAPI_reset(event_sets_[i]);
         if (retval != PAPI_OK)
@@ -172,26 +181,16 @@ class PapiProfiler : public tflite::Profiler {
 
     // Handle uncore events
     if (!papi_uncore_events_.empty()) {
-      int state;
-      PAPI_state(uncore_event_set_, &state);
-      if (!(state & PAPI_RUNNING)) {
-        // Begin tracking counters
-        retval = PAPI_start(uncore_event_set_);
-        if (retval != PAPI_OK)
-          handle_error(retval, __LINE__, __FILE__);
-
-      } else {
-        // Reset counters
-        retval = PAPI_reset(uncore_event_set_);
-        if (retval != PAPI_OK)
-          handle_error(retval, __LINE__, __FILE__);
-      }
+      // Reset counters
+      retval = PAPI_reset(uncore_event_set_);
+      if (retval != PAPI_OK)
+        handle_error(retval, __LINE__, __FILE__);
     }
 
-    uint32_t event_handle = event_index_++;
-    papi_regions_[event_handle] = trace_event_tag;
-    timings_[event_handle] = PAPI_get_real_nsec();
-    return event_handle;
+    event_index_++;
+    papi_regions_[event_index_] = std::move(trace_event_tag);
+    timings_[event_index_] = PAPI_get_real_nsec();
+    return event_index_;
   }
 
   void EndEvent(uint32_t event_handle) override
@@ -200,32 +199,44 @@ class PapiProfiler : public tflite::Profiler {
       return;
     }
 
-    timings_[event_handle] = PAPI_get_real_nsec() - timings_[event_handle];
+    long long op_latency = PAPI_get_real_nsec() - timings_[event_handle];
 
-    int retval;
-    // For each thread we are profiling
-    for (uint64_t i = 0; i < event_sets_.size(); ++i) {
-      retval = PAPI_read(event_sets_[i], event_values_.data());
-      if (retval != PAPI_OK)
-        handle_error(retval, __LINE__, __FILE__);
-      // For each of the events we collected a counter value for
-      for (auto val : event_values_) {
-        results_[papi_regions_[event_handle]].push_back(val);
-      }
+    // For performance reserve space for 10000 elements for each perf event in
+    // results
+    if (results_[papi_regions_[event_handle]].empty()) {
+      results_[papi_regions_[event_handle]].reserve(
+          papi_events_.size() * 10000);
+    }
+    if (results_uncore_[papi_regions_[event_handle]].empty()) {
+      results_uncore_[papi_regions_[event_handle]].reserve(
+          papi_uncore_events_.size() * 10000);
     }
 
-    // Push back the op timing
-    results_[papi_regions_[event_handle]].push_back(timings_[event_handle]);
+    int retval;
 
+    if (!papi_events_.empty()) {  // For each thread we are profiling
+      for (uint64_t i = 0; i < event_sets_.size(); ++i) {
+        retval = PAPI_read(event_sets_[i], event_values_.data());
+        if (retval != PAPI_OK)
+          handle_error(retval, __LINE__, __FILE__);
+        // Write event counter values to end of results vector for current op
+        results_[papi_regions_[event_handle]].insert(
+            results_[papi_regions_[event_handle]].end(), event_values_.begin(),
+            event_values_.end());
+      }
+
+      // Push back the op timing
+      results_[papi_regions_[event_handle]].push_back(op_latency);
+    }
     // Handle uncore events
     if (!papi_uncore_events_.empty()) {
       retval = PAPI_read(uncore_event_set_, uncore_event_values_.data());
       if (retval != PAPI_OK)
         handle_error(retval, __LINE__, __FILE__);
       // For each of the events we collected a counter value for
-      for (auto val : uncore_event_values_) {
-        results_uncore_[papi_regions_[event_handle]].push_back(val);
-      }
+      results_uncore_[papi_regions_[event_handle]].insert(
+          results_uncore_[papi_regions_[event_handle]].end(),
+          uncore_event_values_.begin(), uncore_event_values_.end());
     }
   }
 
@@ -278,12 +289,7 @@ MaybeCreatePapiProfiler()
   // Per core events
   char* papi_events = getenv("PAPI_EVENTS");
   std::vector<std::string> papi_events_vec;
-  if (papi_events == NULL) {
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_WARN,
-        "PAPI_EVENTS not specified, op level profiling disabled!");
-    return nullptr;
-  } else {
+  if (papi_events != NULL) {
     // Parse out all papi events indivdually
     std::stringstream ss(papi_events);
     while (ss.good()) {
@@ -303,12 +309,7 @@ MaybeCreatePapiProfiler()
   // Uncore events
   char* papi_uncore_events = getenv("PAPI_UNCORE_EVENTS");
   std::vector<std::string> papi_uncore_events_vec;
-  if (papi_uncore_events == NULL) {
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_WARN,
-        "PAPI_UNCORE_EVENTS not specified, op level profiling disabled!");
-    return nullptr;
-  } else {
+  if (papi_uncore_events != NULL) {
     // Parse out all papi events indivdually
     std::stringstream ss(papi_uncore_events);
     while (ss.good()) {
@@ -324,6 +325,15 @@ MaybeCreatePapiProfiler()
       papi_uncore_events_vec.push_back(substr);
     }
   }
+
+  if ((papi_events == NULL) && (papi_uncore_events == NULL)) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_WARN,
+        "PAPI_EVENTS nor PAPI_UNCORE_EVENTS specified, op level profiling "
+        "disabled!");
+    return nullptr;
+  }
+
   return std::unique_ptr<tflite::Profiler>(
       new PapiProfiler(papi_events_vec, papi_uncore_events_vec));
 }

From 0ba51340b915b221adc0f5e725d32197b6f27896 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Fri, 28 Jul 2023 16:13:38 -0500
Subject: [PATCH 25/33] Add sample id to csv file, and fix csv gen

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance/papi_profiler.cc | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/model_instance/papi_profiler.cc b/src/model_instance/papi_profiler.cc
index ecc34d5..5cd2659 100644
--- a/src/model_instance/papi_profiler.cc
+++ b/src/model_instance/papi_profiler.cc
@@ -121,24 +121,24 @@ class PapiProfiler : public tflite::Profiler {
 
     myfile.open(("counters_" + std::to_string(utc) + ".csv").c_str());
     // Header
-    myfile << "op_id,thread_id,papi_event,value\n";
+    myfile << "op_id,thread_id,sample_id,papi_event,value\n";
     // Iterate over map keyed on tflite operation id, with values being a vector
     // of counter values for each tracked perf event
-    pid_t inf_thread_id;
     for (auto& event : results_) {
-      // Write all of the per-core events first, broken down by thread
       for (uint64_t i = 0; i < event.second.size(); ++i) {
-        inf_thread_id =
-            inf_thread_ids_[i / papi_events_.size() % event_sets_.size()];
-        myfile << event.first << "," << inf_thread_id << ","
+        myfile << event.first << ","
+               << inf_thread_ids_[i / papi_events_.size() % event_sets_.size()]
+               << "," << i / (papi_events_.size() * event_sets_.size()) << ","
                << papi_events_[i % papi_events_.size()] << ","
                << event.second[i] << "\n";
       }
     }
+
     for (auto& event : results_uncore_) {
       // Now write the uncore events with a dummy thread id of -1
       for (uint64_t i = 0; i < results_uncore_[event.first].size(); ++i) {
         myfile << event.first << "," << -1 << ","
+               << i / papi_uncore_events_.size() << ","
                << papi_uncore_events_[i % papi_uncore_events_.size()] << ","
                << results_uncore_[event.first][i] << "\n";
       }
@@ -223,10 +223,9 @@ class PapiProfiler : public tflite::Profiler {
         results_[papi_regions_[event_handle]].insert(
             results_[papi_regions_[event_handle]].end(), event_values_.begin(),
             event_values_.end());
+        // Push back the op timing
+        results_[papi_regions_[event_handle]].push_back(op_latency);
       }
-
-      // Push back the op timing
-      results_[papi_regions_[event_handle]].push_back(op_latency);
     }
     // Handle uncore events
     if (!papi_uncore_events_.empty()) {

From 9737271fd1b4badd4ca5c453c39c247a046cf0e6 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Wed, 2 Aug 2023 16:25:25 -0500
Subject: [PATCH 26/33] Only keep one copy of op timinigs in csv

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance/papi_profiler.cc | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/model_instance/papi_profiler.cc b/src/model_instance/papi_profiler.cc
index 5cd2659..777ae60 100644
--- a/src/model_instance/papi_profiler.cc
+++ b/src/model_instance/papi_profiler.cc
@@ -86,9 +86,6 @@ class PapiProfiler : public tflite::Profiler {
           handle_error(retval, __LINE__, __FILE__);
       }
       event_values_.resize(papi_events_.size());
-
-      // Separately we will also track operation timings in nanos
-      papi_events_.push_back("TIME_NS");
     }
 
     // Handle uncore events separately
@@ -143,6 +140,15 @@ class PapiProfiler : public tflite::Profiler {
                << results_uncore_[event.first][i] << "\n";
       }
     }
+
+    for (auto& event : results_timings_) {
+      // Now write the timing events with a dummy thread id of -1
+      for (uint64_t i = 0; i < results_timings_[event.first].size(); ++i) {
+        myfile << event.first << "," << -1 << "," << i << ","
+               << "TIME_NS"
+               << "," << results_timings_[event.first][i] << "\n";
+      }
+    }
     myfile.close();
 
     for (auto& event_set : event_sets_) {
@@ -199,13 +205,16 @@ class PapiProfiler : public tflite::Profiler {
       return;
     }
 
-    long long op_latency = PAPI_get_real_nsec() - timings_[event_handle];
+    // Push back the op timing
+    results_timings_[papi_regions_[event_handle]].push_back(
+        PAPI_get_real_nsec() - timings_[event_handle]);
 
     // For performance reserve space for 10000 elements for each perf event in
     // results
     if (results_[papi_regions_[event_handle]].empty()) {
       results_[papi_regions_[event_handle]].reserve(
           papi_events_.size() * 10000);
+      results_timings_.reserve(10000);
     }
     if (results_uncore_[papi_regions_[event_handle]].empty()) {
       results_uncore_[papi_regions_[event_handle]].reserve(
@@ -223,8 +232,6 @@ class PapiProfiler : public tflite::Profiler {
         results_[papi_regions_[event_handle]].insert(
             results_[papi_regions_[event_handle]].end(), event_values_.begin(),
             event_values_.end());
-        // Push back the op timing
-        results_[papi_regions_[event_handle]].push_back(op_latency);
       }
     }
     // Handle uncore events
@@ -280,6 +287,9 @@ class PapiProfiler : public tflite::Profiler {
 
   // Vector holding all per core counter values to be processed at end
   std::unordered_map<std::string, std::vector<long long>> results_uncore_;
+
+  // Vector holding op timings
+  std::unordered_map<std::string, std::vector<long long>> results_timings_;
 };
 
 std::unique_ptr<tflite::Profiler>

From 9096e599d375a768a4ca171338f57ef0a70f366f Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Thu, 17 Aug 2023 13:56:42 -0500
Subject: [PATCH 27/33] Add function to get list of avail cpus per socket

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 CMakeLists.txt      | 24 ++++++++++++++++++++++++
 src/config.h        |  1 +
 src/tflite.cc       |  5 +++++
 src/tflite_utils.cc | 18 ++++++++++++++++++
 src/tflite_utils.h  |  3 +++
 5 files changed, 51 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5fc325e..17dbb63 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,6 +189,19 @@ configure_file(src/libtriton_armnn_tflite.ldscript
 
 include(ExternalProject)
 
+# Handle hwloc
+ExternalProject_Add(
+    hwloc
+    GIT_REPOSITORY https://github.com/open-mpi/hwloc
+    GIT_TAG hwloc-2.8.0
+    GIT_SHALLOW ON
+    SOURCE_DIR ${CMAKE_BINARY_DIR}/hwloc
+    BINARY_DIR ${CMAKE_BINARY_DIR}/hwloc
+    CONFIGURE_COMMAND ./autogen.sh && ./configure --prefix=<INSTALL_DIR> --enable-debug=$<IF:$<CONFIG:Debug>,"1","0"
+    BUILD_COMMAND make -j$(nproc)
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND make install)
+
 set(TFLITE_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/external/tensorflow_lite)
 
 if(TFLITE_BAZEL_BUILD)
@@ -516,6 +529,10 @@ else()
   list(APPEND BACKEND_LINK_LIBS tensorflow-lite)
 endif()
 
+add_dependencies(triton-armnn-tflite-backend hwloc)
+list(APPEND BACKEND_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/hwloc-prefix/include)
+target_link_libraries(triton-armnn-tflite-backend PRIVATE ${CMAKE_BINARY_DIR}/hwloc-prefix/lib/libhwloc.so)
+
 if(ARMNN_DELEGATE_ENABLE)
   target_compile_definitions(triton-armnn-tflite-backend PRIVATE ARMNN_DELEGATE_ENABLE=1)
 endif()
@@ -580,6 +597,13 @@ install(
   LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/armnn_tflite
   ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/armnn_tflite)
 
+# Install hwloc libraries
+install(
+  DIRECTORY ${CMAKE_BINARY_DIR}/hwloc-prefix/lib/
+  DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/armnn_tflite
+  FILES_MATCHING
+  PATTERN "*.so*")
+
 if(ARMNN_DELEGATE_ENABLE)
   # Install ArmNN libraries and license
   install(
diff --git a/src/config.h b/src/config.h
index 225ebd6..3bf4f24 100644
--- a/src/config.h
+++ b/src/config.h
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <string>
+#include <unordered_map>
 
 // This class is used to map an optimizer option to an index in an array so
 // options can be sent across a tensorpipe payload
diff --git a/src/tflite.cc b/src/tflite.cc
index 4ea90ac..169c044 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -146,6 +146,9 @@ class ModelState : public BackendModel {
   // remote numa node id
   int remote_numa_node_id_ = 1;
 
+  // Map managing list of avail cpus in system, keyed on socket
+  std::unordered_map<int, std::vector<int>> avail_cpus_;
+
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
   TRITONSERVER_Error* AutoCompleteConfig();
@@ -177,6 +180,8 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
   THROW_IF_BACKEND_MODEL_ERROR(InitConfig());
   THROW_IF_BACKEND_MODEL_ERROR(LoadModel());
 
+  PopulateCpusMap(avail_cpus_);
+
   // Get the directory of the backend to find the path to the model instance
   // binary
   TRITONBACKEND_Backend* backend;
diff --git a/src/tflite_utils.cc b/src/tflite_utils.cc
index 40d4fd3..786440e 100644
--- a/src/tflite_utils.cc
+++ b/src/tflite_utils.cc
@@ -131,5 +131,23 @@ StringToIntVector(std::string const& s)
   return result;
 }
 
+void
+PopulateCpusMap(std::unordered_map<int, std::vector<int>>& cpus)
+{
+  hwloc_topology_t topology;
+  hwloc_topology_init(&topology);
+  hwloc_topology_load(topology);
+
+  int num_cpus = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
+  for (int cpu_id = 0; cpu_id < num_cpus; ++cpu_id) {
+    hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, cpu_id);
+    if (obj) {
+      hwloc_bitmap_t nodeset = obj->nodeset;
+      cpus[hwloc_bitmap_first(nodeset)].push_back(cpu_id);
+    }
+  }
+  hwloc_topology_destroy(topology);
+}
+
 
 }}}  // namespace triton::backend::tensorflowlite
diff --git a/src/tflite_utils.h b/src/tflite_utils.h
index 745c38e..e227539 100644
--- a/src/tflite_utils.h
+++ b/src/tflite_utils.h
@@ -6,6 +6,7 @@
 
 #include <sstream>
 
+#include "hwloc.h"
 #include "tensorflow/lite/model.h"
 #include "triton/backend/backend_model.h"
 #include "triton/core/tritonserver.h"
@@ -43,4 +44,6 @@ VectorToString(std::vector<T, A> const& v)
   return ss.str();
 }
 
+void PopulateCpusMap(std::unordered_map<int, std::vector<int>>&);
+
 }}}  // namespace triton::backend::tensorflowlite

From 3caac119cd2b39efd887ace11acdb85c275e644e Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Fri, 18 Aug 2023 15:22:40 -0500
Subject: [PATCH 28/33] Fix bug in validating arbitrary batch sizes

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/tflite.cc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/tflite.cc b/src/tflite.cc
index 169c044..e148f52 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -680,7 +680,8 @@ ModelState::ValidateModelConfig()
          "' for model '" + Name() + "'"));
 
     // Validate input shape matches expected from model
-    TfLiteIntArray* tflite_dims = interpreter->tensor(inputs[i])->dims;
+    const TfLiteIntArray* tflite_dims =
+        interpreter->tensor(inputs[i])->dims_signature;
     std::vector<int64_t> model_input_shape(
         tflite_dims->data, tflite_dims->data + tflite_dims->size);
 
@@ -694,10 +695,10 @@ ModelState::ValidateModelConfig()
         RETURN_IF_ERROR(ParseShape(io, "dims", &config_input_shape));
       }
       if (max_batch_size_ > 0) {
-        // if batching is supported, you tflite doesn't encode -1 as
-        // the dim like tf does, it's just a 1. So just insert a 1 as the
-        // batch dim for the config input shape to see if it lines up
-        config_input_shape.insert(config_input_shape.begin(), 1);
+        // if batching is supported, tflite encodes -1 as the signature dim like
+        // tf does. So just insert a -1 as the batch dim for the config input
+        // shape to see if it lines up
+        config_input_shape.insert(config_input_shape.begin(), -1);
       }
       if (config_input_shape != model_input_shape) {
         return TRITONSERVER_ErrorNew(
@@ -748,7 +749,8 @@ ModelState::ValidateModelConfig()
          "' for model '" + Name() + "'"));
 
     // Validate output shape matches expected from model
-    TfLiteIntArray* tflite_dims = interpreter->tensor(outputs[i])->dims;
+    const TfLiteIntArray* tflite_dims =
+        interpreter->tensor(outputs[i])->dims_signature;
     std::vector<int64_t> model_output_shape(
         tflite_dims->data, tflite_dims->data + tflite_dims->size);
 
@@ -762,7 +764,7 @@ ModelState::ValidateModelConfig()
         RETURN_IF_ERROR(ParseShape(io, "dims", &config_output_shape));
       }
       if (max_batch_size_ > 0) {
-        config_output_shape.insert(config_output_shape.begin(), 1);
+        config_output_shape.insert(config_output_shape.begin(), -1);
       }
       RETURN_ERROR_IF_TRUE(
           config_output_shape != model_output_shape,

From aba3fa354173bf738b71784a1fee5d27be22c860 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Tue, 22 Aug 2023 11:38:06 -0500
Subject: [PATCH 29/33] Implement thread pinning feature

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/model_instance/model_instance.cc      |  31 +++++-
 src/model_instance/model_instance.h       |   6 +
 src/model_instance/model_instance_utils.h |   9 ++
 src/model_instance/papi_profiler.cc       |  17 ++-
 src/tflite.cc                             | 129 ++++++++++++++++++++--
 5 files changed, 172 insertions(+), 20 deletions(-)

diff --git a/src/model_instance/model_instance.cc b/src/model_instance/model_instance.cc
index 31cbd5b..6ec9d9e 100644
--- a/src/model_instance/model_instance.cc
+++ b/src/model_instance/model_instance.cc
@@ -5,6 +5,8 @@
 
 #include "model_instance.h"
 
+#include <sched.h>
+
 #include <future>
 #include <unordered_set>
 
@@ -57,6 +59,13 @@ ModelInstance::BuildInterpreter(tensorpipe::Descriptor descriptor)
     return kTfLiteError;
   }
 
+  // Get range of cpus to pin to if specified
+  for (std::vector<tensorpipe::Descriptor::Payload>::iterator it =
+           descriptor.payloads.begin() + OptimizerOption::COUNT + 1;
+       it < descriptor.payloads.end(); it++) {
+    cpus_.push_back(std::stoi(it->metadata));
+  }
+
   // Set numa parameters
   numa_alloc_policy_ = AllocationPolicyFromString(
       descriptor.payloads[OptimizerOption::NUMA_ALLOC_POLICY].metadata);
@@ -372,13 +381,29 @@ ModelInstance::Infer(tensorpipe::Descriptor& descriptor)
         if (interpreter_->Invoke() != kTfLiteOk) {
           success = false;
         } else {
-#ifdef PAPI_PROFILING_ENABLE
-          // After the first inference, all threads should be alive to profile
+          // After the first inference, all threads should be alive
           if (first_inference_) {
+#ifdef PAPI_PROFILING_ENABLE
             papi_profiler_ = MaybeCreatePapiProfiler();
             interpreter_->AddProfiler(papi_profiler_.get());
-          }
 #endif  // PAPI_PROFILING_ENABLE
+
+            // If cpus are specified pin the inference threads
+            if (cpus_.size() > 0) {
+              int i = 0;
+              for (pid_t& tid : InferenceThreadIds()) {
+                cpu_set_t cpuset;
+                CPU_ZERO(&cpuset);
+                // Selected cpu loops around if more threads than cpus
+                CPU_SET(cpus_[i++ % cpus_.size()], &cpuset);
+                int rc = sched_setaffinity(tid, sizeof(cpu_set_t), &cpuset);
+                if (rc != 0) {
+                  std::cout << "Error calling sched_setaffinity: " << rc
+                            << "\n";
+                }
+              }
+            }
+          }
         }
 
         first_inference_ = false;
diff --git a/src/model_instance/model_instance.h b/src/model_instance/model_instance.h
index 7a16c2a..52dd9ee 100644
--- a/src/model_instance/model_instance.h
+++ b/src/model_instance/model_instance.h
@@ -72,6 +72,9 @@ class ModelInstance {
   // remote numa node id
   int remote_numa_node_id_ = 1;
 
+  // thread ids
+  std::vector<pid_t> inference_thread_ids_;
+
 #ifdef LIBNUMA_ENABLE
   // Initalize numa policy for this model
   void InitNuma(int local_node_id, int remote_node_id);
@@ -107,6 +110,9 @@ class ModelInstance {
   // Tensorpipe response message we can reuse to write outputs into
   tensorpipe::Message tp_response_msg_;
 
+  // CPU Range
+  std::vector<int> cpus_;
+
 #ifdef PAPI_PROFILING_ENABLE
   std::unique_ptr<tflite::Profiler> papi_profiler_;
 #endif  // PAPI_PROFILING_ENABLE
diff --git a/src/model_instance/model_instance_utils.h b/src/model_instance/model_instance_utils.h
index 209fc06..edc3a35 100644
--- a/src/model_instance/model_instance_utils.h
+++ b/src/model_instance/model_instance_utils.h
@@ -46,6 +46,15 @@ CurrentThreadIds()
   return r;
 }
 
+inline std::vector<pid_t>
+InferenceThreadIds()
+{
+  // We only care about the 4th thread in the process on, as these are used
+  // for inference
+  std::vector<pid_t> current_threads = CurrentThreadIds();
+  return std::vector<pid_t>(current_threads.begin() + 3, current_threads.end());
+}
+
 inline void
 LogThreads()
 {
diff --git a/src/model_instance/papi_profiler.cc b/src/model_instance/papi_profiler.cc
index 777ae60..aeba119 100644
--- a/src/model_instance/papi_profiler.cc
+++ b/src/model_instance/papi_profiler.cc
@@ -41,18 +41,15 @@ class PapiProfiler : public tflite::Profiler {
  public:
   PapiProfiler(
       const std::vector<std::string>& papi_events,
-      const std::vector<std::string>& papi_uncore_events)
+      const std::vector<std::string>& papi_uncore_events,
+      const std::vector<pid_t> inf_thread_ids)
       : supported_event_types_(
             static_cast<uint64_t>(EventType::DELEGATE_OPERATOR_INVOKE_EVENT) +
             static_cast<uint64_t>(EventType::OPERATOR_INVOKE_EVENT)),
-        papi_events_(papi_events), papi_uncore_events_(papi_uncore_events)
+        papi_events_(papi_events), papi_uncore_events_(papi_uncore_events),
+        inf_thread_ids_(inf_thread_ids)
   {
-    // We only care about the 4th thread in the process on, as these are used
-    // for inference
-    std::vector<pid_t> current_threads = CurrentThreadIds();
-    inf_thread_ids_ =
-        std::vector<pid_t>(current_threads.begin() + 3, current_threads.end());
-
+    // Reserve space for recording the data ahead of time
     papi_regions_.reserve(1000);
     timings_.reserve(1000);
 
@@ -343,6 +340,6 @@ MaybeCreatePapiProfiler()
     return nullptr;
   }
 
-  return std::unique_ptr<tflite::Profiler>(
-      new PapiProfiler(papi_events_vec, papi_uncore_events_vec));
+  return std::unique_ptr<tflite::Profiler>(new PapiProfiler(
+      papi_events_vec, papi_uncore_events_vec, InferenceThreadIds()));
 }
diff --git a/src/tflite.cc b/src/tflite.cc
index e148f52..27c5875 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <limits>
 #include <map>
+#include <numeric>
 #include <sstream>
 #include <thread>
 #include <unordered_map>
@@ -61,6 +62,33 @@
 
 namespace triton { namespace backend { namespace tensorflowlite {
 
+// Custom object to store global state for this backend
+struct ArmNNTFLiteBackendState {
+  // Map managing list of avail cpus in system, keyed on socket
+  // TODO: Change this to a bitmap
+  std::unordered_map<int, std::vector<int>> avail_cpus_;
+  std::unordered_map<std::string, std::vector<int>> used_cpus_;
+
+  explicit ArmNNTFLiteBackendState(const std::vector<int> cpus_to_use)
+  {
+    // Start with list of all available CPUs on system
+    PopulateCpusMap(avail_cpus_);
+
+    // If we have a cpu restriction, modify avail_cpus accordingly
+    if (!cpus_to_use.empty()) {
+      for (auto& [socket_id, cpus] : avail_cpus_) {
+        std::vector<int> valid_cpus;
+        std::set_union(
+            cpus_to_use.begin(), cpus_to_use.end(), cpus.begin(), cpus.end(),
+            std::back_inserter(valid_cpus));
+        cpus = std::move(valid_cpus);
+      }
+    }
+  }
+
+  ~ArmNNTFLiteBackendState() {}
+};
+
 //
 // ModelState
 //
@@ -119,6 +147,9 @@ class ModelState : public BackendModel {
   std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
   std::unordered_map<std::string, std::vector<int64_t>> output_shape_map_;
 
+  // Pointer to shared backend state
+  std::shared_ptr<ArmNNTFLiteBackendState> backend_state_;
+
   // The pointer to the tflite network
   std::unique_ptr<tflite::FlatBufferModel> model_;
 
@@ -146,9 +177,6 @@ class ModelState : public BackendModel {
   // remote numa node id
   int remote_numa_node_id_ = 1;
 
-  // Map managing list of avail cpus in system, keyed on socket
-  std::unordered_map<int, std::vector<int>> avail_cpus_;
-
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
   TRITONSERVER_Error* AutoCompleteConfig();
@@ -168,6 +196,16 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
     RETURN_IF_ERROR(ex.err_);
   }
 
+  TRITONBACKEND_Backend* backend;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelBackend(triton_model, &backend));
+  void* vbackendstate;
+  RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vbackendstate));
+  RETURN_ERROR_IF_TRUE(
+      vbackendstate == nullptr, TRITONSERVER_ERROR_INTERNAL,
+      std::string("unexpected nullptr state in TRITONBACKEND_ModelInitialize"));
+  (*state)->backend_state_.reset(
+      reinterpret_cast<ArmNNTFLiteBackendState*>(vbackendstate));
+
   return nullptr;  // success
 }
 
@@ -180,8 +218,6 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
   THROW_IF_BACKEND_MODEL_ERROR(InitConfig());
   THROW_IF_BACKEND_MODEL_ERROR(LoadModel());
 
-  PopulateCpusMap(avail_cpus_);
-
   // Get the directory of the backend to find the path to the model instance
   // binary
   TRITONBACKEND_Backend* backend;
@@ -945,8 +981,21 @@ ModelInstanceState::LaunchModelInstance()
       std::string("shm://") + model_instance_name_};
 
 #ifdef LIBNUMA_ENABLE
-  // Model instance will always be pinned to numa node set as local, it's the
-  // membinding we change
+  // CPUS affinity always set to local node
+  std::vector<int>& avail_cpus =
+      model_state_->backend_state_
+          ->avail_cpus_[model_state_->local_numa_node_id_];
+  model_state_->backend_state_->used_cpus_[model_instance_name_] =
+      std::vector<int>(
+          avail_cpus.begin(),
+          avail_cpus.begin() + model_state_->tflite_num_threads_);
+  model_state_->backend_state_->avail_cpus_[model_state_->local_numa_node_id_]
+      .erase(
+          avail_cpus.begin(),
+          avail_cpus.begin() + model_state_->tflite_num_threads_);
+
+  // Model instance will always be pinned to numa node set as local, it's
+  // the membinding we change
   switch (model_state_->numa_alloc_policy_) {
     case AllocationPolicy::LOCAL:
     case AllocationPolicy::WEIGHT_REMOTE_RESULT_LOCAL:
@@ -971,6 +1020,16 @@ ModelInstanceState::LaunchModelInstance()
       break;
     }
   }
+#else
+  model_state_->backend_state_->used_cpus_[model_instance_name_] =
+      std::vector<int>(
+          model_state_->backend_state_->avail_cpus_[0].begin(),
+          model_state_->backend_state_->avail_cpus_[0].begin() +
+              model_state_->tflite_num_threads_);
+  model_state_->backend_state_->avail_cpus_[0].erase(
+      model_state_->backend_state_->avail_cpus_[0].begin(),
+      model_state_->backend_state_->avail_cpus_[0].begin() +
+          model_state_->tflite_num_threads_);
 #endif  // LIBNUMA_ENABLE
 
   // We have the model_instance process inherit the parent's standard streams
@@ -1151,6 +1210,12 @@ ModelInstanceState::SendModel()
 
   tp_msg.payloads[OptimizerOption::ARMNN_GPU_REDUCE_FP32_TO_FP16] =
       gen_metadata(model_state_->armnn_gpu_reduce_fp32_to_fp16_);
+
+  // The rest of the remaining spots will go to what cpus to use for inference
+  for (auto& cpuid :
+       model_state_->backend_state_->used_cpus_[model_instance_name_]) {
+    tp_msg.payloads.push_back(gen_metadata(std::to_string(cpuid)));
+  }
 #endif  // ARMNN_DELEGATE_ENABLE
 
   // Write the message
@@ -1630,6 +1695,56 @@ extern "C" {
 TRITONSERVER_Error*
 TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
 {
+  // The backend configuration may contain information needed by the
+  // backend, such a command-line arguments.
+  TRITONSERVER_Message* backend_config_message;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_BackendConfig(backend, &backend_config_message));
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
+      backend_config_message, &buffer, &byte_size));
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("backend configuration:\n") + buffer).c_str());
+  triton::common::TritonJson::Value backend_config;
+  if (byte_size != 0) {
+    RETURN_IF_ERROR(backend_config.Parse(buffer, byte_size));
+  }
+  triton::common::TritonJson::Value cmdline;
+  std::vector<int> cpus_to_use;
+  if (backend_config.Find("cmdline", &cmdline)) {
+    triton::common::TritonJson::Value value;
+    std::string value_str;
+    if (cmdline.Find("cpus", &value)) {
+      RETURN_IF_ERROR(value.AsString(&value_str));
+      std::stringstream ss(value_str);
+      std::vector<int> range(2);
+      int i = 0;
+      while (ss.good()) {
+        std::string substr;
+        std::getline(ss, substr, '-');
+        // Get range of cpu values
+        range[i++] = std::stoi(substr);
+      }
+      cpus_to_use.resize(range[1] - range[0] + 1);
+      std::iota(cpus_to_use.begin(), cpus_to_use.end(), range[0]);
+    }
+  }
+
+  // If we have any global backend state we create and set it here
+  try {
+    ArmNNTFLiteBackendState* state = new ArmNNTFLiteBackendState(cpus_to_use);
+    RETURN_IF_ERROR(
+        TRITONBACKEND_BackendSetState(backend, reinterpret_cast<void*>(state)));
+  }
+  catch (const BackendModelException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
   const char* cname;
   RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
   std::string name(cname);

From 7367f6b03abd711d78f0f1d4911bbcff7df96f43 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Tue, 22 Aug 2023 11:49:28 -0500
Subject: [PATCH 30/33] Give back threads to avail cpus

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/tflite.cc | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/tflite.cc b/src/tflite.cc
index 27c5875..aaaae22 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -936,6 +936,16 @@ ModelInstanceState::ModelInstanceState(
 
 ModelInstanceState::~ModelInstanceState()
 {
+  // Give back cpus to avail_cpus backend state object
+  std::vector<int>& avail_cpus =
+      model_state_->backend_state_
+          ->avail_cpus_[model_state_->local_numa_node_id_];
+  avail_cpus.insert(
+      avail_cpus.begin(),
+      model_state_->backend_state_->used_cpus_[model_instance_name_].begin(),
+      model_state_->backend_state_->used_cpus_[model_instance_name_].end());
+
+  // Cleanup tensorpipe and reproc process
   pipe_->close();
   listener_->close();
   reproc::stop_actions stop = {
@@ -980,19 +990,19 @@ ModelInstanceState::LaunchModelInstance()
       std::string(model_state_->model_instance_location_) + "/model_instance",
       std::string("shm://") + model_instance_name_};
 
+  std::vector<int>& avail_cpus = model_state_->backend_state_->avail_cpus_[0];
+
 #ifdef LIBNUMA_ENABLE
   // CPUS affinity always set to local node
-  std::vector<int>& avail_cpus =
-      model_state_->backend_state_
-          ->avail_cpus_[model_state_->local_numa_node_id_];
+  avail_cpus = model_state_->backend_state_
+                   ->avail_cpus_[model_state_->local_numa_node_id_];
   model_state_->backend_state_->used_cpus_[model_instance_name_] =
       std::vector<int>(
           avail_cpus.begin(),
           avail_cpus.begin() + model_state_->tflite_num_threads_);
-  model_state_->backend_state_->avail_cpus_[model_state_->local_numa_node_id_]
-      .erase(
-          avail_cpus.begin(),
-          avail_cpus.begin() + model_state_->tflite_num_threads_);
+  avail_cpus.erase(
+      avail_cpus.begin(),
+      avail_cpus.begin() + model_state_->tflite_num_threads_);
 
   // Model instance will always be pinned to numa node set as local, it's
   // the membinding we change
@@ -1023,13 +1033,11 @@ ModelInstanceState::LaunchModelInstance()
 #else
   model_state_->backend_state_->used_cpus_[model_instance_name_] =
       std::vector<int>(
-          model_state_->backend_state_->avail_cpus_[0].begin(),
-          model_state_->backend_state_->avail_cpus_[0].begin() +
-              model_state_->tflite_num_threads_);
-  model_state_->backend_state_->avail_cpus_[0].erase(
-      model_state_->backend_state_->avail_cpus_[0].begin(),
-      model_state_->backend_state_->avail_cpus_[0].begin() +
-          model_state_->tflite_num_threads_);
+          avail_cpus.begin(),
+          avail_cpus.begin() + model_state_->tflite_num_threads_);
+  avail_cpus.erase(
+      avail_cpus.begin(),
+      avail_cpus.begin() + model_state_->tflite_num_threads_);
 #endif  // LIBNUMA_ENABLE
 
   // We have the model_instance process inherit the parent's standard streams

From 0b47640b31722095ca1f825f5475769296d6fa4a Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Tue, 22 Aug 2023 15:13:41 -0500
Subject: [PATCH 31/33] Fix issue with model unloading

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/tflite.cc       | 67 +++++++++++++++++++++------------------------
 src/tflite_utils.cc | 13 +++++++--
 2 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/src/tflite.cc b/src/tflite.cc
index aaaae22..ca76412 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -85,8 +85,6 @@ struct ArmNNTFLiteBackendState {
       }
     }
   }
-
-  ~ArmNNTFLiteBackendState() {}
 };
 
 //
@@ -148,7 +146,7 @@ class ModelState : public BackendModel {
   std::unordered_map<std::string, std::vector<int64_t>> output_shape_map_;
 
   // Pointer to shared backend state
-  std::shared_ptr<ArmNNTFLiteBackendState> backend_state_;
+  ArmNNTFLiteBackendState* backend_state_;
 
   // The pointer to the tflite network
   std::unique_ptr<tflite::FlatBufferModel> model_;
@@ -203,8 +201,8 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
   RETURN_ERROR_IF_TRUE(
       vbackendstate == nullptr, TRITONSERVER_ERROR_INTERNAL,
       std::string("unexpected nullptr state in TRITONBACKEND_ModelInitialize"));
-  (*state)->backend_state_.reset(
-      reinterpret_cast<ArmNNTFLiteBackendState*>(vbackendstate));
+  (*state)->backend_state_ =
+      reinterpret_cast<ArmNNTFLiteBackendState*>(vbackendstate);
 
   return nullptr;  // success
 }
@@ -936,15 +934,6 @@ ModelInstanceState::ModelInstanceState(
 
 ModelInstanceState::~ModelInstanceState()
 {
-  // Give back cpus to avail_cpus backend state object
-  std::vector<int>& avail_cpus =
-      model_state_->backend_state_
-          ->avail_cpus_[model_state_->local_numa_node_id_];
-  avail_cpus.insert(
-      avail_cpus.begin(),
-      model_state_->backend_state_->used_cpus_[model_instance_name_].begin(),
-      model_state_->backend_state_->used_cpus_[model_instance_name_].end());
-
   // Cleanup tensorpipe and reproc process
   pipe_->close();
   listener_->close();
@@ -960,6 +949,15 @@ ModelInstanceState::~ModelInstanceState()
   if (ec) {
     LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to stop child process");
   }
+
+  // Give back cpus to avail_cpus backend state object
+  std::vector<int>& avail_cpus =
+      model_state_->backend_state_
+          ->avail_cpus_[model_state_->local_numa_node_id_];
+  avail_cpus.insert(
+      avail_cpus.begin(),
+      model_state_->backend_state_->used_cpus_[model_instance_name_].begin(),
+      model_state_->backend_state_->used_cpus_[model_instance_name_].end());
 }
 
 TRITONSERVER_Error*
@@ -990,20 +988,7 @@ ModelInstanceState::LaunchModelInstance()
       std::string(model_state_->model_instance_location_) + "/model_instance",
       std::string("shm://") + model_instance_name_};
 
-  std::vector<int>& avail_cpus = model_state_->backend_state_->avail_cpus_[0];
-
 #ifdef LIBNUMA_ENABLE
-  // CPUS affinity always set to local node
-  avail_cpus = model_state_->backend_state_
-                   ->avail_cpus_[model_state_->local_numa_node_id_];
-  model_state_->backend_state_->used_cpus_[model_instance_name_] =
-      std::vector<int>(
-          avail_cpus.begin(),
-          avail_cpus.begin() + model_state_->tflite_num_threads_);
-  avail_cpus.erase(
-      avail_cpus.begin(),
-      avail_cpus.begin() + model_state_->tflite_num_threads_);
-
   // Model instance will always be pinned to numa node set as local, it's
   // the membinding we change
   switch (model_state_->numa_alloc_policy_) {
@@ -1030,16 +1015,25 @@ ModelInstanceState::LaunchModelInstance()
       break;
     }
   }
-#else
-  model_state_->backend_state_->used_cpus_[model_instance_name_] =
-      std::vector<int>(
-          avail_cpus.begin(),
-          avail_cpus.begin() + model_state_->tflite_num_threads_);
-  avail_cpus.erase(
-      avail_cpus.begin(),
-      avail_cpus.begin() + model_state_->tflite_num_threads_);
 #endif  // LIBNUMA_ENABLE
 
+  // CPUS affinity always set to local node
+  std::vector<int>& avail_cpus =
+      model_state_->backend_state_
+          ->avail_cpus_[model_state_->local_numa_node_id_];
+
+  RETURN_ERROR_IF_TRUE(
+      avail_cpus.empty(), TRITONSERVER_ERROR_INTERNAL,
+      std::string("not enough cpus left in system to pin on."));
+
+  // Assign cpus with max assignment being all cpus if thread count > num cores
+  int end_idx = std::min(
+      static_cast<int>(model_state_->tflite_num_threads_),
+      static_cast<int>(avail_cpus.size()));
+  model_state_->backend_state_->used_cpus_[model_instance_name_] =
+      std::vector<int>(avail_cpus.begin(), avail_cpus.begin() + end_idx);
+  avail_cpus.erase(avail_cpus.begin(), avail_cpus.begin() + end_idx);
+
   // We have the model_instance process inherit the parent's standard streams
   // so the it reads directly from the stdin and writes directly to the
   // stdout/stderr triton.
@@ -1111,6 +1105,7 @@ ModelInstanceState::LaunchModelInstance()
       std::string(
           "Model instance failed: process did not connect back to parent"));
 
+  // Send the model across the wire to the instance
   SendModel();
 
   return nullptr;
@@ -1218,13 +1213,13 @@ ModelInstanceState::SendModel()
 
   tp_msg.payloads[OptimizerOption::ARMNN_GPU_REDUCE_FP32_TO_FP16] =
       gen_metadata(model_state_->armnn_gpu_reduce_fp32_to_fp16_);
+#endif  // ARMNN_DELEGATE_ENABLE
 
   // The rest of the remaining spots will go to what cpus to use for inference
   for (auto& cpuid :
        model_state_->backend_state_->used_cpus_[model_instance_name_]) {
     tp_msg.payloads.push_back(gen_metadata(std::to_string(cpuid)));
   }
-#endif  // ARMNN_DELEGATE_ENABLE
 
   // Write the message
   auto done = std::make_shared<std::promise<bool>>();
diff --git a/src/tflite_utils.cc b/src/tflite_utils.cc
index 786440e..d380c24 100644
--- a/src/tflite_utils.cc
+++ b/src/tflite_utils.cc
@@ -138,12 +138,19 @@ PopulateCpusMap(std::unordered_map<int, std::vector<int>>& cpus)
   hwloc_topology_init(&topology);
   hwloc_topology_load(topology);
 
-  int num_cpus = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
-  for (int cpu_id = 0; cpu_id < num_cpus; ++cpu_id) {
+  int num_logical_cpus = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
+  int smt_threads_per_core =
+      num_logical_cpus / hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE);
+  for (int cpu_id = 0; cpu_id < num_logical_cpus; ++cpu_id) {
     hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, cpu_id);
     if (obj) {
       hwloc_bitmap_t nodeset = obj->nodeset;
-      cpus[hwloc_bitmap_first(nodeset)].push_back(cpu_id);
+      if (cpu_id % smt_threads_per_core) {
+        cpus[hwloc_bitmap_first(nodeset)].push_back(cpu_id);
+      } else {
+        cpus[hwloc_bitmap_first(nodeset)].insert(
+            cpus[hwloc_bitmap_first(nodeset)].begin(), cpu_id);
+      }
     }
   }
   hwloc_topology_destroy(topology);

From 5c7ff58cfdfe0b9a7842c22c8fe7e8b94c6cf4e8 Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Wed, 23 Aug 2023 09:50:23 -0500
Subject: [PATCH 32/33] Add flag to control thread pinning

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/tflite.cc | 71 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 20 deletions(-)

diff --git a/src/tflite.cc b/src/tflite.cc
index ca76412..272ae71 100644
--- a/src/tflite.cc
+++ b/src/tflite.cc
@@ -172,9 +172,12 @@ class ModelState : public BackendModel {
   // Local numa node id
   int local_numa_node_id_ = 0;
 
-  // remote numa node id
+  // Remote numa node id
   int remote_numa_node_id_ = 1;
 
+  // pin threads
+  bool pin_threads_ = false;
+
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
   TRITONSERVER_Error* AutoCompleteConfig();
@@ -258,6 +261,29 @@ ModelState::InitConfig()
         }
       }
 
+      // Handle pin_threads parameter
+      err = GetParameterValue(params, "pin_threads", &value_str);
+      // pin_threads is not required so clear error if not found
+      if (err != nullptr) {
+        if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+          return err;
+        } else {
+          TRITONSERVER_ErrorDelete(err);
+        }
+      } else {
+        if (value_str == "on") {
+          pin_threads_ = true;
+        } else if (value_str == "off") {
+          pin_threads_ = false;
+        } else {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string("parameter 'pin_threads' must be 'on' or 'off' ") +
+               Name() + "'")
+                  .c_str());
+        }
+      }
+
       // Handle numa parameters
       err = GetParameterValue(params, "numa_alloc_policy", &value_str);
 
@@ -1017,22 +1043,25 @@ ModelInstanceState::LaunchModelInstance()
   }
 #endif  // LIBNUMA_ENABLE
 
-  // CPUS affinity always set to local node
-  std::vector<int>& avail_cpus =
-      model_state_->backend_state_
-          ->avail_cpus_[model_state_->local_numa_node_id_];
-
-  RETURN_ERROR_IF_TRUE(
-      avail_cpus.empty(), TRITONSERVER_ERROR_INTERNAL,
-      std::string("not enough cpus left in system to pin on."));
+  if (model_state_->pin_threads_) {
+    // CPUS affinity always set to local node
+    std::vector<int>& avail_cpus =
+        model_state_->backend_state_
+            ->avail_cpus_[model_state_->local_numa_node_id_];
 
-  // Assign cpus with max assignment being all cpus if thread count > num cores
-  int end_idx = std::min(
-      static_cast<int>(model_state_->tflite_num_threads_),
-      static_cast<int>(avail_cpus.size()));
-  model_state_->backend_state_->used_cpus_[model_instance_name_] =
-      std::vector<int>(avail_cpus.begin(), avail_cpus.begin() + end_idx);
-  avail_cpus.erase(avail_cpus.begin(), avail_cpus.begin() + end_idx);
+    RETURN_ERROR_IF_TRUE(
+        avail_cpus.empty(), TRITONSERVER_ERROR_INTERNAL,
+        std::string("not enough cpus left in system to pin on."));
+
+    // Assign cpus with max assignment being all cpus if thread count > num
+    // cores
+    int end_idx = std::min(
+        static_cast<int>(model_state_->tflite_num_threads_),
+        static_cast<int>(avail_cpus.size()));
+    model_state_->backend_state_->used_cpus_[model_instance_name_] =
+        std::vector<int>(avail_cpus.begin(), avail_cpus.begin() + end_idx);
+    avail_cpus.erase(avail_cpus.begin(), avail_cpus.begin() + end_idx);
+  }
 
   // We have the model_instance process inherit the parent's standard streams
   // so the it reads directly from the stdin and writes directly to the
@@ -1215,10 +1244,12 @@ ModelInstanceState::SendModel()
       gen_metadata(model_state_->armnn_gpu_reduce_fp32_to_fp16_);
 #endif  // ARMNN_DELEGATE_ENABLE
 
-  // The rest of the remaining spots will go to what cpus to use for inference
-  for (auto& cpuid :
-       model_state_->backend_state_->used_cpus_[model_instance_name_]) {
-    tp_msg.payloads.push_back(gen_metadata(std::to_string(cpuid)));
+  if (model_state_->pin_threads_) {
+    // The rest of the remaining spots will go to what cpus to use for inference
+    for (auto& cpuid :
+         model_state_->backend_state_->used_cpus_[model_instance_name_]) {
+      tp_msg.payloads.push_back(gen_metadata(std::to_string(cpuid)));
+    }
   }
 
   // Write the message

From 6e875e42bb6296b30d2eb5d47696be82de4edf5d Mon Sep 17 00:00:00 2001
From: Josh Minor <josh.minor@arm.com>
Date: Wed, 23 Aug 2023 11:13:21 -0500
Subject: [PATCH 33/33] Fix thread pinning strat

Signed-off-by: Josh Minor <josh.minor@arm.com>
---
 src/tflite_utils.cc | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/tflite_utils.cc b/src/tflite_utils.cc
index d380c24..b2ab3b8 100644
--- a/src/tflite_utils.cc
+++ b/src/tflite_utils.cc
@@ -138,23 +138,27 @@ PopulateCpusMap(std::unordered_map<int, std::vector<int>>& cpus)
   hwloc_topology_init(&topology);
   hwloc_topology_load(topology);
 
-  int num_logical_cpus = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
-  int smt_threads_per_core =
-      num_logical_cpus / hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE);
-  for (int cpu_id = 0; cpu_id < num_logical_cpus; ++cpu_id) {
-    hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, cpu_id);
-    if (obj) {
-      hwloc_bitmap_t nodeset = obj->nodeset;
-      if (cpu_id % smt_threads_per_core) {
-        cpus[hwloc_bitmap_first(nodeset)].push_back(cpu_id);
-      } else {
-        cpus[hwloc_bitmap_first(nodeset)].insert(
-            cpus[hwloc_bitmap_first(nodeset)].begin(), cpu_id);
+  int num_phys_cpus = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE);
+  for (int i = 0; i < num_phys_cpus; ++i) {
+    hwloc_obj_t core = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i);
+    if (core) {
+      hwloc_bitmap_t nodeset = core->nodeset;
+      for (unsigned int j = 0; j < core->arity; ++j) {
+        unsigned int cpu_id = core->children[j]->os_index;
+        // First insert first thread of cpu near front of list, then push all
+        // its children back
+        if (j == 0) {
+          cpus[hwloc_bitmap_first(nodeset)].insert(
+              cpus[hwloc_bitmap_first(nodeset)].begin() +
+                  cpus[hwloc_bitmap_first(nodeset)].size() / core->arity,
+              cpu_id);
+        } else {
+          cpus[hwloc_bitmap_first(nodeset)].push_back(cpu_id);
+        }
       }
     }
   }
+
   hwloc_topology_destroy(topology);
 }
-
-
 }}}  // namespace triton::backend::tensorflowlite