[Allocator] Fix OOM issues in non-DPS mode (#498)

jhalakpatel · web-flow · commit ac1fe16bf514 · 2025-03-04T13:03:21.000-07:00
Summary:
- Restore querying of `rankOffset` from the output descriptor, which was
inadvertently removed in a previous merge. Update the related test
`tensorrt-runtime-to-executor.mlir`.
- Enhance debugging logs to provide clearer error messages.
- Revert `Allocator::track` to its previous implementation for cases
where the incoming pointer is not managed internally by `Allocator`.
- Modify `Allocator::track` to correctly handle cases where an incoming
pointer is already managed internally. This adjustment is necessary as
`Allocator` now tracks pointers internally when they are returned as
function results.
- Refine `Allocator::safeDeallocate` to ensure it only releases pointers
that are managed internally.
- Correct a typo in `Allocator::safeDeallocate` for
`PointerType::pinned_host`, which previously caused an error. Adjust to
log a message deferring memory deallocation to `PinnedMemoryAllocator`.
- Address an issue in external memref creation by avoiding retracking of
pointers already managed by `Allocator`. This prevents redundant
tracking as externally managed.
- Ensure that when populating arguments for an Enqueue function, the
session allocator tracks pointers as internally managed if they are
managed by the client allocator. Ensure the session tracker does not
assume ownership for deallocation.
- Prevent `~Allocator()` from releasing pointers that have already been
released internally.

Signed-off-by: Jhalak Patel &lt;jhalakp@nvidia.com&gt;
diff --git a/mlir-tensorrt/compiler/lib/Conversion/TensorRTRuntimeToExecutor/TensorRTRuntimeToExecutor.cpp b/mlir-tensorrt/compiler/lib/Conversion/TensorRTRuntimeToExecutor/TensorRTRuntimeToExecutor.cpp
@@ -375,6 +375,10 @@ struct ConvertEnqueueAllocToCall
     for (auto [idx, result] : llvm::enumerate(op.getResults())) {
       MemRefType memrefType = cast<MemRefType>(result.getType());
       unsigned rank = memrefType.getRank();
+
+      // Skip the rank offset that is populated by the callee.
+      outputDescOffset++;
+
       Value devicePtrOffset = b.create<executor::GetOffsetOp>(
           i64Type, structType,
           ArrayRef<OpFoldResult>{
diff --git a/mlir-tensorrt/compiler/test/Conversion/TensorRTRuntimeToExecutor/tensorrt-runtime-to-executor.mlir b/mlir-tensorrt/compiler/test/Conversion/TensorRTRuntimeToExecutor/tensorrt-runtime-to-executor.mlir
@@ -79,24 +79,26 @@ func.func @convert_enqueue_alloc(%arg0: memref<?xf32, #device>,
 //       CHECK:     %[[v12:.+]] = executor.table.get %[[v0]][3] : <!executor.ptr<device>, !executor.ptr<device>, i64, i64, i64>
 //       CHECK:     %[[v13:.+]] = executor.table.create(%[[v8]], %[[c0_i64]], %[[c2_i64]], %[[v9]], %[[v10]], %[[v11]], %[[c0_i64]], %[[c1_i64]], %[[v12]] : !executor.ptr<device>, i64, i64, i64, i64, !executor.ptr<device>, i64, i64, i64) : <!executor.ptr<device>, i64, i64, i64, i64, !executor.ptr<device>, i64, i64, i64>
 //       CHECK:     executor.call @_trtrt_enqueue_alloc(%[[v3]], %[[v2]], %[[v4]], %[[v13]]) : (!executor.ptr<host>, !executor.ptr<host>, !executor.ptr<host>, !executor.table<!executor.ptr<device>, i64, i64, i64, i64, !executor.ptr<device>, i64, i64, i64>) -> ()
-//       CHECK:     %[[v14:.+]] = executor.load %[[v4]] + %[[v6]] : (!executor.ptr<host>, i64) -> i64
-//       CHECK:     %[[v15:.+]] = executor.inttoptr %[[v14]] : (i64) -> !executor.ptr<device>
-//       CHECK:     %[[v16:.+]] = executor.getoffset[0, 2] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
-//       CHECK:     %[[v17:.+]] = executor.load %[[v4]] + %[[v16]] : (!executor.ptr<host>, i64) -> i64
+//       CHECK:     %[[v14:.+]] = executor.getoffset[0, 2] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
+//       CHECK:     %[[v15:.+]] = executor.load %[[v4]] + %[[v14]] : (!executor.ptr<host>, i64) -> i64
+//       CHECK:     %[[v16:.+]] = executor.inttoptr %[[v15]] : (i64) -> !executor.ptr<device>
 //       CHECK:     %[[v18:.+]] = executor.getoffset[0, 3] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
 //       CHECK:     %[[v19:.+]] = executor.load %[[v4]] + %[[v18]] : (!executor.ptr<host>, i64) -> i64
-//       CHECK:     %[[v20:.+]] = executor.table.create(%[[v15]], %[[v15]], %[[c0_i64]], %[[v17]], %[[v19]] : !executor.ptr<device>, !executor.ptr<device>, i64, i64, i64) : <!executor.ptr<device>, !executor.ptr<device>, i64, i64, i64>
-//       CHECK:     %[[v21:.+]] = builtin.unrealized_conversion_cast %[[v20]] : !executor.table<!executor.ptr<device>, !executor.ptr<device>, i64, i64, i64> to memref<?xf32, #executor.memory_type<device>>
-//       CHECK:     %[[v22:.+]] = executor.getoffset[0, 4] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
-//       CHECK:     %[[v23:.+]] = executor.load %[[v4]] + %[[v22]] : (!executor.ptr<host>, i64) -> i64
-//       CHECK:     %[[v24:.+]] = executor.inttoptr %[[v23]] : (i64) -> !executor.ptr<host>
-//       CHECK:     %[[v25:.+]] = executor.load %[[v4]] + %[[v7]] : (!executor.ptr<host>, i64) -> i64
-//       CHECK:     %[[v26:.+]] = executor.getoffset[0, 6] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
-//       CHECK:     %[[v27:.+]] = executor.load %[[v4]] + %[[v26]] : (!executor.ptr<host>, i64) -> i64
+//       CHECK:     %[[v20:.+]] = executor.getoffset[0, 4] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
+//       CHECK:     %[[v21:.+]] = executor.load %[[v4]] + %[[v20]] : (!executor.ptr<host>, i64) -> i64
+//       CHECK:     %[[v22:.+]] = executor.table.create(%[[v16]], %[[v16]], %[[c0_i64]], %[[v19]], %[[v21]] : !executor.ptr<device>, !executor.ptr<device>, i64, i64, i64) : <!executor.ptr<device>, !executor.ptr<device>, i64, i64, i64>
+//       CHECK:     %[[v23:.+]] = builtin.unrealized_conversion_cast %[[v22]] : !executor.table<!executor.ptr<device>, !executor.ptr<device>, i64, i64, i64> to memref<?xf32, #executor.memory_type<device>>
+//       CHECK:     %[[v24:.+]] = executor.getoffset[0, 6] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
+//       CHECK:     %[[v25:.+]] = executor.load %[[v4]] + %[[v24]] : (!executor.ptr<host>, i64) -> i64
+//       CHECK:     %[[v26:.+]] = executor.inttoptr %[[v25]] : (i64) -> !executor.ptr<host>
 //       CHECK:     %[[v28:.+]] = executor.getoffset[0, 7] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
 //       CHECK:     %[[v29:.+]] = executor.load %[[v4]] + %[[v28]] : (!executor.ptr<host>, i64) -> i64
 //       CHECK:     %[[v30:.+]] = executor.getoffset[0, 8] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
 //       CHECK:     %[[v31:.+]] = executor.load %[[v4]] + %[[v30]] : (!executor.ptr<host>, i64) -> i64
-//       CHECK:     %[[v32:.+]] = executor.table.create(%[[v24]], %[[v24]], %[[c0_i64]], %[[v25]], %[[v27]], %[[v29]], %[[v31]] : !executor.ptr<host>, !executor.ptr<host>, i64, i64, i64, i64, i64) : <!executor.ptr<host>, !executor.ptr<host>, i64, i64, i64, i64, i64>
-//       CHECK:     %[[v33:.+]] = builtin.unrealized_conversion_cast %[[v32]] : !executor.table<!executor.ptr<host>, !executor.ptr<host>, i64, i64, i64, i64, i64> to memref<?x?xf32, #executor.memory_type<host>>
-//       CHECK:     return %[[v21]], %[[v33]] : memref<?xf32, #executor.memory_type<device>>, memref<?x?xf32, #executor.memory_type<host>>
+//       CHECK:     %[[v32:.+]] = executor.getoffset[0, 9] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
+//       CHECK:     %[[v33:.+]] = executor.load %[[v4]] + %[[v32]] : (!executor.ptr<host>, i64) -> i64
+//       CHECK:     %[[v34:.+]] = executor.getoffset[0, 10] : () -> i64, !executor.table<i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64>
+//       CHECK:     %[[v35:.+]] = executor.load %[[v4]] + %[[v34]] : (!executor.ptr<host>, i64) -> i64
+//       CHECK:     %[[v36:.+]] = executor.table.create(%[[v26]], %[[v26]], %[[c0_i64]], %[[v29]], %[[v31]], %[[v33]], %[[v35]] : !executor.ptr<host>, !executor.ptr<host>, i64, i64, i64, i64, i64) : <!executor.ptr<host>, !executor.ptr<host>, i64, i64, i64, i64, i64>
+//       CHECK:     %[[v37:.+]] = builtin.unrealized_conversion_cast %[[v36]] : !executor.table<!executor.ptr<host>, !executor.ptr<host>, i64, i64, i64, i64, i64> to memref<?x?xf32, #executor.memory_type<host>>
+//       CHECK:     return %[[v23]], %[[v37]] : memref<?xf32, #executor.memory_type<device>>, memref<?x?xf32, #executor.memory_type<host>>
diff --git a/mlir-tensorrt/compiler/test/python/mlir_tensorrt_runtime/test_create_memref.py b/mlir-tensorrt/compiler/test/python/mlir_tensorrt_runtime/test_create_memref.py
@@ -397,24 +397,25 @@ def test_released_internally():
     arr = np.array([5.0, 4.0, 2.0])
 
     def memref_alloc():
+        # The data is externally owned, so the memref will not be released internally.
         memref = client.create_host_memref_view(
             int(arr.ctypes.data), shape=[3], dtype=runtime.ScalarTypeCode.f64
         )
         return np.from_dlpack(
             memref
-        )  # Ensure we have an externally reference to the pointer.
+        )  # Ensure we have an external reference to the pointer.
 
     _ = memref_alloc()
     print(
         "Memref released internally: ", client.is_released_internally(arr.ctypes.data)
     )
 
 
-print("Test memref is released internally with an external reference")
+print("Test memref is not released internally with an external reference")
 test_released_internally()
 
-# CHECK-LABEL: Test memref is released internally with an external reference
-# CHECK-NEXT: Memref released internally:  True
+# CHECK-LABEL: Test memref is not released internally with an external reference
+# CHECK-NEXT: Memref released internally:  False
 
 
 def test_memref_lifetime():
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
@@ -689,7 +689,7 @@ class MemRefValue : public RuntimeValue {
 
   const std::optional<ScalarType> &getScalarType() const { return scalarType; }
 
-  RuntimeClient *getClient() { return client; }
+  RuntimeClient *getClient() const { return client; }
 
 private:
   MemRefValue(RuntimeClient *client, mlirtrt::runtime::PointerType addressSpace,
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Status.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Status.h
@@ -239,6 +239,17 @@ class StatusOr {
     }                                                                          \
   } while (false);
 
+#ifndef NDEBUG
+#define MTRT_ERROR_IF(errexpr, msg)                                            \
+  do {                                                                         \
+    if (errexpr) {                                                             \
+      llvm::report_fatal_error(msg);                                           \
+    }                                                                          \
+  } while (false);
+#else // In Release mode, compiles to a no-op.
+#define MTRT_ERROR_IF(errexpr, msg)
+#endif
+
 } // namespace mlirtrt
 
 #endif // MLIR_TENSORRT_SUPPORT_STATUS_H
diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -535,6 +535,8 @@ MTRT_Status mtrtGetScalarTypeCodeFromDLDataType(DLDataType dtype,
 
 static void dlpackManagedTensorDeleter(DLManagedTensor *tensor) {
   if (tensor) {
+    MTRT_DBGF("Deleting DLManagedTensor. Data pointer: %p",
+              tensor->dl_tensor.data);
     delete[] tensor->dl_tensor.shape;
     delete[] tensor->dl_tensor.strides;
     if (tensor->manager_ctx) {
diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp